nfs-ganesha 1.4

fsal_mds.c

Go to the documentation of this file.
00001 /*
00002  * vim:expandtab:shiftwidth=8:tabstop=8:
00003  *
00004  * Copyright (C) 2010 The Linux Box Corporation
00005  * All Rights Reserved
00006  * Contributor: Adam C. Emerson
00007  *
00008  * This program is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Lesser General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 3 of the License, or (at your option) any later version.
00012  *
00013  * This program is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Lesser General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Lesser General Public
00019  * License along with this library; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
00021  *
00022  * ---------------------------------------
00023  */
00024 
00035 #ifdef HAVE_CONFIG_H
00036 #include "config.h"
00037 #endif
00038 
00039 #include "fsal.h"
00040 #include "fsal_internal.h"
00041 #include "fsal_convert.h"
00042 #include "nfsv41.h"
00043 #include <cephfs/libcephfs.h>
00044 #include <fcntl.h>
00045 #include "HashTable.h"
00046 #include <pthread.h>
00047 #include <stdint.h>
00048 #include "fsal_types.h"
00049 #include "fsal_pnfs.h"
00050 #include "pnfs_common.h"
00051 #include "fsal_pnfs_files.h"
00052 
00053 const size_t BIGGEST_PATTERN = 1024; /* Linux supports a stripe
00054                                         pattern with no more than 4096
00055                                         stripes, but for now we stick
00056                                         to 1024 to keep them da_addrs
00057                                         from being too gigantic. */
00058 
00059 
00060 
00061 nfsstat4
00062 CEPHFSAL_layoutget(fsal_handle_t *exthandle,
00063                    fsal_op_context_t *extcontext,
00064                    XDR *loc_body,
00065                    const struct fsal_layoutget_arg *arg,
00066                    struct fsal_layoutget_res *res)
00067 {
00068      /* The FSAL handle as defined for the CEPH FSAL */
00069      cephfsal_handle_t* handle = (cephfsal_handle_t*) exthandle;
00070      /* The FSAL operation context as defined for the CEPH FSAL */
00071      cephfsal_op_context_t* context = (cephfsal_op_context_t*) extcontext;
00072      /* The mount passed to all*/
00073      struct ceph_mount_info *cmount = context->export_context->cmount;
00074      /* Structure containing the storage parameters of the file within
00075         the Ceph cluster. */
00076      struct ceph_file_layout file_layout;
00077      /* Width of each stripe on the file */
00078      uint32_t stripe_width = 0;
00079      /* Utility parameter */
00080      nfl_util4 util = 0;
00081      /* The last byte that can be accessed through pNFS */
00082      uint64_t last_possible_byte = 0;
00083      /* The deviceid for this layout */
00084      struct pnfs_deviceid deviceid = {0, 0};
00085      /* Data server handle */
00086      cephfsal_handle_t ds_handle;
00087      /* NFS Status */
00088      nfsstat4 nfs_status = 0;
00089 
00090      /* We support only LAYOUT4_NFSV4_1_FILES layouts */
00091 
00092      if (arg->type != LAYOUT4_NFSV4_1_FILES) {
00093           LogCrit(COMPONENT_PNFS,
00094                   "Unsupported layout type: %x",
00095                   arg->type);
00096           return NFS4ERR_UNKNOWN_LAYOUTTYPE;
00097      }
00098 
00099      /* Get basic information on the file and calculate the dimensions
00100         of the layout we can support. */
00101 
00102      memset(&file_layout, 0, sizeof(struct ceph_file_layout));
00103 
00104      ceph_ll_file_layout(cmount, VINODE(handle), &file_layout);
00105      stripe_width = file_layout.fl_stripe_unit;
00106      last_possible_byte = (BIGGEST_PATTERN * stripe_width) - 1;
00107 
00108      /* Since the Linux kernel refuses to work with any layout that
00109         doesn't cover the whole file, if a whole file layout is
00110         requested, lie.
00111 
00112         Otherwise, make sure the required layout doesn't go beyond
00113         what can be accessed through pNFS. */
00114      if (!((res->segment.offset == 0) &&
00115            (res->segment.length == NFS4_UINT64_MAX))) {
00116           struct pnfs_segment smallest_acceptable = {
00117                .io_mode = res->segment.io_mode,
00118                .offset = res->segment.offset,
00119                .length = arg->minlength
00120           };
00121           struct pnfs_segment forbidden_area = {
00122                .io_mode = res->segment.io_mode,
00123                .offset = last_possible_byte + 1,
00124                .length = NFS4_UINT64_MAX
00125           };
00126           if (pnfs_segments_overlap(smallest_acceptable,
00127                                     forbidden_area)) {
00128                LogCrit(COMPONENT_PNFS,
00129                        "Required layout extends beyond allowed region."
00130                        "offset: %"PRIu64", minlength: %" PRIu64".",
00131                        res->segment.offset,
00132                        arg->minlength);
00133                return NFS4ERR_BADLAYOUT;
00134           }
00135           res->segment.offset = 0;
00136           res->segment.length = stripe_width * BIGGEST_PATTERN;
00137           res->segment.io_mode = LAYOUTIOMODE4_RW;
00138      }
00139 
00140      /* For now, just make the low quad of the deviceid be the inode
00141         number.  With the span of the layouts constrained above, this
00142         lets us generate the device address on the fly from the
00143         deviceid rather than storing it. */
00144 
00145      deviceid.export_id = arg->export_id;
00146      deviceid.devid = VINODE(handle).ino.val;
00147 
00148      /* We return exactly one filehandle, filling in the necessary
00149         information for the DS server to speak to the Ceph OSD
00150         directly. */
00151 
00152      ds_handle = *handle;
00153      ds_handle.data.layout = file_layout;
00154      ds_handle.data.snapseq = ceph_ll_snap_seq(cmount, VINODE(handle));
00155 
00156      /* We are using sparse layouts with commit-through-DS, so our
00157         utility word contains only the stripe width, our first stripe
00158         is always at the beginning of the layout, and there is no
00159         pattern offset. */
00160 
00161      if ((stripe_width & ~NFL4_UFLG_STRIPE_UNIT_SIZE_MASK) != 0) {
00162           LogCrit(COMPONENT_PNFS,
00163                   "Ceph returned stripe width that is disallowed by NFS: "
00164                   "%"PRIu32".", stripe_width);
00165           return NFS4ERR_SERVERFAULT;
00166      }
00167      util = stripe_width;
00168 
00169      if ((nfs_status
00170           = FSAL_encode_file_layout(loc_body,
00171                                     extcontext,
00172                                     &deviceid,
00173                                     util,
00174                                     0,
00175                                     0,
00176                                     1,
00177                                     (fsal_handle_t *)&ds_handle))) {
00178           LogCrit(COMPONENT_PNFS, "Failed to encode nfsv4_1_file_layout.");
00179           return nfs_status;
00180      }
00181 
00182      /* We grant only one segment, and we want it back when the file
00183         is closed. */
00184 
00185      res->return_on_close = TRUE;
00186      res->last_segment = TRUE;
00187 
00188      return NFS4_OK;
00189 }
00190 
00191 nfsstat4
00192 CEPHFSAL_layoutreturn(fsal_handle_t* handle,
00193                       fsal_op_context_t* context,
00194                       XDR *lrf_body,
00195                       const struct fsal_layoutreturn_arg *arg)
00196 
00197 {
00198      /* Sanity check on type */
00199      if (arg->lo_type != LAYOUT4_NFSV4_1_FILES) {
00200           LogCrit(COMPONENT_PNFS,
00201                   "Unsupported layout type: %x",
00202                   arg->lo_type);
00203           return NFS4ERR_UNKNOWN_LAYOUTTYPE;
00204      }
00205 
00206      /* Since we no longer store DS addresses, we no longer have
00207         anything to free.  Later on we should unravel the Ceph client
00208         a bit more and coordinate with the Ceph MDS's notion of read
00209         and write pins, but that isn't germane until we have
00210         LAYOUTRECALL. */
00211 
00212      return NFS4_OK;
00213 }
00214 
00215 nfsstat4
00216 CEPHFSAL_layoutcommit(fsal_handle_t *exthandle,
00217                       fsal_op_context_t *extcontext,
00218                       XDR *lou_body,
00219                       const struct fsal_layoutcommit_arg *arg,
00220                       struct fsal_layoutcommit_res *res)
00221 {
00222      /* Filehandle for Ceph calls */
00223      cephfsal_handle_t* handle = (cephfsal_handle_t*) exthandle;
00224      /* Operation context */
00225      cephfsal_op_context_t* context = (cephfsal_op_context_t*) extcontext;
00226      /* Mount structure that must be supplied with each call to Ceph */
00227      struct ceph_mount_info *cmount = context->export_context->cmount;
00228      /* User ID and group ID for permissions */
00229      int uid = FSAL_OP_CONTEXT_TO_UID(context);
00230      int gid = FSAL_OP_CONTEXT_TO_GID(context);
00231      /* Old stat, so we don't truncate file or reverse time */
00232      struct stat stold;
00233      /* new stat to set time and size */
00234      struct stat stnew;
00235      /* Mask to determine exactly what gets set */
00236      int attrmask = 0;
00237      /* Error returns from Ceph */
00238      int ceph_status = 0;
00239 
00240      /* Sanity check on type */
00241      if (arg->type != LAYOUT4_NFSV4_1_FILES) {
00242           LogCrit(COMPONENT_PNFS,
00243                   "Unsupported layout type: %x",
00244                   arg->type);
00245           return NFS4ERR_UNKNOWN_LAYOUTTYPE;
00246      }
00247 
00248      /* A more proper and robust implementation of this would use Ceph
00249         caps, but we need to hack at the client to expose those before
00250         it can work. */
00251 
00252      memset(&stold, 0, sizeof(struct stat));
00253      if ((ceph_status = ceph_ll_getattr(cmount, VINODE(handle),
00254                                         &stold, uid, gid)) < 0) {
00255           if (ceph_status == -EPERM) {
00256                LogCrit(COMPONENT_PNFS,
00257                        "User %u, Group %u not permitted to get attributes "
00258                        "of file %" PRIu64 ".",
00259                        uid, gid, VINODE(handle).ino.val);
00260                return NFS4ERR_ACCESS;
00261           } else {
00262                LogCrit(COMPONENT_PNFS,
00263                        "Error %d in attempt to get attributes of "
00264                        "file %" PRIu64 ".",
00265                        -ceph_status, VINODE(handle).ino.val);
00266                return posix2nfs4_error(-ceph_status);
00267           }
00268      }
00269 
00270      memset(&stnew, 0, sizeof(struct stat));
00271      if (arg->new_offset) {
00272           if (stold.st_size < arg->last_write + 1) {
00273                attrmask |= CEPH_SETATTR_SIZE;
00274                stnew.st_size = arg->last_write + 1;
00275                res->size_supplied = TRUE;
00276                res->new_size = arg->last_write + 1;
00277           }
00278      }
00279 
00280      if ((arg->time_changed) &&
00281          (arg->new_time.seconds > stold.st_mtime)) {
00282           stnew.st_mtime = arg->new_time.seconds;
00283      } else {
00284           stnew.st_mtime = time(NULL);
00285      }
00286 
00287      attrmask |= CEPH_SETATTR_MTIME;
00288 
00289      if ((ceph_status = ceph_ll_setattr(cmount, VINODE(handle), &stnew,
00290                                         attrmask, uid, gid)) < 0) {
00291           if (ceph_status == -EPERM) {
00292                LogCrit(COMPONENT_PNFS,
00293                        "User %u, Group %u not permitted to get attributes "
00294                        "of file %" PRIu64 ".",
00295                        uid, gid, VINODE(handle).ino.val);
00296                return NFS4ERR_ACCESS;
00297           } else {
00298                LogCrit(COMPONENT_PNFS,
00299                        "Error %d in attempt to get attributes of "
00300                        "file %" PRIu64 ".",
00301                        -ceph_status, VINODE(handle).ino.val);
00302                return posix2nfs4_error(-ceph_status);
00303           }
00304      }
00305 
00306      /* This is likely universal for files. */
00307 
00308      res->commit_done = TRUE;
00309 
00310      return NFS4_OK;
00311 }
00312 
00313 nfsstat4
00314 CEPHFSAL_getdeviceinfo(fsal_op_context_t *extcontext,
00315                        XDR* da_addr_body,
00316                        layouttype4 type,
00317                        const struct pnfs_deviceid *deviceid)
00318 {
00319      /* Operation context */
00320      cephfsal_op_context_t* context = (cephfsal_op_context_t*) extcontext;
00321      /* Mount structure that must be supplied with each call to Ceph */
00322      struct ceph_mount_info *cmount = context->export_context->cmount;
00323      /* The number of Ceph OSDs in the cluster */
00324      unsigned num_osds = ceph_ll_num_osds(cmount);
00325      /* Minimal information needed to get layout info */
00326      vinodeno_t vinode;
00327      /* Structure containing the storage parameters of the file within
00328         the Ceph cluster. */
00329      struct ceph_file_layout file_layout;
00330      /* Currently, all layouts have the same number of stripes */
00331      uint32_t stripes = BIGGEST_PATTERN;
00332      /* Index for iterating over stripes */
00333      size_t stripe  = 0;
00334      /* Index for iterating over OSDs */
00335      size_t osd = 0;
00336      /* NFSv4 status code */
00337      nfsstat4 nfs_status = 0;
00338 
00339      vinode.ino.val = deviceid->devid;
00340      vinode.snapid.val = CEPH_NOSNAP;
00341 
00342      /* Sanity check on type */
00343      if (type != LAYOUT4_NFSV4_1_FILES) {
00344           LogCrit(COMPONENT_PNFS,
00345                   "Unsupported layout type: %x",
00346                   type);
00347           return NFS4ERR_UNKNOWN_LAYOUTTYPE;
00348      }
00349 
00350      /* Retrieve and calculate storage parameters of layout */
00351 
00352      memset(&file_layout, 0, sizeof(struct ceph_file_layout));
00353      ceph_ll_file_layout(cmount, vinode, &file_layout);
00354 
00355      /* As this is large, we encode as we go rather than building a
00356         structure and encoding it all at once. */
00357 
00358      /* The first entry in the nfsv4_1_file_ds_addr4 is the array of
00359         stripe indices. */
00360 
00361      /* First we encode the count of stripes.  Since our pattern
00362         doesn't repeat, we have as many indices as we do stripes. */
00363 
00364      if (!xdr_uint32_t(da_addr_body, &stripes)) {
00365           LogCrit(COMPONENT_PNFS, "Failed to encode length of "
00366                   "stripe_indices array: %" PRIu32 ".", stripes);
00367           return NFS4ERR_SERVERFAULT;
00368      }
00369 
00370      for (stripe = 0; stripe < stripes; stripe++) {
00371           uint32_t stripe_osd
00372                = stripe_osd = ceph_ll_get_stripe_osd(cmount,
00373                                                      vinode,
00374                                                      stripe,
00375                                                      &file_layout);
00376           if (stripe_osd < 0) {
00377                LogCrit(COMPONENT_PNFS, "Failed to retrieve OSD for "
00378                        "stripe %lu of file %" PRIu64 ".  Error: %u",
00379                        stripe, deviceid->devid, -stripe_osd);
00380                return NFS4ERR_SERVERFAULT;
00381           }
00382           if (!xdr_uint32_t(da_addr_body, &stripe_osd)) {
00383                LogCrit(COMPONENT_PNFS, "Failed to encode OSD for stripe %lu.",
00384                        stripe);
00385                return NFS4ERR_SERVERFAULT;
00386           }
00387      }
00388 
00389      /* The number of OSDs in our cluster is the length of our array
00390         of multipath_lists */
00391 
00392      if (!xdr_uint32_t(da_addr_body, &num_osds)) {
00393           LogCrit(COMPONENT_PNFS, "Failed to encode length of "
00394                   "multipath_ds_list array: %u", num_osds);
00395           return NFS4ERR_SERVERFAULT;
00396      }
00397 
00398      /* Since our index is the OSD number itself, we have only one
00399         host per multipath_list. */
00400 
00401      for(osd = 0; osd < num_osds; osd++) {
00402           fsal_multipath_member_t host;
00403           memset(&host, 0, sizeof(fsal_multipath_member_t));
00404           host.proto = 6;
00405           if (ceph_ll_osdaddr(cmount, osd, &host.addr) < 0) {
00406                LogCrit(COMPONENT_PNFS,
00407                        "Unable to get IP address for OSD %lu.",
00408                        osd);
00409                return NFS4ERR_SERVERFAULT;
00410           }
00411           host.port = 2049;
00412           if ((nfs_status
00413                = FSAL_encode_v4_multipath(da_addr_body,
00414                                           1,
00415                                           &host))
00416               != NFS4_OK) {
00417                return nfs_status;
00418           }
00419      }
00420 
00421      return NFS4_OK;
00422 }
00423 
00424 nfsstat4
00425 CEPHFSAL_getdevicelist(fsal_handle_t *handle,
00426                        fsal_op_context_t *context,
00427                        const struct fsal_getdevicelist_arg *arg,
00428                        struct fsal_getdevicelist_res *res)
00429 {
00430      /* Sanity check on type */
00431      if (arg->type != LAYOUT4_NFSV4_1_FILES) {
00432           LogCrit(COMPONENT_PNFS,
00433                   "Unsupported layout type: %x",
00434                   arg->type);
00435           return NFS4ERR_UNKNOWN_LAYOUTTYPE;
00436      }
00437 
00438      /* We have neither the ability nor the desire to return all valid
00439         deviceids, so we do nothing successfully. */
00440 
00441      res->count = 0;
00442      res->eof = TRUE;
00443 
00444      return NFS4_OK;
00445 }