diff options
Diffstat (limited to 'fs/ceph/caps.c')
-rw-r--r-- | fs/ceph/caps.c | 200 |
1 files changed, 128 insertions, 72 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 620daad201db..25442b40c25a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -147,7 +147,7 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) spin_unlock(&mdsc->caps_list_lock); } -int ceph_reserve_caps(struct ceph_mds_client *mdsc, +void ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need) { int i; @@ -155,7 +155,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, int have; int alloc = 0; LIST_HEAD(newcaps); - int ret = 0; dout("reserve caps ctx=%p need=%d\n", ctx, need); @@ -174,14 +173,15 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, for (i = have; i < need; i++) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); - if (!cap) { - ret = -ENOMEM; - goto out_alloc_count; - } + if (!cap) + break; list_add(&cap->caps_item, &newcaps); alloc++; } - BUG_ON(have + alloc != need); + /* we didn't manage to reserve as much as we needed */ + if (have + alloc != need) + pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n", + ctx, need, have + alloc); spin_lock(&mdsc->caps_list_lock); mdsc->caps_total_count += alloc; @@ -197,13 +197,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", ctx, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); - return 0; - -out_alloc_count: - /* we didn't manage to reserve as much as we needed */ - pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n", - ctx, need, have); - return ret; } int ceph_unreserve_caps(struct ceph_mds_client *mdsc, @@ -236,8 +229,10 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, if (!ctx) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); if (cap) { + spin_lock(&mdsc->caps_list_lock); mdsc->caps_use_count++; mdsc->caps_total_count++; + spin_unlock(&mdsc->caps_list_lock); } return cap; } @@ -488,15 +483,17 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, ci->i_rdcache_gen++; /* - * if we are newly issued FILE_SHARED, clear D_COMPLETE; we + * if we are newly issued FILE_SHARED, mark dir not complete; we * don't know what happened to this directory while we didn't * have the cap. */ if ((issued & CEPH_CAP_FILE_SHARED) && (had & CEPH_CAP_FILE_SHARED) == 0) { ci->i_shared_gen++; - if (S_ISDIR(ci->vfs_inode.i_mode)) - ceph_dir_clear_complete(&ci->vfs_inode); + if (S_ISDIR(ci->vfs_inode.i_mode)) { + dout(" marking %p NOT complete\n", &ci->vfs_inode); + __ceph_dir_clear_complete(ci); + } } } @@ -551,6 +548,7 @@ retry: cap->implemented = 0; cap->mds = mds; cap->mds_wanted = 0; + cap->mseq = 0; cap->ci = ci; __insert_cap_node(ci, cap); @@ -607,10 +605,20 @@ retry: __cap_delay_requeue(mdsc, ci); } - if (flags & CEPH_CAP_FLAG_AUTH) - ci->i_auth_cap = cap; - else if (ci->i_auth_cap == cap) + if (flags & CEPH_CAP_FLAG_AUTH) { + if (ci->i_auth_cap == NULL || + ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) + ci->i_auth_cap = cap; + } else if (ci->i_auth_cap == cap) { ci->i_auth_cap = NULL; + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + dout(" moving %p to cap_dirty_migrating\n", inode); + list_move(&ci->i_dirty_item, + &mdsc->cap_dirty_migrating); + } + spin_unlock(&mdsc->cap_dirty_lock); + } dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", inode, ceph_vinop(inode), cap, ceph_cap_string(issued), @@ -618,7 +626,10 @@ retry: cap->cap_id = cap_id; cap->issued = issued; cap->implemented |= issued; - cap->mds_wanted |= wanted; + if (mseq > cap->mseq) + cap->mds_wanted = wanted; + else + cap->mds_wanted |= wanted; cap->seq = seq; cap->issue_seq = seq; cap->mseq = mseq; @@ -679,6 +690,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) if (implemented) *implemented |= cap->implemented; } + /* + * exclude caps issued by non-auth MDS, but are been revoking + * by the auth MDS. The non-auth MDS should be revoking/exporting + * these caps, but the message is delayed. + */ + if (ci->i_auth_cap) { + cap = ci->i_auth_cap; + have &= ~cap->implemented | cap->issued; + } return have; } @@ -786,22 +806,28 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) /* * Return true if mask caps are currently being revoked by an MDS. */ -int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +int __ceph_caps_revoking_other(struct ceph_inode_info *ci, + struct ceph_cap *ocap, int mask) { - struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; struct rb_node *p; - int ret = 0; - spin_lock(&ci->i_ceph_lock); for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); - if (__cap_is_valid(cap) && - (cap->implemented & ~cap->issued & mask)) { - ret = 1; - break; - } + if (cap != ocap && __cap_is_valid(cap) && + (cap->implemented & ~cap->issued & mask)) + return 1; } + return 0; +} + +int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +{ + struct inode *inode = &ci->vfs_inode; + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = __ceph_caps_revoking_other(ci, NULL, mask); spin_unlock(&ci->i_ceph_lock); dout("ceph_caps_revoking %p %s = %d\n", inode, ceph_cap_string(mask), ret); @@ -928,7 +954,7 @@ static int send_cap_msg(struct ceph_mds_session *session, u64 size, u64 max_size, struct timespec *mtime, struct timespec *atime, u64 time_warp_seq, - uid_t uid, gid_t gid, umode_t mode, + kuid_t uid, kgid_t gid, umode_t mode, u64 xattr_version, struct ceph_buffer *xattrs_buf, u64 follows) @@ -972,8 +998,8 @@ static int send_cap_msg(struct ceph_mds_session *session, ceph_encode_timespec(&fc->atime, atime); fc->time_warp_seq = cpu_to_le32(time_warp_seq); - fc->uid = cpu_to_le32(uid); - fc->gid = cpu_to_le32(gid); + fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid)); + fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid)); fc->mode = cpu_to_le32(mode); fc->xattr_version = cpu_to_le64(xattr_version); @@ -987,9 +1013,9 @@ static int send_cap_msg(struct ceph_mds_session *session, return 0; } -static void __queue_cap_release(struct ceph_mds_session *session, - u64 ino, u64 cap_id, u32 migrate_seq, - u32 issue_seq) +void __queue_cap_release(struct ceph_mds_session *session, + u64 ino, u64 cap_id, u32 migrate_seq, + u32 issue_seq) { struct ceph_msg *msg; struct ceph_mds_cap_release *head; @@ -1005,7 +1031,7 @@ static void __queue_cap_release(struct ceph_mds_session *session, BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); head = msg->front.iov_base; - head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); + le32_add_cpu(&head->num, 1); item = msg->front.iov_base + msg->front.iov_len; item->ino = cpu_to_le64(ino); item->cap_id = cpu_to_le64(cap_id); @@ -1079,8 +1105,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, struct timespec mtime, atime; int wake = 0; umode_t mode; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; struct ceph_mds_session *session; u64 xattr_version = 0; struct ceph_buffer *xattr_blob = NULL; @@ -1349,11 +1375,15 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) if (!ci->i_head_snapc) ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); - dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode, - ci->i_head_snapc); + dout(" inode %p now dirty snapc %p auth cap %p\n", + &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); - list_add(&ci->i_dirty_item, &mdsc->cap_dirty); + if (ci->i_auth_cap) + list_add(&ci->i_dirty_item, &mdsc->cap_dirty); + else + list_add(&ci->i_dirty_item, + &mdsc->cap_dirty_migrating); spin_unlock(&mdsc->cap_dirty_lock); if (ci->i_flushing_caps == 0) { ihold(inode); @@ -1454,7 +1484,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; - int file_wanted, used; + int file_wanted, used, cap_used; int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ int issued, implemented, want, retain, revoking, flushing = 0; int mds = -1; /* keep track of how far we've gone through i_caps list @@ -1557,9 +1587,14 @@ retry_locked: /* NOTE: no side-effects allowed, until we take s_mutex */ + cap_used = used; + if (ci->i_auth_cap && cap != ci->i_auth_cap) + cap_used &= ~ci->i_auth_cap->issued; + revoking = cap->implemented & ~cap->issued; - dout(" mds%d cap %p issued %s implemented %s revoking %s\n", + dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", cap->mds, cap, ceph_cap_string(cap->issued), + ceph_cap_string(cap_used), ceph_cap_string(cap->implemented), ceph_cap_string(revoking)); @@ -1587,7 +1622,7 @@ retry_locked: } /* completed revocation? going down and there are no caps? */ - if (revoking && (revoking & used) == 0) { + if (revoking && (revoking & cap_used) == 0) { dout("completed revocation of %s\n", ceph_cap_string(cap->implemented & ~cap->issued)); goto ack; @@ -1664,8 +1699,8 @@ ack: sent++; /* __send_cap drops i_ceph_lock */ - delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, - retain, flushing, NULL); + delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, + want, retain, flushing, NULL); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } @@ -1955,8 +1990,15 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, cap = ci->i_auth_cap; dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); + __ceph_flush_snaps(ci, &session, 1); + if (ci->i_flushing_caps) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &cap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, __ceph_caps_used(ci), __ceph_caps_wanted(ci), @@ -2027,6 +2069,17 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, goto out; } + /* finish pending truncate */ + while (ci->i_truncate_pending) { + spin_unlock(&ci->i_ceph_lock); + if (!(need & CEPH_CAP_FILE_WR)) + mutex_lock(&inode->i_mutex); + __ceph_do_pending_vmtruncate(inode); + if (!(need & CEPH_CAP_FILE_WR)) + mutex_unlock(&inode->i_mutex); + spin_lock(&ci->i_ceph_lock); + } + if (need & CEPH_CAP_FILE_WR) { if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { dout("get_cap_refs %p endoff %llu > maxsize %llu\n", @@ -2048,12 +2101,6 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, } have = __ceph_caps_issued(ci, &implemented); - /* - * disallow writes while a truncate is pending - */ - if (ci->i_truncate_pending) - have &= ~CEPH_CAP_FILE_WR; - if ((have & need) == need) { /* * Look at (implemented & ~have & not) so that we keep waiting @@ -2353,10 +2400,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = le32_to_cpu(grant->mode); - inode->i_uid = le32_to_cpu(grant->uid); - inode->i_gid = le32_to_cpu(grant->gid); + inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); + inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, - inode->i_uid, inode->i_gid); + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); } if ((issued & CEPH_CAP_LINK_EXCL) == 0) @@ -2388,7 +2436,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, &atime); /* max size increase? */ - if (max_size != ci->i_max_size) { + if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); ci->i_max_size = max_size; if (max_size >= ci->i_wanted_max_size) { @@ -2410,7 +2458,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, dout("mds wanted %s -> %s\n", ceph_cap_string(le32_to_cpu(grant->wanted)), ceph_cap_string(wanted)); - grant->wanted = cpu_to_le32(wanted); + /* imported cap may not have correct mds_wanted */ + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) + check_caps = 1; } cap->seq = seq; @@ -2444,6 +2494,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } else { dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); + /* non-auth MDS is revoking the newly grant caps ? */ + if (cap == ci->i_auth_cap && + __ceph_caps_revoking_other(ci, cap, newcaps)) + check_caps = 2; + cap->issued = newcaps; cap->implemented |= newcaps; /* add bits only, to * avoid stepping on a @@ -2745,6 +2800,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, /* make sure we re-request max_size, if necessary */ spin_lock(&ci->i_ceph_lock); + ci->i_wanted_max_size = 0; /* reset */ ci->i_requested_max_size = 0; spin_unlock(&ci->i_ceph_lock); } @@ -2813,6 +2869,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); + if (op == CEPH_CAP_OP_IMPORT) + ceph_add_cap_releases(mdsc, session); + /* lookup ino */ inode = ceph_find_inode(sb, vino); ci = ceph_inode(inode); @@ -2840,8 +2899,6 @@ void ceph_handle_caps(struct ceph_mds_session *session, case CEPH_CAP_OP_IMPORT: handle_cap_import(mdsc, inode, h, session, snaptrace, snaptrace_len); - ceph_check_caps(ceph_inode(inode), 0, session); - goto done_unlocked; } /* the rest require a cap */ @@ -2858,6 +2915,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: + case CEPH_CAP_OP_IMPORT: handle_cap_grant(inode, h, session, cap, msg->middle); goto done_unlocked; @@ -3010,21 +3068,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode, (cap->issued & unless) == 0)) { if ((cap->issued & drop) && (cap->issued & unless) == 0) { - dout("encode_inode_release %p cap %p %s -> " - "%s\n", inode, cap, + int wanted = __ceph_caps_wanted(ci); + if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0) + wanted |= cap->mds_wanted; + dout("encode_inode_release %p cap %p " + "%s -> %s, wanted %s -> %s\n", inode, cap, ceph_cap_string(cap->issued), - ceph_cap_string(cap->issued & ~drop)); + ceph_cap_string(cap->issued & ~drop), + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(wanted)); + cap->issued &= ~drop; cap->implemented &= ~drop; - if (ci->i_ceph_flags & CEPH_I_NODELAY) { - int wanted = __ceph_caps_wanted(ci); - dout(" wanted %s -> %s (act %s)\n", - ceph_cap_string(cap->mds_wanted), - ceph_cap_string(cap->mds_wanted & - ~wanted), - ceph_cap_string(wanted)); - cap->mds_wanted &= wanted; - } + cap->mds_wanted = wanted; } else { dout("encode_inode_release %p cap %p %s" " (force)\n", inode, cap, |