summaryrefslogtreecommitdiff
path: root/fs/ceph/caps.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ceph/caps.c')
-rw-r--r--fs/ceph/caps.c200
1 files changed, 128 insertions, 72 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 620daad201db..25442b40c25a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -147,7 +147,7 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
spin_unlock(&mdsc->caps_list_lock);
}
-int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+void ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need)
{
int i;
@@ -155,7 +155,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
int have;
int alloc = 0;
LIST_HEAD(newcaps);
- int ret = 0;
dout("reserve caps ctx=%p need=%d\n", ctx, need);
@@ -174,14 +173,15 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
for (i = have; i < need; i++) {
cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
- if (!cap) {
- ret = -ENOMEM;
- goto out_alloc_count;
- }
+ if (!cap)
+ break;
list_add(&cap->caps_item, &newcaps);
alloc++;
}
- BUG_ON(have + alloc != need);
+ /* we didn't manage to reserve as much as we needed */
+ if (have + alloc != need)
+ pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+ ctx, need, have + alloc);
spin_lock(&mdsc->caps_list_lock);
mdsc->caps_total_count += alloc;
@@ -197,13 +197,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
ctx, mdsc->caps_total_count, mdsc->caps_use_count,
mdsc->caps_reserve_count, mdsc->caps_avail_count);
- return 0;
-
-out_alloc_count:
- /* we didn't manage to reserve as much as we needed */
- pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
- ctx, need, have);
- return ret;
}
int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
@@ -236,8 +229,10 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
if (!ctx) {
cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
if (cap) {
+ spin_lock(&mdsc->caps_list_lock);
mdsc->caps_use_count++;
mdsc->caps_total_count++;
+ spin_unlock(&mdsc->caps_list_lock);
}
return cap;
}
@@ -488,15 +483,17 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
ci->i_rdcache_gen++;
/*
- * if we are newly issued FILE_SHARED, clear D_COMPLETE; we
+ * if we are newly issued FILE_SHARED, mark dir not complete; we
* don't know what happened to this directory while we didn't
* have the cap.
*/
if ((issued & CEPH_CAP_FILE_SHARED) &&
(had & CEPH_CAP_FILE_SHARED) == 0) {
ci->i_shared_gen++;
- if (S_ISDIR(ci->vfs_inode.i_mode))
- ceph_dir_clear_complete(&ci->vfs_inode);
+ if (S_ISDIR(ci->vfs_inode.i_mode)) {
+ dout(" marking %p NOT complete\n", &ci->vfs_inode);
+ __ceph_dir_clear_complete(ci);
+ }
}
}
@@ -551,6 +548,7 @@ retry:
cap->implemented = 0;
cap->mds = mds;
cap->mds_wanted = 0;
+ cap->mseq = 0;
cap->ci = ci;
__insert_cap_node(ci, cap);
@@ -607,10 +605,20 @@ retry:
__cap_delay_requeue(mdsc, ci);
}
- if (flags & CEPH_CAP_FLAG_AUTH)
- ci->i_auth_cap = cap;
- else if (ci->i_auth_cap == cap)
+ if (flags & CEPH_CAP_FLAG_AUTH) {
+ if (ci->i_auth_cap == NULL ||
+ ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
+ ci->i_auth_cap = cap;
+ } else if (ci->i_auth_cap == cap) {
ci->i_auth_cap = NULL;
+ spin_lock(&mdsc->cap_dirty_lock);
+ if (!list_empty(&ci->i_dirty_item)) {
+ dout(" moving %p to cap_dirty_migrating\n", inode);
+ list_move(&ci->i_dirty_item,
+ &mdsc->cap_dirty_migrating);
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+ }
dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
@@ -618,7 +626,10 @@ retry:
cap->cap_id = cap_id;
cap->issued = issued;
cap->implemented |= issued;
- cap->mds_wanted |= wanted;
+ if (mseq > cap->mseq)
+ cap->mds_wanted = wanted;
+ else
+ cap->mds_wanted |= wanted;
cap->seq = seq;
cap->issue_seq = seq;
cap->mseq = mseq;
@@ -679,6 +690,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
if (implemented)
*implemented |= cap->implemented;
}
+ /*
+ * exclude caps issued by non-auth MDS, but are been revoking
+ * by the auth MDS. The non-auth MDS should be revoking/exporting
+ * these caps, but the message is delayed.
+ */
+ if (ci->i_auth_cap) {
+ cap = ci->i_auth_cap;
+ have &= ~cap->implemented | cap->issued;
+ }
return have;
}
@@ -786,22 +806,28 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
/*
* Return true if mask caps are currently being revoked by an MDS.
*/
-int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
+ struct ceph_cap *ocap, int mask)
{
- struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
struct rb_node *p;
- int ret = 0;
- spin_lock(&ci->i_ceph_lock);
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
cap = rb_entry(p, struct ceph_cap, ci_node);
- if (__cap_is_valid(cap) &&
- (cap->implemented & ~cap->issued & mask)) {
- ret = 1;
- break;
- }
+ if (cap != ocap && __cap_is_valid(cap) &&
+ (cap->implemented & ~cap->issued & mask))
+ return 1;
}
+ return 0;
+}
+
+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+{
+ struct inode *inode = &ci->vfs_inode;
+ int ret;
+
+ spin_lock(&ci->i_ceph_lock);
+ ret = __ceph_caps_revoking_other(ci, NULL, mask);
spin_unlock(&ci->i_ceph_lock);
dout("ceph_caps_revoking %p %s = %d\n", inode,
ceph_cap_string(mask), ret);
@@ -928,7 +954,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
u64 size, u64 max_size,
struct timespec *mtime, struct timespec *atime,
u64 time_warp_seq,
- uid_t uid, gid_t gid, umode_t mode,
+ kuid_t uid, kgid_t gid, umode_t mode,
u64 xattr_version,
struct ceph_buffer *xattrs_buf,
u64 follows)
@@ -972,8 +998,8 @@ static int send_cap_msg(struct ceph_mds_session *session,
ceph_encode_timespec(&fc->atime, atime);
fc->time_warp_seq = cpu_to_le32(time_warp_seq);
- fc->uid = cpu_to_le32(uid);
- fc->gid = cpu_to_le32(gid);
+ fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid));
+ fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid));
fc->mode = cpu_to_le32(mode);
fc->xattr_version = cpu_to_le64(xattr_version);
@@ -987,9 +1013,9 @@ static int send_cap_msg(struct ceph_mds_session *session,
return 0;
}
-static void __queue_cap_release(struct ceph_mds_session *session,
- u64 ino, u64 cap_id, u32 migrate_seq,
- u32 issue_seq)
+void __queue_cap_release(struct ceph_mds_session *session,
+ u64 ino, u64 cap_id, u32 migrate_seq,
+ u32 issue_seq)
{
struct ceph_msg *msg;
struct ceph_mds_cap_release *head;
@@ -1005,7 +1031,7 @@ static void __queue_cap_release(struct ceph_mds_session *session,
BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
head = msg->front.iov_base;
- head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
+ le32_add_cpu(&head->num, 1);
item = msg->front.iov_base + msg->front.iov_len;
item->ino = cpu_to_le64(ino);
item->cap_id = cpu_to_le64(cap_id);
@@ -1079,8 +1105,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
struct timespec mtime, atime;
int wake = 0;
umode_t mode;
- uid_t uid;
- gid_t gid;
+ kuid_t uid;
+ kgid_t gid;
struct ceph_mds_session *session;
u64 xattr_version = 0;
struct ceph_buffer *xattr_blob = NULL;
@@ -1349,11 +1375,15 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
if (!ci->i_head_snapc)
ci->i_head_snapc = ceph_get_snap_context(
ci->i_snap_realm->cached_context);
- dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
- ci->i_head_snapc);
+ dout(" inode %p now dirty snapc %p auth cap %p\n",
+ &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
BUG_ON(!list_empty(&ci->i_dirty_item));
spin_lock(&mdsc->cap_dirty_lock);
- list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+ if (ci->i_auth_cap)
+ list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+ else
+ list_add(&ci->i_dirty_item,
+ &mdsc->cap_dirty_migrating);
spin_unlock(&mdsc->cap_dirty_lock);
if (ci->i_flushing_caps == 0) {
ihold(inode);
@@ -1454,7 +1484,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
- int file_wanted, used;
+ int file_wanted, used, cap_used;
int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
int issued, implemented, want, retain, revoking, flushing = 0;
int mds = -1; /* keep track of how far we've gone through i_caps list
@@ -1557,9 +1587,14 @@ retry_locked:
/* NOTE: no side-effects allowed, until we take s_mutex */
+ cap_used = used;
+ if (ci->i_auth_cap && cap != ci->i_auth_cap)
+ cap_used &= ~ci->i_auth_cap->issued;
+
revoking = cap->implemented & ~cap->issued;
- dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
+ dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
cap->mds, cap, ceph_cap_string(cap->issued),
+ ceph_cap_string(cap_used),
ceph_cap_string(cap->implemented),
ceph_cap_string(revoking));
@@ -1587,7 +1622,7 @@ retry_locked:
}
/* completed revocation? going down and there are no caps? */
- if (revoking && (revoking & used) == 0) {
+ if (revoking && (revoking & cap_used) == 0) {
dout("completed revocation of %s\n",
ceph_cap_string(cap->implemented & ~cap->issued));
goto ack;
@@ -1664,8 +1699,8 @@ ack:
sent++;
/* __send_cap drops i_ceph_lock */
- delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
- retain, flushing, NULL);
+ delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
+ want, retain, flushing, NULL);
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
@@ -1955,8 +1990,15 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
cap = ci->i_auth_cap;
dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+
__ceph_flush_snaps(ci, &session, 1);
+
if (ci->i_flushing_caps) {
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_move_tail(&ci->i_flushing_item,
+ &cap->session->s_cap_flushing);
+ spin_unlock(&mdsc->cap_dirty_lock);
+
delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
__ceph_caps_used(ci),
__ceph_caps_wanted(ci),
@@ -2027,6 +2069,17 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
goto out;
}
+ /* finish pending truncate */
+ while (ci->i_truncate_pending) {
+ spin_unlock(&ci->i_ceph_lock);
+ if (!(need & CEPH_CAP_FILE_WR))
+ mutex_lock(&inode->i_mutex);
+ __ceph_do_pending_vmtruncate(inode);
+ if (!(need & CEPH_CAP_FILE_WR))
+ mutex_unlock(&inode->i_mutex);
+ spin_lock(&ci->i_ceph_lock);
+ }
+
if (need & CEPH_CAP_FILE_WR) {
if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
@@ -2048,12 +2101,6 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
}
have = __ceph_caps_issued(ci, &implemented);
- /*
- * disallow writes while a truncate is pending
- */
- if (ci->i_truncate_pending)
- have &= ~CEPH_CAP_FILE_WR;
-
if ((have & need) == need) {
/*
* Look at (implemented & ~have & not) so that we keep waiting
@@ -2353,10 +2400,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
inode->i_mode = le32_to_cpu(grant->mode);
- inode->i_uid = le32_to_cpu(grant->uid);
- inode->i_gid = le32_to_cpu(grant->gid);
+ inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
+ inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
- inode->i_uid, inode->i_gid);
+ from_kuid(&init_user_ns, inode->i_uid),
+ from_kgid(&init_user_ns, inode->i_gid));
}
if ((issued & CEPH_CAP_LINK_EXCL) == 0)
@@ -2388,7 +2436,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
&atime);
/* max size increase? */
- if (max_size != ci->i_max_size) {
+ if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
ci->i_max_size = max_size;
if (max_size >= ci->i_wanted_max_size) {
@@ -2410,7 +2458,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
dout("mds wanted %s -> %s\n",
ceph_cap_string(le32_to_cpu(grant->wanted)),
ceph_cap_string(wanted));
- grant->wanted = cpu_to_le32(wanted);
+ /* imported cap may not have correct mds_wanted */
+ if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
+ check_caps = 1;
}
cap->seq = seq;
@@ -2444,6 +2494,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
} else {
dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
ceph_cap_string(newcaps));
+ /* non-auth MDS is revoking the newly grant caps ? */
+ if (cap == ci->i_auth_cap &&
+ __ceph_caps_revoking_other(ci, cap, newcaps))
+ check_caps = 2;
+
cap->issued = newcaps;
cap->implemented |= newcaps; /* add bits only, to
* avoid stepping on a
@@ -2745,6 +2800,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
/* make sure we re-request max_size, if necessary */
spin_lock(&ci->i_ceph_lock);
+ ci->i_wanted_max_size = 0; /* reset */
ci->i_requested_max_size = 0;
spin_unlock(&ci->i_ceph_lock);
}
@@ -2813,6 +2869,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
(unsigned)seq);
+ if (op == CEPH_CAP_OP_IMPORT)
+ ceph_add_cap_releases(mdsc, session);
+
/* lookup ino */
inode = ceph_find_inode(sb, vino);
ci = ceph_inode(inode);
@@ -2840,8 +2899,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
case CEPH_CAP_OP_IMPORT:
handle_cap_import(mdsc, inode, h, session,
snaptrace, snaptrace_len);
- ceph_check_caps(ceph_inode(inode), 0, session);
- goto done_unlocked;
}
/* the rest require a cap */
@@ -2858,6 +2915,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
switch (op) {
case CEPH_CAP_OP_REVOKE:
case CEPH_CAP_OP_GRANT:
+ case CEPH_CAP_OP_IMPORT:
handle_cap_grant(inode, h, session, cap, msg->middle);
goto done_unlocked;
@@ -3010,21 +3068,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
(cap->issued & unless) == 0)) {
if ((cap->issued & drop) &&
(cap->issued & unless) == 0) {
- dout("encode_inode_release %p cap %p %s -> "
- "%s\n", inode, cap,
+ int wanted = __ceph_caps_wanted(ci);
+ if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
+ wanted |= cap->mds_wanted;
+ dout("encode_inode_release %p cap %p "
+ "%s -> %s, wanted %s -> %s\n", inode, cap,
ceph_cap_string(cap->issued),
- ceph_cap_string(cap->issued & ~drop));
+ ceph_cap_string(cap->issued & ~drop),
+ ceph_cap_string(cap->mds_wanted),
+ ceph_cap_string(wanted));
+
cap->issued &= ~drop;
cap->implemented &= ~drop;
- if (ci->i_ceph_flags & CEPH_I_NODELAY) {
- int wanted = __ceph_caps_wanted(ci);
- dout(" wanted %s -> %s (act %s)\n",
- ceph_cap_string(cap->mds_wanted),
- ceph_cap_string(cap->mds_wanted &
- ~wanted),
- ceph_cap_string(wanted));
- cap->mds_wanted &= wanted;
- }
+ cap->mds_wanted = wanted;
} else {
dout("encode_inode_release %p cap %p %s"
" (force)\n", inode, cap,