From 76aa844d5b2fb8c839180d3f5874e333b297e5fd Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 6 Oct 2009 11:31:14 -0700 Subject: ceph: debugfs Basic state information is available via /sys/kernel/debug/ceph, including instances of the client, fsids, current monitor, mds and osd maps, outstanding server requests, and hooks to adjust debug levels. Signed-off-by: Sage Weil --- fs/ceph/debugfs.c | 425 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 425 insertions(+) create mode 100644 fs/ceph/debugfs.c (limited to 'fs/ceph/debugfs.c') diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c new file mode 100644 index 000000000000..9edbad32f116 --- /dev/null +++ b/fs/ceph/debugfs.c @@ -0,0 +1,425 @@ +#include "ceph_debug.h" + +#include +#include +#include +#include + +#include "super.h" +#include "mds_client.h" + +/* + * Implement /sys/kernel/debug/ceph fun + * + * /sys/kernel/debug/ceph/client* - an instance of the ceph client + * .../osdmap - current osdmap + * .../mdsmap - current mdsmap + * .../monmap - current monmap + * .../osdc - active osd requests + * .../mdsc - active mds requests + * .../monc - mon client state + * .../dentry_lru - dump contents of dentry lru + * .../caps - expose cap (reservation) stats + */ + +static struct dentry *ceph_debugfs_dir; + +static int monmap_show(struct seq_file *s, void *p) +{ + int i; + struct ceph_client *client = s->private; + + if (client->monc.monmap == NULL) + return 0; + + seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); + for (i = 0; i < client->monc.monmap->num_mon; i++) { + struct ceph_entity_inst *inst = + &client->monc.monmap->mon_inst[i]; + + seq_printf(s, "\t%s%lld\t%s\n", + ENTITY_NAME(inst->name), + pr_addr(&inst->addr.in_addr)); + } + return 0; +} + +static int mdsmap_show(struct seq_file *s, void *p) +{ + int i; + struct ceph_client *client = s->private; + + if (client->mdsc.mdsmap == NULL) + return 0; + seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch); + seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root); + seq_printf(s, "session_timeout %d\n", + client->mdsc.mdsmap->m_session_timeout); + seq_printf(s, "session_autoclose %d\n", + client->mdsc.mdsmap->m_session_autoclose); + for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) { + struct ceph_entity_addr *addr = + &client->mdsc.mdsmap->m_info[i].addr; + int state = client->mdsc.mdsmap->m_info[i].state; + + seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr), + ceph_mds_state_name(state)); + } + return 0; +} + +static int osdmap_show(struct seq_file *s, void *p) +{ + int i; + struct ceph_client *client = s->private; + + if (client->osdc.osdmap == NULL) + return 0; + seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch); + seq_printf(s, "flags%s%s\n", + (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ? + " NEARFULL" : "", + (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? + " FULL" : ""); + for (i = 0; i < client->osdc.osdmap->num_pools; i++) { + struct ceph_pg_pool_info *pool = + &client->osdc.osdmap->pg_pool[i]; + seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", + i, pool->v.pg_num, pool->pg_num_mask, + pool->v.lpg_num, pool->lpg_num_mask); + } + for (i = 0; i < client->osdc.osdmap->max_osd; i++) { + struct ceph_entity_addr *addr = + &client->osdc.osdmap->osd_addr[i]; + int state = client->osdc.osdmap->osd_state[i]; + char sb[64]; + + seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n", + i, pr_addr(&addr->in_addr), + ((client->osdc.osdmap->osd_weight[i]*100) >> 16), + ceph_osdmap_state_str(sb, sizeof(sb), state)); + } + return 0; +} + +static int monc_show(struct seq_file *s, void *p) +{ + struct ceph_client *client = s->private; + struct ceph_mon_statfs_request *req; + u64 nexttid = 0; + int got; + struct ceph_mon_client *monc = &client->monc; + + mutex_lock(&monc->mutex); + + if (monc->have_mdsmap) + seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap); + if (monc->have_osdmap) + seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap); + if (monc->want_next_osdmap) + seq_printf(s, "want next osdmap\n"); + + while (nexttid < monc->last_tid) { + got = radix_tree_gang_lookup(&monc->statfs_request_tree, + (void **)&req, nexttid, 1); + if (got == 0) + break; + nexttid = req->tid + 1; + + seq_printf(s, "%lld statfs\n", req->tid); + } + mutex_unlock(&monc->mutex); + + return 0; +} + +static int mdsc_show(struct seq_file *s, void *p) +{ + struct ceph_client *client = s->private; + struct ceph_mds_request *req; + u64 nexttid = 0; + int got; + struct ceph_mds_client *mdsc = &client->mdsc; + int pathlen; + u64 pathbase; + char *path; + + mutex_lock(&mdsc->mutex); + while (nexttid < mdsc->last_tid) { + got = radix_tree_gang_lookup(&mdsc->request_tree, + (void **)&req, nexttid, 1); + if (got == 0) + break; + nexttid = req->r_tid + 1; + + if (req->r_request) + seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds); + else + seq_printf(s, "%lld\t(no request)\t", req->r_tid); + + seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); + + if (req->r_got_unsafe) + seq_printf(s, "\t(unsafe)"); + else + seq_printf(s, "\t"); + + if (req->r_inode) { + seq_printf(s, " #%llx", ceph_ino(req->r_inode)); + } else if (req->r_dentry) { + path = ceph_mdsc_build_path(req->r_dentry, &pathlen, + &pathbase, 0); + spin_lock(&req->r_dentry->d_lock); + seq_printf(s, " #%llx/%.*s (%s)", + ceph_ino(req->r_dentry->d_parent->d_inode), + req->r_dentry->d_name.len, + req->r_dentry->d_name.name, + path ? path : ""); + spin_unlock(&req->r_dentry->d_lock); + kfree(path); + } else if (req->r_path1) { + seq_printf(s, " #%llx/%s", req->r_ino1.ino, + req->r_path1); + } + + if (req->r_old_dentry) { + path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen, + &pathbase, 0); + spin_lock(&req->r_old_dentry->d_lock); + seq_printf(s, " #%llx/%.*s (%s)", + ceph_ino(req->r_old_dentry->d_parent->d_inode), + req->r_old_dentry->d_name.len, + req->r_old_dentry->d_name.name, + path ? path : ""); + spin_unlock(&req->r_old_dentry->d_lock); + kfree(path); + } else if (req->r_path2) { + if (req->r_ino2.ino) + seq_printf(s, " #%llx/%s", req->r_ino2.ino, + req->r_path2); + else + seq_printf(s, " %s", req->r_path2); + } + + seq_printf(s, "\n"); + } + mutex_unlock(&mdsc->mutex); + + return 0; +} + +static int osdc_show(struct seq_file *s, void *pp) +{ + struct ceph_client *client = s->private; + struct ceph_osd_client *osdc = &client->osdc; + struct rb_node *p; + + mutex_lock(&osdc->request_mutex); + for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { + struct ceph_osd_request *req; + struct ceph_osd_request_head *head; + struct ceph_osd_op *op; + int num_ops; + int opcode, olen; + int i; + + req = rb_entry(p, struct ceph_osd_request, r_node); + + seq_printf(s, "%lld\tosd%d\t", req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1); + + head = req->r_request->front.iov_base; + op = (void *)(head + 1); + + num_ops = le16_to_cpu(head->num_ops); + olen = le32_to_cpu(head->object_len); + seq_printf(s, "%.*s", olen, + (const char *)(head->ops + num_ops)); + + if (req->r_reassert_version.epoch) + seq_printf(s, "\t%u'%llu", + (unsigned)le32_to_cpu(req->r_reassert_version.epoch), + le64_to_cpu(req->r_reassert_version.version)); + else + seq_printf(s, "\t"); + + for (i = 0; i < num_ops; i++) { + opcode = le16_to_cpu(op->op); + seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); + op++; + } + + seq_printf(s, "\n"); + } + mutex_unlock(&osdc->request_mutex); + return 0; +} + +static int caps_show(struct seq_file *s, void *p) +{ + struct ceph_client *client = p; + int total, avail, used, reserved; + + ceph_reservation_status(client, &total, &avail, &used, &reserved); + seq_printf(s, "total\t\t%d\n" + "avail\t\t%d\n" + "used\t\t%d\n" + "reserved\t%d\n", + total, avail, used, reserved); + return 0; +} + +static int dentry_lru_show(struct seq_file *s, void *ptr) +{ + struct ceph_client *client = s->private; + struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_dentry_info *di; + + spin_lock(&mdsc->dentry_lru_lock); + list_for_each_entry(di, &mdsc->dentry_lru, lru) { + struct dentry *dentry = di->dentry; + seq_printf(s, "%p %p\t%.*s\n", + di, dentry, dentry->d_name.len, dentry->d_name.name); + } + spin_unlock(&mdsc->dentry_lru_lock); + + return 0; +} + +#define DEFINE_SHOW_FUNC(name) \ +static int name##_open(struct inode *inode, struct file *file) \ +{ \ + struct seq_file *sf; \ + int ret; \ + \ + ret = single_open(file, name, NULL); \ + sf = file->private_data; \ + sf->private = inode->i_private; \ + return ret; \ +} \ + \ +static const struct file_operations name##_fops = { \ + .open = name##_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +}; + +DEFINE_SHOW_FUNC(monmap_show) +DEFINE_SHOW_FUNC(mdsmap_show) +DEFINE_SHOW_FUNC(osdmap_show) +DEFINE_SHOW_FUNC(monc_show) +DEFINE_SHOW_FUNC(mdsc_show) +DEFINE_SHOW_FUNC(osdc_show) +DEFINE_SHOW_FUNC(dentry_lru_show) +DEFINE_SHOW_FUNC(caps_show) + +int __init ceph_debugfs_init(void) +{ + ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); + if (!ceph_debugfs_dir) + return -ENOMEM; + return 0; +} + +void ceph_debugfs_cleanup(void) +{ + debugfs_remove(ceph_debugfs_dir); +} + +int ceph_debugfs_client_init(struct ceph_client *client) +{ + int ret = 0; + char name[80]; + + snprintf(name, sizeof(name), FSID_FORMAT ".client%lld", + PR_FSID(&client->monc.monmap->fsid), client->whoami); + + client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); + if (!client->debugfs_dir) + goto out; + + client->monc.debugfs_file = debugfs_create_file("monc", + 0600, + client->debugfs_dir, + client, + &monc_show_fops); + if (!client->monc.debugfs_file) + goto out; + + client->mdsc.debugfs_file = debugfs_create_file("mdsc", + 0600, + client->debugfs_dir, + client, + &mdsc_show_fops); + if (!client->mdsc.debugfs_file) + goto out; + + client->osdc.debugfs_file = debugfs_create_file("osdc", + 0600, + client->debugfs_dir, + client, + &osdc_show_fops); + if (!client->osdc.debugfs_file) + goto out; + + client->debugfs_monmap = debugfs_create_file("monmap", + 0600, + client->debugfs_dir, + client, + &monmap_show_fops); + if (!client->debugfs_monmap) + goto out; + + client->debugfs_mdsmap = debugfs_create_file("mdsmap", + 0600, + client->debugfs_dir, + client, + &mdsmap_show_fops); + if (!client->debugfs_mdsmap) + goto out; + + client->debugfs_osdmap = debugfs_create_file("osdmap", + 0600, + client->debugfs_dir, + client, + &osdmap_show_fops); + if (!client->debugfs_osdmap) + goto out; + + client->debugfs_dentry_lru = debugfs_create_file("dentry_lru", + 0600, + client->debugfs_dir, + client, + &dentry_lru_show_fops); + if (!client->debugfs_dentry_lru) + goto out; + + client->debugfs_caps = debugfs_create_file("caps", + 0400, + client->debugfs_dir, + client, + &caps_show_fops); + if (!client->debugfs_caps) + goto out; + + return 0; + +out: + ceph_debugfs_client_cleanup(client); + return ret; +} + +void ceph_debugfs_client_cleanup(struct ceph_client *client) +{ + debugfs_remove(client->debugfs_caps); + debugfs_remove(client->debugfs_dentry_lru); + debugfs_remove(client->debugfs_osdmap); + debugfs_remove(client->debugfs_mdsmap); + debugfs_remove(client->debugfs_monmap); + debugfs_remove(client->osdc.debugfs_file); + debugfs_remove(client->mdsc.debugfs_file); + debugfs_remove(client->monc.debugfs_file); + debugfs_remove(client->debugfs_dir); +} + -- cgit v1.2.3 From 039934b895c89c2bb40aa5132efe00e60b70efca Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 12 Nov 2009 15:05:52 -0800 Subject: ceph: build cleanly without CONFIG_DEBUG_FS Signed-off-by: Sage Weil --- fs/ceph/debugfs.c | 23 +++++++++++++++++++++++ fs/ceph/mds_client.h | 2 ++ fs/ceph/mon_client.h | 2 ++ fs/ceph/osd_client.h | 2 ++ fs/ceph/super.h | 2 ++ 5 files changed, 31 insertions(+) (limited to 'fs/ceph/debugfs.c') diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 9edbad32f116..9b2020696680 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -8,6 +8,8 @@ #include "super.h" #include "mds_client.h" +#ifdef CONFIG_DEBUG_FS + /* * Implement /sys/kernel/debug/ceph fun * @@ -423,3 +425,24 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client) debugfs_remove(client->debugfs_dir); } +#else // CONFIG_DEBUG_FS + +int __init ceph_debugfs_init(void) +{ + return 0; +} + +void ceph_debugfs_cleanup(void) +{ +} + +int ceph_debugfs_client_init(struct ceph_client *client) +{ + return 0; +} + +void ceph_debugfs_client_cleanup(struct ceph_client *client) +{ +} + +#endif // CONFIG_DEBUG_FS diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index f566e9c84295..0751b821f231 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -256,7 +256,9 @@ struct ceph_mds_client { spinlock_t cap_dirty_lock; /* protects above items */ wait_queue_head_t cap_flushing_wq; +#ifdef CONFIG_DEBUG_FS struct dentry *debugfs_file; +#endif spinlock_t dentry_lru_lock; struct list_head dentry_lru; diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h index 5258c5693b03..9f6db45bf469 100644 --- a/fs/ceph/mon_client.h +++ b/fs/ceph/mon_client.h @@ -78,7 +78,9 @@ struct ceph_mon_client { int want_next_osdmap; /* 1 = want, 2 = want+asked */ u32 have_osdmap, have_mdsmap; +#ifdef CONFIG_DEBUG_FS struct dentry *debugfs_file; +#endif }; extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end); diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h index 9a4addf7d651..766c8dc80aff 100644 --- a/fs/ceph/osd_client.h +++ b/fs/ceph/osd_client.h @@ -83,7 +83,9 @@ struct ceph_osd_client { struct rb_root requests; /* pending requests */ int num_requests; struct delayed_work timeout_work; +#ifdef CONFIG_DEBUG_FS struct dentry *debugfs_file; +#endif mempool_t *req_mempool; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 06b62c02f513..8aa1ffba6c0d 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -112,9 +112,11 @@ static inline unsigned long time_sub(unsigned long a, unsigned long b) */ struct ceph_client { __s64 whoami; /* my client number */ +#ifdef CONFIG_DEBUG_FS struct dentry *debugfs_monmap; struct dentry *debugfs_mdsmap, *debugfs_osdmap; struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps; +#endif struct mutex mount_mutex; /* serialize mount attempts */ struct ceph_mount_args *mount_args; -- cgit v1.2.3 From 0743304d871559cb4c7c066357de2caa60e94c2f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 18 Nov 2009 16:50:41 -0800 Subject: ceph: fix debugfs entry, simplify fsid checks We may first learn our fsid from any of the mon, osd, or mds maps (whichever the monitor sends first). Consolidate checks in a single helper. Initialize the client debugfs entry then, since we need the fsid (and global_id) for the directory name. Also remove dead mount code. Signed-off-by: Sage Weil --- fs/ceph/debugfs.c | 4 +- fs/ceph/mds_client.c | 12 +----- fs/ceph/mon_client.c | 113 ++++----------------------------------------------- fs/ceph/osd_client.c | 12 +----- fs/ceph/super.c | 32 ++++++++++++--- fs/ceph/super.h | 18 ++++---- 6 files changed, 51 insertions(+), 140 deletions(-) (limited to 'fs/ceph/debugfs.c') diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 9b2020696680..b90fc3e1ff70 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -7,6 +7,8 @@ #include "super.h" #include "mds_client.h" +#include "mon_client.h" +#include "auth.h" #ifdef CONFIG_DEBUG_FS @@ -335,7 +337,7 @@ int ceph_debugfs_client_init(struct ceph_client *client) char name[80]; snprintf(name, sizeof(name), FSID_FORMAT ".client%lld", - PR_FSID(&client->monc.monmap->fsid), client->whoami); + PR_FSID(&client->fsid), client->monc.auth->global_id); client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); if (!client->debugfs_dir) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 8a285158aecc..8d95b0f051e4 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2782,16 +2782,8 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); ceph_decode_copy(&p, &fsid, sizeof(fsid)); - if (mdsc->client->monc.have_fsid) { - if (ceph_fsid_compare(&fsid, - &mdsc->client->monc.monmap->fsid)) { - pr_err("got mdsmap with wrong fsid\n"); - return; - } - } else { - ceph_fsid_set(&mdsc->client->monc.monmap->fsid, &fsid); - mdsc->client->monc.have_fsid = true; - } + if (ceph_check_fsid(mdsc->client, &fsid) < 0) + return; epoch = ceph_decode_32(&p); maplen = ceph_decode_32(&p); dout("handle_map epoch %u len %d\n", epoch, (int)maplen); diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index 017d5aef8834..b742b3b3e0f3 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -320,89 +320,12 @@ int ceph_monc_open_session(struct ceph_mon_client *monc) return 0; } -#if 0 - -/* - * The monitor responds with mount ack indicate mount success. The - * included client ticket allows the client to talk to MDSs and OSDs. - */ -static void handle_mount_ack(struct ceph_mon_client *monc, struct ceph_msg *msg) -{ - struct ceph_client *client = monc->client; - struct ceph_monmap *monmap = NULL, *old = monc->monmap; - void *p, *end; - s32 result; - u32 len; - s64 cnum; - int err = -EINVAL; - - if (client->whoami >= 0) { - dout("handle_mount_ack - already mounted\n"); - return; - } - - mutex_lock(&monc->mutex); - - dout("handle_mount_ack\n"); - p = msg->front.iov_base; - end = p + msg->front.iov_len; - - ceph_decode_64_safe(&p, end, cnum, bad); - ceph_decode_32_safe(&p, end, result, bad); - ceph_decode_32_safe(&p, end, len, bad); - if (result) { - pr_err("mount denied: %.*s (%d)\n", len, (char *)p, - result); - err = result; - goto out; - } - p += len; - - ceph_decode_32_safe(&p, end, len, bad); - ceph_decode_need(&p, end, len, bad); - monmap = ceph_monmap_decode(p, p + len); - if (IS_ERR(monmap)) { - pr_err("problem decoding monmap, %d\n", - (int)PTR_ERR(monmap)); - err = -EINVAL; - goto out; - } - p += len; - - client->monc.monmap = monmap; - kfree(old); - - client->signed_ticket = NULL; - client->signed_ticket_len = 0; - - monc->want_mount = false; - - client->whoami = cnum; - client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; - client->msgr->inst.name.num = cpu_to_le64(cnum); - pr_info("client%lld fsid " FSID_FORMAT "\n", - client->whoami, PR_FSID(&client->monc.monmap->fsid)); - - ceph_debugfs_client_init(client); - __send_subscribe(monc); - - err = 0; - goto out; - -bad: - pr_err("error decoding mount_ack message\n"); -out: - client->mount_err = err; - mutex_unlock(&monc->mutex); - wake_up(&client->mount_wq); -} -#endif - /* * The monitor responds with mount ack indicate mount success. The * included client ticket allows the client to talk to MDSs and OSDs. */ -static void ceph_monc_handle_map(struct ceph_mon_client *monc, struct ceph_msg *msg) +static void ceph_monc_handle_map(struct ceph_mon_client *monc, + struct ceph_msg *msg) { struct ceph_client *client = monc->client; struct ceph_monmap *monmap = NULL, *old = monc->monmap; @@ -420,42 +343,19 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, struct ceph_msg * (int)PTR_ERR(monmap)); return; } - if (monc->have_fsid && - ceph_fsid_compare(&monmap->fsid, &monc->monmap->fsid)) { - print_hex_dump(KERN_ERR, "monmap->fsid: ", DUMP_PREFIX_NONE, 16, 1, - (void *)&monmap->fsid, 16, 0); - print_hex_dump(KERN_ERR, "monc->monmap->fsid: ", DUMP_PREFIX_NONE, 16, 1, - (void *)&monc->monmap->fsid, 16, 0); - - pr_err("fsid mismatch, got a previous map with different fsid"); + + if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) { kfree(monmap); return; } client->monc.monmap = monmap; - client->monc.have_fsid = true; kfree(old); mutex_unlock(&monc->mutex); wake_up(&client->mount_wq); } - -/* - * init client info after authentication - */ -static void __init_authenticated_client(struct ceph_mon_client *monc) -{ - struct ceph_client *client = monc->client; - - client->signed_ticket = NULL; - client->signed_ticket_len = 0; - client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; - client->msgr->inst.name.num = monc->auth->global_id; - - ceph_debugfs_client_init(client); -} - /* * statfs */ @@ -754,7 +654,10 @@ static void handle_auth_reply(struct ceph_mon_client *monc, ceph_con_send(monc->con, monc->m_auth); } else if (monc->auth->ops->is_authenticated(monc->auth)) { dout("authenticated, starting session\n"); - __init_authenticated_client(monc); + + monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; + monc->client->msgr->inst.name.num = monc->auth->global_id; + __send_subscribe(monc); __resend_statfs(monc); } diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index ca0ee68c322a..d63f192999ee 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -882,16 +882,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) /* verify fsid */ ceph_decode_need(&p, end, sizeof(fsid), bad); ceph_decode_copy(&p, &fsid, sizeof(fsid)); - if (osdc->client->monc.have_fsid) { - if (ceph_fsid_compare(&fsid, - &osdc->client->monc.monmap->fsid)) { - pr_err("got osdmap with wrong fsid, ignoring\n"); - return; - } - } else { - ceph_fsid_set(&osdc->client->monc.monmap->fsid, &fsid); - osdc->client->monc.have_fsid = true; - } + if (ceph_check_fsid(osdc->client, &fsid) < 0) + return; down_write(&osdc->map_sem); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index df05617aca86..3df6d4ab236c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -19,6 +19,7 @@ #include "decode.h" #include "super.h" #include "mon_client.h" +#include "auth.h" /* * Ceph superblock operations @@ -510,14 +511,11 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) client->sb = NULL; client->mount_state = CEPH_MOUNT_MOUNTING; - client->whoami = -1; client->mount_args = args; client->msgr = NULL; client->mount_err = 0; - client->signed_ticket = NULL; - client->signed_ticket_len = 0; err = bdi_init(&client->backing_dev_info); if (err < 0) @@ -582,8 +580,6 @@ static void ceph_destroy_client(struct ceph_client *client) ceph_monc_stop(&client->monc); ceph_osdc_stop(&client->osdc); - kfree(client->signed_ticket); - ceph_debugfs_client_cleanup(client); destroy_workqueue(client->wb_wq); destroy_workqueue(client->pg_inv_wq); @@ -599,6 +595,32 @@ static void ceph_destroy_client(struct ceph_client *client) dout("destroy_client %p done\n", client); } +/* + * Initially learn our fsid, or verify an fsid matches. + */ +int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) +{ + if (client->have_fsid) { + if (ceph_fsid_compare(&client->fsid, fsid)) { + print_hex_dump(KERN_ERR, "this fsid: ", + DUMP_PREFIX_NONE, 16, 1, + (void *)fsid, 16, 0); + print_hex_dump(KERN_ERR, " old fsid: ", + DUMP_PREFIX_NONE, 16, 1, + (void *)&client->fsid, 16, 0); + pr_err("fsid mismatch\n"); + return -1; + } + } else { + pr_info("client%lld fsid " FSID_FORMAT "\n", + client->monc.auth->global_id, PR_FSID(fsid)); + memcpy(&client->fsid, fsid, sizeof(*fsid)); + ceph_debugfs_client_init(client); + client->have_fsid = true; + } + return 0; +} + /* * true if we have the mon map (and have thus joined the cluster) */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index e0e8130959b6..de5e32414978 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -113,16 +113,11 @@ static inline unsigned long time_sub(unsigned long a, unsigned long b) * mounting the same ceph filesystem/cluster. */ struct ceph_client { - __s64 whoami; /* my client number */ -#ifdef CONFIG_DEBUG_FS - struct dentry *debugfs_monmap; - struct dentry *debugfs_mdsmap, *debugfs_osdmap; - struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps; -#endif + struct ceph_fsid fsid; + bool have_fsid; struct mutex mount_mutex; /* serialize mount attempts */ struct ceph_mount_args *mount_args; - struct ceph_fsid fsid; struct super_block *sb; @@ -130,8 +125,6 @@ struct ceph_client { wait_queue_head_t mount_wq; int mount_err; - void *signed_ticket; /* our keys to the kingdom */ - int signed_ticket_len; struct ceph_messenger *msgr; /* messenger instance */ struct ceph_mon_client monc; @@ -145,6 +138,12 @@ struct ceph_client { struct workqueue_struct *trunc_wq; struct backing_dev_info backing_dev_info; + +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_monmap; + struct dentry *debugfs_mdsmap, *debugfs_osdmap; + struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps; +#endif }; static inline struct ceph_client *ceph_client(struct super_block *sb) @@ -735,6 +734,7 @@ extern struct kmem_cache *ceph_dentry_cachep; extern struct kmem_cache *ceph_file_cachep; extern const char *ceph_msg_type_name(int type); +extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); #define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \ "%02x%02x%02x%02x%02x%02x" -- cgit v1.2.3 From 06edf046dd68ccbc7cf5f70f957a31702d0e7596 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 15 Dec 2009 14:44:32 -0800 Subject: ceph: include link to bdi in debugfs Signed-off-by: Sage Weil --- fs/ceph/debugfs.c | 7 +++++++ fs/ceph/super.h | 1 + 2 files changed, 8 insertions(+) (limited to 'fs/ceph/debugfs.c') diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index b90fc3e1ff70..441484ab7e94 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -1,5 +1,6 @@ #include "ceph_debug.h" +#include #include #include #include @@ -24,6 +25,7 @@ * .../monc - mon client state * .../dentry_lru - dump contents of dentry lru * .../caps - expose cap (reservation) stats + * .../bdi - symlink to ../../bdi/something */ static struct dentry *ceph_debugfs_dir; @@ -407,6 +409,10 @@ int ceph_debugfs_client_init(struct ceph_client *client) if (!client->debugfs_caps) goto out; + sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev)); + client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir, + name); + return 0; out: @@ -416,6 +422,7 @@ out: void ceph_debugfs_client_cleanup(struct ceph_client *client) { + debugfs_remove(client->debugfs_bdi); debugfs_remove(client->debugfs_caps); debugfs_remove(client->debugfs_dentry_lru); debugfs_remove(client->debugfs_osdmap); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index de5e32414978..2304bd2844a4 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -143,6 +143,7 @@ struct ceph_client { struct dentry *debugfs_monmap; struct dentry *debugfs_mdsmap, *debugfs_osdmap; struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps; + struct dentry *debugfs_bdi; #endif }; -- cgit v1.2.3 From 2baba25019ec564cd247af74013873d69a0b8190 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 18 Dec 2009 13:51:57 -0800 Subject: ceph: writeback congestion control Set bdi congestion bit when amount of write data in flight exceeds adjustable threshold. Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/addr.c | 35 +++++++++++++++++++++++++++++++++-- fs/ceph/debugfs.c | 33 +++++++++++++++++++++++++++++++++ fs/ceph/super.c | 36 ++++++++++++++++++++++++++++++++++++ fs/ceph/super.h | 3 +++ 4 files changed, 105 insertions(+), 2 deletions(-) (limited to 'fs/ceph/debugfs.c') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index d0cdceb0b90b..a6850a14038e 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -47,6 +47,12 @@ * accounting is preserved. */ +#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10)) +#define CONGESTION_OFF_THRESH(congestion_kb) \ + (CONGESTION_ON_THRESH(congestion_kb) - \ + (CONGESTION_ON_THRESH(congestion_kb) >> 2)) + + /* * Dirty a page. Optimistically adjust accounting, on the assumption @@ -377,6 +383,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) { struct inode *inode; struct ceph_inode_info *ci; + struct ceph_client *client; struct ceph_osd_client *osdc; loff_t page_off = page->index << PAGE_CACHE_SHIFT; int len = PAGE_CACHE_SIZE; @@ -384,6 +391,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) int err = 0; struct ceph_snap_context *snapc; u64 snap_size = 0; + long writeback_stat; dout("writepage %p idx %lu\n", page, page->index); @@ -393,7 +401,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } inode = page->mapping->host; ci = ceph_inode(inode); - osdc = &ceph_inode_to_client(inode)->osdc; + client = ceph_inode_to_client(inode); + osdc = &client->osdc; /* verify this is a writeable snap context */ snapc = (void *)page->private; @@ -420,6 +429,11 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p page %p index %lu on %llu~%u\n", inode, page, page->index, page_off, len); + writeback_stat = atomic_long_inc_return(&client->writeback_count); + if (writeback_stat > + CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) + set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); + set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), &ci->i_layout, snapc, @@ -499,6 +513,8 @@ static void writepages_finish(struct ceph_osd_request *req, struct writeback_control *wbc = req->r_wbc; __s32 rc = -EIO; u64 bytes = 0; + struct ceph_client *client = ceph_inode_to_client(inode); + long writeback_stat; /* parse reply */ replyhead = msg->front.iov_base; @@ -524,6 +540,13 @@ static void writepages_finish(struct ceph_osd_request *req, BUG_ON(!page); WARN_ON(!PageUptodate(page)); + writeback_stat = + atomic_long_dec_return(&client->writeback_count); + if (writeback_stat < + CONGESTION_OFF_THRESH(client->mount_args->congestion_kb)) + clear_bdi_congested(&client->backing_dev_info, + BLK_RW_ASYNC); + if (i >= wrote) { dout("inode %p skipping page %p\n", inode, page); wbc->pages_skipped++; @@ -666,6 +689,7 @@ retry: u64 offset, len; struct ceph_osd_request_head *reqhead; struct ceph_osd_op *op; + long writeback_stat; next = 0; locked_pages = 0; @@ -773,6 +797,12 @@ get_more_pages: first = i; dout("%p will write page %p idx %lu\n", inode, page, page->index); + + writeback_stat = atomic_long_inc_return(&client->writeback_count); + if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) { + set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); + } + set_page_writeback(page); req->r_pages[locked_pages] = page; locked_pages++; @@ -998,7 +1028,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = file->f_dentry->d_inode; - struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_client *client = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = &client->mdsc; unsigned from = pos & (PAGE_CACHE_SIZE - 1); int check_cap = 0; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 441484ab7e94..22d3b47fb1be 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -320,6 +320,30 @@ DEFINE_SHOW_FUNC(osdc_show) DEFINE_SHOW_FUNC(dentry_lru_show) DEFINE_SHOW_FUNC(caps_show) +static int congestion_kb_set(void *data, u64 val) +{ + struct ceph_client *client = (struct ceph_client *)data; + + if (client) + client->mount_args->congestion_kb = (int)val; + + return 0; +} + +static int congestion_kb_get(void *data, u64 *val) +{ + struct ceph_client *client = (struct ceph_client *)data; + + if (client) + *val = (u64)client->mount_args->congestion_kb; + + return 0; +} + + +DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, + congestion_kb_set, "%llu\n"); + int __init ceph_debugfs_init(void) { ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); @@ -409,6 +433,14 @@ int ceph_debugfs_client_init(struct ceph_client *client) if (!client->debugfs_caps) goto out; + client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb", + 0600, + client->debugfs_dir, + client, + &congestion_kb_fops); + if (!client->debugfs_congestion_kb) + goto out; + sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev)); client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir, name); @@ -431,6 +463,7 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client) debugfs_remove(client->osdc.debugfs_file); debugfs_remove(client->mdsc.debugfs_file); debugfs_remove(client->monc.debugfs_file); + debugfs_remove(client->debugfs_congestion_kb); debugfs_remove(client->debugfs_dir); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 6d02a166f8ff..b9cb8cebcdc1 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -150,6 +150,35 @@ static void ceph_inode_init_once(void *foo) inode_init_once(&ci->vfs_inode); } +static int default_congestion_kb(void) +{ + int congestion_kb; + + /* + * Copied from NFS + * + * congestion size, scale with available memory. + * + * 64MB: 8192k + * 128MB: 11585k + * 256MB: 16384k + * 512MB: 23170k + * 1GB: 32768k + * 2GB: 46340k + * 4GB: 65536k + * 8GB: 92681k + * 16GB: 131072k + * + * This allows larger machines to have larger/more transfers. + * Limit the default to 256M + */ + congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + if (congestion_kb > 256*1024) + congestion_kb = 256*1024; + + return congestion_kb; +} + static int __init init_caches(void) { ceph_inode_cachep = kmem_cache_create("ceph_inode_info", @@ -267,6 +296,7 @@ enum { Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, Opt_readdir_max_entries, + Opt_congestion_kb, Opt_last_int, /* int args above */ Opt_snapdirname, @@ -295,6 +325,7 @@ static match_table_t arg_tokens = { {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, {Opt_readdir_max_entries, "readdir_max_entries=%d"}, + {Opt_congestion_kb, "write_congestion_kb=%d"}, /* int args above */ {Opt_snapdirname, "snapdirname=%s"}, {Opt_name, "name=%s"}, @@ -342,6 +373,7 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4; args->max_readdir = 1024; + args->congestion_kb = default_congestion_kb(); /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ err = -EINVAL; @@ -445,6 +477,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, case Opt_readdir_max_entries: args->max_readdir = intval; break; + case Opt_congestion_kb: + args->congestion_kb = intval; + break; case Opt_noshare: args->flags |= CEPH_OPT_NOSHARE; @@ -516,6 +551,7 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) client->msgr = NULL; client->mount_err = 0; + atomic_long_set(&client->writeback_count, 0); err = bdi_init(&client->backing_dev_info); if (err < 0) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 2304bd2844a4..62d9ae482d72 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -59,6 +59,7 @@ struct ceph_mount_args { int wsize; int rsize; /* max readahead */ int max_readdir; /* max readdir size */ + int congestion_kb; /* max readdir size */ int osd_timeout; char *snapdir_name; /* default ".snap" */ char *name; @@ -136,6 +137,7 @@ struct ceph_client { struct workqueue_struct *wb_wq; struct workqueue_struct *pg_inv_wq; struct workqueue_struct *trunc_wq; + atomic_long_t writeback_count; struct backing_dev_info backing_dev_info; @@ -143,6 +145,7 @@ struct ceph_client { struct dentry *debugfs_monmap; struct dentry *debugfs_mdsmap, *debugfs_osdmap; struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps; + struct dentry *debugfs_congestion_kb; struct dentry *debugfs_bdi; #endif }; -- cgit v1.2.3 From 7740a42f816790583bd8a9079337772d511af3a3 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 8 Jan 2010 15:58:25 -0800 Subject: ceph: display pgid in debugfs osd request dump Signed-off-by: Sage Weil --- fs/ceph/debugfs.c | 6 ++++-- fs/ceph/osd_client.c | 2 ++ fs/ceph/osd_client.h | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/ceph/debugfs.c') diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 22d3b47fb1be..fba44b2a6086 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -231,8 +231,10 @@ static int osdc_show(struct seq_file *s, void *pp) req = rb_entry(p, struct ceph_osd_request, r_node); - seq_printf(s, "%lld\tosd%d\t", req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1); + seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1, + le32_to_cpu(req->r_pgid.pool), + le16_to_cpu(req->r_pgid.ps)); head = req->r_request->front.iov_base; op = (void *)(head + 1); diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index 80b868f7a0fc..8417e21a3cb2 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -538,6 +538,8 @@ static int __map_osds(struct ceph_osd_client *osdc, if (err) return err; pgid = reqhead->layout.ol_pgid; + req->r_pgid = pgid; + o = ceph_calc_pg_primary(osdc->osdmap, pgid); if ((req->r_osd && req->r_osd->o_osd == o && diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h index 8fef71cc4457..4162c6810a8f 100644 --- a/fs/ceph/osd_client.h +++ b/fs/ceph/osd_client.h @@ -42,6 +42,7 @@ struct ceph_osd_request { struct rb_node r_node; struct list_head r_osd_item; struct ceph_osd *r_osd; + struct ceph_pg r_pgid; struct ceph_connection *r_con_filling_pages; -- cgit v1.2.3 From 44ca18f2682eb1cfbed153849adedb79e3e19790 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 15 Feb 2010 12:08:46 -0800 Subject: ceph: use rbtree for mds requests The rbtree is a more appropriate data structure than a radix_tree. It avoids extra memory usage and simplifies the code. It also fixes a bug where the debugfs 'mdsc' file wasn't including the most recent mds request. Signed-off-by: Sage Weil --- fs/ceph/debugfs.c | 13 ++--- fs/ceph/mds_client.c | 149 +++++++++++++++++++++++++++++++-------------------- fs/ceph/mds_client.h | 4 +- 3 files changed, 97 insertions(+), 69 deletions(-) (limited to 'fs/ceph/debugfs.c') diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index fba44b2a6086..cd5dd805e4be 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -142,21 +142,16 @@ static int monc_show(struct seq_file *s, void *p) static int mdsc_show(struct seq_file *s, void *p) { struct ceph_client *client = s->private; - struct ceph_mds_request *req; - u64 nexttid = 0; - int got; struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_request *req; + struct rb_node *rp; int pathlen; u64 pathbase; char *path; mutex_lock(&mdsc->mutex); - while (nexttid < mdsc->last_tid) { - got = radix_tree_gang_lookup(&mdsc->request_tree, - (void **)&req, nexttid, 1); - if (got == 0) - break; - nexttid = req->r_tid + 1; + for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) { + req = rb_entry(rp, struct ceph_mds_request, r_node); if (req->r_request) seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index aa8506bad42d..81840d6b68a4 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -255,6 +255,7 @@ static const char *session_state_name(int s) case CEPH_MDS_SESSION_OPEN: return "open"; case CEPH_MDS_SESSION_HUNG: return "hung"; case CEPH_MDS_SESSION_CLOSING: return "closing"; + case CEPH_MDS_SESSION_RESTARTING: return "restarting"; case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting"; default: return "???"; } @@ -448,10 +449,42 @@ static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc, u64 tid) { struct ceph_mds_request *req; - req = radix_tree_lookup(&mdsc->request_tree, tid); - if (req) - ceph_mdsc_get_request(req); - return req; + struct rb_node *n = mdsc->request_tree.rb_node; + + while (n) { + req = rb_entry(n, struct ceph_mds_request, r_node); + if (tid < req->r_tid) + n = n->rb_left; + else if (tid > req->r_tid) + n = n->rb_right; + else { + ceph_mdsc_get_request(req); + return req; + } + } + return NULL; +} + +static void __insert_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *new) +{ + struct rb_node **p = &mdsc->request_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_mds_request *req = NULL; + + while (*p) { + parent = *p; + req = rb_entry(parent, struct ceph_mds_request, r_node); + if (new->r_tid < req->r_tid) + p = &(*p)->rb_left; + else if (new->r_tid > req->r_tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->r_node, parent, p); + rb_insert_color(&new->r_node, &mdsc->request_tree); } /* @@ -469,7 +502,7 @@ static void __register_request(struct ceph_mds_client *mdsc, ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps); dout("__register_request %p tid %lld\n", req, req->r_tid); ceph_mdsc_get_request(req); - radix_tree_insert(&mdsc->request_tree, req->r_tid, (void *)req); + __insert_request(mdsc, req); if (dir) { struct ceph_inode_info *ci = ceph_inode(dir); @@ -485,7 +518,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { dout("__unregister_request %p tid %lld\n", req, req->r_tid); - radix_tree_delete(&mdsc->request_tree, req->r_tid); + rb_erase(&req->r_node, &mdsc->request_tree); ceph_mdsc_put_request(req); if (req->r_unsafe_dir) { @@ -1115,17 +1148,25 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) } /* - * return oldest (lowest) tid in request tree, 0 if none. + * return oldest (lowest) request, tid in request tree, 0 if none. * * called under mdsc->mutex. */ +static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc) +{ + if (RB_EMPTY_ROOT(&mdsc->request_tree)) + return NULL; + return rb_entry(rb_first(&mdsc->request_tree), + struct ceph_mds_request, r_node); +} + static u64 __get_oldest_tid(struct ceph_mds_client *mdsc) { - struct ceph_mds_request *first; - if (radix_tree_gang_lookup(&mdsc->request_tree, - (void **)&first, 0, 1) <= 0) - return 0; - return first->r_tid; + struct ceph_mds_request *req = __get_oldest_req(mdsc); + + if (req) + return req->r_tid; + return 0; } /* @@ -1540,26 +1581,19 @@ static void __wake_requests(struct ceph_mds_client *mdsc, */ static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all) { - struct ceph_mds_request *reqs[10]; - u64 nexttid = 0; - int i, got; + struct ceph_mds_request *req; + struct rb_node *p; dout("kick_requests mds%d\n", mds); - while (nexttid <= mdsc->last_tid) { - got = radix_tree_gang_lookup(&mdsc->request_tree, - (void **)&reqs, nexttid, 10); - if (got == 0) - break; - nexttid = reqs[got-1]->r_tid + 1; - for (i = 0; i < got; i++) { - if (reqs[i]->r_got_unsafe) - continue; - if (reqs[i]->r_session && - reqs[i]->r_session->s_mds == mds) { - dout(" kicking tid %llu\n", reqs[i]->r_tid); - put_request_session(reqs[i]); - __do_request(mdsc, reqs[i]); - } + for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) { + req = rb_entry(p, struct ceph_mds_request, r_node); + if (req->r_got_unsafe) + continue; + if (req->r_session && + req->r_session->s_mds == mds) { + dout(" kicking tid %llu\n", req->r_tid); + put_request_session(req); + __do_request(mdsc, req); } } } @@ -1748,7 +1782,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) list_del_init(&req->r_unsafe_item); /* last unsafe request during umount? */ - if (mdsc->stopping && !__get_oldest_tid(mdsc)) + if (mdsc->stopping && !__get_oldest_req(mdsc)) complete(&mdsc->safe_umount_waiters); mutex_unlock(&mdsc->mutex); goto out; @@ -2573,7 +2607,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) INIT_LIST_HEAD(&mdsc->snap_empty); spin_lock_init(&mdsc->snap_empty_lock); mdsc->last_tid = 0; - INIT_RADIX_TREE(&mdsc->request_tree, GFP_NOFS); + mdsc->request_tree = RB_ROOT; INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); mdsc->last_renew_caps = jiffies; INIT_LIST_HEAD(&mdsc->cap_delay_list); @@ -2600,20 +2634,19 @@ static void wait_requests(struct ceph_mds_client *mdsc) struct ceph_client *client = mdsc->client; mutex_lock(&mdsc->mutex); - if (__get_oldest_tid(mdsc)) { + if (__get_oldest_req(mdsc)) { mutex_unlock(&mdsc->mutex); + dout("wait_requests waiting for requests\n"); wait_for_completion_timeout(&mdsc->safe_umount_waiters, client->mount_args->mount_timeout * HZ); - mutex_lock(&mdsc->mutex); /* tear down remaining requests */ - while (radix_tree_gang_lookup(&mdsc->request_tree, - (void **)&req, 0, 1)) { + mutex_lock(&mdsc->mutex); + while ((req = __get_oldest_req(mdsc))) { dout("wait_requests timed out on tid %llu\n", req->r_tid); - radix_tree_delete(&mdsc->request_tree, req->r_tid); - ceph_mdsc_put_request(req); + __unregister_request(mdsc, req); } } mutex_unlock(&mdsc->mutex); @@ -2639,31 +2672,29 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) */ static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) { - struct ceph_mds_request *req; - u64 next_tid = 0; - int got; + struct ceph_mds_request *req = NULL; + struct rb_node *n; mutex_lock(&mdsc->mutex); dout("wait_unsafe_requests want %lld\n", want_tid); - while (1) { - got = radix_tree_gang_lookup(&mdsc->request_tree, (void **)&req, - next_tid, 1); - if (!got) - break; - if (req->r_tid > want_tid) + req = __get_oldest_req(mdsc); + while (req && req->r_tid <= want_tid) { + if ((req->r_op & CEPH_MDS_OP_WRITE)) { + /* write op */ + ceph_mdsc_get_request(req); + mutex_unlock(&mdsc->mutex); + dout("wait_unsafe_requests wait on %llu (want %llu)\n", + req->r_tid, want_tid); + wait_for_completion(&req->r_safe_completion); + mutex_lock(&mdsc->mutex); + n = rb_next(&req->r_node); + ceph_mdsc_put_request(req); + } else { + n = rb_next(&req->r_node); + } + if (!n) break; - - next_tid = req->r_tid + 1; - if ((req->r_op & CEPH_MDS_OP_WRITE) == 0) - continue; /* not a write op */ - - ceph_mdsc_get_request(req); - mutex_unlock(&mdsc->mutex); - dout("wait_unsafe_requests wait on %llu (want %llu)\n", - req->r_tid, want_tid); - wait_for_completion(&req->r_safe_completion); - mutex_lock(&mdsc->mutex); - ceph_mdsc_put_request(req); + req = rb_entry(n, struct ceph_mds_request, r_node); } mutex_unlock(&mdsc->mutex); dout("wait_unsafe_requests done\n"); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index ee71495e27c4..98f09cd06006 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "types.h" @@ -150,6 +151,7 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, */ struct ceph_mds_request { u64 r_tid; /* transaction id */ + struct rb_node r_node; int r_op; /* mds op code */ int r_mds; @@ -249,7 +251,7 @@ struct ceph_mds_client { spinlock_t snap_empty_lock; /* protect snap_empty */ u64 last_tid; /* most recent mds request */ - struct radix_tree_root request_tree; /* pending mds requests */ + struct rb_root request_tree; /* pending mds requests */ struct delayed_work delayed_work; /* delayed work */ unsigned long last_renew_caps; /* last time we renewed our caps */ struct list_head cap_delay_list; /* caps with delayed release */ -- cgit v1.2.3 From 85ff03f6bfef7d5b59ab3aefd4773f497ffad8a4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 15 Feb 2010 14:47:28 -0800 Subject: ceph: use rbtree for mon statfs requests An rbtree is lighter weight, particularly given we will generally have very few in-flight statfs requests. Signed-off-by: Sage Weil --- fs/ceph/debugfs.c | 14 ++++------- fs/ceph/mon_client.c | 67 ++++++++++++++++++++++++++++++++++++---------------- fs/ceph/mon_client.h | 5 ++-- 3 files changed, 54 insertions(+), 32 deletions(-) (limited to 'fs/ceph/debugfs.c') diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index cd5dd805e4be..b58bd9188692 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -112,9 +112,8 @@ static int monc_show(struct seq_file *s, void *p) { struct ceph_client *client = s->private; struct ceph_mon_statfs_request *req; - u64 nexttid = 0; - int got; struct ceph_mon_client *monc = &client->monc; + struct rb_node *rp; mutex_lock(&monc->mutex); @@ -125,17 +124,12 @@ static int monc_show(struct seq_file *s, void *p) if (monc->want_next_osdmap) seq_printf(s, "want next osdmap\n"); - while (nexttid < monc->last_tid) { - got = radix_tree_gang_lookup(&monc->statfs_request_tree, - (void **)&req, nexttid, 1); - if (got == 0) - break; - nexttid = req->tid + 1; - + for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) { + req = rb_entry(rp, struct ceph_mon_statfs_request, node); seq_printf(s, "%lld statfs\n", req->tid); } - mutex_unlock(&monc->mutex); + mutex_unlock(&monc->mutex); return 0; } diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index fec41a0eff86..542276e60798 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -343,6 +343,46 @@ out: /* * statfs */ +static struct ceph_mon_statfs_request *__lookup_statfs( + struct ceph_mon_client *monc, u64 tid) +{ + struct ceph_mon_statfs_request *req; + struct rb_node *n = monc->statfs_request_tree.rb_node; + + while (n) { + req = rb_entry(n, struct ceph_mon_statfs_request, node); + if (tid < req->tid) + n = n->rb_left; + else if (tid > req->tid) + n = n->rb_right; + else + return req; + } + return NULL; +} + +static void __insert_statfs(struct ceph_mon_client *monc, + struct ceph_mon_statfs_request *new) +{ + struct rb_node **p = &monc->statfs_request_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_mon_statfs_request *req = NULL; + + while (*p) { + parent = *p; + req = rb_entry(parent, struct ceph_mon_statfs_request, node); + if (new->tid < req->tid) + p = &(*p)->rb_left; + else if (new->tid > req->tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, &monc->statfs_request_tree); +} + static void handle_statfs_reply(struct ceph_mon_client *monc, struct ceph_msg *msg) { @@ -356,7 +396,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc, dout("handle_statfs_reply %p tid %llu\n", msg, tid); mutex_lock(&monc->mutex); - req = radix_tree_lookup(&monc->statfs_request_tree, tid); + req = __lookup_statfs(monc, tid); if (req) { *req->buf = reply->st; req->result = 0; @@ -416,11 +456,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) req.tid = ++monc->last_tid; req.last_attempt = jiffies; req.delay = BASE_DELAY_INTERVAL; - if (radix_tree_insert(&monc->statfs_request_tree, req.tid, &req) < 0) { - mutex_unlock(&monc->mutex); - pr_err("ENOMEM in do_statfs\n"); - return -ENOMEM; - } + __insert_statfs(monc, &req); monc->num_statfs_requests++; mutex_unlock(&monc->mutex); @@ -430,7 +466,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) err = wait_for_completion_interruptible(&req.completion); mutex_lock(&monc->mutex); - radix_tree_delete(&monc->statfs_request_tree, req.tid); + rb_erase(&req.node, &monc->statfs_request_tree); monc->num_statfs_requests--; ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1); mutex_unlock(&monc->mutex); @@ -445,20 +481,11 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) */ static void __resend_statfs(struct ceph_mon_client *monc) { - u64 next_tid = 0; - int got; - int did = 0; struct ceph_mon_statfs_request *req; + struct rb_node *p; - while (1) { - got = radix_tree_gang_lookup(&monc->statfs_request_tree, - (void **)&req, - next_tid, 1); - if (got == 0) - break; - did++; - next_tid = req->tid + 1; - + for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) { + req = rb_entry(p, struct ceph_mon_statfs_request, node); send_statfs(monc, req); } } @@ -578,7 +605,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) monc->sub_sent = 0; INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); - INIT_RADIX_TREE(&monc->statfs_request_tree, GFP_NOFS); + monc->statfs_request_tree = RB_ROOT; monc->num_statfs_requests = 0; monc->last_tid = 0; diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h index 5ca8e48d4379..b958ad5afa06 100644 --- a/fs/ceph/mon_client.h +++ b/fs/ceph/mon_client.h @@ -2,7 +2,7 @@ #define _FS_CEPH_MON_CLIENT_H #include -#include +#include #include "messenger.h" #include "msgpool.h" @@ -45,6 +45,7 @@ struct ceph_mon_request { */ struct ceph_mon_statfs_request { u64 tid; + struct rb_node node; int result; struct ceph_statfs *buf; struct completion completion; @@ -75,7 +76,7 @@ struct ceph_mon_client { struct ceph_msgpool msgpool_auth_reply; /* pending statfs requests */ - struct radix_tree_root statfs_request_tree; + struct rb_root statfs_request_tree; int num_statfs_requests; u64 last_tid; -- cgit v1.2.3 From 85ccce43a3fc15a40ded6ae1603e3f68a17f4d24 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 17 Feb 2010 10:02:43 -0800 Subject: ceph: clean up readdir caps reservation Use a global counter for the minimum number of allocated caps instead of hard coding a check against readdir_max. This takes into account multiple client instances, and avoids examining the superblock mount options when a cap is dropped. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 23 +++++++++++++++++------ fs/ceph/debugfs.c | 13 +++++++------ fs/ceph/super.c | 5 +++++ fs/ceph/super.h | 5 ++++- 4 files changed, 33 insertions(+), 13 deletions(-) (limited to 'fs/ceph/debugfs.c') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ab9b571dda11..f94b56faba3b 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -128,6 +128,7 @@ static int caps_total_count; /* total caps allocated */ static int caps_use_count; /* in use */ static int caps_reserve_count; /* unused, reserved */ static int caps_avail_count; /* unused, unreserved */ +static int caps_min_count; /* keep at least this many (unreserved) */ void __init ceph_caps_init(void) { @@ -149,6 +150,15 @@ void ceph_caps_finalize(void) caps_avail_count = 0; caps_use_count = 0; caps_reserve_count = 0; + caps_min_count = 0; + spin_unlock(&caps_list_lock); +} + +void ceph_adjust_min_caps(int delta) +{ + spin_lock(&caps_list_lock); + caps_min_count += delta; + BUG_ON(caps_min_count < 0); spin_unlock(&caps_list_lock); } @@ -265,12 +275,10 @@ static void put_cap(struct ceph_cap *cap, caps_reserve_count, caps_avail_count); caps_use_count--; /* - * Keep some preallocated caps around, at least enough to do a - * readdir (which needs to preallocate lots of them), to avoid - * lots of free/alloc churn. + * Keep some preallocated caps around (ceph_min_count), to + * avoid lots of free/alloc churn. */ - if (caps_avail_count >= caps_reserve_count + - ceph_client(cap->ci->vfs_inode.i_sb)->mount_args->max_readdir) { + if (caps_avail_count >= caps_reserve_count + caps_min_count) { caps_total_count--; kmem_cache_free(ceph_cap_cachep, cap); } else { @@ -289,7 +297,8 @@ static void put_cap(struct ceph_cap *cap, } void ceph_reservation_status(struct ceph_client *client, - int *total, int *avail, int *used, int *reserved) + int *total, int *avail, int *used, int *reserved, + int *min) { if (total) *total = caps_total_count; @@ -299,6 +308,8 @@ void ceph_reservation_status(struct ceph_client *client, *used = caps_use_count; if (reserved) *reserved = caps_reserve_count; + if (min) + *min = caps_min_count; } /* diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index b58bd9188692..1a47b5c25b5f 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -255,14 +255,15 @@ static int osdc_show(struct seq_file *s, void *pp) static int caps_show(struct seq_file *s, void *p) { struct ceph_client *client = p; - int total, avail, used, reserved; + int total, avail, used, reserved, min; - ceph_reservation_status(client, &total, &avail, &used, &reserved); + ceph_reservation_status(client, &total, &avail, &used, &reserved, &min); seq_printf(s, "total\t\t%d\n" - "avail\t\t%d\n" - "used\t\t%d\n" - "reserved\t%d\n", - total, avail, used, reserved); + "avail\t\t%d\n" + "used\t\t%d\n" + "reserved\t%d\n" + "min\t%d\n", + total, avail, used, reserved, min); return 0; } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 39aaf29a04a0..74953be75f8f 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -578,6 +578,9 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) if (!client->wb_pagevec_pool) goto fail_trunc_wq; + /* caps */ + client->min_caps = args->max_readdir; + ceph_adjust_min_caps(client->min_caps); /* subsystems */ err = ceph_monc_init(&client->monc, client); @@ -619,6 +622,8 @@ static void ceph_destroy_client(struct ceph_client *client) ceph_monc_stop(&client->monc); ceph_osdc_stop(&client->osdc); + ceph_adjust_min_caps(-client->min_caps); + ceph_debugfs_client_cleanup(client); destroy_workqueue(client->wb_wq); destroy_workqueue(client->pg_inv_wq); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 1f3928785e12..3b5faf9980f8 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -129,6 +129,8 @@ struct ceph_client { int auth_err; + int min_caps; /* min caps i added */ + struct ceph_messenger *msgr; /* messenger instance */ struct ceph_mon_client monc; struct ceph_mds_client mdsc; @@ -557,11 +559,12 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); extern void ceph_caps_init(void); extern void ceph_caps_finalize(void); +extern void ceph_adjust_min_caps(int delta); extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need); extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx); extern void ceph_reservation_status(struct ceph_client *client, int *total, int *avail, int *used, - int *reserved); + int *reserved, int *min); static inline struct ceph_client *ceph_inode_to_client(struct inode *inode) { -- cgit v1.2.3 From 4fc51be8fa7043ff9a1e34fef0e99214373332ac Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 16 Feb 2010 15:55:03 -0800 Subject: ceph: use rbtree for pg pools; decode new osdmap format Since we can now create and destroy pg pools, the pool ids will be sparse, and an array no longer makes sense for looking up by pool id. Use an rbtree instead. The OSDMap encoding also no longer has a max pool count (previously used to allocate the array). There is a new pool_max, that is the largest pool id we've ever used, although we don't actually need it in the client. Signed-off-by: Sage Weil --- fs/ceph/debugfs.c | 7 +-- fs/ceph/osdmap.c | 136 +++++++++++++++++++++++++++++++++++++----------------- fs/ceph/osdmap.h | 7 +-- fs/ceph/rados.h | 4 +- 4 files changed, 104 insertions(+), 50 deletions(-) (limited to 'fs/ceph/debugfs.c') diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 1a47b5c25b5f..e159f1415110 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -78,6 +78,7 @@ static int osdmap_show(struct seq_file *s, void *p) { int i; struct ceph_client *client = s->private; + struct rb_node *n; if (client->osdc.osdmap == NULL) return 0; @@ -87,11 +88,11 @@ static int osdmap_show(struct seq_file *s, void *p) " NEARFULL" : "", (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? " FULL" : ""); - for (i = 0; i < client->osdc.osdmap->num_pools; i++) { + for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { struct ceph_pg_pool_info *pool = - &client->osdc.osdmap->pg_pool[i]; + rb_entry(n, struct ceph_pg_pool_info, node); seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", - i, pool->v.pg_num, pool->pg_num_mask, + pool->id, pool->v.pg_num, pool->pg_num_mask, pool->v.lpg_num, pool->lpg_num_mask); } for (i = 0; i < client->osdc.osdmap->max_osd; i++) { diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 443fdcdb19c4..34b5696c84fd 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -328,9 +328,15 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) rb_erase(&pg->node, &map->pg_temp); kfree(pg); } + while (!RB_EMPTY_ROOT(&map->pg_pools)) { + struct ceph_pg_pool_info *pi = + rb_entry(rb_first(&map->pg_pools), + struct ceph_pg_pool_info, node); + rb_erase(&pi->node, &map->pg_pools); + kfree(pi); + } kfree(map->osd_state); kfree(map->osd_weight); - kfree(map->pg_pool); kfree(map->osd_addr); kfree(map); } @@ -432,6 +438,48 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, return NULL; } +/* + * rbtree of pg pool info + */ +static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct ceph_pg_pool_info *pi = NULL; + + while (*p) { + parent = *p; + pi = rb_entry(parent, struct ceph_pg_pool_info, node); + if (new->id < pi->id) + p = &(*p)->rb_left; + else if (new->id > pi->id) + p = &(*p)->rb_right; + else + return -EEXIST; + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, root); + return 0; +} + +static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) +{ + struct ceph_pg_pool_info *pi; + struct rb_node *n = root->rb_node; + + while (n) { + pi = rb_entry(n, struct ceph_pg_pool_info, node); + if (id < pi->id) + n = n->rb_left; + else if (id > pi->id) + n = n->rb_right; + else + return pi; + } + return NULL; +} + /* * decode a full map. */ @@ -443,6 +491,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) u8 ev; int err = -EINVAL; void *start = *p; + struct ceph_pg_pool_info *pi; dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); @@ -464,32 +513,27 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_copy(p, &map->created, sizeof(map->created)); ceph_decode_copy(p, &map->modified, sizeof(map->modified)); - map->num_pools = ceph_decode_32(p); - map->pg_pool = kcalloc(map->num_pools, sizeof(*map->pg_pool), - GFP_NOFS); - if (!map->pg_pool) { - err = -ENOMEM; - goto bad; - } ceph_decode_32_safe(p, end, max, bad); while (max--) { - ceph_decode_need(p, end, 4+1+sizeof(map->pg_pool->v), bad); - i = ceph_decode_32(p); - if (i >= map->num_pools) + ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); + pi = kmalloc(sizeof(*pi), GFP_NOFS); + if (!pi) goto bad; + pi->id = ceph_decode_32(p); ev = ceph_decode_8(p); /* encoding version */ if (ev > CEPH_PG_POOL_VERSION) { pr_warning("got unknown v %d > %d of ceph_pg_pool\n", ev, CEPH_PG_POOL_VERSION); goto bad; } - ceph_decode_copy(p, &map->pg_pool[i].v, - sizeof(map->pg_pool->v)); - calc_pg_masks(&map->pg_pool[i]); - p += le32_to_cpu(map->pg_pool[i].v.num_snaps) * sizeof(u64); - p += le32_to_cpu(map->pg_pool[i].v.num_removed_snap_intervals) + ceph_decode_copy(p, &pi->v, sizeof(pi->v)); + __insert_pg_pool(&map->pg_pools, pi); + calc_pg_masks(pi); + p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64); + p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; } + ceph_decode_32_safe(p, end, map->pool_max, bad); ceph_decode_32_safe(p, end, map->flags, bad); @@ -581,7 +625,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, u32 epoch = 0; struct ceph_timespec modified; u32 len, pool; - __s32 new_flags, max; + __s32 new_pool_max, new_flags, max; void *start = *p; int err = -EINVAL; u16 version; @@ -600,6 +644,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, epoch = ceph_decode_32(p); BUG_ON(epoch != map->epoch+1); ceph_decode_copy(p, &modified, sizeof(modified)); + new_pool_max = ceph_decode_32(p); new_flags = ceph_decode_32(p); /* full map? */ @@ -623,6 +668,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, /* new flags? */ if (new_flags >= 0) map->flags = new_flags; + if (new_pool_max >= 0) + map->pool_max = new_pool_max; ceph_decode_need(p, end, 5*sizeof(u32), bad); @@ -647,37 +694,42 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, ceph_decode_32_safe(p, end, len, bad); while (len--) { __u8 ev; + struct ceph_pg_pool_info *pi; ceph_decode_32_safe(p, end, pool, bad); - if (pool >= map->num_pools) { - void *pg_pool = kcalloc(pool + 1, - sizeof(*map->pg_pool), - GFP_NOFS); - if (!pg_pool) { - err = -ENOMEM; - goto bad; - } - memcpy(pg_pool, map->pg_pool, - map->num_pools * sizeof(*map->pg_pool)); - kfree(map->pg_pool); - map->pg_pool = pg_pool; - map->num_pools = pool+1; - } - ceph_decode_need(p, end, 1 + sizeof(map->pg_pool->v), bad); + ceph_decode_need(p, end, 1 + sizeof(pi->v), bad); ev = ceph_decode_8(p); /* encoding version */ if (ev > CEPH_PG_POOL_VERSION) { pr_warning("got unknown v %d > %d of ceph_pg_pool\n", ev, CEPH_PG_POOL_VERSION); goto bad; } - ceph_decode_copy(p, &map->pg_pool[pool].v, - sizeof(map->pg_pool->v)); - calc_pg_masks(&map->pg_pool[pool]); + pi = __lookup_pg_pool(&map->pg_pools, pool); + if (!pi) { + pi = kmalloc(sizeof(*pi), GFP_NOFS); + if (!pi) { + err = -ENOMEM; + goto bad; + } + pi->id = pool; + __insert_pg_pool(&map->pg_pools, pi); + } + ceph_decode_copy(p, &pi->v, sizeof(pi->v)); + calc_pg_masks(pi); } - /* old_pool (ignore) */ + /* old_pool */ ceph_decode_32_safe(p, end, len, bad); - *p += len * sizeof(u32); + while (len--) { + struct ceph_pg_pool_info *pi; + + ceph_decode_32_safe(p, end, pool, bad); + pi = __lookup_pg_pool(&map->pg_pools, pool); + if (pi) { + rb_erase(&pi->node, &map->pg_pools); + kfree(pi); + } + } /* new_up */ err = -EINVAL; @@ -861,10 +913,10 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol, unsigned ps; BUG_ON(!osdmap); - if (poolid >= osdmap->num_pools) - return -EIO; - pool = &osdmap->pg_pool[poolid]; + pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); + if (!pool) + return -EIO; ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); if (preferred >= 0) { ps += preferred; @@ -919,9 +971,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, preferred >= osdmap->crush->max_devices) preferred = -1; - if (poolid >= osdmap->num_pools) + pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); + if (!pool) return NULL; - pool = &osdmap->pg_pool[poolid]; ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, pool->v.type, pool->v.size); if (ruleno < 0) { diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h index c4af8418aa00..1fb55afb2642 100644 --- a/fs/ceph/osdmap.h +++ b/fs/ceph/osdmap.h @@ -19,6 +19,8 @@ * the change between two successive epochs, or as a fully encoded map. */ struct ceph_pg_pool_info { + struct rb_node node; + int id; struct ceph_pg_pool v; int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; }; @@ -44,9 +46,8 @@ struct ceph_osdmap { struct ceph_entity_addr *osd_addr; struct rb_root pg_temp; - - u32 num_pools; - struct ceph_pg_pool_info *pg_pool; + struct rb_root pg_pools; + u32 pool_max; /* the CRUSH map specifies the mapping of placement groups to * the list of osds that store+replicate them. */ diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h index 1f4c78640541..26ac8b89a676 100644 --- a/fs/ceph/rados.h +++ b/fs/ceph/rados.h @@ -11,8 +11,8 @@ /* * osdmap encoding versions */ -#define CEPH_OSDMAP_INC_VERSION 3 -#define CEPH_OSDMAP_VERSION 3 +#define CEPH_OSDMAP_INC_VERSION 4 +#define CEPH_OSDMAP_VERSION 4 /* * fs id -- cgit v1.2.3