summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/dlm/config.c18
-rw-r--r--fs/dlm/config.h5
-rw-r--r--fs/dlm/debug_fs.c54
-rw-r--r--fs/dlm/dlm_internal.h42
-rw-r--r--fs/dlm/lock.c16
-rw-r--r--fs/dlm/lockspace.c14
-rw-r--r--fs/dlm/lowcomms.c411
-rw-r--r--fs/dlm/lowcomms.h25
-rw-r--r--fs/dlm/member.c37
-rw-r--r--fs/dlm/midcomms.c1343
-rw-r--r--fs/dlm/midcomms.h15
-rw-r--r--fs/dlm/rcom.c123
-rw-r--r--fs/dlm/util.c10
-rw-r--r--fs/dlm/util.h2
14 files changed, 1924 insertions, 191 deletions
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 88d95d96e36c..42eee2783756 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -20,6 +20,7 @@
#include <net/sock.h>
#include "config.h"
+#include "midcomms.h"
#include "lowcomms.h"
/*
@@ -79,6 +80,9 @@ struct dlm_cluster {
unsigned int cl_new_rsb_count;
unsigned int cl_recover_callbacks;
char cl_cluster_name[DLM_LOCKSPACE_LEN];
+
+ struct dlm_spaces *sps;
+ struct dlm_comms *cms;
};
static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
@@ -204,7 +208,7 @@ static int dlm_check_zero(unsigned int x)
static int dlm_check_buffer_size(unsigned int x)
{
- if (x < DEFAULT_BUFFER_SIZE)
+ if (x < DLM_MAX_SOCKET_BUFSIZE)
return -EINVAL;
return 0;
@@ -409,6 +413,9 @@ static struct config_group *make_cluster(struct config_group *g,
if (!cl || !sps || !cms)
goto fail;
+ cl->sps = sps;
+ cl->cms = cms;
+
config_group_init_type_name(&cl->group, name, &cluster_type);
config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
@@ -458,6 +465,9 @@ static void drop_cluster(struct config_group *g, struct config_item *i)
static void release_cluster(struct config_item *i)
{
struct dlm_cluster *cl = config_item_to_cluster(i);
+
+ kfree(cl->sps);
+ kfree(cl->cms);
kfree(cl);
}
@@ -532,7 +542,7 @@ static void drop_comm(struct config_group *g, struct config_item *i)
struct dlm_comm *cm = config_item_to_comm(i);
if (local_comm == cm)
local_comm = NULL;
- dlm_lowcomms_close(cm->nodeid);
+ dlm_midcomms_close(cm->nodeid);
while (cm->addr_count--)
kfree(cm->addr[cm->addr_count]);
config_item_put(i);
@@ -942,7 +952,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
#define DEFAULT_SCAN_SECS 5
#define DEFAULT_LOG_DEBUG 0
#define DEFAULT_LOG_INFO 1
-#define DEFAULT_PROTOCOL 0
+#define DEFAULT_PROTOCOL DLM_PROTO_TCP
#define DEFAULT_MARK 0
#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */
#define DEFAULT_WAITWARN_US 0
@@ -952,7 +962,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
struct dlm_config_info dlm_config = {
.ci_tcp_port = DEFAULT_TCP_PORT,
- .ci_buffer_size = DEFAULT_BUFFER_SIZE,
+ .ci_buffer_size = DLM_MAX_SOCKET_BUFSIZE,
.ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE,
.ci_recover_timer = DEFAULT_RECOVER_TIMER,
.ci_toss_secs = DEFAULT_TOSS_SECS,
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index d2cd4bd20313..df92b0a07fc6 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -12,7 +12,7 @@
#ifndef __CONFIG_DOT_H__
#define __CONFIG_DOT_H__
-#define DEFAULT_BUFFER_SIZE 4096
+#define DLM_MAX_SOCKET_BUFSIZE 4096
struct dlm_config_node {
int nodeid;
@@ -23,6 +23,9 @@ struct dlm_config_node {
#define DLM_MAX_ADDR_COUNT 3
+#define DLM_PROTO_TCP 0
+#define DLM_PROTO_SCTP 1
+
struct dlm_config_info {
int ci_tcp_port;
int ci_buffer_size;
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index d5bd990bcab8..47e9d57e4cae 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -16,6 +16,7 @@
#include <linux/slab.h>
#include "dlm_internal.h"
+#include "midcomms.h"
#include "lock.h"
#define DLM_DEBUG_BUF_LEN 4096
@@ -23,6 +24,7 @@ static char debug_buf[DLM_DEBUG_BUF_LEN];
static struct mutex debug_buf_lock;
static struct dentry *dlm_root;
+static struct dentry *dlm_comms;
static char *print_lockmode(int mode)
{
@@ -738,6 +740,57 @@ void dlm_delete_debug_file(struct dlm_ls *ls)
debugfs_remove(ls->ls_debug_toss_dentry);
}
+static int dlm_state_show(struct seq_file *file, void *offset)
+{
+ seq_printf(file, "%s\n", dlm_midcomms_state(file->private));
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dlm_state);
+
+static int dlm_flags_show(struct seq_file *file, void *offset)
+{
+ seq_printf(file, "%lu\n", dlm_midcomms_flags(file->private));
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dlm_flags);
+
+static int dlm_send_queue_cnt_show(struct seq_file *file, void *offset)
+{
+ seq_printf(file, "%d\n", dlm_midcomms_send_queue_cnt(file->private));
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dlm_send_queue_cnt);
+
+static int dlm_version_show(struct seq_file *file, void *offset)
+{
+ seq_printf(file, "0x%08x\n", dlm_midcomms_version(file->private));
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dlm_version);
+
+void *dlm_create_debug_comms_file(int nodeid, void *data)
+{
+ struct dentry *d_node;
+ char name[256];
+
+ memset(name, 0, sizeof(name));
+ snprintf(name, 256, "%d", nodeid);
+
+ d_node = debugfs_create_dir(name, dlm_comms);
+ debugfs_create_file("state", 0444, d_node, data, &dlm_state_fops);
+ debugfs_create_file("flags", 0444, d_node, data, &dlm_flags_fops);
+ debugfs_create_file("send_queue_count", 0444, d_node, data,
+ &dlm_send_queue_cnt_fops);
+ debugfs_create_file("version", 0444, d_node, data, &dlm_version_fops);
+
+ return d_node;
+}
+
+void dlm_delete_debug_comms_file(void *ctx)
+{
+ debugfs_remove(ctx);
+}
+
void dlm_create_debug_file(struct dlm_ls *ls)
{
char name[DLM_LOCKSPACE_LEN + 8];
@@ -797,6 +850,7 @@ void __init dlm_register_debugfs(void)
{
mutex_init(&debug_buf_lock);
dlm_root = debugfs_create_dir("dlm", NULL);
+ dlm_comms = debugfs_create_dir("comms", dlm_root);
}
void dlm_unregister_debugfs(void)
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 04fe9f525ac7..91d1ca3a121a 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -57,9 +57,12 @@ struct dlm_header;
struct dlm_message;
struct dlm_rcom;
struct dlm_mhandle;
+struct dlm_msg;
#define log_print(fmt, args...) \
printk(KERN_ERR "dlm: "fmt"\n" , ##args)
+#define log_print_ratelimited(fmt, args...) \
+ printk_ratelimited(KERN_ERR "dlm: "fmt"\n", ##args)
#define log_error(ls, fmt, args...) \
printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
@@ -368,23 +371,33 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
/* dlm_header is first element of all structs sent between nodes */
#define DLM_HEADER_MAJOR 0x00030000
-#define DLM_HEADER_MINOR 0x00000001
+#define DLM_HEADER_MINOR 0x00000002
+
+#define DLM_VERSION_3_1 0x00030001
+#define DLM_VERSION_3_2 0x00030002
#define DLM_HEADER_SLOTS 0x00000001
#define DLM_MSG 1
#define DLM_RCOM 2
+#define DLM_OPTS 3
+#define DLM_ACK 4
+#define DLM_FIN 5
struct dlm_header {
uint32_t h_version;
- uint32_t h_lockspace;
+ union {
+ /* for DLM_MSG and DLM_RCOM */
+ uint32_t h_lockspace;
+ /* for DLM_ACK and DLM_OPTS */
+ uint32_t h_seq;
+ } u;
uint32_t h_nodeid; /* nodeid of sender */
uint16_t h_length;
uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */
uint8_t h_pad;
};
-
#define DLM_MSG_REQUEST 1
#define DLM_MSG_CONVERT 2
#define DLM_MSG_UNLOCK 3
@@ -452,10 +465,29 @@ struct dlm_rcom {
char rc_buf[];
};
+struct dlm_opt_header {
+ uint16_t t_type;
+ uint16_t t_length;
+ uint32_t o_pad;
+ /* need to be 8 byte aligned */
+ char t_value[];
+};
+
+/* encapsulation header */
+struct dlm_opts {
+ struct dlm_header o_header;
+ uint8_t o_nextcmd;
+ uint8_t o_pad;
+ uint16_t o_optlen;
+ uint32_t o_pad2;
+ char o_opts[];
+};
+
union dlm_packet {
struct dlm_header header; /* common to other two */
struct dlm_message message;
struct dlm_rcom rcom;
+ struct dlm_opts opts;
};
#define DLM_RSF_NEED_SLOTS 0x00000001
@@ -722,11 +754,15 @@ void dlm_register_debugfs(void);
void dlm_unregister_debugfs(void);
void dlm_create_debug_file(struct dlm_ls *ls);
void dlm_delete_debug_file(struct dlm_ls *ls);
+void *dlm_create_debug_comms_file(int nodeid, void *data);
+void dlm_delete_debug_comms_file(void *ctx);
#else
static inline void dlm_register_debugfs(void) { }
static inline void dlm_unregister_debugfs(void) { }
static inline void dlm_create_debug_file(struct dlm_ls *ls) { }
static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
+static inline void *dlm_create_debug_comms_file(int nodeid, void *data) { return NULL; }
+static inline void dlm_delete_debug_comms_file(void *ctx) { }
#endif
#endif /* __DLM_INTERNAL_DOT_H__ */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index b93df39d0915..c502c065d007 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -59,7 +59,7 @@
#include "dlm_internal.h"
#include <linux/dlm_device.h>
#include "memory.h"
-#include "lowcomms.h"
+#include "midcomms.h"
#include "requestqueue.h"
#include "util.h"
#include "dir.h"
@@ -3534,17 +3534,17 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
char *mb;
/* get_buffer gives us a message handle (mh) that we need to
- pass into lowcomms_commit and a message buffer (mb) that we
+ pass into midcomms_commit and a message buffer (mb) that we
write our data into */
- mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
+ mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb);
if (!mh)
return -ENOBUFS;
ms = (struct dlm_message *) mb;
ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
- ms->m_header.h_lockspace = ls->ls_global_id;
+ ms->m_header.u.h_lockspace = ls->ls_global_id;
ms->m_header.h_nodeid = dlm_our_nodeid();
ms->m_header.h_length = mb_len;
ms->m_header.h_cmd = DLM_MSG;
@@ -3589,7 +3589,7 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
{
dlm_message_out(ms);
- dlm_lowcomms_commit_buffer(mh);
+ dlm_midcomms_commit_mhandle(mh);
return 0;
}
@@ -5038,16 +5038,16 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid)
if (hd->h_nodeid != nodeid) {
log_print("invalid h_nodeid %d from %d lockspace %x",
- hd->h_nodeid, nodeid, hd->h_lockspace);
+ hd->h_nodeid, nodeid, hd->u.h_lockspace);
return;
}
- ls = dlm_find_lockspace_global(hd->h_lockspace);
+ ls = dlm_find_lockspace_global(hd->u.h_lockspace);
if (!ls) {
if (dlm_config.ci_log_debug) {
printk_ratelimited(KERN_DEBUG "dlm: invalid lockspace "
"%u from %d cmd %d type %d\n",
- hd->h_lockspace, nodeid, hd->h_cmd, type);
+ hd->u.h_lockspace, nodeid, hd->h_cmd, type);
}
if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index c14cf2b7faab..d71aba8c3e64 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -16,6 +16,7 @@
#include "member.h"
#include "recoverd.h"
#include "dir.h"
+#include "midcomms.h"
#include "lowcomms.h"
#include "config.h"
#include "memory.h"
@@ -390,7 +391,7 @@ static int threads_start(void)
}
/* Thread for sending/receiving messages for all lockspace's */
- error = dlm_lowcomms_start();
+ error = dlm_midcomms_start();
if (error) {
log_print("cannot start dlm lowcomms %d", error);
goto scand_fail;
@@ -566,7 +567,12 @@ static int new_lockspace(const char *name, const char *cluster,
mutex_init(&ls->ls_requestqueue_mutex);
mutex_init(&ls->ls_clear_proc_locks);
- ls->ls_recover_buf = kmalloc(LOWCOMMS_MAX_TX_BUFFER_LEN, GFP_NOFS);
+ /* Due backwards compatibility with 3.1 we need to use maximum
+ * possible dlm message size to be sure the message will fit and
+ * not having out of bounds issues. However on sending side 3.2
+ * might send less.
+ */
+ ls->ls_recover_buf = kmalloc(DLM_MAX_SOCKET_BUFSIZE, GFP_NOFS);
if (!ls->ls_recover_buf)
goto out_lkbidr;
@@ -698,7 +704,7 @@ int dlm_new_lockspace(const char *name, const char *cluster,
error = 0;
if (!ls_count) {
dlm_scand_stop();
- dlm_lowcomms_shutdown();
+ dlm_midcomms_shutdown();
dlm_lowcomms_stop();
}
out:
@@ -787,7 +793,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
if (ls_count == 1) {
dlm_scand_stop();
- dlm_lowcomms_shutdown();
+ dlm_midcomms_shutdown();
}
dlm_callback_stop(ls);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 166e36fcf3e4..0ea9ae35da0b 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -59,7 +59,6 @@
#include "config.h"
#define NEEDED_RMEM (4*1024*1024)
-#define CONN_HASH_SIZE 32
/* Number of messages to send before rescheduling */
#define MAX_SEND_MSG_COUNT 25
@@ -79,14 +78,20 @@ struct connection {
#define CF_CLOSING 8
#define CF_SHUTDOWN 9
#define CF_CONNECTED 10
+#define CF_RECONNECT 11
+#define CF_DELAY_CONNECT 12
+#define CF_EOF 13
struct list_head writequeue; /* List of outgoing writequeue_entries */
spinlock_t writequeue_lock;
+ atomic_t writequeue_cnt;
void (*connect_action) (struct connection *); /* What to do to connect */
void (*shutdown_action)(struct connection *con); /* What to do to shutdown */
+ bool (*eof_condition)(struct connection *con); /* What to do to eof check */
int retries;
#define MAX_CONNECT_RETRIES 3
struct hlist_node list;
struct connection *othercon;
+ struct connection *sendcon;
struct work_struct rwork; /* Receive workqueue */
struct work_struct swork; /* Send workqueue */
wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */
@@ -113,7 +118,22 @@ struct writequeue_entry {
int len;
int end;
int users;
+ bool dirty;
struct connection *con;
+ struct list_head msgs;
+ struct kref ref;
+};
+
+struct dlm_msg {
+ struct writequeue_entry *entry;
+ struct dlm_msg *orig_msg;
+ bool retransmit;
+ void *ppc;
+ int len;
+ int idx; /* new()/commit() idx exchange */
+
+ struct list_head list;
+ struct kref ref;
};
struct dlm_node_addr {
@@ -155,33 +175,23 @@ static void sctp_connect_to_sock(struct connection *con);
static void tcp_connect_to_sock(struct connection *con);
static void dlm_tcp_shutdown(struct connection *con);
-/* This is deliberately very simple because most clusters have simple
- sequential nodeids, so we should be able to go straight to a connection
- struct in the array */
-static inline int nodeid_hash(int nodeid)
+static struct connection *__find_con(int nodeid, int r)
{
- return nodeid & (CONN_HASH_SIZE-1);
-}
-
-static struct connection *__find_con(int nodeid)
-{
- int r, idx;
struct connection *con;
- r = nodeid_hash(nodeid);
-
- idx = srcu_read_lock(&connections_srcu);
hlist_for_each_entry_rcu(con, &connection_hash[r], list) {
- if (con->nodeid == nodeid) {
- srcu_read_unlock(&connections_srcu, idx);
+ if (con->nodeid == nodeid)
return con;
- }
}
- srcu_read_unlock(&connections_srcu, idx);
return NULL;
}
+static bool tcp_eof_condition(struct connection *con)
+{
+ return atomic_read(&con->writequeue_cnt);
+}
+
static int dlm_con_init(struct connection *con, int nodeid)
{
con->rx_buflen = dlm_config.ci_buffer_size;
@@ -193,15 +203,23 @@ static int dlm_con_init(struct connection *con, int nodeid)
mutex_init(&con->sock_mutex);
INIT_LIST_HEAD(&con->writequeue);
spin_lock_init(&con->writequeue_lock);
+ atomic_set(&con->writequeue_cnt, 0);
INIT_WORK(&con->swork, process_send_sockets);
INIT_WORK(&con->rwork, process_recv_sockets);
init_waitqueue_head(&con->shutdown_wait);
- if (dlm_config.ci_protocol == 0) {
+ switch (dlm_config.ci_protocol) {
+ case DLM_PROTO_TCP:
con->connect_action = tcp_connect_to_sock;
con->shutdown_action = dlm_tcp_shutdown;
- } else {
+ con->eof_condition = tcp_eof_condition;
+ break;
+ case DLM_PROTO_SCTP:
con->connect_action = sctp_connect_to_sock;
+ break;
+ default:
+ kfree(con->rx_buf);
+ return -EINVAL;
}
return 0;
@@ -216,7 +234,8 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
struct connection *con, *tmp;
int r, ret;
- con = __find_con(nodeid);
+ r = nodeid_hash(nodeid);
+ con = __find_con(nodeid, r);
if (con || !alloc)
return con;
@@ -230,8 +249,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
return NULL;
}
- r = nodeid_hash(nodeid);
-
spin_lock(&connections_lock);
/* Because multiple workqueues/threads calls this function it can
* race on multiple cpu's. Instead of locking hot path __find_con()
@@ -239,7 +256,7 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
* under protection of connections_lock. If this is the case we
* abort our connection creation and return the existing connection.
*/
- tmp = __find_con(nodeid);
+ tmp = __find_con(nodeid, r);
if (tmp) {
spin_unlock(&connections_lock);
kfree(con->rx_buf);
@@ -256,15 +273,13 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
/* Loop round all connections */
static void foreach_conn(void (*conn_func)(struct connection *c))
{
- int i, idx;
+ int i;
struct connection *con;
- idx = srcu_read_lock(&connections_srcu);
for (i = 0; i < CONN_HASH_SIZE; i++) {
hlist_for_each_entry_rcu(con, &connection_hash[i], list)
conn_func(con);
}
- srcu_read_unlock(&connections_srcu, idx);
}
static struct dlm_node_addr *find_node_addr(int nodeid)
@@ -462,6 +477,9 @@ static void lowcomms_data_ready(struct sock *sk)
static void lowcomms_listen_data_ready(struct sock *sk)
{
+ if (!dlm_allow_conn)
+ return;
+
queue_work(recv_workqueue, &listen_con.rwork);
}
@@ -518,14 +536,21 @@ static void lowcomms_state_change(struct sock *sk)
int dlm_lowcomms_connect_node(int nodeid)
{
struct connection *con;
+ int idx;
if (nodeid == dlm_our_nodeid())
return 0;
+ idx = srcu_read_lock(&connections_srcu);
con = nodeid2con(nodeid, GFP_NOFS);
- if (!con)
+ if (!con) {
+ srcu_read_unlock(&connections_srcu, idx);
return -ENOMEM;
+ }
+
lowcomms_connect_sock(con);
+ srcu_read_unlock(&connections_srcu, idx);
+
return 0;
}
@@ -587,6 +612,22 @@ static void lowcomms_error_report(struct sock *sk)
dlm_config.ci_tcp_port, sk->sk_err,
sk->sk_err_soft);
}
+
+ /* below sendcon only handling */
+ if (test_bit(CF_IS_OTHERCON, &con->flags))
+ con = con->sendcon;
+
+ switch (sk->sk_err) {
+ case ECONNREFUSED:
+ set_bit(CF_DELAY_CONNECT, &con->flags);
+ break;
+ default:
+ break;
+ }
+
+ if (!test_and_set_bit(CF_RECONNECT, &con->flags))
+ queue_work(send_workqueue, &con->swork);
+
out:
read_unlock_bh(&sk->sk_callback_lock);
if (orig_report)
@@ -669,6 +710,42 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len);
}
+static void dlm_page_release(struct kref *kref)
+{
+ struct writequeue_entry *e = container_of(kref, struct writequeue_entry,
+ ref);
+
+ __free_page(e->page);
+ kfree(e);
+}
+
+static void dlm_msg_release(struct kref *kref)
+{
+ struct dlm_msg *msg = container_of(kref, struct dlm_msg, ref);
+
+ kref_put(&msg->entry->ref, dlm_page_release);
+ kfree(msg);
+}
+
+static void free_entry(struct writequeue_entry *e)
+{
+ struct dlm_msg *msg, *tmp;
+
+ list_for_each_entry_safe(msg, tmp, &e->msgs, list) {
+ if (msg->orig_msg) {
+ msg->orig_msg->retransmit = false;
+ kref_put(&msg->orig_msg->ref, dlm_msg_release);
+ }
+
+ list_del(&msg->list);
+ kref_put(&msg->ref, dlm_msg_release);
+ }
+
+ list_del(&e->list);
+ atomic_dec(&e->con->writequeue_cnt);
+ kref_put(&e->ref, dlm_page_release);
+}
+
static void dlm_close_sock(struct socket **sock)
{
if (*sock) {
@@ -683,6 +760,7 @@ static void close_connection(struct connection *con, bool and_other,
bool tx, bool rx)
{
bool closing = test_and_set_bit(CF_CLOSING, &con->flags);
+ struct writequeue_entry *e;
if (tx && !closing && cancel_work_sync(&con->swork)) {
log_print("canceled swork for node %d", con->nodeid);
@@ -698,12 +776,35 @@ static void close_connection(struct connection *con, bool and_other,
if (con->othercon && and_other) {
/* Will only re-enter once. */
- close_connection(con->othercon, false, true, true);
+ close_connection(con->othercon, false, tx, rx);
+ }
+
+ /* if we send a writequeue entry only a half way, we drop the
+ * whole entry because reconnection and that we not start of the
+ * middle of a msg which will confuse the other end.
+ *
+ * we can always drop messages because retransmits, but what we
+ * cannot allow is to transmit half messages which may be processed
+ * at the other side.
+ *
+ * our policy is to start on a clean state when disconnects, we don't
+ * know what's send/received on transport layer in this case.
+ */
+ spin_lock(&con->writequeue_lock);
+ if (!list_empty(&con->writequeue)) {
+ e = list_first_entry(&con->writequeue, struct writequeue_entry,
+ list);
+ if (e->dirty)
+ free_entry(e);
}
+ spin_unlock(&con->writequeue_lock);
con->rx_leftover = 0;
con->retries = 0;
clear_bit(CF_CONNECTED, &con->flags);
+ clear_bit(CF_DELAY_CONNECT, &con->flags);
+ clear_bit(CF_RECONNECT, &con->flags);
+ clear_bit(CF_EOF, &con->flags);
mutex_unlock(&con->sock_mutex);
clear_bit(CF_CLOSING, &con->flags);
}
@@ -841,19 +942,26 @@ out_resched:
return -EAGAIN;
out_close:
- mutex_unlock(&con->sock_mutex);
- if (ret != -EAGAIN) {
- /* Reconnect when there is something to send */
- close_connection(con, false, true, false);
- if (ret == 0) {
- log_print("connection %p got EOF from %d",
- con, con->nodeid);
+ if (ret == 0) {
+ log_print("connection %p got EOF from %d",
+ con, con->nodeid);
+
+ if (con->eof_condition && con->eof_condition(con)) {
+ set_bit(CF_EOF, &con->flags);
+ mutex_unlock(&con->sock_mutex);
+ } else {
+ mutex_unlock(&con->sock_mutex);
+ close_connection(con, false, true, false);
+
/* handling for tcp shutdown */
clear_bit(CF_SHUTDOWN, &con->flags);
wake_up(&con->shutdown_wait);
- /* signal to breaking receive worker */
- ret = -1;
}
+
+ /* signal to breaking receive worker */
+ ret = -1;
+ } else {
+ mutex_unlock(&con->sock_mutex);
}
return ret;
}
@@ -864,16 +972,12 @@ static int accept_from_sock(struct listen_connection *con)
int result;
struct sockaddr_storage peeraddr;
struct socket *newsock;
- int len;
+ int len, idx;
int nodeid;
struct connection *newcon;
struct connection *addcon;
unsigned int mark;
- if (!dlm_allow_conn) {
- return -1;
- }
-
if (!con->sock)
return -ENOTCONN;
@@ -907,8 +1011,10 @@ static int accept_from_sock(struct listen_connection *con)
* the same time and the connections cross on the wire.
* In this case we store the incoming one in "othercon"
*/
+ idx = srcu_read_lock(&connections_srcu);
newcon = nodeid2con(nodeid, GFP_NOFS);
if (!newcon) {
+ srcu_read_unlock(&connections_srcu, idx);
result = -ENOMEM;
goto accept_err;
}
@@ -924,6 +1030,7 @@ static int accept_from_sock(struct listen_connection *con)
if (!othercon) {
log_print("failed to allocate incoming socket");
mutex_unlock(&newcon->sock_mutex);
+ srcu_read_unlock(&connections_srcu, idx);
result = -ENOMEM;
goto accept_err;
}
@@ -932,11 +1039,14 @@ static int accept_from_sock(struct listen_connection *con)
if (result < 0) {
kfree(othercon);
mutex_unlock(&newcon->sock_mutex);
+ srcu_read_unlock(&connections_srcu, idx);
goto accept_err;
}
lockdep_set_subclass(&othercon->sock_mutex, 1);
+ set_bit(CF_IS_OTHERCON, &othercon->flags);
newcon->othercon = othercon;
+ othercon->sendcon = newcon;
} else {
/* close other sock con if we have something new */
close_connection(othercon, false, true, false);
@@ -966,6 +1076,8 @@ static int accept_from_sock(struct listen_connection *con)
if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags))
queue_work(recv_workqueue, &addcon->rwork);
+ srcu_read_unlock(&connections_srcu, idx);
+
return 0;
accept_err:
@@ -977,12 +1089,6 @@ accept_err:
return result;
}
-static void free_entry(struct writequeue_entry *e)
-{
- __free_page(e->page);
- kfree(e);
-}
-
/*
* writequeue_entry_complete - try to delete and free write queue entry
* @e: write queue entry to try to delete
@@ -994,11 +1100,11 @@ static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
{
e->offset += completed;
e->len -= completed;
+ /* signal that page was half way transmitted */
+ e->dirty = true;
- if (e->len == 0 && e->users == 0) {
- list_del(&e->list);
+ if (e->len == 0 && e->users == 0)
free_entry(e);
- }
}
/*
@@ -1075,7 +1181,7 @@ static void sctp_connect_to_sock(struct connection *con)
make_sockaddr(&daddr, dlm_config.ci_tcp_port, &addr_len);
- log_print("connecting to %d", con->nodeid);
+ log_print_ratelimited("connecting to %d", con->nodeid);
/* Turn off Nagle's algorithm */
sctp_sock_set_nodelay(sock->sk);
@@ -1171,7 +1277,7 @@ static void tcp_connect_to_sock(struct connection *con)
make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
- log_print("connecting to %d", con->nodeid);
+ log_print_ratelimited("connecting to %d", con->nodeid);
/* Turn off Nagle's algorithm */
tcp_sock_set_nodelay(sock->sk);
@@ -1364,12 +1470,16 @@ static struct writequeue_entry *new_writequeue_entry(struct connection *con,
entry->con = con;
entry->users = 1;
+ kref_init(&entry->ref);
+ INIT_LIST_HEAD(&entry->msgs);
return entry;
}
static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
- gfp_t allocation, char **ppc)
+ gfp_t allocation, char **ppc,
+ void (*cb)(struct dlm_mhandle *mh),
+ struct dlm_mhandle *mh)
{
struct writequeue_entry *e;
@@ -1377,7 +1487,12 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
if (!list_empty(&con->writequeue)) {
e = list_last_entry(&con->writequeue, struct writequeue_entry, list);
if (DLM_WQ_REMAIN_BYTES(e) >= len) {
+ kref_get(&e->ref);
+
*ppc = page_address(e->page) + e->end;
+ if (cb)
+ cb(mh);
+
e->end += len;
e->users++;
spin_unlock(&con->writequeue_lock);
@@ -1391,42 +1506,92 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
if (!e)
return NULL;
+ kref_get(&e->ref);
*ppc = page_address(e->page);
e->end += len;
+ atomic_inc(&con->writequeue_cnt);
spin_lock(&con->writequeue_lock);
+ if (cb)
+ cb(mh);
+
list_add_tail(&e->list, &con->writequeue);
spin_unlock(&con->writequeue_lock);
return e;
};
-void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
+static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len,
+ gfp_t allocation, char **ppc,
+ void (*cb)(struct dlm_mhandle *mh),
+ struct dlm_mhandle *mh)
+{
+ struct writequeue_entry *e;
+ struct dlm_msg *msg;
+
+ msg = kzalloc(sizeof(*msg), allocation);
+ if (!msg)
+ return NULL;
+
+ kref_init(&msg->ref);
+
+ e = new_wq_entry(con, len, allocation, ppc, cb, mh);
+ if (!e) {
+ kfree(msg);
+ return NULL;
+ }
+
+ msg->ppc = *ppc;
+ msg->len = len;
+ msg->entry = e;
+
+ return msg;
+}
+
+struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
+ char **ppc, void (*cb)(struct dlm_mhandle *mh),
+ struct dlm_mhandle *mh)
{
struct connection *con;
+ struct dlm_msg *msg;
+ int idx;
- if (len > DEFAULT_BUFFER_SIZE ||
+ if (len > DLM_MAX_SOCKET_BUFSIZE ||
len < sizeof(struct dlm_header)) {
- BUILD_BUG_ON(PAGE_SIZE < DEFAULT_BUFFER_SIZE);
+ BUILD_BUG_ON(PAGE_SIZE < DLM_MAX_SOCKET_BUFSIZE);
log_print("failed to allocate a buffer of size %d", len);
WARN_ON(1);
return NULL;
}
+ idx = srcu_read_lock(&connections_srcu);
con = nodeid2con(nodeid, allocation);
- if (!con)
+ if (!con) {
+ srcu_read_unlock(&connections_srcu, idx);
return NULL;
+ }
- return new_wq_entry(con, len, allocation, ppc);
+ msg = dlm_lowcomms_new_msg_con(con, len, allocation, ppc, cb, mh);
+ if (!msg) {
+ srcu_read_unlock(&connections_srcu, idx);
+ return NULL;
+ }
+
+ /* we assume if successful commit must called */
+ msg->idx = idx;
+ return msg;
}
-void dlm_lowcomms_commit_buffer(void *mh)
+static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg)
{
- struct writequeue_entry *e = (struct writequeue_entry *)mh;
+ struct writequeue_entry *e = msg->entry;
struct connection *con = e->con;
int users;
spin_lock(&con->writequeue_lock);
+ kref_get(&msg->ref);
+ list_add(&msg->list, &e->msgs);
+
users = --e->users;
if (users)
goto out;
@@ -1442,6 +1607,42 @@ out:
return;
}
+void dlm_lowcomms_commit_msg(struct dlm_msg *msg)
+{
+ _dlm_lowcomms_commit_msg(msg);
+ srcu_read_unlock(&connections_srcu, msg->idx);
+}
+
+void dlm_lowcomms_put_msg(struct dlm_msg *msg)
+{
+ kref_put(&msg->ref, dlm_msg_release);
+}
+
+/* does not held connections_srcu, usage workqueue only */
+int dlm_lowcomms_resend_msg(struct dlm_msg *msg)
+{
+ struct dlm_msg *msg_resend;
+ char *ppc;
+
+ if (msg->retransmit)
+ return 1;
+
+ msg_resend = dlm_lowcomms_new_msg_con(msg->entry->con, msg->len,
+ GFP_ATOMIC, &ppc, NULL, NULL);
+ if (!msg_resend)
+ return -ENOMEM;
+
+ msg->retransmit = true;
+ kref_get(&msg->ref);
+ msg_resend->orig_msg = msg;
+
+ memcpy(ppc, msg->ppc, msg->len);
+ _dlm_lowcomms_commit_msg(msg_resend);
+ dlm_lowcomms_put_msg(msg_resend);
+
+ return 0;
+}
+
/* Send a message */
static void send_to_sock(struct connection *con)
{
@@ -1483,7 +1684,7 @@ static void send_to_sock(struct connection *con)
cond_resched();
goto out;
} else if (ret < 0)
- goto send_error;
+ goto out;
}
/* Don't starve people filling buffers */
@@ -1496,16 +1697,23 @@ static void send_to_sock(struct connection *con)
writequeue_entry_complete(e, ret);
}
spin_unlock(&con->writequeue_lock);
-out:
- mutex_unlock(&con->sock_mutex);
+
+ /* close if we got EOF */
+ if (test_and_clear_bit(CF_EOF, &con->flags)) {
+ mutex_unlock(&con->sock_mutex);
+ close_connection(con, false, false, true);
+
+ /* handling for tcp shutdown */
+ clear_bit(CF_SHUTDOWN, &con->flags);
+ wake_up(&con->shutdown_wait);
+ } else {
+ mutex_unlock(&con->sock_mutex);
+ }
+
return;
-send_error:
+out:
mutex_unlock(&con->sock_mutex);
- close_connection(con, false, false, true);
- /* Requeue the send work. When the work daemon runs again, it will try
- a new connection, then call this function again. */
- queue_work(send_workqueue, &con->swork);
return;
out_connect:
@@ -1520,7 +1728,6 @@ static void clean_one_writequeue(struct connection *con)
spin_lock(&con->writequeue_lock);
list_for_each_entry_safe(e, safe, &con->writequeue, list) {
- list_del(&e->list);
free_entry(e);
}
spin_unlock(&con->writequeue_lock);
@@ -1532,8 +1739,10 @@ int dlm_lowcomms_close(int nodeid)
{
struct connection *con;
struct dlm_node_addr *na;
+ int idx;
log_print("closing connection to node %d", nodeid);
+ idx = srcu_read_lock(&connections_srcu);
con = nodeid2con(nodeid, 0);
if (con) {
set_bit(CF_CLOSE, &con->flags);
@@ -1542,6 +1751,7 @@ int dlm_lowcomms_close(int nodeid)
if (con->othercon)
clean_one_writequeue(con->othercon);
}
+ srcu_read_unlock(&connections_srcu, idx);
spin_lock(&dlm_node_addrs_spin);
na = find_node_addr(nodeid);
@@ -1578,35 +1788,50 @@ static void process_send_sockets(struct work_struct *work)
{
struct connection *con = container_of(work, struct connection, swork);
+ WARN_ON(test_bit(CF_IS_OTHERCON, &con->flags));
+
clear_bit(CF_WRITE_PENDING, &con->flags);
- if (con->sock == NULL) /* not mutex protected so check it inside too */
+
+ if (test_and_clear_bit(CF_RECONNECT, &con->flags)) {
+ close_connection(con, false, false, true);
+ dlm_midcomms_unack_msg_resend(con->nodeid);
+ }
+
+ if (con->sock == NULL) { /* not mutex protected so check it inside too */
+ if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags))
+ msleep(1000);
con->connect_action(con);
+ }
if (!list_empty(&con->writequeue))
send_to_sock(con);
}
static void work_stop(void)
{
- if (recv_workqueue)
+ if (recv_workqueue) {
destroy_workqueue(recv_workqueue);
- if (send_workqueue)
+ recv_workqueue = NULL;
+ }
+
+ if (send_workqueue) {
destroy_workqueue(send_workqueue);
+ send_workqueue = NULL;
+ }
}
static int work_start(void)
{
- recv_workqueue = alloc_workqueue("dlm_recv",
- WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
+ recv_workqueue = alloc_ordered_workqueue("dlm_recv", WQ_MEM_RECLAIM);
if (!recv_workqueue) {
log_print("can't start dlm_recv");
return -ENOMEM;
}
- send_workqueue = alloc_workqueue("dlm_send",
- WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
+ send_workqueue = alloc_ordered_workqueue("dlm_send", WQ_MEM_RECLAIM);
if (!send_workqueue) {
log_print("can't start dlm_send");
destroy_workqueue(recv_workqueue);
+ recv_workqueue = NULL;
return -ENOMEM;
}
@@ -1621,6 +1846,8 @@ static void shutdown_conn(struct connection *con)
void dlm_lowcomms_shutdown(void)
{
+ int idx;
+
/* Set all the flags to prevent any
* socket activity.
*/
@@ -1633,7 +1860,9 @@ void dlm_lowcomms_shutdown(void)
dlm_close_sock(&listen_con.sock);
+ idx = srcu_read_lock(&connections_srcu);
foreach_conn(shutdown_conn);
+ srcu_read_unlock(&connections_srcu, idx);
}
static void _stop_conn(struct connection *con, bool and_other)
@@ -1682,7 +1911,7 @@ static void free_conn(struct connection *con)
static void work_flush(void)
{
- int ok, idx;
+ int ok;
int i;
struct connection *con;
@@ -1693,7 +1922,6 @@ static void work_flush(void)
flush_workqueue(recv_workqueue);
if (send_workqueue)
flush_workqueue(send_workqueue);
- idx = srcu_read_lock(&connections_srcu);
for (i = 0; i < CONN_HASH_SIZE && ok; i++) {
hlist_for_each_entry_rcu(con, &connection_hash[i],
list) {
@@ -1707,14 +1935,17 @@ static void work_flush(void)
}
}
}
- srcu_read_unlock(&connections_srcu, idx);
} while (!ok);
}
void dlm_lowcomms_stop(void)
{
+ int idx;
+
+ idx = srcu_read_lock(&connections_srcu);
work_flush();
foreach_conn(free_conn);
+ srcu_read_unlock(&connections_srcu, idx);
work_stop();
deinit_local();
}
@@ -1738,15 +1969,24 @@ int dlm_lowcomms_start(void)
error = work_start();
if (error)
- goto fail;
+ goto fail_local;
dlm_allow_conn = 1;
/* Start listening */
- if (dlm_config.ci_protocol == 0)
+ switch (dlm_config.ci_protocol) {
+ case DLM_PROTO_TCP:
error = tcp_listen_for_all();
- else
+ break;
+ case DLM_PROTO_SCTP:
error = sctp_listen_for_all(&listen_con);
+ break;
+ default:
+ log_print("Invalid protocol identifier %d set",
+ dlm_config.ci_protocol);
+ error = -EINVAL;
+ break;
+ }
if (error)
goto fail_unlisten;
@@ -1755,6 +1995,9 @@ int dlm_lowcomms_start(void)
fail_unlisten:
dlm_allow_conn = 0;
dlm_close_sock(&listen_con.sock);
+ work_stop();
+fail_local:
+ deinit_local();
fail:
return error;
}
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index 48bbc4e18761..aaae7115c00d 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -12,7 +12,22 @@
#ifndef __LOWCOMMS_DOT_H__
#define __LOWCOMMS_DOT_H__
-#define LOWCOMMS_MAX_TX_BUFFER_LEN 4096
+#include "dlm_internal.h"
+
+#define DLM_MIDCOMMS_OPT_LEN sizeof(struct dlm_opts)
+#define DLM_MAX_APP_BUFSIZE (DLM_MAX_SOCKET_BUFSIZE - \
+ DLM_MIDCOMMS_OPT_LEN)
+
+#define CONN_HASH_SIZE 32
+
+/* This is deliberately very simple because most clusters have simple
+ * sequential nodeids, so we should be able to go straight to a connection
+ * struct in the array
+ */
+static inline int nodeid_hash(int nodeid)
+{
+ return nodeid & (CONN_HASH_SIZE-1);
+}
/* switch to check if dlm is running */
extern int dlm_allow_conn;
@@ -22,8 +37,12 @@ void dlm_lowcomms_shutdown(void);
void dlm_lowcomms_stop(void);
void dlm_lowcomms_exit(void);
int dlm_lowcomms_close(int nodeid);
-void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc);
-void dlm_lowcomms_commit_buffer(void *mh);
+struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
+ char **ppc, void (*cb)(struct dlm_mhandle *mh),
+ struct dlm_mhandle *mh);
+void dlm_lowcomms_commit_msg(struct dlm_msg *msg);
+void dlm_lowcomms_put_msg(struct dlm_msg *msg);
+int dlm_lowcomms_resend_msg(struct dlm_msg *msg);
int dlm_lowcomms_connect_node(int nodeid);
int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark);
int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len);
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index ceef3f2074ff..d9e1e4170eb1 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -15,6 +15,7 @@
#include "recover.h"
#include "rcom.h"
#include "config.h"
+#include "midcomms.h"
#include "lowcomms.h"
int dlm_slots_version(struct dlm_header *h)
@@ -270,7 +271,7 @@ int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
log_slots(ls, gen, num, NULL, array, array_size);
- max_slots = (LOWCOMMS_MAX_TX_BUFFER_LEN - sizeof(struct dlm_rcom) -
+ max_slots = (DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom) -
sizeof(struct rcom_config)) / sizeof(struct rcom_slot);
if (num > max_slots) {
@@ -329,6 +330,7 @@ static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node)
memb->nodeid = node->nodeid;
memb->weight = node->weight;
memb->comm_seq = node->comm_seq;
+ dlm_midcomms_add_member(node->nodeid);
add_ordered_member(ls, memb);
ls->ls_num_nodes++;
return 0;
@@ -359,26 +361,34 @@ int dlm_is_removed(struct dlm_ls *ls, int nodeid)
return 0;
}
-static void clear_memb_list(struct list_head *head)
+static void clear_memb_list(struct list_head *head,
+ void (*after_del)(int nodeid))
{
struct dlm_member *memb;
while (!list_empty(head)) {
memb = list_entry(head->next, struct dlm_member, list);
list_del(&memb->list);
+ if (after_del)
+ after_del(memb->nodeid);
kfree(memb);
}
}
+static void clear_members_cb(int nodeid)
+{
+ dlm_midcomms_remove_member(nodeid);
+}
+
void dlm_clear_members(struct dlm_ls *ls)
{
- clear_memb_list(&ls->ls_nodes);
+ clear_memb_list(&ls->ls_nodes, clear_members_cb);
ls->ls_num_nodes = 0;
}
void dlm_clear_members_gone(struct dlm_ls *ls)
{
- clear_memb_list(&ls->ls_nodes_gone);
+ clear_memb_list(&ls->ls_nodes_gone, NULL);
}
static void make_member_array(struct dlm_ls *ls)
@@ -552,6 +562,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
neg++;
list_move(&memb->list, &ls->ls_nodes_gone);
+ dlm_midcomms_remove_member(memb->nodeid);
ls->ls_num_nodes--;
dlm_lsop_recover_slot(ls, memb);
}
@@ -576,12 +587,18 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
*neg_out = neg;
error = ping_members(ls);
- if (!error || error == -EPROTO) {
- /* new_lockspace() may be waiting to know if the config
- is good or bad */
- ls->ls_members_result = error;
- complete(&ls->ls_members_done);
- }
+ /* error -EINTR means that a new recovery action is triggered.
+ * We ignore this recovery action and let run the new one which might
+ * have new member configuration.
+ */
+ if (error == -EINTR)
+ error = 0;
+
+ /* new_lockspace() may be waiting to know if the config
+ * is good or bad
+ */
+ ls->ls_members_result = error;
+ complete(&ls->ls_members_done);
log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
return error;
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 1c6654a21ec4..e3de268898ed 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -3,7 +3,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2021 Red Hat, Inc. All rights reserved.
**
**
*******************************************************************************
@@ -12,22 +12,866 @@
/*
* midcomms.c
*
- * This is the appallingly named "mid-level" comms layer.
+ * This is the appallingly named "mid-level" comms layer. It takes care about
+ * deliver an on application layer "reliable" communication above the used
+ * lowcomms transport layer.
*
- * Its purpose is to take packets from the "real" comms layer,
- * split them up into packets and pass them to the interested
- * part of the locking mechanism.
+ * How it works:
*
- * It also takes messages from the locking layer, formats them
- * into packets and sends them to the comms layer.
+ * Each nodes keeps track of all send DLM messages in send_queue with a sequence
+ * number. The receive will send an DLM_ACK message back for every DLM message
+ * received at the other side. If a reconnect happens in lowcomms we will send
+ * all unacknowledged dlm messages again. The receiving side might drop any already
+ * received message by comparing sequence numbers.
+ *
+ * How version detection works:
+ *
+ * Due the fact that dlm has pre-configured node addresses on every side
+ * it is in it's nature that every side connects at starts to transmit
+ * dlm messages which ends in a race. However DLM_RCOM_NAMES, DLM_RCOM_STATUS
+ * and their replies are the first messages which are exchanges. Due backwards
+ * compatibility these messages are not covered by the midcomms re-transmission
+ * layer. These messages have their own re-transmission handling in the dlm
+ * application layer. The version field of every node will be set on these RCOM
+ * messages as soon as they arrived and the node isn't yet part of the nodes
+ * hash. There exists also logic to detect version mismatched if something weird
+ * going on or the first messages isn't an expected one.
+ *
+ * Termination:
+ *
+ * The midcomms layer does a 4 way handshake for termination on DLM protocol
+ * like TCP supports it with half-closed socket support. SCTP doesn't support
+ * half-closed socket, so we do it on DLM layer. Also socket shutdown() can be
+ * interrupted by .e.g. tcp reset itself. Additional there exists the othercon
+ * paradigm in lowcomms which cannot be easily without breaking backwards
+ * compatibility. A node cannot send anything to another node when a DLM_FIN
+ * message was send. There exists additional logic to print a warning if
+ * DLM wants to do it. There exists a state handling like RFC 793 but reduced
+ * to termination only. The event "member removal event" describes the cluster
+ * manager removed the node from internal lists, at this point DLM does not
+ * send any message to the other node. There exists two cases:
+ *
+ * 1. The cluster member was removed and we received a FIN
+ * OR
+ * 2. We received a FIN but the member was not removed yet
+ *
+ * One of these cases will do the CLOSE_WAIT to LAST_ACK change.
+ *
+ *
+ * +---------+
+ * | CLOSED |
+ * +---------+
+ * | add member/receive RCOM version
+ * | detection msg
+ * V
+ * +---------+
+ * | ESTAB |
+ * +---------+
+ * CLOSE | | rcv FIN
+ * ------- | | -------
+ * +---------+ snd FIN / \ snd ACK +---------+
+ * | FIN |<----------------- ------------------>| CLOSE |
+ * | WAIT-1 |------------------ | WAIT |
+ * +---------+ rcv FIN \ +---------+
+ * | rcv ACK of FIN ------- | CLOSE | member
+ * | -------------- snd ACK | ------- | removal
+ * V x V snd FIN V event
+ * +---------+ +---------+ +---------+
+ * |FINWAIT-2| | CLOSING | | LAST-ACK|
+ * +---------+ +---------+ +---------+
+ * | rcv ACK of FIN | rcv ACK of FIN |
+ * | rcv FIN -------------- | -------------- |
+ * | ------- x V x V
+ * \ snd ACK +---------+ +---------+
+ * ------------------------>| CLOSED | | CLOSED |
+ * +---------+ +---------+
+ *
+ * NOTE: any state can interrupted by midcomms_close() and state will be
+ * switched to CLOSED in case of fencing. There exists also some timeout
+ * handling when we receive the version detection RCOM messages which is
+ * made by observation.
+ *
+ * Future improvements:
+ *
+ * There exists some known issues/improvements of the dlm handling. Some
+ * of them should be done in a next major dlm version bump which makes
+ * it incompatible with previous versions.
+ *
+ * Unaligned memory access:
+ *
+ * There exists cases when the dlm message buffer length is not aligned
+ * to 8 byte. However seems nobody detected any problem with it. This
+ * can be fixed in the next major version bump of dlm.
+ *
+ * Version detection:
+ *
+ * The version detection and how it's done is related to backwards
+ * compatibility. There exists better ways to make a better handling.
+ * However this should be changed in the next major version bump of dlm.
+ *
+ * Ack handling:
+ *
+ * Currently we send an ack message for every dlm message. However we
+ * can ack multiple dlm messages with one ack by just delaying the ack
+ * message. Will reduce some traffic but makes the drop detection slower.
+ *
+ * Tail Size checking:
+ *
+ * There exists a message tail payload in e.g. DLM_MSG however we don't
+ * check it against the message length yet regarding to the receive buffer
+ * length. That need to be validated.
+ *
+ * Fencing bad nodes:
+ *
+ * At timeout places or weird sequence number behaviours we should send
+ * a fencing request to the cluster manager.
+ */
+
+/* Debug switch to enable a 5 seconds sleep waiting of a termination.
+ * This can be useful to test fencing while termination is running.
+ * This requires a setup with only gfs2 as dlm user, so that the
+ * last umount will terminate the connection.
+ *
+ * However it became useful to test, while the 5 seconds block in umount
+ * just press the reset button. In a lot of dropping the termination
+ * process can could take several seconds.
*/
+#define DLM_DEBUG_FENCE_TERMINATION 0
+
+#include <net/tcp.h>
#include "dlm_internal.h"
#include "lowcomms.h"
#include "config.h"
#include "lock.h"
+#include "util.h"
#include "midcomms.h"
+/* init value for sequence numbers for testing purpose only e.g. overflows */
+#define DLM_SEQ_INIT 0
+/* 3 minutes wait to sync ending of dlm */
+#define DLM_SHUTDOWN_TIMEOUT msecs_to_jiffies(3 * 60 * 1000)
+#define DLM_VERSION_NOT_SET 0
+
+struct midcomms_node {
+ int nodeid;
+ uint32_t version;
+ uint32_t seq_send;
+ uint32_t seq_next;
+ /* These queues are unbound because we cannot drop any message in dlm.
+ * We could send a fence signal for a specific node to the cluster
+ * manager if queues hits some maximum value, however this handling
+ * not supported yet.
+ */
+ struct list_head send_queue;
+ spinlock_t send_queue_lock;
+ atomic_t send_queue_cnt;
+#define DLM_NODE_FLAG_CLOSE 1
+#define DLM_NODE_FLAG_STOP_TX 2
+#define DLM_NODE_FLAG_STOP_RX 3
+ unsigned long flags;
+ wait_queue_head_t shutdown_wait;
+
+ /* dlm tcp termination state */
+#define DLM_CLOSED 1
+#define DLM_ESTABLISHED 2
+#define DLM_FIN_WAIT1 3
+#define DLM_FIN_WAIT2 4
+#define DLM_CLOSE_WAIT 5
+#define DLM_LAST_ACK 6
+#define DLM_CLOSING 7
+ int state;
+ spinlock_t state_lock;
+
+ /* counts how many lockspaces are using this node
+ * this refcount is necessary to determine if the
+ * node wants to disconnect.
+ */
+ int users;
+
+ /* not protected by srcu, node_hash lifetime */
+ void *debugfs;
+
+ struct hlist_node hlist;
+ struct rcu_head rcu;
+};
+
+struct dlm_mhandle {
+ const struct dlm_header *inner_hd;
+ struct midcomms_node *node;
+ struct dlm_opts *opts;
+ struct dlm_msg *msg;
+ bool committed;
+ uint32_t seq;
+
+ void (*ack_rcv)(struct midcomms_node *node);
+
+ /* get_mhandle/commit srcu idx exchange */
+ int idx;
+
+ struct list_head list;
+ struct rcu_head rcu;
+};
+
+static struct hlist_head node_hash[CONN_HASH_SIZE];
+static DEFINE_SPINLOCK(nodes_lock);
+DEFINE_STATIC_SRCU(nodes_srcu);
+
+/* This mutex prevents that midcomms_close() is running while
+ * stop() or remove(). As I experienced invalid memory access
+ * behaviours when DLM_DEBUG_FENCE_TERMINATION is enabled and
+ * resetting machines. I will end in some double deletion in nodes
+ * datastructure.
+ */
+static DEFINE_MUTEX(close_lock);
+
+static inline const char *dlm_state_str(int state)
+{
+ switch (state) {
+ case DLM_CLOSED:
+ return "CLOSED";
+ case DLM_ESTABLISHED:
+ return "ESTABLISHED";
+ case DLM_FIN_WAIT1:
+ return "FIN_WAIT1";
+ case DLM_FIN_WAIT2:
+ return "FIN_WAIT2";
+ case DLM_CLOSE_WAIT:
+ return "CLOSE_WAIT";
+ case DLM_LAST_ACK:
+ return "LAST_ACK";
+ case DLM_CLOSING:
+ return "CLOSING";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+const char *dlm_midcomms_state(struct midcomms_node *node)
+{
+ return dlm_state_str(node->state);
+}
+
+unsigned long dlm_midcomms_flags(struct midcomms_node *node)
+{
+ return node->flags;
+}
+
+int dlm_midcomms_send_queue_cnt(struct midcomms_node *node)
+{
+ return atomic_read(&node->send_queue_cnt);
+}
+
+uint32_t dlm_midcomms_version(struct midcomms_node *node)
+{
+ return node->version;
+}
+
+static struct midcomms_node *__find_node(int nodeid, int r)
+{
+ struct midcomms_node *node;
+
+ hlist_for_each_entry_rcu(node, &node_hash[r], hlist) {
+ if (node->nodeid == nodeid)
+ return node;
+ }
+
+ return NULL;
+}
+
+static void dlm_mhandle_release(struct rcu_head *rcu)
+{
+ struct dlm_mhandle *mh = container_of(rcu, struct dlm_mhandle, rcu);
+
+ dlm_lowcomms_put_msg(mh->msg);
+ kfree(mh);
+}
+
+static void dlm_mhandle_delete(struct midcomms_node *node,
+ struct dlm_mhandle *mh)
+{
+ list_del_rcu(&mh->list);
+ atomic_dec(&node->send_queue_cnt);
+ call_rcu(&mh->rcu, dlm_mhandle_release);
+}
+
+static void dlm_send_queue_flush(struct midcomms_node *node)
+{
+ struct dlm_mhandle *mh;
+
+ pr_debug("flush midcomms send queue of node %d\n", node->nodeid);
+
+ rcu_read_lock();
+ spin_lock(&node->send_queue_lock);
+ list_for_each_entry_rcu(mh, &node->send_queue, list) {
+ dlm_mhandle_delete(node, mh);
+ }
+ spin_unlock(&node->send_queue_lock);
+ rcu_read_unlock();
+}
+
+static void midcomms_node_reset(struct midcomms_node *node)
+{
+ pr_debug("reset node %d\n", node->nodeid);
+
+ node->seq_next = DLM_SEQ_INIT;
+ node->seq_send = DLM_SEQ_INIT;
+ node->version = DLM_VERSION_NOT_SET;
+ node->flags = 0;
+
+ dlm_send_queue_flush(node);
+ node->state = DLM_CLOSED;
+ wake_up(&node->shutdown_wait);
+}
+
+static struct midcomms_node *nodeid2node(int nodeid, gfp_t alloc)
+{
+ struct midcomms_node *node, *tmp;
+ int r = nodeid_hash(nodeid);
+
+ node = __find_node(nodeid, r);
+ if (node || !alloc)
+ return node;
+
+ node = kmalloc(sizeof(*node), alloc);
+ if (!node)
+ return NULL;
+
+ node->nodeid = nodeid;
+ spin_lock_init(&node->state_lock);
+ spin_lock_init(&node->send_queue_lock);
+ atomic_set(&node->send_queue_cnt, 0);
+ INIT_LIST_HEAD(&node->send_queue);
+ init_waitqueue_head(&node->shutdown_wait);
+ node->users = 0;
+ midcomms_node_reset(node);
+
+ spin_lock(&nodes_lock);
+ /* check again if there was somebody else
+ * earlier here to add the node
+ */
+ tmp = __find_node(nodeid, r);
+ if (tmp) {
+ spin_unlock(&nodes_lock);
+ kfree(node);
+ return tmp;
+ }
+
+ hlist_add_head_rcu(&node->hlist, &node_hash[r]);
+ spin_unlock(&nodes_lock);
+
+ node->debugfs = dlm_create_debug_comms_file(nodeid, node);
+ return node;
+}
+
+static int dlm_send_ack(int nodeid, uint32_t seq)
+{
+ int mb_len = sizeof(struct dlm_header);
+ struct dlm_header *m_header;
+ struct dlm_msg *msg;
+ char *ppc;
+
+ msg = dlm_lowcomms_new_msg(nodeid, mb_len, GFP_NOFS, &ppc,
+ NULL, NULL);
+ if (!msg)
+ return -ENOMEM;
+
+ m_header = (struct dlm_header *)ppc;
+
+ m_header->h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ m_header->h_nodeid = dlm_our_nodeid();
+ m_header->h_length = mb_len;
+ m_header->h_cmd = DLM_ACK;
+ m_header->u.h_seq = seq;
+
+ header_out(m_header);
+ dlm_lowcomms_commit_msg(msg);
+ dlm_lowcomms_put_msg(msg);
+
+ return 0;
+}
+
+static int dlm_send_fin(struct midcomms_node *node,
+ void (*ack_rcv)(struct midcomms_node *node))
+{
+ int mb_len = sizeof(struct dlm_header);
+ struct dlm_header *m_header;
+ struct dlm_mhandle *mh;
+ char *ppc;
+
+ mh = dlm_midcomms_get_mhandle(node->nodeid, mb_len, GFP_NOFS, &ppc);
+ if (!mh)
+ return -ENOMEM;
+
+ mh->ack_rcv = ack_rcv;
+
+ m_header = (struct dlm_header *)ppc;
+
+ m_header->h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ m_header->h_nodeid = dlm_our_nodeid();
+ m_header->h_length = mb_len;
+ m_header->h_cmd = DLM_FIN;
+
+ header_out(m_header);
+
+ pr_debug("sending fin msg to node %d\n", node->nodeid);
+ dlm_midcomms_commit_mhandle(mh);
+ set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags);
+
+ return 0;
+}
+
+static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq)
+{
+ struct dlm_mhandle *mh;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(mh, &node->send_queue, list) {
+ if (before(mh->seq, seq)) {
+ if (mh->ack_rcv)
+ mh->ack_rcv(node);
+ } else {
+ /* send queue should be ordered */
+ break;
+ }
+ }
+
+ spin_lock(&node->send_queue_lock);
+ list_for_each_entry_rcu(mh, &node->send_queue, list) {
+ if (before(mh->seq, seq)) {
+ dlm_mhandle_delete(node, mh);
+ } else {
+ /* send queue should be ordered */
+ break;
+ }
+ }
+ spin_unlock(&node->send_queue_lock);
+ rcu_read_unlock();
+}
+
+static void dlm_pas_fin_ack_rcv(struct midcomms_node *node)
+{
+ spin_lock(&node->state_lock);
+ pr_debug("receive passive fin ack from node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+
+ switch (node->state) {
+ case DLM_LAST_ACK:
+ /* DLM_CLOSED */
+ midcomms_node_reset(node);
+ break;
+ case DLM_CLOSED:
+ /* not valid but somehow we got what we want */
+ wake_up(&node->shutdown_wait);
+ break;
+ default:
+ spin_unlock(&node->state_lock);
+ log_print("%s: unexpected state: %d\n",
+ __func__, node->state);
+ WARN_ON(1);
+ return;
+ }
+ spin_unlock(&node->state_lock);
+}
+
+static void dlm_midcomms_receive_buffer(union dlm_packet *p,
+ struct midcomms_node *node,
+ uint32_t seq)
+{
+ if (seq == node->seq_next) {
+ node->seq_next++;
+ /* send ack before fin */
+ dlm_send_ack(node->nodeid, node->seq_next);
+
+ switch (p->header.h_cmd) {
+ case DLM_FIN:
+ spin_lock(&node->state_lock);
+ pr_debug("receive fin msg from node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+
+ switch (node->state) {
+ case DLM_ESTABLISHED:
+ node->state = DLM_CLOSE_WAIT;
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ /* passive shutdown DLM_LAST_ACK case 1
+ * additional we check if the node is used by
+ * cluster manager events at all.
+ */
+ if (node->users == 0) {
+ node->state = DLM_LAST_ACK;
+ pr_debug("switch node %d to state %s case 1\n",
+ node->nodeid, dlm_state_str(node->state));
+ spin_unlock(&node->state_lock);
+ goto send_fin;
+ }
+ break;
+ case DLM_FIN_WAIT1:
+ node->state = DLM_CLOSING;
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ break;
+ case DLM_FIN_WAIT2:
+ midcomms_node_reset(node);
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ wake_up(&node->shutdown_wait);
+ break;
+ case DLM_LAST_ACK:
+ /* probably remove_member caught it, do nothing */
+ break;
+ default:
+ spin_unlock(&node->state_lock);
+ log_print("%s: unexpected state: %d\n",
+ __func__, node->state);
+ WARN_ON(1);
+ return;
+ }
+ spin_unlock(&node->state_lock);
+
+ set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags);
+ break;
+ default:
+ WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
+ dlm_receive_buffer(p, node->nodeid);
+ break;
+ }
+ } else {
+ /* retry to ack message which we already have by sending back
+ * current node->seq_next number as ack.
+ */
+ if (seq < node->seq_next)
+ dlm_send_ack(node->nodeid, node->seq_next);
+
+ log_print_ratelimited("ignore dlm msg because seq mismatch, seq: %u, expected: %u, nodeid: %d",
+ seq, node->seq_next, node->nodeid);
+ }
+
+ return;
+
+send_fin:
+ set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags);
+ dlm_send_fin(node, dlm_pas_fin_ack_rcv);
+}
+
+static struct midcomms_node *
+dlm_midcomms_recv_node_lookup(int nodeid, const union dlm_packet *p,
+ uint16_t msglen, int (*cb)(struct midcomms_node *node))
+{
+ struct midcomms_node *node = NULL;
+ gfp_t allocation = 0;
+ int ret;
+
+ switch (p->header.h_cmd) {
+ case DLM_RCOM:
+ if (msglen < sizeof(struct dlm_rcom)) {
+ log_print("rcom msg too small: %u, will skip this message from node %d",
+ msglen, nodeid);
+ return NULL;
+ }
+
+ switch (le32_to_cpu(p->rcom.rc_type)) {
+ case DLM_RCOM_NAMES:
+ fallthrough;
+ case DLM_RCOM_NAMES_REPLY:
+ fallthrough;
+ case DLM_RCOM_STATUS:
+ fallthrough;
+ case DLM_RCOM_STATUS_REPLY:
+ node = nodeid2node(nodeid, 0);
+ if (node) {
+ spin_lock(&node->state_lock);
+ if (node->state != DLM_ESTABLISHED)
+ pr_debug("receive begin RCOM msg from node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+
+ switch (node->state) {
+ case DLM_CLOSED:
+ node->state = DLM_ESTABLISHED;
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ break;
+ case DLM_ESTABLISHED:
+ break;
+ default:
+ /* some invalid state passive shutdown
+ * was failed, we try to reset and
+ * hope it will go on.
+ */
+ log_print("reset node %d because shutdown stuck",
+ node->nodeid);
+
+ midcomms_node_reset(node);
+ node->state = DLM_ESTABLISHED;
+ break;
+ }
+ spin_unlock(&node->state_lock);
+ }
+
+ allocation = GFP_NOFS;
+ break;
+ default:
+ break;
+ }
+
+ break;
+ default:
+ break;
+ }
+
+ node = nodeid2node(nodeid, allocation);
+ if (!node) {
+ switch (p->header.h_cmd) {
+ case DLM_OPTS:
+ if (msglen < sizeof(struct dlm_opts)) {
+ log_print("opts msg too small: %u, will skip this message from node %d",
+ msglen, nodeid);
+ return NULL;
+ }
+
+ log_print_ratelimited("received dlm opts message nextcmd %d from node %d in an invalid sequence",
+ p->opts.o_nextcmd, nodeid);
+ break;
+ default:
+ log_print_ratelimited("received dlm message cmd %d from node %d in an invalid sequence",
+ p->header.h_cmd, nodeid);
+ break;
+ }
+
+ return NULL;
+ }
+
+ ret = cb(node);
+ if (ret < 0)
+ return NULL;
+
+ return node;
+}
+
+static int dlm_midcomms_version_check_3_2(struct midcomms_node *node)
+{
+ switch (node->version) {
+ case DLM_VERSION_NOT_SET:
+ node->version = DLM_VERSION_3_2;
+ log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2,
+ node->nodeid);
+ break;
+ case DLM_VERSION_3_2:
+ break;
+ default:
+ log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x",
+ DLM_VERSION_3_2, node->nodeid, node->version);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int dlm_opts_check_msglen(union dlm_packet *p, uint16_t msglen, int nodeid)
+{
+ int len = msglen;
+
+ /* we only trust outer header msglen because
+ * it's checked against receive buffer length.
+ */
+ if (len < sizeof(struct dlm_opts))
+ return -1;
+ len -= sizeof(struct dlm_opts);
+
+ if (len < le16_to_cpu(p->opts.o_optlen))
+ return -1;
+ len -= le16_to_cpu(p->opts.o_optlen);
+
+ switch (p->opts.o_nextcmd) {
+ case DLM_FIN:
+ if (len < sizeof(struct dlm_header)) {
+ log_print("fin too small: %d, will skip this message from node %d",
+ len, nodeid);
+ return -1;
+ }
+
+ break;
+ case DLM_MSG:
+ if (len < sizeof(struct dlm_message)) {
+ log_print("msg too small: %d, will skip this message from node %d",
+ msglen, nodeid);
+ return -1;
+ }
+
+ break;
+ case DLM_RCOM:
+ if (len < sizeof(struct dlm_rcom)) {
+ log_print("rcom msg too small: %d, will skip this message from node %d",
+ len, nodeid);
+ return -1;
+ }
+
+ break;
+ default:
+ log_print("unsupported o_nextcmd received: %u, will skip this message from node %d",
+ p->opts.o_nextcmd, nodeid);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid)
+{
+ uint16_t msglen = le16_to_cpu(p->header.h_length);
+ struct midcomms_node *node;
+ uint32_t seq;
+ int ret, idx;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = dlm_midcomms_recv_node_lookup(nodeid, p, msglen,
+ dlm_midcomms_version_check_3_2);
+ if (!node)
+ goto out;
+
+ switch (p->header.h_cmd) {
+ case DLM_RCOM:
+ /* these rcom message we use to determine version.
+ * they have their own retransmission handling and
+ * are the first messages of dlm.
+ *
+ * length already checked.
+ */
+ switch (le32_to_cpu(p->rcom.rc_type)) {
+ case DLM_RCOM_NAMES:
+ fallthrough;
+ case DLM_RCOM_NAMES_REPLY:
+ fallthrough;
+ case DLM_RCOM_STATUS:
+ fallthrough;
+ case DLM_RCOM_STATUS_REPLY:
+ break;
+ default:
+ log_print("unsupported rcom type received: %u, will skip this message from node %d",
+ le32_to_cpu(p->rcom.rc_type), nodeid);
+ goto out;
+ }
+
+ WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
+ dlm_receive_buffer(p, nodeid);
+ break;
+ case DLM_OPTS:
+ seq = le32_to_cpu(p->header.u.h_seq);
+
+ ret = dlm_opts_check_msglen(p, msglen, nodeid);
+ if (ret < 0) {
+ log_print("opts msg too small: %u, will skip this message from node %d",
+ msglen, nodeid);
+ goto out;
+ }
+
+ p = (union dlm_packet *)((unsigned char *)p->opts.o_opts +
+ le16_to_cpu(p->opts.o_optlen));
+
+ /* recheck inner msglen just if it's not garbage */
+ msglen = le16_to_cpu(p->header.h_length);
+ switch (p->header.h_cmd) {
+ case DLM_RCOM:
+ if (msglen < sizeof(struct dlm_rcom)) {
+ log_print("inner rcom msg too small: %u, will skip this message from node %d",
+ msglen, nodeid);
+ goto out;
+ }
+
+ break;
+ case DLM_MSG:
+ if (msglen < sizeof(struct dlm_message)) {
+ log_print("inner msg too small: %u, will skip this message from node %d",
+ msglen, nodeid);
+ goto out;
+ }
+
+ break;
+ case DLM_FIN:
+ if (msglen < sizeof(struct dlm_header)) {
+ log_print("inner fin too small: %u, will skip this message from node %d",
+ msglen, nodeid);
+ goto out;
+ }
+
+ break;
+ default:
+ log_print("unsupported inner h_cmd received: %u, will skip this message from node %d",
+ msglen, nodeid);
+ goto out;
+ }
+
+ dlm_midcomms_receive_buffer(p, node, seq);
+ break;
+ case DLM_ACK:
+ seq = le32_to_cpu(p->header.u.h_seq);
+ dlm_receive_ack(node, seq);
+ break;
+ default:
+ log_print("unsupported h_cmd received: %u, will skip this message from node %d",
+ p->header.h_cmd, nodeid);
+ break;
+ }
+
+out:
+ srcu_read_unlock(&nodes_srcu, idx);
+}
+
+static int dlm_midcomms_version_check_3_1(struct midcomms_node *node)
+{
+ switch (node->version) {
+ case DLM_VERSION_NOT_SET:
+ node->version = DLM_VERSION_3_1;
+ log_print("version 0x%08x for node %d detected", DLM_VERSION_3_1,
+ node->nodeid);
+ break;
+ case DLM_VERSION_3_1:
+ break;
+ default:
+ log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x",
+ DLM_VERSION_3_1, node->nodeid, node->version);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void dlm_midcomms_receive_buffer_3_1(union dlm_packet *p, int nodeid)
+{
+ uint16_t msglen = le16_to_cpu(p->header.h_length);
+ struct midcomms_node *node;
+ int idx;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = dlm_midcomms_recv_node_lookup(nodeid, p, msglen,
+ dlm_midcomms_version_check_3_1);
+ if (!node) {
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+ srcu_read_unlock(&nodes_srcu, idx);
+
+ switch (p->header.h_cmd) {
+ case DLM_RCOM:
+ /* length already checked */
+ break;
+ case DLM_MSG:
+ if (msglen < sizeof(struct dlm_message)) {
+ log_print("msg too small: %u, will skip this message from node %d",
+ msglen, nodeid);
+ return;
+ }
+
+ break;
+ default:
+ log_print("unsupported h_cmd received: %u, will skip this message from node %d",
+ p->header.h_cmd, nodeid);
+ return;
+ }
+
+ dlm_receive_buffer(p, nodeid);
+}
+
/*
* Called from the low-level comms layer to process a buffer of
* commands.
@@ -43,7 +887,7 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
while (len >= sizeof(struct dlm_header)) {
hd = (struct dlm_header *)ptr;
- /* no message should be more than DEFAULT_BUFFER_SIZE or
+ /* no message should be more than DLM_MAX_SOCKET_BUFSIZE or
* less than dlm_header size.
*
* Some messages does not have a 8 byte length boundary yet
@@ -55,7 +899,7 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
* the next major version bump.
*/
msglen = le16_to_cpu(hd->h_length);
- if (msglen > DEFAULT_BUFFER_SIZE ||
+ if (msglen > DLM_MAX_SOCKET_BUFSIZE ||
msglen < sizeof(struct dlm_header)) {
log_print("received invalid length header: %u from node %d, will abort message parsing",
msglen, nodeid);
@@ -68,32 +912,19 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
if (msglen > len)
break;
- switch (hd->h_cmd) {
- case DLM_MSG:
- if (msglen < sizeof(struct dlm_message)) {
- log_print("dlm msg too small: %u, will skip this message",
- msglen);
- goto skip;
- }
-
+ switch (le32_to_cpu(hd->h_version)) {
+ case DLM_VERSION_3_1:
+ dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid);
break;
- case DLM_RCOM:
- if (msglen < sizeof(struct dlm_rcom)) {
- log_print("dlm rcom msg too small: %u, will skip this message",
- msglen);
- goto skip;
- }
-
+ case DLM_VERSION_3_2:
+ dlm_midcomms_receive_buffer_3_2((union dlm_packet *)ptr, nodeid);
break;
default:
- log_print("unsupported h_cmd received: %u, will skip this message",
- hd->h_cmd);
- goto skip;
+ log_print("received invalid version header: %u from node %d, will skip this message",
+ le32_to_cpu(hd->h_version), nodeid);
+ break;
}
- dlm_receive_buffer((union dlm_packet *)ptr, nodeid);
-
-skip:
ret += msglen;
len -= msglen;
ptr += msglen;
@@ -102,3 +933,455 @@ skip:
return ret;
}
+void dlm_midcomms_unack_msg_resend(int nodeid)
+{
+ struct midcomms_node *node;
+ struct dlm_mhandle *mh;
+ int idx, ret;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = nodeid2node(nodeid, 0);
+ if (!node) {
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+
+ /* old protocol, we don't support to retransmit on failure */
+ switch (node->version) {
+ case DLM_VERSION_3_2:
+ break;
+ default:
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(mh, &node->send_queue, list) {
+ if (!mh->committed)
+ continue;
+
+ ret = dlm_lowcomms_resend_msg(mh->msg);
+ if (!ret)
+ log_print_ratelimited("retransmit dlm msg, seq %u, nodeid %d",
+ mh->seq, node->nodeid);
+ }
+ rcu_read_unlock();
+ srcu_read_unlock(&nodes_srcu, idx);
+}
+
+static void dlm_fill_opts_header(struct dlm_opts *opts, uint16_t inner_len,
+ uint32_t seq)
+{
+ opts->o_header.h_cmd = DLM_OPTS;
+ opts->o_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ opts->o_header.h_nodeid = dlm_our_nodeid();
+ opts->o_header.h_length = DLM_MIDCOMMS_OPT_LEN + inner_len;
+ opts->o_header.u.h_seq = seq;
+ header_out(&opts->o_header);
+}
+
+static void midcomms_new_msg_cb(struct dlm_mhandle *mh)
+{
+ atomic_inc(&mh->node->send_queue_cnt);
+
+ spin_lock(&mh->node->send_queue_lock);
+ list_add_tail_rcu(&mh->list, &mh->node->send_queue);
+ spin_unlock(&mh->node->send_queue_lock);
+
+ mh->seq = mh->node->seq_send++;
+}
+
+static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int nodeid,
+ int len, gfp_t allocation, char **ppc)
+{
+ struct dlm_opts *opts;
+ struct dlm_msg *msg;
+
+ msg = dlm_lowcomms_new_msg(nodeid, len + DLM_MIDCOMMS_OPT_LEN,
+ allocation, ppc, midcomms_new_msg_cb, mh);
+ if (!msg)
+ return NULL;
+
+ opts = (struct dlm_opts *)*ppc;
+ mh->opts = opts;
+
+ /* add possible options here */
+ dlm_fill_opts_header(opts, len, mh->seq);
+
+ *ppc += sizeof(*opts);
+ mh->inner_hd = (const struct dlm_header *)*ppc;
+ return msg;
+}
+
+struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
+ gfp_t allocation, char **ppc)
+{
+ struct midcomms_node *node;
+ struct dlm_mhandle *mh;
+ struct dlm_msg *msg;
+ int idx;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = nodeid2node(nodeid, 0);
+ if (!node) {
+ WARN_ON_ONCE(1);
+ goto err;
+ }
+
+ /* this is a bug, however we going on and hope it will be resolved */
+ WARN_ON(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags));
+
+ mh = kzalloc(sizeof(*mh), GFP_NOFS);
+ if (!mh)
+ goto err;
+
+ mh->idx = idx;
+ mh->node = node;
+
+ switch (node->version) {
+ case DLM_VERSION_3_1:
+ msg = dlm_lowcomms_new_msg(nodeid, len, allocation, ppc,
+ NULL, NULL);
+ if (!msg) {
+ kfree(mh);
+ goto err;
+ }
+
+ break;
+ case DLM_VERSION_3_2:
+ msg = dlm_midcomms_get_msg_3_2(mh, nodeid, len, allocation,
+ ppc);
+ if (!msg) {
+ kfree(mh);
+ goto err;
+ }
+
+ break;
+ default:
+ kfree(mh);
+ WARN_ON(1);
+ goto err;
+ }
+
+ mh->msg = msg;
+
+ /* keep in mind that is a must to call
+ * dlm_midcomms_commit_msg() which releases
+ * nodes_srcu using mh->idx which is assumed
+ * here that the application will call it.
+ */
+ return mh;
+
+err:
+ srcu_read_unlock(&nodes_srcu, idx);
+ return NULL;
+}
+
+static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh)
+{
+ /* nexthdr chain for fast lookup */
+ mh->opts->o_nextcmd = mh->inner_hd->h_cmd;
+ mh->committed = true;
+ dlm_lowcomms_commit_msg(mh->msg);
+}
+
+void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh)
+{
+ switch (mh->node->version) {
+ case DLM_VERSION_3_1:
+ srcu_read_unlock(&nodes_srcu, mh->idx);
+
+ dlm_lowcomms_commit_msg(mh->msg);
+ dlm_lowcomms_put_msg(mh->msg);
+ /* mh is not part of rcu list in this case */
+ kfree(mh);
+ break;
+ case DLM_VERSION_3_2:
+ dlm_midcomms_commit_msg_3_2(mh);
+ srcu_read_unlock(&nodes_srcu, mh->idx);
+ break;
+ default:
+ srcu_read_unlock(&nodes_srcu, mh->idx);
+ WARN_ON(1);
+ break;
+ }
+}
+
+int dlm_midcomms_start(void)
+{
+ int i;
+
+ for (i = 0; i < CONN_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&node_hash[i]);
+
+ return dlm_lowcomms_start();
+}
+
+static void dlm_act_fin_ack_rcv(struct midcomms_node *node)
+{
+ spin_lock(&node->state_lock);
+ pr_debug("receive active fin ack from node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+
+ switch (node->state) {
+ case DLM_FIN_WAIT1:
+ node->state = DLM_FIN_WAIT2;
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ break;
+ case DLM_CLOSING:
+ midcomms_node_reset(node);
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ wake_up(&node->shutdown_wait);
+ break;
+ case DLM_CLOSED:
+ /* not valid but somehow we got what we want */
+ wake_up(&node->shutdown_wait);
+ break;
+ default:
+ spin_unlock(&node->state_lock);
+ log_print("%s: unexpected state: %d\n",
+ __func__, node->state);
+ WARN_ON(1);
+ return;
+ }
+ spin_unlock(&node->state_lock);
+}
+
+void dlm_midcomms_add_member(int nodeid)
+{
+ struct midcomms_node *node;
+ int idx;
+
+ if (nodeid == dlm_our_nodeid())
+ return;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = nodeid2node(nodeid, GFP_NOFS);
+ if (!node) {
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+
+ spin_lock(&node->state_lock);
+ if (!node->users) {
+ pr_debug("receive add member from node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ switch (node->state) {
+ case DLM_ESTABLISHED:
+ break;
+ case DLM_CLOSED:
+ node->state = DLM_ESTABLISHED;
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ break;
+ default:
+ /* some invalid state passive shutdown
+ * was failed, we try to reset and
+ * hope it will go on.
+ */
+ log_print("reset node %d because shutdown stuck",
+ node->nodeid);
+
+ midcomms_node_reset(node);
+ node->state = DLM_ESTABLISHED;
+ break;
+ }
+ }
+
+ node->users++;
+ pr_debug("users inc count %d\n", node->users);
+ spin_unlock(&node->state_lock);
+
+ srcu_read_unlock(&nodes_srcu, idx);
+}
+
+void dlm_midcomms_remove_member(int nodeid)
+{
+ struct midcomms_node *node;
+ int idx;
+
+ if (nodeid == dlm_our_nodeid())
+ return;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = nodeid2node(nodeid, 0);
+ if (!node) {
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+
+ spin_lock(&node->state_lock);
+ node->users--;
+ pr_debug("users dec count %d\n", node->users);
+
+ /* hitting users count to zero means the
+ * other side is running dlm_midcomms_stop()
+ * we meet us to have a clean disconnect.
+ */
+ if (node->users == 0) {
+ pr_debug("receive remove member from node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ switch (node->state) {
+ case DLM_ESTABLISHED:
+ break;
+ case DLM_CLOSE_WAIT:
+ /* passive shutdown DLM_LAST_ACK case 2 */
+ node->state = DLM_LAST_ACK;
+ spin_unlock(&node->state_lock);
+
+ pr_debug("switch node %d to state %s case 2\n",
+ node->nodeid, dlm_state_str(node->state));
+ goto send_fin;
+ case DLM_LAST_ACK:
+ /* probably receive fin caught it, do nothing */
+ break;
+ case DLM_CLOSED:
+ /* already gone, do nothing */
+ break;
+ default:
+ log_print("%s: unexpected state: %d\n",
+ __func__, node->state);
+ break;
+ }
+ }
+ spin_unlock(&node->state_lock);
+
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+
+send_fin:
+ set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags);
+ dlm_send_fin(node, dlm_pas_fin_ack_rcv);
+ srcu_read_unlock(&nodes_srcu, idx);
+}
+
+static void midcomms_node_release(struct rcu_head *rcu)
+{
+ struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu);
+
+ WARN_ON(atomic_read(&node->send_queue_cnt));
+ kfree(node);
+}
+
+static void midcomms_shutdown(struct midcomms_node *node)
+{
+ int ret;
+
+ /* old protocol, we don't wait for pending operations */
+ switch (node->version) {
+ case DLM_VERSION_3_2:
+ break;
+ default:
+ return;
+ }
+
+ spin_lock(&node->state_lock);
+ pr_debug("receive active shutdown for node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ switch (node->state) {
+ case DLM_ESTABLISHED:
+ node->state = DLM_FIN_WAIT1;
+ pr_debug("switch node %d to state %s case 2\n",
+ node->nodeid, dlm_state_str(node->state));
+ break;
+ case DLM_CLOSED:
+ /* we have what we want */
+ spin_unlock(&node->state_lock);
+ return;
+ default:
+ /* busy to enter DLM_FIN_WAIT1, wait until passive
+ * done in shutdown_wait to enter DLM_CLOSED.
+ */
+ break;
+ }
+ spin_unlock(&node->state_lock);
+
+ if (node->state == DLM_FIN_WAIT1) {
+ dlm_send_fin(node, dlm_act_fin_ack_rcv);
+
+ if (DLM_DEBUG_FENCE_TERMINATION)
+ msleep(5000);
+ }
+
+ /* wait for other side dlm + fin */
+ ret = wait_event_timeout(node->shutdown_wait,
+ node->state == DLM_CLOSED ||
+ test_bit(DLM_NODE_FLAG_CLOSE, &node->flags),
+ DLM_SHUTDOWN_TIMEOUT);
+ if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) {
+ pr_debug("active shutdown timed out for node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ midcomms_node_reset(node);
+ return;
+ }
+
+ pr_debug("active shutdown done for node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+}
+
+void dlm_midcomms_shutdown(void)
+{
+ struct midcomms_node *node;
+ int i, idx;
+
+ mutex_lock(&close_lock);
+ idx = srcu_read_lock(&nodes_srcu);
+ for (i = 0; i < CONN_HASH_SIZE; i++) {
+ hlist_for_each_entry_rcu(node, &node_hash[i], hlist) {
+ midcomms_shutdown(node);
+
+ dlm_delete_debug_comms_file(node->debugfs);
+
+ spin_lock(&nodes_lock);
+ hlist_del_rcu(&node->hlist);
+ spin_unlock(&nodes_lock);
+
+ call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release);
+ }
+ }
+ srcu_read_unlock(&nodes_srcu, idx);
+ mutex_unlock(&close_lock);
+
+ dlm_lowcomms_shutdown();
+}
+
+int dlm_midcomms_close(int nodeid)
+{
+ struct midcomms_node *node;
+ int idx, ret;
+
+ if (nodeid == dlm_our_nodeid())
+ return 0;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ /* Abort pending close/remove operation */
+ node = nodeid2node(nodeid, 0);
+ if (node) {
+ /* let shutdown waiters leave */
+ set_bit(DLM_NODE_FLAG_CLOSE, &node->flags);
+ wake_up(&node->shutdown_wait);
+ }
+ srcu_read_unlock(&nodes_srcu, idx);
+
+ synchronize_srcu(&nodes_srcu);
+
+ idx = srcu_read_lock(&nodes_srcu);
+ mutex_lock(&close_lock);
+ node = nodeid2node(nodeid, 0);
+ if (!node) {
+ mutex_unlock(&close_lock);
+ srcu_read_unlock(&nodes_srcu, idx);
+ return dlm_lowcomms_close(nodeid);
+ }
+
+ ret = dlm_lowcomms_close(nodeid);
+ spin_lock(&node->state_lock);
+ midcomms_node_reset(node);
+ spin_unlock(&node->state_lock);
+ srcu_read_unlock(&nodes_srcu, idx);
+ mutex_unlock(&close_lock);
+
+ return ret;
+}
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
index 61e90a921849..579abc6929be 100644
--- a/fs/dlm/midcomms.h
+++ b/fs/dlm/midcomms.h
@@ -12,7 +12,22 @@
#ifndef __MIDCOMMS_DOT_H__
#define __MIDCOMMS_DOT_H__
+struct midcomms_node;
+
int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen);
+struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
+ gfp_t allocation, char **ppc);
+void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh);
+int dlm_midcomms_close(int nodeid);
+int dlm_midcomms_start(void);
+void dlm_midcomms_shutdown(void);
+void dlm_midcomms_add_member(int nodeid);
+void dlm_midcomms_remove_member(int nodeid);
+void dlm_midcomms_unack_msg_resend(int nodeid);
+const char *dlm_midcomms_state(struct midcomms_node *node);
+unsigned long dlm_midcomms_flags(struct midcomms_node *node);
+int dlm_midcomms_send_queue_cnt(struct midcomms_node *node);
+uint32_t dlm_midcomms_version(struct midcomms_node *node);
#endif /* __MIDCOMMS_DOT_H__ */
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index f5b1bd65728d..5651933f54a4 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -27,25 +27,15 @@ static int rcom_response(struct dlm_ls *ls)
return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
}
-static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
- struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
+static void _create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
+ struct dlm_rcom **rc_ret, char *mb, int mb_len)
{
struct dlm_rcom *rc;
- struct dlm_mhandle *mh;
- char *mb;
- int mb_len = sizeof(struct dlm_rcom) + len;
-
- mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
- if (!mh) {
- log_print("create_rcom to %d type %d len %d ENOBUFS",
- to_nodeid, type, len);
- return -ENOBUFS;
- }
rc = (struct dlm_rcom *) mb;
rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
- rc->rc_header.h_lockspace = ls->ls_global_id;
+ rc->rc_header.u.h_lockspace = ls->ls_global_id;
rc->rc_header.h_nodeid = dlm_our_nodeid();
rc->rc_header.h_length = mb_len;
rc->rc_header.h_cmd = DLM_RCOM;
@@ -56,16 +46,67 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
rc->rc_seq = ls->ls_recover_seq;
spin_unlock(&ls->ls_recover_lock);
- *mh_ret = mh;
*rc_ret = rc;
+}
+
+static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
+ struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
+{
+ int mb_len = sizeof(struct dlm_rcom) + len;
+ struct dlm_mhandle *mh;
+ char *mb;
+
+ mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb);
+ if (!mh) {
+ log_print("%s to %d type %d len %d ENOBUFS",
+ __func__, to_nodeid, type, len);
+ return -ENOBUFS;
+ }
+
+ _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len);
+ *mh_ret = mh;
+ return 0;
+}
+
+static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type,
+ int len, struct dlm_rcom **rc_ret,
+ struct dlm_msg **msg_ret)
+{
+ int mb_len = sizeof(struct dlm_rcom) + len;
+ struct dlm_msg *msg;
+ char *mb;
+
+ msg = dlm_lowcomms_new_msg(to_nodeid, mb_len, GFP_NOFS, &mb,
+ NULL, NULL);
+ if (!msg) {
+ log_print("create_rcom to %d type %d len %d ENOBUFS",
+ to_nodeid, type, len);
+ return -ENOBUFS;
+ }
+
+ _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len);
+ *msg_ret = msg;
return 0;
}
+static void _send_rcom(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+ dlm_rcom_out(rc);
+}
+
static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
struct dlm_rcom *rc)
{
- dlm_rcom_out(rc);
- dlm_lowcomms_commit_buffer(mh);
+ _send_rcom(ls, rc);
+ dlm_midcomms_commit_mhandle(mh);
+}
+
+static void send_rcom_stateless(struct dlm_ls *ls, struct dlm_msg *msg,
+ struct dlm_rcom *rc)
+{
+ _send_rcom(ls, rc);
+ dlm_lowcomms_commit_msg(msg);
+ dlm_lowcomms_put_msg(msg);
}
static void set_rcom_status(struct dlm_ls *ls, struct rcom_status *rs,
@@ -141,7 +182,7 @@ static void disallow_sync_reply(struct dlm_ls *ls)
int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
{
struct dlm_rcom *rc;
- struct dlm_mhandle *mh;
+ struct dlm_msg *msg;
int error = 0;
ls->ls_recover_nodeid = nodeid;
@@ -153,17 +194,17 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
}
retry:
- error = create_rcom(ls, nodeid, DLM_RCOM_STATUS,
- sizeof(struct rcom_status), &rc, &mh);
+ error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS,
+ sizeof(struct rcom_status), &rc, &msg);
if (error)
goto out;
set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags);
allow_sync_reply(ls, &rc->rc_id);
- memset(ls->ls_recover_buf, 0, LOWCOMMS_MAX_TX_BUFFER_LEN);
+ memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE);
- send_rcom(ls, mh, rc);
+ send_rcom_stateless(ls, msg, rc);
error = dlm_wait_function(ls, &rcom_response);
disallow_sync_reply(ls);
@@ -191,11 +232,11 @@ retry:
static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
struct dlm_rcom *rc;
- struct dlm_mhandle *mh;
struct rcom_status *rs;
uint32_t status;
int nodeid = rc_in->rc_header.h_nodeid;
int len = sizeof(struct rcom_config);
+ struct dlm_msg *msg;
int num_slots = 0;
int error;
@@ -218,8 +259,8 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
len += num_slots * sizeof(struct rcom_slot);
do_create:
- error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
- len, &rc, &mh);
+ error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS_REPLY,
+ len, &rc, &msg);
if (error)
return;
@@ -246,7 +287,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
spin_unlock(&ls->ls_recover_lock);
do_send:
- send_rcom(ls, mh, rc);
+ send_rcom_stateless(ls, msg, rc);
}
static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
@@ -271,21 +312,22 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
{
struct dlm_rcom *rc;
- struct dlm_mhandle *mh;
+ struct dlm_msg *msg;
int error = 0;
ls->ls_recover_nodeid = nodeid;
retry:
- error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
+ error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES, last_len,
+ &rc, &msg);
if (error)
goto out;
memcpy(rc->rc_buf, last_name, last_len);
allow_sync_reply(ls, &rc->rc_id);
- memset(ls->ls_recover_buf, 0, LOWCOMMS_MAX_TX_BUFFER_LEN);
+ memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE);
- send_rcom(ls, mh, rc);
+ send_rcom_stateless(ls, msg, rc);
error = dlm_wait_function(ls, &rcom_response);
disallow_sync_reply(ls);
@@ -298,14 +340,15 @@ retry:
static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
struct dlm_rcom *rc;
- struct dlm_mhandle *mh;
int error, inlen, outlen, nodeid;
+ struct dlm_msg *msg;
nodeid = rc_in->rc_header.h_nodeid;
inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
- outlen = LOWCOMMS_MAX_TX_BUFFER_LEN - sizeof(struct dlm_rcom);
+ outlen = DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom);
- error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
+ error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen,
+ &rc, &msg);
if (error)
return;
rc->rc_id = rc_in->rc_id;
@@ -313,7 +356,7 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
nodeid);
- send_rcom(ls, mh, rc);
+ send_rcom_stateless(ls, msg, rc);
}
int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
@@ -342,10 +385,6 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
- error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
- if (error)
- return;
-
/* Old code would send this special id to trigger a debug dump. */
if (rc_in->rc_id == 0xFFFFFFFF) {
log_error(ls, "receive_rcom_lookup dump from %d", nodeid);
@@ -353,6 +392,10 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
return;
}
+ error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
+ if (error)
+ return;
+
error = dlm_master_lookup(ls, nodeid, rc_in->rc_buf, len,
DLM_LU_RECOVER_MASTER, &ret_nodeid, NULL);
if (error)
@@ -458,14 +501,14 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
char *mb;
int mb_len = sizeof(struct dlm_rcom) + sizeof(struct rcom_config);
- mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_NOFS, &mb);
+ mh = dlm_midcomms_get_mhandle(nodeid, mb_len, GFP_NOFS, &mb);
if (!mh)
return -ENOBUFS;
rc = (struct dlm_rcom *) mb;
rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
- rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
+ rc->rc_header.u.h_lockspace = rc_in->rc_header.u.h_lockspace;
rc->rc_header.h_nodeid = dlm_our_nodeid();
rc->rc_header.h_length = mb_len;
rc->rc_header.h_cmd = DLM_RCOM;
@@ -479,7 +522,7 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
rf->rf_lvblen = cpu_to_le32(~0U);
dlm_rcom_out(rc);
- dlm_lowcomms_commit_buffer(mh);
+ dlm_midcomms_commit_mhandle(mh);
return 0;
}
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
index cfd0d00b19ae..58acbcc2081a 100644
--- a/fs/dlm/util.c
+++ b/fs/dlm/util.c
@@ -20,18 +20,20 @@
#define DLM_ERRNO_ETIMEDOUT 110
#define DLM_ERRNO_EINPROGRESS 115
-static void header_out(struct dlm_header *hd)
+void header_out(struct dlm_header *hd)
{
hd->h_version = cpu_to_le32(hd->h_version);
- hd->h_lockspace = cpu_to_le32(hd->h_lockspace);
+ /* does it for others u32 in union as well */
+ hd->u.h_lockspace = cpu_to_le32(hd->u.h_lockspace);
hd->h_nodeid = cpu_to_le32(hd->h_nodeid);
hd->h_length = cpu_to_le16(hd->h_length);
}
-static void header_in(struct dlm_header *hd)
+void header_in(struct dlm_header *hd)
{
hd->h_version = le32_to_cpu(hd->h_version);
- hd->h_lockspace = le32_to_cpu(hd->h_lockspace);
+ /* does it for others u32 in union as well */
+ hd->u.h_lockspace = le32_to_cpu(hd->u.h_lockspace);
hd->h_nodeid = le32_to_cpu(hd->h_nodeid);
hd->h_length = le16_to_cpu(hd->h_length);
}
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
index cc719ca9397e..d46f23c7a6a0 100644
--- a/fs/dlm/util.h
+++ b/fs/dlm/util.h
@@ -15,6 +15,8 @@ void dlm_message_out(struct dlm_message *ms);
void dlm_message_in(struct dlm_message *ms);
void dlm_rcom_out(struct dlm_rcom *rc);
void dlm_rcom_in(struct dlm_rcom *rc);
+void header_out(struct dlm_header *hd);
+void header_in(struct dlm_header *hd);
#endif