From: Jeff Liu <jeff.liu@oracle.com>
To: rds-devel@oss.oracle.com
Cc: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>,
Dan Carpenter <dan.carpenter@oracle.com>,
davem@davemloft.net, James Morris <james.l.morris@oracle.com>,
netdev@vger.kernel.org
Subject: [PATCH] RDS: Fix spinlock recursion for rds over tcp transmit
Date: Sat, 06 Oct 2012 13:42:37 +0800 [thread overview]
Message-ID: <506FC4CD.7070509@oracle.com> (raw)
Hello,
RDS ping/pong over TCP feature has broke for years(2.6.39 to 3.6.0) since we have to set TCP cork and
call kerenel_sendmsg() to reply a ping requirement which both need to lock "struct sock *sk".
However, this lock has already been hold before our rda_tcp_data_ready() callback is triggerred.
As a result, we always facing spinlock recursion which would resulting in system panic...
Given that RDS ping is a special kind of message, we don't need to reply it as
soon as possible, IMHO, we can schedule it to work queue as a delayed response to
make TCP transport totally works. Also, I think we can using the system default
work queue to serve it to reduce the possible impact on general TCP transmit.
With below patch, I have run rds-ping(run multiple ping between two hosts at the same time) and
rds-stress(hostA listen, hostB send packets) for half day, it works to me.
Thanks,
-Jeff
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
CC: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>
CC: David S. Miller <davem@davemloft.net>
CC: James Morris <james.l.morris@oracle.com>
Signed-off-by: Jie Liu <jeff.liu@oracle.com>
---
net/rds/send.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
net/rds/tcp.h | 7 +++++
2 files changed, 82 insertions(+), 5 deletions(-)
diff --git a/net/rds/send.c b/net/rds/send.c
index 96531d4..011006e 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -38,8 +38,10 @@
#include <linux/list.h>
#include <linux/ratelimit.h>
#include <linux/export.h>
+#include <linux/workqueue.h>
#include "rds.h"
+#include "tcp.h"
/* When transmitting messages in rds_send_xmit, we need to emerge from
* time to time and briefly release the CPU. Otherwise the softlock watchdog
@@ -55,6 +57,12 @@ static int send_batch_count = 64;
module_param(send_batch_count, int, 0444);
MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
+/* RDS over TCP ping/pong */
+static void rds_tcp_pong_worker(struct work_struct *work);
+static DEFINE_SPINLOCK(rds_tcp_pong_lock);
+static LIST_HEAD(rds_tcp_pong_list);
+static DECLARE_WORK(rds_tcp_pong_work, rds_tcp_pong_worker);
+
static void rds_send_remove_from_sock(struct list_head *messages, int status);
/*
@@ -1082,11 +1090,7 @@ out:
return ret;
}
-/*
- * Reply to a ping packet.
- */
-int
-rds_send_pong(struct rds_connection *conn, __be16 dport)
+static int rds_tcp_send_pong(struct rds_connection *conn, __be16 dport)
{
struct rds_message *rm;
unsigned long flags;
@@ -1132,3 +1136,69 @@ out:
rds_message_put(rm);
return ret;
}
+
+static void rds_tcp_pong_worker(struct work_struct *work)
+{
+ struct rds_tcp_pong *tp;
+ struct rds_connection *conn;
+ __be16 dport;
+
+ spin_lock(&rds_tcp_pong_lock);
+ if (list_empty(&rds_tcp_pong_list))
+ goto out_unlock;
+
+ /*
+ * Process on tcp pong once one time to reduce the possbile impact
+ * on normal transmit.
+ */
+ tp = list_entry(rds_tcp_pong_list.next, struct rds_tcp_pong, tp_node);
+ conn = tp->tp_conn;
+ dport = tp->tp_dport;
+ list_del(&tp->tp_node);
+ spin_unlock(&rds_tcp_pong_lock);
+
+ kfree(tp);
+ rds_tcp_send_pong(conn, dport);
+ goto out;
+
+out_unlock:
+ spin_unlock(&rds_tcp_pong_lock);
+out:
+ return;
+}
+
+/*
+ * RDS over TCP transport support ping/pong message. However, it
+ * always resulting in sock spinlock recursion up to 3.7.0. To solve
+ * this issue, we can shedule it to work queue as a delayed response.
+ * Here we using the system default work queue.
+ */
+int rds_tcp_pong(struct rds_connection *conn, __be16 dport)
+{
+ struct rds_tcp_pong *tp;
+ int ret = 0;
+
+ tp = kmalloc(sizeof(*tp), GFP_ATOMIC);
+ if (!tp) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ tp->tp_conn = conn;
+ tp->tp_dport = dport;
+ spin_lock(&rds_tcp_pong_lock);
+ list_add_tail(&tp->tp_node, &rds_tcp_pong_list);
+ spin_unlock(&rds_tcp_pong_lock);
+ schedule_work(&rds_tcp_pong_work);
+
+out:
+ return ret;
+}
+
+/*
+ * Reply to a ping package, TCP only.
+ */
+int rds_send_pong(struct rds_connection *conn, __be16 dport)
+{
+ return rds_tcp_pong(conn, dport);
+}
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 9cf2927..c4c7e01 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -3,6 +3,13 @@
#define RDS_TCP_PORT 16385
+/* RDS over TCP ping/pong message entry */
+struct rds_tcp_pong {
+ struct list_head tp_node;
+ struct rds_connection *tp_conn;
+ __be16 tp_dport;
+};
+
struct rds_tcp_incoming {
struct rds_incoming ti_inc;
struct sk_buff_head ti_skb_list;
--
1.7.9.5
next reply other threads:[~2012-10-06 5:43 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-10-06 5:42 Jeff Liu [this message]
2012-10-08 16:47 ` [PATCH] RDS: Fix spinlock recursion for rds over tcp transmit Venkat Venkatsubra
2012-10-09 4:40 ` Jie Liu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=506FC4CD.7070509@oracle.com \
--to=jeff.liu@oracle.com \
--cc=dan.carpenter@oracle.com \
--cc=davem@davemloft.net \
--cc=james.l.morris@oracle.com \
--cc=netdev@vger.kernel.org \
--cc=rds-devel@oss.oracle.com \
--cc=venkat.x.venkatsubra@oracle.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).