* [PATCH 4/4] sunrpc: use SKB fragment destructors to delay completion until page is released by network stack.
From: Ian Campbell @ 2011-11-09 15:02 UTC (permalink / raw)
To: netdev-u79uwXL29TY76Z2rM5mHXA
Cc: Ian Campbell, David S. Miller, Neil Brown, J. Bruce Fields,
linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1320850895.955.172.camel-o4Be2W7LfRlXesXXhkcM7miJhflN2719@public.gmane.org>
This prevents an issue where an ACK is delayed, a retransmit is queued (either
at the RPC or TCP level) and the ACK arrives before the retransmission hits the
wire. If this happens to an NFS WRITE RPC then the write() system call
completes and the userspace process can continue, potentially modifying data
referenced by the retransmission before the retransmission occurs.
Signed-off-by: Ian Campbell <ian.campbell-Sxgqhf6Nn4DQT0dZR+AlfA@public.gmane.org>
Acked-by: Trond Myklebust <Trond.Myklebust-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org>
Cc: "David S. Miller" <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
Cc: Neil Brown <neilb-l3A5Bk7waGM@public.gmane.org>
Cc: "J. Bruce Fields" <bfields-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>
Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
---
include/linux/sunrpc/xdr.h | 2 ++
include/linux/sunrpc/xprt.h | 5 ++++-
net/sunrpc/clnt.c | 27 ++++++++++++++++++++++-----
net/sunrpc/svcsock.c | 3 ++-
net/sunrpc/xprt.c | 13 +++++++++++++
net/sunrpc/xprtsock.c | 3 ++-
6 files changed, 45 insertions(+), 8 deletions(-)
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index a20970e..172f81e 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -16,6 +16,7 @@
#include <asm/byteorder.h>
#include <asm/unaligned.h>
#include <linux/scatterlist.h>
+#include <linux/skbuff.h>
/*
* Buffer adjustment
@@ -57,6 +58,7 @@ struct xdr_buf {
tail[1]; /* Appended after page data */
struct page ** pages; /* Array of contiguous pages */
+ struct skb_frag_destructor *destructor;
unsigned int page_base, /* Start of page data */
page_len, /* Length of page data */
flags; /* Flags for data disposition */
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 15518a1..75131eb 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -92,7 +92,10 @@ struct rpc_rqst {
/* A cookie used to track the
state of the transport
connection */
-
+ struct skb_frag_destructor destructor; /* SKB paged fragment
+ * destructor for
+ * transmitted pages*/
+
/*
* Partial send handling
*/
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index c5347d2..919538d 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -61,6 +61,7 @@ static void call_reserve(struct rpc_task *task);
static void call_reserveresult(struct rpc_task *task);
static void call_allocate(struct rpc_task *task);
static void call_decode(struct rpc_task *task);
+static void call_complete(struct rpc_task *task);
static void call_bind(struct rpc_task *task);
static void call_bind_status(struct rpc_task *task);
static void call_transmit(struct rpc_task *task);
@@ -1113,6 +1114,8 @@ rpc_xdr_encode(struct rpc_task *task)
(char *)req->rq_buffer + req->rq_callsize,
req->rq_rcvsize);
+ req->rq_snd_buf.destructor = &req->destructor;
+
p = rpc_encode_header(task);
if (p == NULL) {
printk(KERN_INFO "RPC: couldn't encode RPC header, exit EIO\n");
@@ -1276,6 +1279,7 @@ call_connect_status(struct rpc_task *task)
static void
call_transmit(struct rpc_task *task)
{
+ struct rpc_rqst *req = task->tk_rqstp;
dprint_status(task);
task->tk_action = call_status;
@@ -1309,8 +1313,8 @@ call_transmit(struct rpc_task *task)
call_transmit_status(task);
if (rpc_reply_expected(task))
return;
- task->tk_action = rpc_exit_task;
- rpc_wake_up_queued_task(&task->tk_xprt->pending, task);
+ task->tk_action = call_complete;
+ skb_frag_destructor_unref(&req->destructor);
}
/*
@@ -1383,7 +1387,8 @@ call_bc_transmit(struct rpc_task *task)
return;
}
- task->tk_action = rpc_exit_task;
+ task->tk_action = call_complete;
+ skb_frag_destructor_unref(&req->destructor);
if (task->tk_status < 0) {
printk(KERN_NOTICE "RPC: Could not send backchannel reply "
"error: %d\n", task->tk_status);
@@ -1423,7 +1428,6 @@ call_bc_transmit(struct rpc_task *task)
"error: %d\n", task->tk_status);
break;
}
- rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
}
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
@@ -1589,12 +1593,14 @@ call_decode(struct rpc_task *task)
return;
}
- task->tk_action = rpc_exit_task;
+ task->tk_action = call_complete;
if (decode) {
task->tk_status = rpcauth_unwrap_resp(task, decode, req, p,
task->tk_msg.rpc_resp);
}
+ rpc_sleep_on(&req->rq_xprt->pending, task, NULL);
+ skb_frag_destructor_unref(&req->destructor);
dprintk("RPC: %5u call_decode result %d\n", task->tk_pid,
task->tk_status);
return;
@@ -1609,6 +1615,17 @@ out_retry:
}
}
+/*
+ * 8. Wait for pages to be released by the network stack.
+ */
+static void
+call_complete(struct rpc_task *task)
+{
+ dprintk("RPC: %5u call_complete result %d\n",
+ task->tk_pid, task->tk_status);
+ task->tk_action = rpc_exit_task;
+}
+
static __be32 *
rpc_encode_header(struct rpc_task *task)
{
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 852a258..3685cad 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -196,7 +196,8 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
while (pglen > 0) {
if (slen == size)
flags = 0;
- result = kernel_sendpage(sock, *ppage, NULL, base, size, flags);
+ result = kernel_sendpage(sock, *ppage, xdr->destructor,
+ base, size, flags);
if (result > 0)
len += result;
if (result != size)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index f4385e4..925aa0c 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1103,6 +1103,16 @@ static inline void xprt_init_xid(struct rpc_xprt *xprt)
xprt->xid = net_random();
}
+static int xprt_complete_skb_pages(void *calldata)
+{
+ struct rpc_task *task = calldata;
+ struct rpc_rqst *req = task->tk_rqstp;
+
+ dprintk("RPC: %5u completing skb pages\n", task->tk_pid);
+ rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
+ return 0;
+}
+
static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
{
struct rpc_rqst *req = task->tk_rqstp;
@@ -1115,6 +1125,9 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt)
req->rq_xid = xprt_alloc_xid(xprt);
req->rq_release_snd_buf = NULL;
xprt_reset_majortimeo(req);
+ atomic_set(&req->destructor.ref, 1);
+ req->destructor.destroy = &xprt_complete_skb_pages;
+ req->destructor.data = task;
dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid,
req, ntohl(req->rq_xid));
}
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index f79e40e9..af3a106 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -408,7 +408,8 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
remainder -= len;
if (remainder != 0 || more)
flags |= MSG_MORE;
- err = sock->ops->sendpage(sock, *ppage, NULL, base, len, flags);
+ err = sock->ops->sendpage(sock, *ppage, xdr->destructor,
+ base, len, flags);
if (remainder == 0 || err != len)
break;
sent += err;
--
1.7.2.5
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 3/4] net: add paged frag destructor support to kernel_sendpage.
From: Ian Campbell @ 2011-11-09 15:02 UTC (permalink / raw)
To: netdev-u79uwXL29TY76Z2rM5mHXA
Cc: Ian Campbell, David S. Miller, Alexey Kuznetsov,
Pekka Savola (ipv6), James Morris, Hideaki YOSHIFUJI,
Patrick McHardy, Trond Myklebust, Greg Kroah-Hartman,
drbd-user-cunTk1MwBs8qoQakbn7OcQ,
devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b,
cluster-devel-H+wXaHxf7aLQT0dZR+AlfA,
ocfs2-devel-N0ozoZBvEnrZJqsBc5GL+g,
ceph-devel-u79uwXL29TY76Z2rM5mHXA,
rds-devel-N0ozoZBvEnrZJqsBc5GL+g,
linux-nfs-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1320850895.955.172.camel-o4Be2W7LfRlXesXXhkcM7miJhflN2719@public.gmane.org>
This requires adding a new argument to various sendpage hooks up and down the
stack. At the moment this parameter is always NULL.
Signed-off-by: Ian Campbell <ian.campbell-Sxgqhf6Nn4DQT0dZR+AlfA@public.gmane.org>
Cc: "David S. Miller" <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
Cc: Alexey Kuznetsov <kuznet-v/Mj1YrvjDBInbfyfbPRSQ@public.gmane.org>
Cc: "Pekka Savola (ipv6)" <pekkas-UjJjq++bwZ7HOG6cAo2yLw@public.gmane.org>
Cc: James Morris <jmorris-gx6/JNMH7DfYtjvyW6yDsg@public.gmane.org>
Cc: Hideaki YOSHIFUJI <yoshfuji-VfPWfsRibaP+Ru+s062T9g@public.gmane.org>
Cc: Patrick McHardy <kaber-dcUjhNyLwpNeoWH0uzbU5w@public.gmane.org>
Cc: Trond Myklebust <Trond.Myklebust-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org>
Cc: Greg Kroah-Hartman <gregkh-l3A5Bk7waGM@public.gmane.org>
Cc: drbd-user-cunTk1MwBs8qoQakbn7OcQ@public.gmane.org
Cc: devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b@public.gmane.org
Cc: cluster-devel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org
Cc: ocfs2-devel-N0ozoZBvEnrZJqsBc5GL+g@public.gmane.org
Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: ceph-devel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: rds-devel-N0ozoZBvEnrZJqsBc5GL+g@public.gmane.org
Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
---
drivers/block/drbd/drbd_main.c | 1 +
drivers/scsi/iscsi_tcp.c | 4 ++--
drivers/scsi/iscsi_tcp.h | 3 ++-
drivers/staging/pohmelfs/trans.c | 3 ++-
drivers/target/iscsi/iscsi_target_util.c | 3 ++-
fs/dlm/lowcomms.c | 4 ++--
fs/ocfs2/cluster/tcp.c | 1 +
include/linux/net.h | 6 +++++-
include/net/inet_common.h | 4 +++-
include/net/ip.h | 4 +++-
include/net/sock.h | 8 +++++---
include/net/tcp.h | 4 +++-
net/ceph/messenger.c | 2 +-
net/core/sock.c | 6 +++++-
net/ipv4/af_inet.c | 9 ++++++---
net/ipv4/ip_output.c | 6 ++++--
net/ipv4/tcp.c | 24 ++++++++++++++++--------
net/ipv4/udp.c | 11 ++++++-----
net/ipv4/udp_impl.h | 5 +++--
net/rds/tcp_send.c | 1 +
net/socket.c | 11 +++++++----
net/sunrpc/svcsock.c | 6 +++---
net/sunrpc/xprtsock.c | 2 +-
23 files changed, 84 insertions(+), 44 deletions(-)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 0358e55..49c7346 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2584,6 +2584,7 @@ static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
set_fs(KERNEL_DS);
do {
sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
+ NULL,
offset, len,
msg_flags);
if (sent == -EAGAIN) {
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 7724414..2eb6801 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -283,8 +283,8 @@ static int iscsi_sw_tcp_xmit_segment(struct iscsi_tcp_conn *tcp_conn,
if (!segment->data) {
sg = segment->sg;
offset += segment->sg_offset + sg->offset;
- r = tcp_sw_conn->sendpage(sk, sg_page(sg), offset,
- copy, flags);
+ r = tcp_sw_conn->sendpage(sk, sg_page(sg), NULL,
+ offset, copy, flags);
} else {
struct msghdr msg = { .msg_flags = flags };
struct kvec iov = {
diff --git a/drivers/scsi/iscsi_tcp.h b/drivers/scsi/iscsi_tcp.h
index 666fe09..1e23265 100644
--- a/drivers/scsi/iscsi_tcp.h
+++ b/drivers/scsi/iscsi_tcp.h
@@ -52,7 +52,8 @@ struct iscsi_sw_tcp_conn {
uint32_t sendpage_failures_cnt;
uint32_t discontiguous_hdr_cnt;
- ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int);
+ ssize_t (*sendpage)(struct socket *, struct page *,
+ struct skb_frag_destructor *, int, size_t, int);
};
struct iscsi_sw_tcp_host {
diff --git a/drivers/staging/pohmelfs/trans.c b/drivers/staging/pohmelfs/trans.c
index 36a2535..f897fdb 100644
--- a/drivers/staging/pohmelfs/trans.c
+++ b/drivers/staging/pohmelfs/trans.c
@@ -104,7 +104,8 @@ static int netfs_trans_send_pages(struct netfs_trans *t, struct netfs_state *st)
msg.msg_flags = MSG_WAITALL | (attached_pages == 1 ? 0 :
MSG_MORE);
- err = kernel_sendpage(st->socket, page, 0, size, msg.msg_flags);
+ err = kernel_sendpage(st->socket, page, NULL,
+ 0, size, msg.msg_flags);
if (err <= 0) {
printk("%s: %d/%d failed to send transaction page: t: %p, gen: %u, size: %u, err: %d.\n",
__func__, i, t->page_num, t, t->gen, size, err);
diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c
index f00137f..1ce98e9 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -1297,7 +1297,8 @@ send_hdr:
u32 sub_len = min_t(u32, data_len, space);
send_pg:
tx_sent = conn->sock->ops->sendpage(conn->sock,
- sg_page(sg), sg->offset + offset, sub_len, 0);
+ sg_page(sg), NULL,
+ sg->offset + offset, sub_len, 0);
if (tx_sent != sub_len) {
if (tx_sent == -EAGAIN) {
pr_err("tcp_sendpage() returned"
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 990626e..98ace05 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1342,8 +1342,8 @@ static void send_to_sock(struct connection *con)
ret = 0;
if (len) {
- ret = kernel_sendpage(con->sock, e->page, offset, len,
- msg_flags);
+ ret = kernel_sendpage(con->sock, e->page, NULL,
+ offset, len, msg_flags);
if (ret == -EAGAIN || ret == 0) {
if (ret == -EAGAIN &&
test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index db5ee4b..81366a0 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -982,6 +982,7 @@ static void o2net_sendpage(struct o2net_sock_container *sc,
mutex_lock(&sc->sc_send_lock);
ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
virt_to_page(kmalloced_virt),
+ NULL,
(long)kmalloced_virt & ~PAGE_MASK,
size, MSG_DONTWAIT);
mutex_unlock(&sc->sc_send_lock);
diff --git a/include/linux/net.h b/include/linux/net.h
index b299230..db562ba 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -157,6 +157,7 @@ struct kiocb;
struct sockaddr;
struct msghdr;
struct module;
+struct skb_frag_destructor;
struct proto_ops {
int family;
@@ -203,6 +204,7 @@ struct proto_ops {
int (*mmap) (struct file *file, struct socket *sock,
struct vm_area_struct * vma);
ssize_t (*sendpage) (struct socket *sock, struct page *page,
+ struct skb_frag_destructor *destroy,
int offset, size_t size, int flags);
ssize_t (*splice_read)(struct socket *sock, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len, unsigned int flags);
@@ -273,7 +275,9 @@ extern int kernel_getsockopt(struct socket *sock, int level, int optname,
char *optval, int *optlen);
extern int kernel_setsockopt(struct socket *sock, int level, int optname,
char *optval, unsigned int optlen);
-extern int kernel_sendpage(struct socket *sock, struct page *page, int offset,
+extern int kernel_sendpage(struct socket *sock, struct page *page,
+ struct skb_frag_destructor *destroy,
+ int offset,
size_t size, int flags);
extern int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg);
extern int kernel_sock_shutdown(struct socket *sock,
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 22fac98..91cd8d0 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -21,7 +21,9 @@ extern int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
extern int inet_accept(struct socket *sock, struct socket *newsock, int flags);
extern int inet_sendmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *msg, size_t size);
-extern ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
+extern ssize_t inet_sendpage(struct socket *sock, struct page *page,
+ struct skb_frag_destructor *frag,
+ int offset,
size_t size, int flags);
extern int inet_recvmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *msg, size_t size, int flags);
diff --git a/include/net/ip.h b/include/net/ip.h
index eca0ef7..d34030c 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -114,7 +114,9 @@ extern int ip_append_data(struct sock *sk, struct flowi4 *fl4,
struct rtable **rt,
unsigned int flags);
extern int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb);
-extern ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
+extern ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4,
+ struct page *page,
+ struct skb_frag_destructor *destroy,
int offset, size_t size, int flags);
extern struct sk_buff *__ip_make_skb(struct sock *sk,
struct flowi4 *fl4,
diff --git a/include/net/sock.h b/include/net/sock.h
index 5ac682f..95ebafd 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -774,6 +774,7 @@ struct proto {
size_t len, int noblock, int flags,
int *addr_len);
int (*sendpage)(struct sock *sk, struct page *page,
+ struct skb_frag_destructor *destroy,
int offset, size_t size, int flags);
int (*bind)(struct sock *sk,
struct sockaddr *uaddr, int addr_len);
@@ -1162,9 +1163,10 @@ extern int sock_no_mmap(struct file *file,
struct socket *sock,
struct vm_area_struct *vma);
extern ssize_t sock_no_sendpage(struct socket *sock,
- struct page *page,
- int offset, size_t size,
- int flags);
+ struct page *page,
+ struct skb_frag_destructor *destroy,
+ int offset, size_t size,
+ int flags);
/*
* Functions to fill in entries in struct proto_ops when a protocol
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e147f42..99fe8f3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -322,7 +322,9 @@ extern void *tcp_v4_tw_get_peer(struct sock *sk);
extern int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t size);
-extern int tcp_sendpage(struct sock *sk, struct page *page, int offset,
+extern int tcp_sendpage(struct sock *sk, struct page *page,
+ struct skb_frag_destructor *destroy,
+ int offset,
size_t size, int flags);
extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
extern int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 9918e9e..65b6fc0 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -849,7 +849,7 @@ static int write_partial_msg_pages(struct ceph_connection *con)
cpu_to_le32(crc32c(tmpcrc, base, len));
con->out_msg_pos.did_page_crc = 1;
}
- ret = kernel_sendpage(con->sock, page,
+ ret = kernel_sendpage(con->sock, page, NULL,
con->out_msg_pos.page_pos + page_shift,
len,
MSG_DONTWAIT | MSG_NOSIGNAL |
diff --git a/net/core/sock.c b/net/core/sock.c
index 5a08762..8fef547 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1860,7 +1860,9 @@ int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *
}
EXPORT_SYMBOL(sock_no_mmap);
-ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
+ssize_t sock_no_sendpage(struct socket *sock, struct page *page,
+ struct skb_frag_destructor *destroy,
+ int offset, size_t size, int flags)
{
ssize_t res;
struct msghdr msg = {.msg_flags = flags};
@@ -1870,6 +1872,8 @@ ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, siz
iov.iov_len = size;
res = kernel_sendmsg(sock, &msg, &iov, 1, size);
kunmap(page);
+ /* kernel_sendmsg copies so we can destroy immediately */
+ skb_frag_destructor_unref(destroy);
return res;
}
EXPORT_SYMBOL(sock_no_sendpage);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1b5096a..99f7fd0 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -745,7 +745,9 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
}
EXPORT_SYMBOL(inet_sendmsg);
-ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
+ssize_t inet_sendpage(struct socket *sock, struct page *page,
+ struct skb_frag_destructor *destroy,
+ int offset,
size_t size, int flags)
{
struct sock *sk = sock->sk;
@@ -758,8 +760,9 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
return -EAGAIN;
if (sk->sk_prot->sendpage)
- return sk->sk_prot->sendpage(sk, page, offset, size, flags);
- return sock_no_sendpage(sock, page, offset, size, flags);
+ return sk->sk_prot->sendpage(sk, page, destroy,
+ offset, size, flags);
+ return sock_no_sendpage(sock, page, destroy, offset, size, flags);
}
EXPORT_SYMBOL(inet_sendpage);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 3252e06..753dc7b 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1116,6 +1116,7 @@ int ip_append_data(struct sock *sk, struct flowi4 *fl4,
}
ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
+ struct skb_frag_destructor *destroy,
int offset, size_t size, int flags)
{
struct inet_sock *inet = inet_sk(sk);
@@ -1229,11 +1230,12 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
i = skb_shinfo(skb)->nr_frags;
if (len > size)
len = size;
- if (skb_can_coalesce(skb, i, page, NULL, offset)) {
+ if (skb_can_coalesce(skb, i, page, destroy, offset)) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
} else if (i < MAX_SKB_FRAGS) {
- get_page(page);
skb_fill_page_desc(skb, i, page, offset, len);
+ skb_frag_set_destructor(skb, i, destroy);
+ skb_frag_ref(skb, i);
} else {
err = -EMSGSIZE;
goto error;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 018de0c..56ef323 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -757,7 +757,10 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
return mss_now;
}
-static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
+static ssize_t do_tcp_sendpages(struct sock *sk,
+ struct page **pages,
+ struct skb_frag_destructor **destructors,
+ int poffset,
size_t psize, int flags)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -783,6 +786,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
while (psize > 0) {
struct sk_buff *skb = tcp_write_queue_tail(sk);
struct page *page = pages[poffset / PAGE_SIZE];
+ struct skb_frag_destructor *destroy =
+ destructors ? destructors[poffset / PAGE_SIZE] : NULL;
int copy, i, can_coalesce;
int offset = poffset % PAGE_SIZE;
int size = min_t(size_t, psize, PAGE_SIZE - offset);
@@ -804,7 +809,7 @@ new_segment:
copy = size;
i = skb_shinfo(skb)->nr_frags;
- can_coalesce = skb_can_coalesce(skb, i, page, NULL, offset);
+ can_coalesce = skb_can_coalesce(skb, i, page, destroy, offset);
if (!can_coalesce && i >= MAX_SKB_FRAGS) {
tcp_mark_push(tp, skb);
goto new_segment;
@@ -815,8 +820,9 @@ new_segment:
if (can_coalesce) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
} else {
- get_page(page);
skb_fill_page_desc(skb, i, page, offset, copy);
+ skb_frag_set_destructor(skb, i, destroy);
+ skb_frag_ref(skb, i);
}
skb->len += copy;
@@ -871,18 +877,20 @@ out_err:
return sk_stream_error(sk, flags, err);
}
-int tcp_sendpage(struct sock *sk, struct page *page, int offset,
- size_t size, int flags)
+int tcp_sendpage(struct sock *sk, struct page *page,
+ struct skb_frag_destructor *destroy,
+ int offset, size_t size, int flags)
{
ssize_t res;
if (!(sk->sk_route_caps & NETIF_F_SG) ||
!(sk->sk_route_caps & NETIF_F_ALL_CSUM))
- return sock_no_sendpage(sk->sk_socket, page, offset, size,
- flags);
+ return sock_no_sendpage(sk->sk_socket, page, destroy,
+ offset, size, flags);
lock_sock(sk);
- res = do_tcp_sendpages(sk, &page, offset, size, flags);
+ res = do_tcp_sendpages(sk, &page, &destroy,
+ offset, size, flags);
release_sock(sk);
return res;
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ebaa96b..b653015 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1028,8 +1028,9 @@ do_confirm:
}
EXPORT_SYMBOL(udp_sendmsg);
-int udp_sendpage(struct sock *sk, struct page *page, int offset,
- size_t size, int flags)
+int udp_sendpage(struct sock *sk, struct page *page,
+ struct skb_frag_destructor *destroy,
+ int offset, size_t size, int flags)
{
struct inet_sock *inet = inet_sk(sk);
struct udp_sock *up = udp_sk(sk);
@@ -1057,11 +1058,11 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
}
ret = ip_append_page(sk, &inet->cork.fl.u.ip4,
- page, offset, size, flags);
+ page, destroy, offset, size, flags);
if (ret == -EOPNOTSUPP) {
release_sock(sk);
- return sock_no_sendpage(sk->sk_socket, page, offset,
- size, flags);
+ return sock_no_sendpage(sk->sk_socket, page, destroy,
+ offset, size, flags);
}
if (ret < 0) {
udp_flush_pending_frames(sk);
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index aaad650..4923d82 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -23,8 +23,9 @@ extern int compat_udp_getsockopt(struct sock *sk, int level, int optname,
#endif
extern int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len, int noblock, int flags, int *addr_len);
-extern int udp_sendpage(struct sock *sk, struct page *page, int offset,
- size_t size, int flags);
+extern int udp_sendpage(struct sock *sk, struct page *page,
+ struct skb_frag_destructor *destroy,
+ int offset, size_t size, int flags);
extern int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb);
extern void udp_destroy_sock(struct sock *sk);
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 1b4fd68..71503ad 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -119,6 +119,7 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
while (sg < rm->data.op_nents) {
ret = tc->t_sock->ops->sendpage(tc->t_sock,
sg_page(&rm->data.op_sg[sg]),
+ NULL,
rm->data.op_sg[sg].offset + off,
rm->data.op_sg[sg].length - off,
MSG_DONTWAIT|MSG_NOSIGNAL);
diff --git a/net/socket.c b/net/socket.c
index 2877647..cbd5728 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -795,7 +795,7 @@ static ssize_t sock_sendpage(struct file *file, struct page *page,
if (more)
flags |= MSG_MORE;
- return kernel_sendpage(sock, page, offset, size, flags);
+ return kernel_sendpage(sock, page, NULL, offset, size, flags);
}
static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
@@ -3352,15 +3352,18 @@ int kernel_setsockopt(struct socket *sock, int level, int optname,
}
EXPORT_SYMBOL(kernel_setsockopt);
-int kernel_sendpage(struct socket *sock, struct page *page, int offset,
+int kernel_sendpage(struct socket *sock, struct page *page,
+ struct skb_frag_destructor *destroy,
+ int offset,
size_t size, int flags)
{
sock_update_classid(sock->sk);
if (sock->ops->sendpage)
- return sock->ops->sendpage(sock, page, offset, size, flags);
+ return sock->ops->sendpage(sock, page, destroy,
+ offset, size, flags);
- return sock_no_sendpage(sock, page, offset, size, flags);
+ return sock_no_sendpage(sock, page, destroy, offset, size, flags);
}
EXPORT_SYMBOL(kernel_sendpage);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 767d494..852a258 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -183,7 +183,7 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
/* send head */
if (slen == xdr->head[0].iov_len)
flags = 0;
- len = kernel_sendpage(sock, headpage, headoffset,
+ len = kernel_sendpage(sock, headpage, NULL, headoffset,
xdr->head[0].iov_len, flags);
if (len != xdr->head[0].iov_len)
goto out;
@@ -196,7 +196,7 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
while (pglen > 0) {
if (slen == size)
flags = 0;
- result = kernel_sendpage(sock, *ppage, base, size, flags);
+ result = kernel_sendpage(sock, *ppage, NULL, base, size, flags);
if (result > 0)
len += result;
if (result != size)
@@ -210,7 +210,7 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
/* send tail */
if (xdr->tail[0].iov_len) {
- result = kernel_sendpage(sock, tailpage, tailoffset,
+ result = kernel_sendpage(sock, tailpage, NULL, tailoffset,
xdr->tail[0].iov_len, 0);
if (result > 0)
len += result;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index d7f97ef..f79e40e9 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -408,7 +408,7 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
remainder -= len;
if (remainder != 0 || more)
flags |= MSG_MORE;
- err = sock->ops->sendpage(sock, *ppage, base, len, flags);
+ err = sock->ops->sendpage(sock, *ppage, NULL, base, len, flags);
if (remainder == 0 || err != len)
break;
sent += err;
--
1.7.2.5
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 0/4] skb paged fragment destructors
From: Ian Campbell @ 2011-11-09 15:01 UTC (permalink / raw)
To: David Miller; +Cc: Jesse Brandeburg, netdev
The following series makes use of the skb fragment API (which is in 3.2)
to add a per-paged-fragment destructor callback. This can be used by
creators of skbs who are interested in the lifecycle of the pages
included in that skb after they have handed it off to the network stack.
I think these have all been posted before, but have been backed up
behind the skb fragment API.
The mail at [0] contains some more background and rationale but
basically the completed series will allow entities which inject pages
into the networking stack to receive a notification when the stack has
really finished with those pages (i.e. including retransmissions,
clones, pull-ups etc) and not just when the original skb is finished
with, which is beneficial to many subsystems which wish to inject pages
into the network stack without giving up full ownership of those page's
lifecycle. It implements something broadly along the lines of what was
described in [1].
I have also included a patch to the RPC subsystem which uses this API to
fix the bug which I describe at [2].
I presented this work at LPC in September and there was a
question/concern raised (by Jesse Brandenburg IIRC) regarding the
overhead of adding this extra field per fragment. If I understand
correctly it seems that in the there have been performance regressions
in the past with allocations outgrowing one allocation size bucket and
therefore using the next. The change in datastructure size resulting
from this series is:
BEFORE AFTER
AMD64: sizeof(struct skb_frag_struct) = 16 24
sizeof(struct skb_shared_info) = 344 488
sizeof(struct sk_buff) = 240 240
i386: sizeof(struct skb_frag_struct) = 8 12
sizeof(struct skb_shared_info) = 188 260
sizeof(struct sk_buff) = 192 192
(I think these are representative of 32 and 64 bit arches generally)
On amd64 this doesn't in itself push the shared_info over a slab
boundary but since the linear data is part of the same allocation the
size of the linear data which will push us into the next size is reduced
from 168 to 24 bytes, which is effectively the same thing as pushing
directly into the next size. On i386 we go straight to the next bucket
(although the 68 bytes available slack for linear area becomes 252 in
that larger size).
I'm not sure if this is a showstopper or the particular issue with slab
still exists (or maybe it was only slab/slub/slob specific?). I need to
find some benchmark which might demonstrate the issue (presumably
something where frames are commonly 24<size<168). Jesse, any hints on
how to test this or references to the previous occurrence(s) would be
gratefully accepted.
Possible solutions all seem a bit fugly:
* suitably sized slab caches appropriate to these new sizes (no
suitable number leaps out at me...)
* split linear data allocation and shinfo allocation into two. I
suspect this will have its own performance implications? On the
positive side skb_shared_info could come from its own fixed size
pool/cache which might have some benefits
* steal a bit a pointer to indicate destructor pointer vs regular
struct page pointer (moving the struct page into the destructor
datastructure for that case). Stops us sharing a single
destructor between multiple pages, but that isn't critical
* add a bitmap to shinfo indicating the kind of each frag.
Requires modification of anywhere which propagates the page
member (doable, but another huge series)
* some sort of pointer compression scheme?
I'm sure there are others I'm not seeing right now.
Cheers,
Ian.
[0] http://marc.info/?l=linux-netdev&m=131072801125521&w=2
[1] http://marc.info/?l=linux-netdev&m=130925719513084&w=2
[2] http://marc.info/?l=linux-nfs&m=122424132729720&w=2
^ permalink raw reply
* Re: [PATCH] Phonet: set the pipe handle using setsockopt
From: Rémi Denis-Courmont @ 2011-11-09 15:00 UTC (permalink / raw)
To: ext Hemant Vilas RAMDASI; +Cc: netdev, Dinesh Kumar Sharma
In-Reply-To: <1320837653-7471-1-git-send-email-hemant.ramdasi@stericsson.com>
Inline...
Le Mercredi 9 Novembre 2011 16:50:53 ext Hemant Vilas RAMDASI a écrit :
> @@ -863,9 +902,27 @@ static int pep_sock_connect(struct sock *sk, struct
> sockaddr *addr, int len) int err;
> u8 data[4] = { 0 /* sub-blocks */, PAD, PAD, PAD };
>
> - pn->pipe_handle = 1; /* anything but INVALID_HANDLE */
> + if (pn->pipe_handle == PN_PIPE_INVALID_HANDLE)
> + pn->pipe_handle = 1; /* anything but INVALID_HANDLE */
> +
> err = pipe_handler_request(sk, PNS_PEP_CONNECT_REQ,
> - PN_PIPE_ENABLE, data, 4);
> + pn->init_enable, data, 4);
> +
> + if (err) {
> + pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
This undoes the setsockopt() silently. I'm not sure you intend this?
> + return err;
> + }
> + sk->sk_state = TCP_SYN_SENT;
> + return 0;
> +}
> +
> +static int pep_sock_enable(struct sock *sk, struct sockaddr *addr, int len)
> +{
> + struct pep_sock *pn = pep_sk(sk);
> + int err;
> +
> + err = pipe_handler_request(sk, PNS_PEP_ENABLE_REQ, PAD,
> + NULL, 0);
> if (err) {
> pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
> return err;
> @@ -959,6 +1016,24 @@ static int pep_setsockopt(struct sock *sk, int level,
> int optname, }
> goto out_norel;
>
> + case PNPIPE_HANDLE:
> + if (val)
> + pn->pipe_handle = val;
> + else
> + err = -EINVAL;
> + break;
Why is zero a special case? What about out-of-range values?
> +
> + case PNPIPE_ENABLE:
> + err = pep_sock_enable(sk, NULL, 0);
> + break;
This is reintroducing the problems with the older code. As far as I know,
setsockopt() needs to be idempotent, which this does not seem to be?
> +
> + case PNPIPE_INITSTATE:
> + if ((val == PN_PIPE_DISABLE) || (val == PN_PIPE_ENABLE))
> + pn->init_enable = val;
> + else
> + err = -EINVAL;
> + break;
It looks like there is no use-case for init-enabled pipes then, right? How
about dropping this extra bit of code and assuming connect()ed pipes are
always init-disabled?
> +
> default:
> err = -ENOPROTOOPT;
> }
> @@ -994,6 +1069,13 @@ static int pep_getsockopt(struct sock *sk, int level,
> int optname, return -EINVAL;
> break;
>
> + case PNPIPE_ENABLE:
> + if (sk->sk_state != TCP_ESTABLISHED)
> + return -EINVAL;
> + else
> + val = 1;
> + break;
> +
> default:
> return -ENOPROTOOPT;
> }
--
Rémi Denis-Courmont
http://www.remlab.net/
^ permalink raw reply
* Re: [PATCH 16/18] net/irda: convert au1k_ir to platform driver.
From: Ralf Baechle @ 2011-11-09 14:55 UTC (permalink / raw)
To: Manuel Lauss; +Cc: Linux-MIPS, Samuel Ortiz, netdev
In-Reply-To: <1320174224-27305-17-git-send-email-manuel.lauss@googlemail.com>
On Tue, Nov 01, 2011 at 08:03:42PM +0100, Manuel Lauss wrote:
> Moderate driver cleanup:
> convert to platform driver, get rid of board-specific code.
>
> Driver loads and runs on a DB1100 board. But since I have no other
> IrDA hardware to exchange data with I can't say whether it really sends
> and receives.
>
> Cc: Samuel Ortiz <samuel@sortiz.org>
> Cc: netdev@vger.kernel.org
> Signed-off-by: Manuel Lauss <manuel.lauss@googlemail.com>
> ---
> I'd like for this to go in via the MIPS tree since other mips patches depend
> on it.
No (n)acks or comments received so queued for 3.3. Thanks,
Ralf
^ permalink raw reply
* Re: [PATCH RESEND 01/18] MIPS: Alchemy: remove PB1000 support
From: Ralf Baechle @ 2011-11-09 14:48 UTC (permalink / raw)
To: Manuel Lauss; +Cc: Linux-MIPS, netdev, linux-pcmcia
In-Reply-To: <1320234824-28604-1-git-send-email-manuel.lauss@googlemail.com>
On Wed, Nov 02, 2011 at 12:53:44PM +0100, Manuel Lauss wrote:
> Noone seems to have test hardware or care anymore. Drop PB1000 support
> and along with it the old Alchemy PCMCIA socket driver.
Nobody (n)acked so I queued this one for 3.3.
Thanks,
Ralf
^ permalink raw reply
* Re: [PATCH] flow_cache_flush soft lockup with heavy ipsec traffic
From: Maris Paupe @ 2011-11-09 14:41 UTC (permalink / raw)
To: Steffen Klassert; +Cc: Eric Dumazet, netdev
In-Reply-To: <20111109134348.GD10138@secunet.com>
I think i found the correct fix for my problem here
http://patchwork.ozlabs.org/patch/114330/
The scenario sounds the same, i will test it, thanks.
On 11/09/11 15:43, Steffen Klassert wrote:
> On Wed, Nov 09, 2011 at 02:16:04PM +0100, Eric Dumazet wrote:
>>
>> Sorry, I dont understand your patch.
>>
>> BH are disabled by the spin_lock_bh() call.
>>
>> Once flow_cache_entry are in garbage list, nothing but garbage collector
>> can access them. I see no possible deadlock. Or there is a bug somewhere
>> and your patch avoid it.
>>
>> Whole point of using a work queue to perform garbage collect was to not
>> hold BH too long (allowing sotirq to process incoming packets), so you
>> basically remove what was done in commit 8e4795605d.
>>
>
> I guess this tries to address a problem that was already discussed here:
>
> http://patchwork.ozlabs.org/patch/116457/
>
^ permalink raw reply
* Re: [v2 PATCH 1/2] NETFILTER module xt_hmark new target for HASH based fw
From: Pablo Neira Ayuso @ 2011-11-09 14:39 UTC (permalink / raw)
To: Hans Schillstrom
Cc: Hans Schillstrom, kaber, jengelh, netfilter-devel, netdev
In-Reply-To: <b328f1j.5274a2398142147da04eb5e915d63283@obelix.schillstrom.com>
On Tue, Nov 08, 2011 at 04:12:27PM +0100, Hans Schillstrom wrote:
> >BTW, do you have some number of this running with and without
> >conntrack? It would be interesting to have.
>
> I didn't save them, but I can make a new benchmark later on.
Thanks, I'm interested in them. It can be just xt_HMARK with and
without conntrack enabled. Also make sure that you use stateful
rule-set if conntrack is enabled (thus, resulting in hashing only
once, not every packet). Otherwise, conntrack will not provide
any improvement.
^ permalink raw reply
* Re: [GIT PULL nf-next] IPVS
From: Pablo Neira Ayuso @ 2011-11-09 14:36 UTC (permalink / raw)
To: Simon Horman
Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
Julian Anastasov, Krzysztof Wilczynski
In-Reply-To: <20111109005805.GA26937@verge.net.au>
On Wed, Nov 09, 2011 at 09:58:07AM +0900, Simon Horman wrote:
> Hi Pablo,
>
> On Mon, Nov 07, 2011 at 09:29:56AM +0100, Pablo Neira Ayuso wrote:
> > Hi Simon,
> >
> > On Mon, Nov 07, 2011 at 12:07:01PM +0900, Simon Horman wrote:
> > > Hi Pablo,
> > >
> > > I am a little confused. The nf-next branch seems to have disappeared.
> > >
> > > Could you consider pulling git://github.com/horms/ipvs-next.git master
> > > to get the following changes that were in your nf-next branch.
> >
> > I was late to get it into net-next. Since net-next became net after
> > the 3.1 release, my moved those changes to net to get it into 3.2
> > once Linus announced that the merge window was opened again.
> >
> > > Or would
> > > you like me to rebase the ipvs patches (9 or the 11 changes below) on
> > > top of git://1984.lsi.us.es/net-next/.git master ?
> >
> > They are already in net davem's tree, they will be included in the
> > upcoming 3.2 release.
> >
> > http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Fdavem%2Fnet.git&a=search&h=HEAD&st=commit&s=Neira
>
> Thanks, and sorry for missing that when I checked yesterday.
>
> Could you suggest which tree and branch I should base the master branch of my
> ipvs and ipvs-next trees on? Their purposes are to provide a reference for
> people wishing to fix or enhance IPVS and a mechanism to send pull requests to
> you. As of now I am using the master branch of your net tree for both.
The 1984.lsi.us.es trees are fine.
There are no branch yet because I have no patches queued for upstream
so far. You can use master if you don't see any nf branch, OK?
^ permalink raw reply
* Re: [PATCHv4 4/9] macb: convert printk to netdev_ and friends
From: Jamie Iles @ 2011-11-09 13:55 UTC (permalink / raw)
To: Joe Perches; +Cc: Jamie Iles, netdev, arnd
In-Reply-To: <1320846368.6923.24.camel@Joe-Laptop>
On Wed, Nov 09, 2011 at 05:46:08AM -0800, Joe Perches wrote:
> On Wed, 2011-11-09 at 13:37 +0000, Jamie Iles wrote:
> > OK, here's an updated patch. Thanks again Joe!
>
> Hi Jamie, thanks for updating, one possible thing.
>
> > @@ -625,15 +625,12 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
> >
> > #ifdef DEBUG
> > int i;
> > - dev_dbg(&bp->pdev->dev,
> > - "start_xmit: len %u head %p data %p tail %p end %p\n",
> > - skb->len, skb->head, skb->data,
> > - skb_tail_pointer(skb), skb_end_pointer(skb));
> > - dev_dbg(&bp->pdev->dev,
> > - "data:");
> > - for (i = 0; i < 16; i++)
> > - printk(" %02x", (unsigned int)skb->data[i]);
> > - printk("\n");
> > + netdev_dbg(bp->dev,
> > + "start_xmit: len %u head %p data %p tail %p end %p\n",
> > + skb->len, skb->head, skb->data,
> > + skb_tail_pointer(skb), skb_end_pointer(skb));
> > + print_hex_dump(KERN_DEBUG, "data: ", DUMP_PREFIX_OFFSET, 16, 1,
> > + skb->data, 16, true);
> > #endif
>
> I think there an unused variable "i" warning now
> if DEBUG is #defined.
Good eyes! I've fixed that but won't repost it.
Jamie
^ permalink raw reply
* Re: [PATCHv4 4/9] macb: convert printk to netdev_ and friends
From: Joe Perches @ 2011-11-09 13:46 UTC (permalink / raw)
To: Jamie Iles; +Cc: netdev, arnd
In-Reply-To: <20111109133726.GD4253@totoro>
On Wed, 2011-11-09 at 13:37 +0000, Jamie Iles wrote:
> OK, here's an updated patch. Thanks again Joe!
Hi Jamie, thanks for updating, one possible thing.
> @@ -625,15 +625,12 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
>
> #ifdef DEBUG
> int i;
> - dev_dbg(&bp->pdev->dev,
> - "start_xmit: len %u head %p data %p tail %p end %p\n",
> - skb->len, skb->head, skb->data,
> - skb_tail_pointer(skb), skb_end_pointer(skb));
> - dev_dbg(&bp->pdev->dev,
> - "data:");
> - for (i = 0; i < 16; i++)
> - printk(" %02x", (unsigned int)skb->data[i]);
> - printk("\n");
> + netdev_dbg(bp->dev,
> + "start_xmit: len %u head %p data %p tail %p end %p\n",
> + skb->len, skb->head, skb->data,
> + skb_tail_pointer(skb), skb_end_pointer(skb));
> + print_hex_dump(KERN_DEBUG, "data: ", DUMP_PREFIX_OFFSET, 16, 1,
> + skb->data, 16, true);
> #endif
I think there an unused variable "i" warning now
if DEBUG is #defined.
^ permalink raw reply
* Re: [PATCH] flow_cache_flush soft lockup with heavy ipsec traffic
From: Steffen Klassert @ 2011-11-09 13:43 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Maris Paupe, netdev
In-Reply-To: <1320844564.3916.4.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC>
On Wed, Nov 09, 2011 at 02:16:04PM +0100, Eric Dumazet wrote:
>
> Sorry, I dont understand your patch.
>
> BH are disabled by the spin_lock_bh() call.
>
> Once flow_cache_entry are in garbage list, nothing but garbage collector
> can access them. I see no possible deadlock. Or there is a bug somewhere
> and your patch avoid it.
>
> Whole point of using a work queue to perform garbage collect was to not
> hold BH too long (allowing sotirq to process incoming packets), so you
> basically remove what was done in commit 8e4795605d.
>
I guess this tries to address a problem that was already discussed here:
http://patchwork.ozlabs.org/patch/116457/
^ permalink raw reply
* [PATCH Kernel-3.1.0] mdio-gpio: Add reset functionality to mdio-gpio driver.
From: Srinivas KANDAGATLA @ 2011-11-09 13:38 UTC (permalink / raw)
To: netdev; +Cc: davem, stuart.menefy
From: Srinivas Kandagatla <srinivas.kandagatla@st.com>
This patch adds phy reset functionality to mdio-bitbang driver. Now
mdio_gpio_platform_data has new member as function pointer which can be
filled at the bsp level for a callback from phy infrastructure. Also the
mdio-bitbang driver fills-in the reset function of mii_bus structure.
Without this patch the bsp level code has to takecare of the reseting
PHY's on the bus, which become bit hacky for every bsp and
phy-infrastructure is ignored aswell.
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@st.com>
Cc: Stuart Menefy <stuart.menefy@st.com>
---
Hi All,
Recently In my attempt to get mdio-bitbang driver working to debug
an issue, I have noticed that mii bus reset functionality is
missing in the mdio-bitbang driver.
This functionality is very much needed to reset devices on the mii-bus.
Without this functionality BSP level code has to add code to do reset
before the mdio-bitbang driver probe is called.
As mii-bus infrastructure already provides the mii-bus drivers to
support reset function callbacks, Its much neat way to get reset
functionality support in to mdio-bitbang driver.
This patch adds reset functionality to mdio-gpio driver.
thanks,
srini
drivers/net/phy/mdio-bitbang.c | 9 +++++++++
drivers/net/phy/mdio-gpio.c | 1 +
include/linux/mdio-bitbang.h | 3 +++
include/linux/mdio-gpio.h | 2 ++
4 files changed, 15 insertions(+), 0 deletions(-)
diff --git a/drivers/net/phy/mdio-bitbang.c b/drivers/net/phy/mdio-bitbang.c
index 6539189..4b99cfc 100644
--- a/drivers/net/phy/mdio-bitbang.c
+++ b/drivers/net/phy/mdio-bitbang.c
@@ -202,6 +202,14 @@ static int mdiobb_write(struct mii_bus *bus, int phy, int reg, u16 val)
return 0;
}
+static int mdiobb_reset(struct mii_bus *bus)
+{
+ struct mdiobb_ctrl *ctrl = bus->priv;
+ if (ctrl->ops->reset)
+ ctrl->ops->reset(bus);
+ return 0;
+}
+
struct mii_bus *alloc_mdio_bitbang(struct mdiobb_ctrl *ctrl)
{
struct mii_bus *bus;
@@ -214,6 +222,7 @@ struct mii_bus *alloc_mdio_bitbang(struct mdiobb_ctrl *ctrl)
bus->read = mdiobb_read;
bus->write = mdiobb_write;
+ bus->reset = mdiobb_reset;
bus->priv = ctrl;
return bus;
diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 2843c90..367e0d0 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -97,6 +97,7 @@ static struct mii_bus * __devinit mdio_gpio_bus_init(struct device *dev,
bitbang->ctrl.ops = &mdio_gpio_ops;
bitbang->mdc = pdata->mdc;
bitbang->mdio = pdata->mdio;
+ mdio_gpio_ops.reset = pdata->reset;
new_bus = alloc_mdio_bitbang(&bitbang->ctrl);
if (!new_bus)
diff --git a/include/linux/mdio-bitbang.h b/include/linux/mdio-bitbang.h
index 8ea9a42..7c88677 100644
--- a/include/linux/mdio-bitbang.h
+++ b/include/linux/mdio-bitbang.h
@@ -27,6 +27,9 @@ struct mdiobb_ops {
/* Retrieve the state Management Data I/O pin. */
int (*get_mdio_data)(struct mdiobb_ctrl *ctrl);
+
+ /* reset callback */
+ int (*reset)(struct mii_bus *bus);
};
struct mdiobb_ctrl {
diff --git a/include/linux/mdio-gpio.h b/include/linux/mdio-gpio.h
index e9d3fdf..7c9fe3c 100644
--- a/include/linux/mdio-gpio.h
+++ b/include/linux/mdio-gpio.h
@@ -20,6 +20,8 @@ struct mdio_gpio_platform_data {
unsigned int phy_mask;
int irqs[PHY_MAX_ADDR];
+ /* reset callback */
+ int (*reset)(struct mii_bus *bus);
};
#endif /* __LINUX_MDIO_GPIO_H */
--
1.6.3.3
^ permalink raw reply related
* Re: [PATCHv4 4/9] macb: convert printk to netdev_ and friends
From: Jamie Iles @ 2011-11-09 13:37 UTC (permalink / raw)
To: Jamie Iles; +Cc: Joe Perches, netdev, arnd
In-Reply-To: <20111109131421.GC4253@totoro>
On Wed, Nov 09, 2011 at 01:14:21PM +0000, Jamie Iles wrote:
> Hi Joe,
>
> On Wed, Nov 09, 2011 at 05:10:47AM -0800, Joe Perches wrote:
> > On Tue, 2011-11-08 at 14:13 +0000, Jamie Iles wrote:
> > > macb is already using the dev_dbg() and friends helpers so use netdev_()
> > > along with a pr_fmt() definition to make the printing a little cleaner.
> > trivia...
>
> All valid comments. I'll fix all of these up.
>
> Thanks for taking a look!
OK, here's an updated patch. Thanks again Joe!
8<---
Subject: [PATCHv5] macb: convert printk to netdev_ and friends
macb is already using the dev_dbg() and friends helpers so use netdev_()
along with a pr_fmt() definition to make the printing a little cleaner.
v5: - cleanups suggested by Joe Perches
Acked-by: Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
Signed-off-by: Jamie Iles <jamie@jamieiles.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Tested-by: Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
---
drivers/net/ethernet/cadence/macb.c | 119 ++++++++++++++++------------------
1 files changed, 56 insertions(+), 63 deletions(-)
diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index d97d9ce..b171dc2 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -8,6 +8,7 @@
* published by the Free Software Foundation.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/clk.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
@@ -82,7 +83,7 @@ static void __init macb_get_hwaddr(struct macb *bp)
if (is_valid_ether_addr(addr)) {
memcpy(bp->dev->dev_addr, addr, sizeof(addr));
} else {
- dev_info(&bp->pdev->dev, "invalid hw address, using random\n");
+ netdev_info(bp->dev, "invalid hw address, using random\n");
random_ether_addr(bp->dev->dev_addr);
}
}
@@ -176,11 +177,12 @@ static void macb_handle_link_change(struct net_device *dev)
if (status_change) {
if (phydev->link)
- printk(KERN_INFO "%s: link up (%d/%s)\n",
- dev->name, phydev->speed,
- DUPLEX_FULL == phydev->duplex ? "Full":"Half");
+ netdev_info(dev, "link up (%d/%s)\n",
+ phydev->speed,
+ phydev->duplex == DUPLEX_FULL ?
+ "Full" : "Half");
else
- printk(KERN_INFO "%s: link down\n", dev->name);
+ netdev_info(dev, "link down\n");
}
}
@@ -194,7 +196,7 @@ static int macb_mii_probe(struct net_device *dev)
phydev = phy_find_first(bp->mii_bus);
if (!phydev) {
- printk (KERN_ERR "%s: no PHY found\n", dev->name);
+ netdev_err(dev, "no PHY found\n");
return -1;
}
@@ -207,7 +209,7 @@ static int macb_mii_probe(struct net_device *dev)
PHY_INTERFACE_MODE_RMII :
PHY_INTERFACE_MODE_MII);
if (ret) {
- printk(KERN_ERR "%s: Could not attach to PHY\n", dev->name);
+ netdev_err(dev, "Could not attach to PHY\n");
return ret;
}
@@ -301,14 +303,13 @@ static void macb_tx(struct macb *bp)
status = macb_readl(bp, TSR);
macb_writel(bp, TSR, status);
- dev_dbg(&bp->pdev->dev, "macb_tx status = %02lx\n",
- (unsigned long)status);
+ netdev_dbg(bp->dev, "macb_tx status = %02lx\n", (unsigned long)status);
if (status & (MACB_BIT(UND) | MACB_BIT(TSR_RLE))) {
int i;
- printk(KERN_ERR "%s: TX %s, resetting buffers\n",
- bp->dev->name, status & MACB_BIT(UND) ?
- "underrun" : "retry limit exceeded");
+ netdev_err(bp->dev, "TX %s, resetting buffers\n",
+ status & MACB_BIT(UND) ?
+ "underrun" : "retry limit exceeded");
/* Transfer ongoing, disable transmitter, to avoid confusion */
if (status & MACB_BIT(TGO))
@@ -367,8 +368,8 @@ static void macb_tx(struct macb *bp)
if (!(bufstat & MACB_BIT(TX_USED)))
break;
- dev_dbg(&bp->pdev->dev, "skb %u (data %p) TX complete\n",
- tail, skb->data);
+ netdev_dbg(bp->dev, "skb %u (data %p) TX complete\n",
+ tail, skb->data);
dma_unmap_single(&bp->pdev->dev, rp->mapping, skb->len,
DMA_TO_DEVICE);
bp->stats.tx_packets++;
@@ -393,8 +394,8 @@ static int macb_rx_frame(struct macb *bp, unsigned int first_frag,
len = MACB_BFEXT(RX_FRMLEN, bp->rx_ring[last_frag].ctrl);
- dev_dbg(&bp->pdev->dev, "macb_rx_frame frags %u - %u (len %u)\n",
- first_frag, last_frag, len);
+ netdev_dbg(bp->dev, "macb_rx_frame frags %u - %u (len %u)\n",
+ first_frag, last_frag, len);
skb = dev_alloc_skb(len + RX_OFFSET);
if (!skb) {
@@ -435,8 +436,8 @@ static int macb_rx_frame(struct macb *bp, unsigned int first_frag,
bp->stats.rx_packets++;
bp->stats.rx_bytes += len;
- dev_dbg(&bp->pdev->dev, "received skb of length %u, csum: %08x\n",
- skb->len, skb->csum);
+ netdev_dbg(bp->dev, "received skb of length %u, csum: %08x\n",
+ skb->len, skb->csum);
netif_receive_skb(skb);
return 0;
@@ -513,8 +514,8 @@ static int macb_poll(struct napi_struct *napi, int budget)
work_done = 0;
- dev_dbg(&bp->pdev->dev, "poll: status = %08lx, budget = %d\n",
- (unsigned long)status, budget);
+ netdev_dbg(bp->dev, "poll: status = %08lx, budget = %d\n",
+ (unsigned long)status, budget);
work_done = macb_rx(bp, budget);
if (work_done < budget) {
@@ -563,8 +564,7 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
macb_writel(bp, IDR, MACB_RX_INT_FLAGS);
if (napi_schedule_prep(&bp->napi)) {
- dev_dbg(&bp->pdev->dev,
- "scheduling RX softirq\n");
+ netdev_dbg(bp->dev, "scheduling RX softirq\n");
__napi_schedule(&bp->napi);
}
}
@@ -585,11 +585,11 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
if (status & MACB_BIT(HRESP)) {
/*
- * TODO: Reset the hardware, and maybe move the printk
- * to a lower-priority context as well (work queue?)
+ * TODO: Reset the hardware, and maybe move the
+ * netdev_err to a lower-priority context as well
+ * (work queue?)
*/
- printk(KERN_ERR "%s: DMA bus error: HRESP not OK\n",
- dev->name);
+ netdev_err(dev, "DMA bus error: HRESP not OK\n");
}
status = macb_readl(bp, ISR);
@@ -625,15 +625,12 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
#ifdef DEBUG
int i;
- dev_dbg(&bp->pdev->dev,
- "start_xmit: len %u head %p data %p tail %p end %p\n",
- skb->len, skb->head, skb->data,
- skb_tail_pointer(skb), skb_end_pointer(skb));
- dev_dbg(&bp->pdev->dev,
- "data:");
- for (i = 0; i < 16; i++)
- printk(" %02x", (unsigned int)skb->data[i]);
- printk("\n");
+ netdev_dbg(bp->dev,
+ "start_xmit: len %u head %p data %p tail %p end %p\n",
+ skb->len, skb->head, skb->data,
+ skb_tail_pointer(skb), skb_end_pointer(skb));
+ print_hex_dump(KERN_DEBUG, "data: ", DUMP_PREFIX_OFFSET, 16, 1,
+ skb->data, 16, true);
#endif
len = skb->len;
@@ -643,21 +640,20 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
if (TX_BUFFS_AVAIL(bp) < 1) {
netif_stop_queue(dev);
spin_unlock_irqrestore(&bp->lock, flags);
- dev_err(&bp->pdev->dev,
- "BUG! Tx Ring full when queue awake!\n");
- dev_dbg(&bp->pdev->dev, "tx_head = %u, tx_tail = %u\n",
- bp->tx_head, bp->tx_tail);
+ netdev_err(bp->dev, "BUG! Tx Ring full when queue awake!\n");
+ netdev_dbg(bp->dev, "tx_head = %u, tx_tail = %u\n",
+ bp->tx_head, bp->tx_tail);
return NETDEV_TX_BUSY;
}
entry = bp->tx_head;
- dev_dbg(&bp->pdev->dev, "Allocated ring entry %u\n", entry);
+ netdev_dbg(bp->dev, "Allocated ring entry %u\n", entry);
mapping = dma_map_single(&bp->pdev->dev, skb->data,
len, DMA_TO_DEVICE);
bp->tx_skb[entry].skb = skb;
bp->tx_skb[entry].mapping = mapping;
- dev_dbg(&bp->pdev->dev, "Mapped skb data %p to DMA addr %08lx\n",
- skb->data, (unsigned long)mapping);
+ netdev_dbg(bp->dev, "Mapped skb data %p to DMA addr %08lx\n",
+ skb->data, (unsigned long)mapping);
ctrl = MACB_BF(TX_FRMLEN, len);
ctrl |= MACB_BIT(TX_LAST);
@@ -721,27 +717,27 @@ static int macb_alloc_consistent(struct macb *bp)
&bp->rx_ring_dma, GFP_KERNEL);
if (!bp->rx_ring)
goto out_err;
- dev_dbg(&bp->pdev->dev,
- "Allocated RX ring of %d bytes at %08lx (mapped %p)\n",
- size, (unsigned long)bp->rx_ring_dma, bp->rx_ring);
+ netdev_dbg(bp->dev,
+ "Allocated RX ring of %d bytes at %08lx (mapped %p)\n",
+ size, (unsigned long)bp->rx_ring_dma, bp->rx_ring);
size = TX_RING_BYTES;
bp->tx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
&bp->tx_ring_dma, GFP_KERNEL);
if (!bp->tx_ring)
goto out_err;
- dev_dbg(&bp->pdev->dev,
- "Allocated TX ring of %d bytes at %08lx (mapped %p)\n",
- size, (unsigned long)bp->tx_ring_dma, bp->tx_ring);
+ netdev_dbg(bp->dev,
+ "Allocated TX ring of %d bytes at %08lx (mapped %p)\n",
+ size, (unsigned long)bp->tx_ring_dma, bp->tx_ring);
size = RX_RING_SIZE * RX_BUFFER_SIZE;
bp->rx_buffers = dma_alloc_coherent(&bp->pdev->dev, size,
&bp->rx_buffers_dma, GFP_KERNEL);
if (!bp->rx_buffers)
goto out_err;
- dev_dbg(&bp->pdev->dev,
- "Allocated RX buffers of %d bytes at %08lx (mapped %p)\n",
- size, (unsigned long)bp->rx_buffers_dma, bp->rx_buffers);
+ netdev_dbg(bp->dev,
+ "Allocated RX buffers of %d bytes at %08lx (mapped %p)\n",
+ size, (unsigned long)bp->rx_buffers_dma, bp->rx_buffers);
return 0;
@@ -952,7 +948,7 @@ static int macb_open(struct net_device *dev)
struct macb *bp = netdev_priv(dev);
int err;
- dev_dbg(&bp->pdev->dev, "open\n");
+ netdev_dbg(bp->dev, "open\n");
/* if the phy is not yet register, retry later*/
if (!bp->phy_dev)
@@ -963,9 +959,8 @@ static int macb_open(struct net_device *dev)
err = macb_alloc_consistent(bp);
if (err) {
- printk(KERN_ERR
- "%s: Unable to allocate DMA memory (error %d)\n",
- dev->name, err);
+ netdev_err(dev, "Unable to allocate DMA memory (error %d)\n",
+ err);
return err;
}
@@ -1174,9 +1169,8 @@ static int __init macb_probe(struct platform_device *pdev)
dev->irq = platform_get_irq(pdev, 0);
err = request_irq(dev->irq, macb_interrupt, 0, dev->name, dev);
if (err) {
- printk(KERN_ERR
- "%s: Unable to request IRQ %d (error %d)\n",
- dev->name, dev->irq, err);
+ dev_err(&pdev->dev, "Unable to request IRQ %d (error %d)\n",
+ dev->irq, err);
goto err_out_iounmap;
}
@@ -1228,13 +1222,12 @@ static int __init macb_probe(struct platform_device *pdev)
platform_set_drvdata(pdev, dev);
- printk(KERN_INFO "%s: Atmel MACB at 0x%08lx irq %d (%pM)\n",
- dev->name, dev->base_addr, dev->irq, dev->dev_addr);
+ netdev_info(dev, "Atmel MACB at 0x%08lx irq %d (%pM)\n",
+ dev->base_addr, dev->irq, dev->dev_addr);
phydev = bp->phy_dev;
- printk(KERN_INFO "%s: attached PHY driver [%s] "
- "(mii_bus:phy_addr=%s, irq=%d)\n", dev->name,
- phydev->drv->name, dev_name(&phydev->dev), phydev->irq);
+ netdev_info(dev, "attached PHY driver [%s] (mii_bus:phy_addr=%s, irq=%d)\n",
+ phydev->drv->name, dev_name(&phydev->dev), phydev->irq);
return 0;
--
1.7.4.1
^ permalink raw reply related
* Re-profiling funds.
From: Liu Wang @ 2011-11-09 13:33 UTC (permalink / raw)
I am Mr. Liu Wang, an official with the International bank of Taipei,Taiwan. I
ask for your partnership in re-profiling funds.
^ permalink raw reply
* Re: [PATCH] flow_cache_flush soft lockup with heavy ipsec traffic
From: Eric Dumazet @ 2011-11-09 13:16 UTC (permalink / raw)
To: Maris Paupe; +Cc: netdev
In-Reply-To: <4EBA7038.4050702@mt.lv>
Le mercredi 09 novembre 2011 à 14:21 +0200, Maris Paupe a écrit :
> During ipsec packet processing flow_cache_flush() may get called which
> creates flow_cache_gc_taklet(), this function is guarded by mutex and
> waits until all tasklets are finished before releasing it, another
> softirq may happen during flow_cache_gc_taklet(), in case when this irq
> is packet reading from a device, it can happen that flow_cache_flush()
> gets called again and a deadlock occurs.
> Here i purpose a simple fix to this problem by disabling softirqs during
> tasklet process. It could also be fixed in ipsec processing code, but I
> am too unfamiliar with it to touch it.
>
> Signed-off-by: Maris Paupe <marisp@mt.lv>
>
> diff --git a/net/core/flow.c b/net/core/flow.c
> index 8ae42de..19ff283 100644
> --- a/net/core/flow.c
> +++ b/net/core/flow.c
> @@ -105,6 +105,7 @@ static void flow_cache_gc_task(struct work_struct *work)
> struct list_head gc_list;
> struct flow_cache_entry *fce, *n;
>
> + local_bh_disable();
> INIT_LIST_HEAD(&gc_list);
> spin_lock_bh(&flow_cache_gc_lock);
> list_splice_tail_init(&flow_cache_gc_list, &gc_list);
> @@ -112,6 +113,7 @@ static void flow_cache_gc_task(struct work_struct *work)
>
> list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
> flow_entry_kill(fce);
> + local_bh_enable();
> }
Sorry, I dont understand your patch.
BH are disabled by the spin_lock_bh() call.
Once flow_cache_entry are in garbage list, nothing but garbage collector
can access them. I see no possible deadlock. Or there is a bug somewhere
and your patch avoid it.
Whole point of using a work queue to perform garbage collect was to not
hold BH too long (allowing sotirq to process incoming packets), so you
basically remove what was done in commit 8e4795605d.
Could you explain the problem you have ? Any stack trace or something ?
^ permalink raw reply
* Re: [PATCHv4 4/9] macb: convert printk to netdev_ and friends
From: Jamie Iles @ 2011-11-09 13:14 UTC (permalink / raw)
To: Joe Perches; +Cc: Jamie Iles, netdev, arnd
In-Reply-To: <1320844247.6923.18.camel@Joe-Laptop>
Hi Joe,
On Wed, Nov 09, 2011 at 05:10:47AM -0800, Joe Perches wrote:
> On Tue, 2011-11-08 at 14:13 +0000, Jamie Iles wrote:
> > macb is already using the dev_dbg() and friends helpers so use netdev_()
> > along with a pr_fmt() definition to make the printing a little cleaner.
> trivia...
All valid comments. I'll fix all of these up.
Thanks for taking a look!
Jamie
^ permalink raw reply
* Re: [PATCHv4 4/9] macb: convert printk to netdev_ and friends
From: Joe Perches @ 2011-11-09 13:10 UTC (permalink / raw)
To: Jamie Iles; +Cc: netdev, arnd
In-Reply-To: <1320761613-18641-5-git-send-email-jamie@jamieiles.com>
On Tue, 2011-11-08 at 14:13 +0000, Jamie Iles wrote:
> macb is already using the dev_dbg() and friends helpers so use netdev_()
> along with a pr_fmt() definition to make the printing a little cleaner.
trivia...
> diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
[]
> @@ -8,6 +8,7 @@
> * published by the Free Software Foundation.
> */
>
> +#define pr_fmt(fmt) "macb: " fmt
I think this is better as
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> @@ -176,11 +177,11 @@ static void macb_handle_link_change(struct net_device *dev)
>
> if (status_change) {
> if (phydev->link)
> - printk(KERN_INFO "%s: link up (%d/%s)\n",
> - dev->name, phydev->speed,
> - DUPLEX_FULL == phydev->duplex ? "Full":"Half");
> + netdev_info(dev, "link up (%d/%s)\n", phydev->speed,
> + DUPLEX_FULL == phydev->duplex ?
> + "Full" : "Half");
Couple of very trivial style things here.
I think this is better as var == const and I also try
to keep arguments on a single line where possible and
when not possible, more arguments after the format to
seoarate lines.
netdev_info(dev, "link up (%d/%s)\n",
phydev->speed,
phydev->duplex == DUPLEX_FULL ?
"Full" : "Half");
[]
> @@ -625,12 +624,11 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
>
> #ifdef DEBUG
> int i;
> - dev_dbg(&bp->pdev->dev,
> - "start_xmit: len %u head %p data %p tail %p end %p\n",
> - skb->len, skb->head, skb->data,
> - skb_tail_pointer(skb), skb_end_pointer(skb));
> - dev_dbg(&bp->pdev->dev,
> - "data:");
> + netdev_dbg(bp->dev,
> + "start_xmit: len %u head %p data %p tail %p end %p\n",
> + skb->len, skb->head, skb->data,
> + skb_tail_pointer(skb), skb_end_pointer(skb));
> + netdev_dbg(bp->dev, "data:");
> for (i = 0; i < 16; i++)
> printk(" %02x", (unsigned int)skb->data[i]);
> printk("\n");
Maybe print_hex_dump
> @@ -1228,13 +1223,13 @@ static int __init macb_probe(struct platform_device *pdev)
>
> platform_set_drvdata(pdev, dev);
>
> - printk(KERN_INFO "%s: Atmel MACB at 0x%08lx irq %d (%pM)\n",
> - dev->name, dev->base_addr, dev->irq, dev->dev_addr);
> + netdev_info(dev, "Atmel MACB at 0x%08lx irq %d (%pM)\n",
> + dev->base_addr, dev->irq, dev->dev_addr);
>
> phydev = bp->phy_dev;
> - printk(KERN_INFO "%s: attached PHY driver [%s] "
> - "(mii_bus:phy_addr=%s, irq=%d)\n", dev->name,
> - phydev->drv->name, dev_name(&phydev->dev), phydev->irq);
> + netdev_info(dev, "attached PHY driver [%s] "
> + "(mii_bus:phy_addr=%s, irq=%d)\n", phydev->drv->name,
> + dev_name(&phydev->dev), phydev->irq);
Coalescing formats can also be done to make
it slightly easier to grep.
netdev_info(dev, "attached PHY driver [%s] (mii_bus:phy_addr=%s, irq=%d)\n",
phydev->drv->name, dev_name(&phydev->dev), phydev->irq);
^ permalink raw reply
* Re: dst->obsolete has become pointless
From: Joe Perches @ 2011-11-09 12:49 UTC (permalink / raw)
To: David Miller; +Cc: steffen.klassert, netdev, timo.teras
In-Reply-To: <20111108.135901.1506278599930259562.davem@davemloft.net>
On Tue, 2011-11-08 at 13:59 -0500, David Miller wrote:
> net: Kill pointless and misleading checks on dst->obsolete.
[]
> Therefore rename it to dst->freed, and make it take on only the values
> "0" and "1".
> diff --git a/include/net/dst.h b/include/net/dst.h
[]
> @@ -55,7 +55,7 @@ struct dst_entry {
> #define DST_NOCOUNT 0x0020
>
> short error;
> - short obsolete;
> + unsigned short freed;
perhaps
bool freed;
bool __pad3;
just to mark the available space a bit more obviously.
^ permalink raw reply
* Re: [PATCH 3/4] ipv4: Fix inetpeer expiration handling
From: Steffen Klassert @ 2011-11-09 12:47 UTC (permalink / raw)
To: Gao feng; +Cc: David Miller, netdev
In-Reply-To: <4E9FBEA1.2050407@cn.fujitsu.com>
On Thu, Oct 20, 2011 at 02:24:33PM +0800, Gao feng wrote:
>
> there are serval problem.
> 1:rt->peer maybe null,we should call rt_bind_peer just like the code below.
If rt->peer is NULL, rt_bind_peer() sets rt->rt_peer_genid = rt_peer_genid().
So your check for rt->rt_peer_genid != rt_peer_genid() is false then and
creates cases where an unchecked peer is bound to a route.
> 2:rt->peer_pmtu_orig is null. if we hasn't send packet before,the func check_peer_pmtu hasn't be called.
> so the peer->pmtu_orig is null.
If a peer is bound to a route during slow path route lookup, the peer
should be properly initialized with rt_init_metrics(). So
rt->peer_pmtu_orig should not be null here as far as I can see.
I still think that my original patch fixes the problem.
^ permalink raw reply
* [PATCH] iMX28 Ethernet driver fix
From: Peter Horton @ 2011-11-09 12:44 UTC (permalink / raw)
To: netdev; +Cc: linux-arm-kernel
Fix driver to correctly handle cloned SKBs. Currently any clones
of the transmit SKBs get endian-swapped by the driver. This breaks
the ATA-over-Ethernet driver for example.
Signed-off-by: Peter Horton <phorton@bitbox.co.uk>
---
drivers/net/ethernet/freescale/fec.c | 8 +++++++-
1 files changed, 7 insertions(+), 1 deletions(-)
diff --git a/drivers/net/ethernet/freescale/fec.c b/drivers/net/ethernet/freescale/fec.c
index 1124ce0..aff1fa9 100644
--- a/drivers/net/ethernet/freescale/fec.c
+++ b/drivers/net/ethernet/freescale/fec.c
@@ -312,8 +312,14 @@ fec_enet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
* On some FEC implementations data must be aligned on
* 4-byte boundaries. Use bounce buffers to copy data
* and get it aligned. Ugh.
+ *
+ * We also have to use the bounce buffers if we need
+ * to do the endian swap (see below) on a cloned buffer.
*/
- if (((unsigned long) bufaddr) & FEC_ALIGNMENT) {
+ if ((((unsigned long) bufaddr) & FEC_ALIGNMENT) ||
+ ((id_entry->driver_data & FEC_QUIRK_SWAP_FRAME) &&
+ skb_cloned(skb)))
+ {
unsigned int index;
index = bdp - fep->tx_bd_base;
memcpy(fep->tx_bounce[index], skb->data, skb->len);
--
1.7.2.5
^ permalink raw reply related
* [PATCH] flow_cache_flush soft lockup with heavy ipsec traffic
From: Maris Paupe @ 2011-11-09 12:21 UTC (permalink / raw)
To: netdev
During ipsec packet processing flow_cache_flush() may get called which
creates flow_cache_gc_taklet(), this function is guarded by mutex and
waits until all tasklets are finished before releasing it, another
softirq may happen during flow_cache_gc_taklet(), in case when this irq
is packet reading from a device, it can happen that flow_cache_flush()
gets called again and a deadlock occurs.
Here i purpose a simple fix to this problem by disabling softirqs during
tasklet process. It could also be fixed in ipsec processing code, but I
am too unfamiliar with it to touch it.
Signed-off-by: Maris Paupe <marisp@mt.lv>
diff --git a/net/core/flow.c b/net/core/flow.c
index 8ae42de..19ff283 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -105,6 +105,7 @@ static void flow_cache_gc_task(struct work_struct *work)
struct list_head gc_list;
struct flow_cache_entry *fce, *n;
+ local_bh_disable();
INIT_LIST_HEAD(&gc_list);
spin_lock_bh(&flow_cache_gc_lock);
list_splice_tail_init(&flow_cache_gc_list, &gc_list);
@@ -112,6 +113,7 @@ static void flow_cache_gc_task(struct work_struct *work)
list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
flow_entry_kill(fce);
+ local_bh_enable();
}
static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);
^ permalink raw reply related
* Re: [PATCH 2/4] ipv4: Update pmtu informations on inetpeer only for output routes
From: Steffen Klassert @ 2011-11-09 12:11 UTC (permalink / raw)
To: David Miller; +Cc: netdev
In-Reply-To: <20111108.143630.106539981030509701.davem@davemloft.net>
On Tue, Nov 08, 2011 at 02:36:30PM -0500, David Miller wrote:
>
> What I think you can do to solve this problem is explicitly use
> dst->ops->default_mtu() in ip_forward() instead of dst_mtu().
>
> That way you won't use the cached PMTU for input routes.
Yes, I already had something like that in mind. I'll do
a patch to fix it in this manner.
^ permalink raw reply
* Re: [PATCH 1/4] ipv4: Fix pmtu propagating
From: Steffen Klassert @ 2011-11-09 12:08 UTC (permalink / raw)
To: David Miller; +Cc: gaofeng, netdev
In-Reply-To: <20111108.143302.907625740390232791.davem@davemloft.net>
On Tue, Nov 08, 2011 at 02:33:02PM -0500, David Miller wrote:
> From: David Miller <davem@davemloft.net>
> Date: Tue, 08 Nov 2011 14:19:50 -0500 (EST)
>
> > I suspect that your real problem has nothing to do with UDP or RAW,
> > but rather the issue is that entries already in the routing cache
> > with a NULL peer need to be refreshed with peer information created
> > in another context.
Yes, that's the problem.
>
> So you want something like this patch:
>
Originally, I wanted to fix it with the patch below.
Given the fact that dst->obsolete is not null, this should
do the same like your patch for output routes. During the
tests with this patch I noticed a problem with that.
Unfortunately I can't remember what it was...
I'll do some investigating, perhaps I can get it back to my mind.
I did some quick tests with this and with your patch and both
seem to fix the problem at the first glance.
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 511f4a7..ac189c9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2723,7 +2723,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
!((rth->rt_key_tos ^ flp4->flowi4_tos) &
(IPTOS_RT_MASK | RTO_ONLINK)) &&
net_eq(dev_net(rth->dst.dev), net) &&
- !rt_is_expired(rth)) {
+ (rth = (struct rtable *) dst_check(&rth->dst, 0)) && rth) {
dst_use(&rth->dst, jiffies);
RT_CACHE_STAT_INC(out_hit);
rcu_read_unlock_bh();
--
1.7.0.4
^ permalink raw reply related
* [PATCH] Phonet: set the pipe handle using setsockopt
From: Hemant Vilas RAMDASI @ 2011-11-09 11:20 UTC (permalink / raw)
To: remi.denis-courmont; +Cc: netdev, Dinesh Kumar Sharma, Hemant Ramdasi
From: Dinesh Kumar Sharma <dinesh.sharma@stericsson.com>
This provides flexibility to set the pipe handle
using setsockopt and enable the same.
Signed-off-by: Hemant Ramdasi <hemant.ramdasi@stericsson.com>
Signed-off-by: Dinesh Kumar Sharma <dinesh.sharma@stericsson.com>
---
include/linux/phonet.h | 2 +
net/phonet/pep.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 86 insertions(+), 2 deletions(-)
diff --git a/include/linux/phonet.h b/include/linux/phonet.h
index 6fb1384..491caec 100644
--- a/include/linux/phonet.h
+++ b/include/linux/phonet.h
@@ -37,6 +37,8 @@
#define PNPIPE_ENCAP 1
#define PNPIPE_IFINDEX 2
#define PNPIPE_HANDLE 3
+#define PNPIPE_ENABLE 4
+#define PNPIPE_INITSTATE 5
#define PNADDR_ANY 0
#define PNADDR_BROADCAST 0xFC
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index f17fd84..3109563 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -167,6 +167,12 @@ static int pipe_handler_send_created_ind(struct sock *sk)
data, 4, GFP_ATOMIC);
}
+static int pipe_handler_send_enabled_ind(struct sock *sk)
+{
+ return pep_indicate(sk, PNS_PIPE_ENABLED_IND, 0 /* sub-blocks */,
+ NULL, 0, GFP_ATOMIC);
+}
+
static int pep_accept_conn(struct sock *sk, struct sk_buff *skb)
{
static const u8 data[20] = {
@@ -533,6 +539,17 @@ static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb)
return pipe_handler_send_created_ind(sk);
}
+static int pep_enableresp_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct pnpipehdr *hdr = pnp_hdr(skb);
+
+ if (hdr->error_code != PN_PIPE_NO_ERROR)
+ return -ECONNREFUSED;
+
+ return pipe_handler_send_enabled_ind(sk);
+}
+
+
/* Queue an skb to an actively connected sock.
* Socket lock must be held. */
static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb)
@@ -578,6 +595,28 @@ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb)
sk->sk_state = TCP_CLOSE_WAIT;
break;
}
+ if (pn->init_enable == PN_PIPE_DISABLE)
+ sk->sk_state = TCP_SYN_RECV;
+ else {
+ sk->sk_state = TCP_ESTABLISHED;
+
+ if (!pn_flow_safe(pn->tx_fc)) {
+ atomic_set(&pn->tx_credits, 1);
+ sk->sk_write_space(sk);
+ }
+ pipe_grant_credits(sk, GFP_ATOMIC);
+
+ }
+ break;
+
+ case PNS_PEP_ENABLE_RESP:
+ if (sk->sk_state != TCP_SYN_SENT)
+ break;
+
+ if (pep_enableresp_rcv(sk, skb)) {
+ sk->sk_state = TCP_CLOSE_WAIT;
+ break;
+ }
sk->sk_state = TCP_ESTABLISHED;
if (!pn_flow_safe(pn->tx_fc)) {
@@ -863,9 +902,27 @@ static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len)
int err;
u8 data[4] = { 0 /* sub-blocks */, PAD, PAD, PAD };
- pn->pipe_handle = 1; /* anything but INVALID_HANDLE */
+ if (pn->pipe_handle == PN_PIPE_INVALID_HANDLE)
+ pn->pipe_handle = 1; /* anything but INVALID_HANDLE */
+
err = pipe_handler_request(sk, PNS_PEP_CONNECT_REQ,
- PN_PIPE_ENABLE, data, 4);
+ pn->init_enable, data, 4);
+
+ if (err) {
+ pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
+ return err;
+ }
+ sk->sk_state = TCP_SYN_SENT;
+ return 0;
+}
+
+static int pep_sock_enable(struct sock *sk, struct sockaddr *addr, int len)
+{
+ struct pep_sock *pn = pep_sk(sk);
+ int err;
+
+ err = pipe_handler_request(sk, PNS_PEP_ENABLE_REQ, PAD,
+ NULL, 0);
if (err) {
pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
return err;
@@ -959,6 +1016,24 @@ static int pep_setsockopt(struct sock *sk, int level, int optname,
}
goto out_norel;
+ case PNPIPE_HANDLE:
+ if (val)
+ pn->pipe_handle = val;
+ else
+ err = -EINVAL;
+ break;
+
+ case PNPIPE_ENABLE:
+ err = pep_sock_enable(sk, NULL, 0);
+ break;
+
+ case PNPIPE_INITSTATE:
+ if ((val == PN_PIPE_DISABLE) || (val == PN_PIPE_ENABLE))
+ pn->init_enable = val;
+ else
+ err = -EINVAL;
+ break;
+
default:
err = -ENOPROTOOPT;
}
@@ -994,6 +1069,13 @@ static int pep_getsockopt(struct sock *sk, int level, int optname,
return -EINVAL;
break;
+ case PNPIPE_ENABLE:
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -EINVAL;
+ else
+ val = 1;
+ break;
+
default:
return -ENOPROTOOPT;
}
--
1.7.4.3
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox