Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 6/18] netfilter: ipt_ULOG: Move away from NLMSG_PUT().
From: David Miller @ 2012-06-27  5:02 UTC (permalink / raw)
  To: netdev


And use nlmsg_data() while we're here too.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ipt_ULOG.c |   15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index ba5756d..99b3f53 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -196,12 +196,15 @@ static void ipt_ulog_packet(unsigned int hooknum,
 
 	pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold);
 
-	/* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */
-	nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
-			sizeof(*pm)+copy_len);
+	nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
+			sizeof(*pm)+copy_len, 0);
+	if (!nlh) {
+		pr_debug("error during nlmsg_put\n");
+		goto out_unlock;
+	}
 	ub->qlen++;
 
-	pm = NLMSG_DATA(nlh);
+	pm = nlmsg_data(nlh);
 
 	/* We might not have a timestamp, get one */
 	if (skb->tstamp.tv64 == 0)
@@ -261,13 +264,11 @@ static void ipt_ulog_packet(unsigned int hooknum,
 			nlh->nlmsg_type = NLMSG_DONE;
 		ulog_send(groupnum);
 	}
-
+out_unlock:
 	spin_unlock_bh(&ulog_lock);
 
 	return;
 
-nlmsg_failure:
-	pr_debug("error during NLMSG_PUT\n");
 alloc_failure:
 	pr_debug("Error building netlink message\n");
 	spin_unlock_bh(&ulog_lock);
-- 
1.7.10.2

^ permalink raw reply related

* [PATCH 7/18] netfilter: nfnetlink_log: Move away from NLMSG_PUT().
From: David Miller @ 2012-06-27  5:02 UTC (permalink / raw)
  To: netdev


And use nlmsg_data() while we're here too.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nfnetlink_log.c |   29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 3c3cfc0..169ab59 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -326,18 +326,20 @@ __nfulnl_send(struct nfulnl_instance *inst)
 {
 	int status = -1;
 
-	if (inst->qlen > 1)
-		NLMSG_PUT(inst->skb, 0, 0,
-			  NLMSG_DONE,
-			  sizeof(struct nfgenmsg));
-
+	if (inst->qlen > 1) {
+		struct nlmsghdr *nlh = nlmsg_put(inst->skb, 0, 0,
+						 NLMSG_DONE,
+						 sizeof(struct nfgenmsg),
+						 0);
+		if (!nlh)
+			goto out;
+	}
 	status = nfnetlink_unicast(inst->skb, &init_net, inst->peer_pid,
 				   MSG_DONTWAIT);
 
 	inst->qlen = 0;
 	inst->skb = NULL;
-
-nlmsg_failure:
+out:
 	return status;
 }
 
@@ -380,10 +382,12 @@ __build_packet_message(struct nfulnl_instance *inst,
 	struct nfgenmsg *nfmsg;
 	sk_buff_data_t old_tail = inst->skb->tail;
 
-	nlh = NLMSG_PUT(inst->skb, 0, 0,
+	nlh = nlmsg_put(inst->skb, 0, 0,
 			NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET,
-			sizeof(struct nfgenmsg));
-	nfmsg = NLMSG_DATA(nlh);
+			sizeof(struct nfgenmsg), 0);
+	if (!nlh)
+		return -1;
+	nfmsg = nlmsg_data(nlh);
 	nfmsg->nfgen_family = pf;
 	nfmsg->version = NFNETLINK_V0;
 	nfmsg->res_id = htons(inst->group_num);
@@ -526,7 +530,7 @@ __build_packet_message(struct nfulnl_instance *inst,
 
 		if (skb_tailroom(inst->skb) < nla_total_size(data_len)) {
 			printk(KERN_WARNING "nfnetlink_log: no tailroom!\n");
-			goto nlmsg_failure;
+			return -1;
 		}
 
 		nla = (struct nlattr *)skb_put(inst->skb, nla_total_size(data_len));
@@ -540,7 +544,6 @@ __build_packet_message(struct nfulnl_instance *inst,
 	nlh->nlmsg_len = inst->skb->tail - old_tail;
 	return 0;
 
-nlmsg_failure:
 nla_put_failure:
 	PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n");
 	return -1;
@@ -745,7 +748,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
 		   const struct nlmsghdr *nlh,
 		   const struct nlattr * const nfula[])
 {
-	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u_int16_t group_num = ntohs(nfmsg->res_id);
 	struct nfulnl_instance *inst;
 	struct nfulnl_msg_config_cmd *cmd = NULL;
-- 
1.7.10.2

^ permalink raw reply related

* [PATCH 8/18] netfilter: nfnetlink_queue_core: Move away from NLMSG_PUT().
From: David Miller @ 2012-06-27  5:02 UTC (permalink / raw)
  To: netdev


And use nlmsg_data() while we're here too.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nfnetlink_queue_core.c |   22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index d36b95e..a0b6492 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -274,13 +274,17 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 
 	skb = alloc_skb(size, GFP_ATOMIC);
 	if (!skb)
-		goto nlmsg_failure;
+		return NULL;
 
 	old_tail = skb->tail;
-	nlh = NLMSG_PUT(skb, 0, 0,
+	nlh = nlmsg_put(skb, 0, 0,
 			NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET,
-			sizeof(struct nfgenmsg));
-	nfmsg = NLMSG_DATA(nlh);
+			sizeof(struct nfgenmsg), 0);
+	if (!nlh) {
+		kfree_skb(skb);
+		return NULL;
+	}
+	nfmsg = nlmsg_data(nlh);
 	nfmsg->nfgen_family = entry->pf;
 	nfmsg->version = NFNETLINK_V0;
 	nfmsg->res_id = htons(queue->queue_num);
@@ -383,7 +387,8 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 
 		if (skb_tailroom(skb) < nla_total_size(data_len)) {
 			printk(KERN_WARNING "nf_queue: no tailroom!\n");
-			goto nlmsg_failure;
+			kfree_skb(skb);
+			return NULL;
 		}
 
 		nla = (struct nlattr *)skb_put(skb, nla_total_size(data_len));
@@ -400,7 +405,6 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 	nlh->nlmsg_len = skb->tail - old_tail;
 	return skb;
 
-nlmsg_failure:
 nla_put_failure:
 	if (skb)
 		kfree_skb(skb);
@@ -686,7 +690,7 @@ nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb,
 		   const struct nlmsghdr *nlh,
 		   const struct nlattr * const nfqa[])
 {
-	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	struct nf_queue_entry *entry, *tmp;
 	unsigned int verdict, maxid;
 	struct nfqnl_msg_verdict_hdr *vhdr;
@@ -732,7 +736,7 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
 		   const struct nlmsghdr *nlh,
 		   const struct nlattr * const nfqa[])
 {
-	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u_int16_t queue_num = ntohs(nfmsg->res_id);
 
 	struct nfqnl_msg_verdict_hdr *vhdr;
@@ -806,7 +810,7 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
 		  const struct nlmsghdr *nlh,
 		  const struct nlattr * const nfqa[])
 {
-	struct nfgenmsg *nfmsg = NLMSG_DATA(nlh);
+	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u_int16_t queue_num = ntohs(nfmsg->res_id);
 	struct nfqnl_instance *queue;
 	struct nfqnl_msg_config_cmd *cmd = NULL;
-- 
1.7.10.2

^ permalink raw reply related

* [PATCH 9/18] pkt_sched: act_api: Move away from NLMSG_PUT().
From: David Miller @ 2012-06-27  5:02 UTC (permalink / raw)
  To: netdev


Move away from NLMSG_NEW() as well.

And use nlmsg_data() while we're here too.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c |   59 +++++++++++++++++++++++++++------------------------
 1 file changed, 31 insertions(+), 28 deletions(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 5cfb160..e3d2c78 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -652,27 +652,27 @@ tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,
 	unsigned char *b = skb_tail_pointer(skb);
 	struct nlattr *nest;
 
-	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
-
-	t = NLMSG_DATA(nlh);
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*t), flags);
+	if (!nlh)
+		goto out_nlmsg_trim;
+	t = nlmsg_data(nlh);
 	t->tca_family = AF_UNSPEC;
 	t->tca__pad1 = 0;
 	t->tca__pad2 = 0;
 
 	nest = nla_nest_start(skb, TCA_ACT_TAB);
 	if (nest == NULL)
-		goto nla_put_failure;
+		goto out_nlmsg_trim;
 
 	if (tcf_action_dump(skb, a, bind, ref) < 0)
-		goto nla_put_failure;
+		goto out_nlmsg_trim;
 
 	nla_nest_end(skb, nest);
 
 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	return skb->len;
 
-nla_put_failure:
-nlmsg_failure:
+out_nlmsg_trim:
 	nlmsg_trim(skb, b);
 	return -1;
 }
@@ -799,19 +799,21 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 	if (a->ops == NULL)
 		goto err_out;
 
-	nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t));
-	t = NLMSG_DATA(nlh);
+	nlh = nlmsg_put(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0);
+	if (!nlh)
+		goto out_module_put;
+	t = nlmsg_data(nlh);
 	t->tca_family = AF_UNSPEC;
 	t->tca__pad1 = 0;
 	t->tca__pad2 = 0;
 
 	nest = nla_nest_start(skb, TCA_ACT_TAB);
 	if (nest == NULL)
-		goto nla_put_failure;
+		goto out_module_put;
 
 	err = a->ops->walk(skb, &dcb, RTM_DELACTION, a);
 	if (err < 0)
-		goto nla_put_failure;
+		goto out_module_put;
 	if (err == 0)
 		goto noflush_out;
 
@@ -828,8 +830,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 
 	return err;
 
-nla_put_failure:
-nlmsg_failure:
+out_module_put:
 	module_put(a->ops->owner);
 err_out:
 noflush_out:
@@ -919,18 +920,20 @@ static int tcf_add_notify(struct net *net, struct tc_action *a,
 
 	b = skb_tail_pointer(skb);
 
-	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
-	t = NLMSG_DATA(nlh);
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*t), flags);
+	if (!nlh)
+		goto out_kfree_skb;
+	t = nlmsg_data(nlh);
 	t->tca_family = AF_UNSPEC;
 	t->tca__pad1 = 0;
 	t->tca__pad2 = 0;
 
 	nest = nla_nest_start(skb, TCA_ACT_TAB);
 	if (nest == NULL)
-		goto nla_put_failure;
+		goto out_kfree_skb;
 
 	if (tcf_action_dump(skb, a, 0, 0) < 0)
-		goto nla_put_failure;
+		goto out_kfree_skb;
 
 	nla_nest_end(skb, nest);
 
@@ -942,8 +945,7 @@ static int tcf_add_notify(struct net *net, struct tc_action *a,
 		err = 0;
 	return err;
 
-nla_put_failure:
-nlmsg_failure:
+out_kfree_skb:
 	kfree_skb(skb);
 	return -1;
 }
@@ -1062,7 +1064,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	struct tc_action_ops *a_o;
 	struct tc_action a;
 	int ret = 0;
-	struct tcamsg *t = (struct tcamsg *) NLMSG_DATA(cb->nlh);
+	struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
 	struct nlattr *kind = find_dump_kind(cb->nlh);
 
 	if (kind == NULL) {
@@ -1080,23 +1082,25 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	if (a_o->walk == NULL) {
 		WARN(1, "tc_dump_action: %s !capable of dumping table\n",
 		     a_o->kind);
-		goto nla_put_failure;
+		goto out_module_put;
 	}
 
-	nlh = NLMSG_PUT(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
-			cb->nlh->nlmsg_type, sizeof(*t));
-	t = NLMSG_DATA(nlh);
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
+			cb->nlh->nlmsg_type, sizeof(*t), 0);
+	if (!nlh)
+		goto out_module_put;
+	t = nlmsg_data(nlh);
 	t->tca_family = AF_UNSPEC;
 	t->tca__pad1 = 0;
 	t->tca__pad2 = 0;
 
 	nest = nla_nest_start(skb, TCA_ACT_TAB);
 	if (nest == NULL)
-		goto nla_put_failure;
+		goto out_module_put;
 
 	ret = a_o->walk(skb, cb, RTM_GETACTION, &a);
 	if (ret < 0)
-		goto nla_put_failure;
+		goto out_module_put;
 
 	if (ret > 0) {
 		nla_nest_end(skb, nest);
@@ -1110,8 +1114,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	module_put(a_o->owner);
 	return skb->len;
 
-nla_put_failure:
-nlmsg_failure:
+out_module_put:
 	module_put(a_o->owner);
 	nlmsg_trim(skb, b);
 	return skb->len;
-- 
1.7.10.2

^ permalink raw reply related

* [PATCH 10/18] unix_diag: Move away from NLMSG_PUT().
From: David Miller @ 2012-06-27  5:02 UTC (permalink / raw)
  To: netdev


And use nlmsg_data() while we're here too and remove useless
casts.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/diag.c |   24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/net/unix/diag.c b/net/unix/diag.c
index 7e8a24b..977ca31 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -126,10 +126,12 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
 	struct nlmsghdr *nlh;
 	struct unix_diag_msg *rep;
 
-	nlh = NLMSG_PUT(skb, pid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep));
+	nlh = nlmsg_put(skb, pid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), 0);
+	if (!nlh)
+		goto out_nlmsg_trim;
 	nlh->nlmsg_flags = flags;
 
-	rep = NLMSG_DATA(nlh);
+	rep = nlmsg_data(nlh);
 
 	rep->udiag_family = AF_UNIX;
 	rep->udiag_type = sk->sk_type;
@@ -139,32 +141,32 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
 
 	if ((req->udiag_show & UDIAG_SHOW_NAME) &&
 	    sk_diag_dump_name(sk, skb))
-		goto nlmsg_failure;
+		goto out_nlmsg_trim;
 
 	if ((req->udiag_show & UDIAG_SHOW_VFS) &&
 	    sk_diag_dump_vfs(sk, skb))
-		goto nlmsg_failure;
+		goto out_nlmsg_trim;
 
 	if ((req->udiag_show & UDIAG_SHOW_PEER) &&
 	    sk_diag_dump_peer(sk, skb))
-		goto nlmsg_failure;
+		goto out_nlmsg_trim;
 
 	if ((req->udiag_show & UDIAG_SHOW_ICONS) &&
 	    sk_diag_dump_icons(sk, skb))
-		goto nlmsg_failure;
+		goto out_nlmsg_trim;
 
 	if ((req->udiag_show & UDIAG_SHOW_RQLEN) &&
 	    sk_diag_show_rqlen(sk, skb))
-		goto nlmsg_failure;
+		goto out_nlmsg_trim;
 
 	if ((req->udiag_show & UDIAG_SHOW_MEMINFO) &&
 	    sock_diag_put_meminfo(sk, skb, UNIX_DIAG_MEMINFO))
-		goto nlmsg_failure;
+		goto out_nlmsg_trim;
 
 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	return skb->len;
 
-nlmsg_failure:
+out_nlmsg_trim:
 	nlmsg_trim(skb, b);
 	return -EMSGSIZE;
 }
@@ -189,7 +191,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	struct unix_diag_req *req;
 	int num, s_num, slot, s_slot;
 
-	req = NLMSG_DATA(cb->nlh);
+	req = nlmsg_data(cb->nlh);
 
 	s_slot = cb->args[0];
 	num = s_num = cb->args[1];
@@ -309,7 +311,7 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 		};
 		return netlink_dump_start(sock_diag_nlsk, skb, h, &c);
 	} else
-		return unix_diag_get_exact(skb, h, (struct unix_diag_req *)NLMSG_DATA(h));
+		return unix_diag_get_exact(skb, h, nlmsg_data(h));
 }
 
 static const struct sock_diag_handler unix_diag_handler = {
-- 
1.7.10.2

^ permalink raw reply related

* [PATCH 11/18] selinux: netlink: Move away from NLMSG_PUT().
From: David Miller @ 2012-06-27  5:02 UTC (permalink / raw)
  To: netdev


And use nlmsg_data() while we're here too.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 security/selinux/netlink.c |   11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/security/selinux/netlink.c b/security/selinux/netlink.c
index 161e01a..8a23a35 100644
--- a/security/selinux/netlink.c
+++ b/security/selinux/netlink.c
@@ -19,6 +19,7 @@
 #include <linux/netlink.h>
 #include <linux/selinux_netlink.h>
 #include <net/net_namespace.h>
+#include <net/netlink.h>
 
 #include "security.h"
 
@@ -47,7 +48,7 @@ static void selnl_add_payload(struct nlmsghdr *nlh, int len, int msgtype, void *
 {
 	switch (msgtype) {
 	case SELNL_MSG_SETENFORCE: {
-		struct selnl_msg_setenforce *msg = NLMSG_DATA(nlh);
+		struct selnl_msg_setenforce *msg = nlmsg_data(nlh);
 
 		memset(msg, 0, len);
 		msg->val = *((int *)data);
@@ -55,7 +56,7 @@ static void selnl_add_payload(struct nlmsghdr *nlh, int len, int msgtype, void *
 	}
 
 	case SELNL_MSG_POLICYLOAD: {
-		struct selnl_msg_policyload *msg = NLMSG_DATA(nlh);
+		struct selnl_msg_policyload *msg = nlmsg_data(nlh);
 
 		memset(msg, 0, len);
 		msg->seqno = *((u32 *)data);
@@ -81,7 +82,9 @@ static void selnl_notify(int msgtype, void *data)
 		goto oom;
 
 	tmp = skb->tail;
-	nlh = NLMSG_PUT(skb, 0, 0, msgtype, len);
+	nlh = nlmsg_put(skb, 0, 0, msgtype, len, 0);
+	if (!nlh)
+		goto out_kfree_skb;
 	selnl_add_payload(nlh, len, msgtype, data);
 	nlh->nlmsg_len = skb->tail - tmp;
 	NETLINK_CB(skb).dst_group = SELNLGRP_AVC;
@@ -89,7 +92,7 @@ static void selnl_notify(int msgtype, void *data)
 out:
 	return;
 
-nlmsg_failure:
+out_kfree_skb:
 	kfree_skb(skb);
 oom:
 	printk(KERN_ERR "SELinux:  OOM in %s\n", __func__);
-- 
1.7.10.2

^ permalink raw reply related

* [PATCH 12/18] infiniband: netlink: Move away from NLMSG_NEW().
From: David Miller @ 2012-06-27  5:02 UTC (permalink / raw)
  To: netdev


And use nlmsg_data() while we're here too.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/core/netlink.c |   10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index e497dfb..1e691dc 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -108,12 +108,14 @@ void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq,
 	unsigned char *prev_tail;
 
 	prev_tail = skb_tail_pointer(skb);
-	*nlh = NLMSG_NEW(skb, 0, seq, RDMA_NL_GET_TYPE(client, op),
-			len, NLM_F_MULTI);
+	*nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op),
+			 len, NLM_F_MULTI);
+	if (!*nlh)
+		goto out_nlmsg_trim;
 	(*nlh)->nlmsg_len = skb_tail_pointer(skb) - prev_tail;
-	return NLMSG_DATA(*nlh);
+	return nlmsg_data(*nlh);
 
-nlmsg_failure:
+out_nlmsg_trim:
 	nlmsg_trim(skb, prev_tail);
 	return NULL;
 }
-- 
1.7.10.2

^ permalink raw reply related

* [PATCH 13/18] audit: netlink: Move away from NLMSG_NEW().
From: David Miller @ 2012-06-27  5:02 UTC (permalink / raw)
  To: netdev


And use nlmsg_data() while we're here too.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/audit.c |   23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/kernel/audit.c b/kernel/audit.c
index 1c7f2c6..30b252a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -384,7 +384,7 @@ static void audit_hold_skb(struct sk_buff *skb)
 static void audit_printk_skb(struct sk_buff *skb)
 {
 	struct nlmsghdr *nlh = nlmsg_hdr(skb);
-	char *data = NLMSG_DATA(nlh);
+	char *data = nlmsg_data(nlh);
 
 	if (nlh->nlmsg_type != AUDIT_EOE) {
 		if (printk_ratelimit())
@@ -516,14 +516,15 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
 	if (!skb)
 		return NULL;
 
-	nlh	= NLMSG_NEW(skb, pid, seq, t, size, flags);
-	data	= NLMSG_DATA(nlh);
+	nlh	= nlmsg_put(skb, pid, seq, t, size, flags);
+	if (!nlh)
+		goto out_kfree_skb;
+	data = nlmsg_data(nlh);
 	memcpy(data, payload, size);
 	return skb;
 
-nlmsg_failure:			/* Used by NLMSG_NEW */
-	if (skb)
-		kfree_skb(skb);
+out_kfree_skb:
+	kfree_skb(skb);
 	return NULL;
 }
 
@@ -680,7 +681,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	sessionid = audit_get_sessionid(current);
 	security_task_getsecid(current, &sid);
 	seq  = nlh->nlmsg_seq;
-	data = NLMSG_DATA(nlh);
+	data = nlmsg_data(nlh);
 
 	switch (msg_type) {
 	case AUDIT_GET:
@@ -1060,13 +1061,15 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
 
 	ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
 	if (!ab->skb)
-		goto nlmsg_failure;
+		goto err;
 
-	nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
+	nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0);
+	if (!nlh)
+		goto out_kfree_skb;
 
 	return ab;
 
-nlmsg_failure:                  /* Used by NLMSG_NEW */
+out_kfree_skb:
 	kfree_skb(ab->skb);
 	ab->skb = NULL;
 err:
-- 
1.7.10.2

^ permalink raw reply related

* [PATCH 14/18] decnet: dn_route: Move away from NLMSG_NEW().
From: David Miller @ 2012-06-27  5:02 UTC (permalink / raw)
  To: netdev


And use nlmsg_data() while we're here too.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/decnet/dn_route.c |   12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 586302e..cd584f7 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1518,8 +1518,10 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
 	unsigned char *b = skb_tail_pointer(skb);
 	long expires;
 
-	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
-	r = NLMSG_DATA(nlh);
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
+	if (!nlh)
+		goto out_nlmsg_trim;
+	r = nlmsg_data(nlh);
 	r->rtm_family = AF_DECnet;
 	r->rtm_dst_len = 16;
 	r->rtm_src_len = 0;
@@ -1559,7 +1561,7 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	return skb->len;
 
-nlmsg_failure:
+out_nlmsg_trim:
 rtattr_failure:
 	nlmsg_trim(skb, b);
 	return -1;
@@ -1572,7 +1574,7 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 {
 	struct net *net = sock_net(in_skb->sk);
 	struct rtattr **rta = arg;
-	struct rtmsg *rtm = NLMSG_DATA(nlh);
+	struct rtmsg *rtm = nlmsg_data(nlh);
 	struct dn_route *rt = NULL;
 	struct dn_skb_cb *cb;
 	int err;
@@ -1669,7 +1671,7 @@ int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb)
 
 	if (NLMSG_PAYLOAD(cb->nlh, 0) < sizeof(struct rtmsg))
 		return -EINVAL;
-	if (!(((struct rtmsg *)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED))
+	if (!(((struct rtmsg *)nlmsg_data(cb->nlh))->rtm_flags&RTM_F_CLONED))
 		return 0;
 
 	s_h = cb->args[0];
-- 
1.7.10.2

^ permalink raw reply related

* [PATCH 15/18] decnet: dn_table: Move away from NLMSG_NEW().
From: David Miller @ 2012-06-27  5:02 UTC (permalink / raw)
  To: netdev


And use nlmsg_data() while we're here too.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/decnet/dn_table.c |   11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 650f338..92ec741 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -299,8 +299,10 @@ static int dn_fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
 	struct nlmsghdr *nlh;
 	unsigned char *b = skb_tail_pointer(skb);
 
-	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags);
-	rtm = NLMSG_DATA(nlh);
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
+	if (!nlh)
+		goto out_nlmsg_trim;
+	rtm = nlmsg_data(nlh);
 	rtm->rtm_family = AF_DECnet;
 	rtm->rtm_dst_len = dst_len;
 	rtm->rtm_src_len = 0;
@@ -348,8 +350,7 @@ static int dn_fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	return skb->len;
 
-
-nlmsg_failure:
+out_nlmsg_trim:
 rtattr_failure:
 	nlmsg_trim(skb, b);
 	return -EMSGSIZE;
@@ -476,7 +477,7 @@ int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
 		return 0;
 
 	if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) &&
-		((struct rtmsg *)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED)
+		((struct rtmsg *)nlmsg_data(cb->nlh))->rtm_flags&RTM_F_CLONED)
 			return dn_cache_dump(skb, cb);
 
 	s_h = cb->args[0];
-- 
1.7.10.2

^ permalink raw reply related

* [PATCH 16/18] pkt_sched: cls_api: Move away from NLMSG_NEW().
From: David Miller @ 2012-06-27  5:02 UTC (permalink / raw)
  To: netdev


And use nlmsg_data() while we're here too, as well as remove
a useless cast.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c |   12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f452f69..6dd1131 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -140,7 +140,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 	int tp_created = 0;
 
 replay:
-	t = NLMSG_DATA(n);
+	t = nlmsg_data(n);
 	protocol = TC_H_MIN(t->tcm_info);
 	prio = TC_H_MAJ(t->tcm_info);
 	nprio = prio;
@@ -349,8 +349,10 @@ static int tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp,
 	struct nlmsghdr  *nlh;
 	unsigned char *b = skb_tail_pointer(skb);
 
-	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
-	tcm = NLMSG_DATA(nlh);
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*tcm), flags);
+	if (!nlh)
+		goto out_nlmsg_trim;
+	tcm = nlmsg_data(nlh);
 	tcm->tcm_family = AF_UNSPEC;
 	tcm->tcm__pad1 = 0;
 	tcm->tcm__pad2 = 0;
@@ -368,7 +370,7 @@ static int tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp,
 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	return skb->len;
 
-nlmsg_failure:
+out_nlmsg_trim:
 nla_put_failure:
 	nlmsg_trim(skb, b);
 	return -1;
@@ -418,7 +420,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 	struct net_device *dev;
 	struct Qdisc *q;
 	struct tcf_proto *tp, **chain;
-	struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
+	struct tcmsg *tcm = nlmsg_data(cb->nlh);
 	unsigned long cl = 0;
 	const struct Qdisc_class_ops *cops;
 	struct tcf_dump_args arg;
-- 
1.7.10.2

^ permalink raw reply related

* [PATCH 17/18] pkt_sched: sch_api: Move away from NLMSG_NEW().
From: David Miller @ 2012-06-27  5:02 UTC (permalink / raw)
  To: netdev


And use nlmsg_data() while we're here too, as well as remove
a useless cast.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c |   24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 085ce53..a08b4ab 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -973,7 +973,7 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 {
 	struct net *net = sock_net(skb->sk);
-	struct tcmsg *tcm = NLMSG_DATA(n);
+	struct tcmsg *tcm = nlmsg_data(n);
 	struct nlattr *tca[TCA_MAX + 1];
 	struct net_device *dev;
 	u32 clid = tcm->tcm_parent;
@@ -1046,7 +1046,7 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 
 replay:
 	/* Reinit, just in case something touches this. */
-	tcm = NLMSG_DATA(n);
+	tcm = nlmsg_data(n);
 	clid = tcm->tcm_parent;
 	q = p = NULL;
 
@@ -1193,8 +1193,10 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	struct gnet_dump d;
 	struct qdisc_size_table *stab;
 
-	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
-	tcm = NLMSG_DATA(nlh);
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*tcm), flags);
+	if (!nlh)
+		goto out_nlmsg_trim;
+	tcm = nlmsg_data(nlh);
 	tcm->tcm_family = AF_UNSPEC;
 	tcm->tcm__pad1 = 0;
 	tcm->tcm__pad2 = 0;
@@ -1230,7 +1232,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	return skb->len;
 
-nlmsg_failure:
+out_nlmsg_trim:
 nla_put_failure:
 	nlmsg_trim(skb, b);
 	return -1;
@@ -1366,7 +1368,7 @@ done:
 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 {
 	struct net *net = sock_net(skb->sk);
-	struct tcmsg *tcm = NLMSG_DATA(n);
+	struct tcmsg *tcm = nlmsg_data(n);
 	struct nlattr *tca[TCA_MAX + 1];
 	struct net_device *dev;
 	struct Qdisc *q = NULL;
@@ -1498,8 +1500,10 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
 	struct gnet_dump d;
 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
 
-	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
-	tcm = NLMSG_DATA(nlh);
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*tcm), flags);
+	if (!nlh)
+		goto out_nlmsg_trim;
+	tcm = nlmsg_data(nlh);
 	tcm->tcm_family = AF_UNSPEC;
 	tcm->tcm__pad1 = 0;
 	tcm->tcm__pad2 = 0;
@@ -1525,7 +1529,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	return skb->len;
 
-nlmsg_failure:
+out_nlmsg_trim:
 nla_put_failure:
 	nlmsg_trim(skb, b);
 	return -1;
@@ -1616,7 +1620,7 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
 
 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
+	struct tcmsg *tcm = nlmsg_data(cb->nlh);
 	struct net *net = sock_net(skb->sk);
 	struct netdev_queue *dev_queue;
 	struct net_device *dev;
-- 
1.7.10.2

^ permalink raw reply related

* [PATCH 18/18] netlink: Delete NLMSG_PUT and NLMSG_NEW.
From: David Miller @ 2012-06-27  5:02 UTC (permalink / raw)
  To: netdev


No longer used and a poor interface as they were macros
with embedded gotos.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h |    8 --------
 1 file changed, 8 deletions(-)

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 0f628ff..ed33f09 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -241,14 +241,6 @@ struct netlink_notify {
 struct nlmsghdr *
 __nlmsg_put(struct sk_buff *skb, u32 pid, u32 seq, int type, int len, int flags);
 
-#define NLMSG_NEW(skb, pid, seq, type, len, flags) \
-({	if (unlikely(skb_tailroom(skb) < (int)NLMSG_SPACE(len))) \
-		goto nlmsg_failure; \
-	__nlmsg_put(skb, pid, seq, type, len, flags); })
-
-#define NLMSG_PUT(skb, pid, seq, type, len) \
-	NLMSG_NEW(skb, pid, seq, type, len, 0)
-
 struct netlink_dump_control {
 	int (*dump)(struct sk_buff *skb, struct netlink_callback *);
 	int (*done)(struct netlink_callback*);
-- 
1.7.10.2

^ permalink raw reply related

* Un...Freakin...believable!
From: John @ 2012-06-27  3:59 UTC (permalink / raw)
  To: netdev

I literally would not have believed this either.

In fact, I didn't.

My friend used the "loyalty" card to get
me to give it a try...

I'm glad I did because I've made more money faster
than I ever had.

In fact, I turned that $100.00 to more thousands in the last
4 months than I can even legally say (plus,you'd probably f
all of your freakin' chair).

But tomorrow, I'll just show you the proof.

I'm doing a Live Webcast with that same friend and
we're going to just show you the back office and the results
we've gotten putting in about 10 minutes a day.

http://bit.ly/MW5hPf

This is so simple, newbies with no business experience
or marketing "know-how" are still profiting their first week.

For many people it's the first time they've ever succeeded
with any home business or online marketing

http://bit.ly/MW5hPf

Register for the Live Broadcast happening on Wednesday.

Talk soon,
John

PS: You don't have to take my word for it, just watch as we
show you exactly what's happening and you see the proof:

http://bit.ly/MW5hPf

****This email is not guaranteeing any results for any person.
Our results do not guarantee your own. *****

^ permalink raw reply

* Re: [PATCH v2 -next 4/4] tg3: Add binary sysfs file to export bulk sensor data
From: David Miller @ 2012-06-27  5:04 UTC (permalink / raw)
  To: mchan; +Cc: netdev, nsujir
In-Reply-To: <1340773249.4344.74.camel@LTIRV-MCHAN1.corp.ad.broadcom.com>

From: "Michael Chan" <mchan@broadcom.com>
Date: Tue, 26 Jun 2012 22:00:49 -0700

> On Tue, 2012-06-26 at 21:02 -0700, David Miller wrote:
>> Ben stated merely that a binary attribute existed for sysfs files.  He
>> did not, however, say that this is the path down which you should
>> implement your feature.
> 
> He said that if the data was difficult to parse in the driver, then a
> binary sysfs or private ioctl (which we would stay away) would be
> appropriate.  There are hundreds of bytes of this data, most of which is
> not useful to the user but needed for Lights out management.  It will
> greatly bloat the tg3 driver to add code to parse all that data and
> export each one separately.

I don't want us to get into the habit of just going "oh it's LOM
stuff, binary blob."  And that's the precedence you're setting here.

It also sets up a situation where functionality could end up only
being available in proprietary binary only tools.

It's not like this is some standardized format like a DMI table or
similar, is it?

^ permalink raw reply

* Re: [PATCH v2 -next 4/4] tg3: Add binary sysfs file to export bulk sensor data
From: Michael Chan @ 2012-06-27  5:12 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, nsujir
In-Reply-To: <20120626.220459.548918770401348569.davem@davemloft.net>

On Tue, 2012-06-26 at 22:04 -0700, David Miller wrote: 
> From: "Michael Chan" <mchan@broadcom.com>
> Date: Tue, 26 Jun 2012 22:00:49 -0700
> 
> > On Tue, 2012-06-26 at 21:02 -0700, David Miller wrote:
> >> Ben stated merely that a binary attribute existed for sysfs files.  He
> >> did not, however, say that this is the path down which you should
> >> implement your feature.
> > 
> > He said that if the data was difficult to parse in the driver, then a
> > binary sysfs or private ioctl (which we would stay away) would be
> > appropriate.  There are hundreds of bytes of this data, most of which is
> > not useful to the user but needed for Lights out management.  It will
> > greatly bloat the tg3 driver to add code to parse all that data and
> > export each one separately.
> 
> I don't want us to get into the habit of just going "oh it's LOM
> stuff, binary blob."  And that's the precedence you're setting here.
> 
> It also sets up a situation where functionality could end up only
> being available in proprietary binary only tools.

I will strongly encourage the OEM to opensource the userspace tool.  In
fact, we'll be writing it for them and we can influence them.

> 
> It's not like this is some standardized format like a DMI table or
> similar, is it?
> 

It is defined by the OEM.  I don't know whether they plan to publish it
or not, but I will check tomorrow.

^ permalink raw reply

* Re: [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support
From: Jason Wang @ 2012-06-27  5:16 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: habanero, netdev, linux-kernel, krkumar2, tahm, akong, davem,
	shemminger, mashirle
In-Reply-To: <20120626104250.GC13108@redhat.com>

On 06/26/2012 06:42 PM, Michael S. Tsirkin wrote:
> On Tue, Jun 26, 2012 at 11:42:17AM +0800, Jason Wang wrote:
>> On 06/25/2012 04:25 PM, Michael S. Tsirkin wrote:
>>> On Mon, Jun 25, 2012 at 02:10:18PM +0800, Jason Wang wrote:
>>>> This patch adds multiqueue support for tap device. This is done by abstracting
>>>> each queue as a file/socket and allowing multiple sockets to be attached to the
>>>> tuntap device (an array of tun_file were stored in the tun_struct). Userspace
>>>> could write and read from those files to do the parallel packet
>>>> sending/receiving.
>>>>
>>>> Unlike the previous single queue implementation, the socket and device were
>>>> loosely coupled, each of them were allowed to go away first. In order to let the
>>>> tx path lockless, netif_tx_loch_bh() is replaced by RCU/NETIF_F_LLTX to
>>>> synchronize between data path and system call.
>>> Don't use LLTX/RCU. It's not worth it.
>>> Use something like netif_set_real_num_tx_queues.
>>>
>>>> The tx queue selecting is first based on the recorded rxq index of an skb, it
>>>> there's no such one, then choosing based on rx hashing (skb_get_rxhash()).
>>>>
>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>> Interestingly macvtap switched to hashing first:
>>> ef0002b577b52941fb147128f30bd1ecfdd3ff6d
>>> (the commit log is corrupted but see what it
>>> does in the patch).
>>> Any idea why?
>> Yes, so tap should be changed to behave same as macvtap. I remember
>> the reason we do that is to make sure the packet of a single flow to
>> be queued to a fixed socket/virtqueues. As 10g cards like ixgbe
>> choose the rx queue for a flow based on the last tx queue where the
>> packets of that flow comes. So if we are using recored rx queue in
>> macvtap, the queue index of a flow would change as vhost thread
>> moves amongs processors.
> Hmm. OTOH if you override this, if TX is sent from VCPU0, RX might land
> on VCPU1 in the guest, which is not good, right?

Yes, but better than making the rx moves between vcpus when we use 
recorded rx queue. Flow steering is needed to make sure the tx and rx on 
the same vcpu.
>> But during test tun/tap, one interesting thing I find is that even
>> ixgbe has recorded the queue index during rx, it seems be lost when
>> tap tries to transmit skbs to userspace.
> dev_pick_tx does this I think but ndo_select_queue
> should be able to get it without trouble.
>
>
>>>> ---
>>>>   drivers/net/tun.c |  371 +++++++++++++++++++++++++++++++++--------------------
>>>>   1 files changed, 232 insertions(+), 139 deletions(-)
>>>>
>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>> index 8233b0a..5c26757 100644
>>>> --- a/drivers/net/tun.c
>>>> +++ b/drivers/net/tun.c
>>>> @@ -107,6 +107,8 @@ struct tap_filter {
>>>>   	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
>>>>   };
>>>>
>>>> +#define MAX_TAP_QUEUES (NR_CPUS<   16 ? NR_CPUS : 16)
>>> Why the limit? I am guessing you copied this from macvtap?
>>> This is problematic for a number of reasons:
>>> 	- will not play well with migration
>>> 	- will not work well for a large guest
>>>
>>> Yes, macvtap needs to be fixed too.
>>>
>>> I am guessing what it is trying to prevent is queueing
>>> up a huge number of packets?
>>> So just divide the default tx queue limit by the # of queues.
>>>
>>> And by the way, for MQ applications maybe we can finally
>>> ignore tx queue altogether and limit the total number
>>> of bytes queued?
>>> To avoid regressions we can make it large like 64M/# queues.
>>> Could be a separate patch I think, and for a single queue
>>> might need a compatible mode though I am not sure.
>>>
>>>> +
>>>>   struct tun_file {
>>>>   	struct sock sk;
>>>>   	struct socket socket;
>>>> @@ -114,16 +116,18 @@ struct tun_file {
>>>>   	int vnet_hdr_sz;
>>>>   	struct tap_filter txflt;
>>>>   	atomic_t count;
>>>> -	struct tun_struct *tun;
>>>> +	struct tun_struct __rcu *tun;
>>>>   	struct net *net;
>>>>   	struct fasync_struct *fasync;
>>>>   	unsigned int flags;
>>>> +	u16 queue_index;
>>>>   };
>>>>
>>>>   struct tun_sock;
>>>>
>>>>   struct tun_struct {
>>>> -	struct tun_file		*tfile;
>>>> +	struct tun_file		*tfiles[MAX_TAP_QUEUES];
>>>> +	unsigned int            numqueues;
>>>>   	unsigned int 		flags;
>>>>   	uid_t			owner;
>>>>   	gid_t			group;
>>>> @@ -138,80 +142,159 @@ struct tun_struct {
>>>>   #endif
>>>>   };
>>>>
>>>> -static int tun_attach(struct tun_struct *tun, struct file *file)
>>>> +static DEFINE_SPINLOCK(tun_lock);
>>>> +
>>>> +/*
>>>> + * tun_get_queue(): calculate the queue index
>>>> + *     - if skbs comes from mq nics, we can just borrow
>>>> + *     - if not, calculate from the hash
>>>> + */
>>>> +static struct tun_file *tun_get_queue(struct net_device *dev,
>>>> +				      struct sk_buff *skb)
>>>>   {
>>>> -	struct tun_file *tfile = file->private_data;
>>>> -	int err;
>>>> +	struct tun_struct *tun = netdev_priv(dev);
>>>> +	struct tun_file *tfile = NULL;
>>>> +	int numqueues = tun->numqueues;
>>>> +	__u32 rxq;
>>>>
>>>> -	ASSERT_RTNL();
>>>> +	BUG_ON(!rcu_read_lock_held());
>>>>
>>>> -	netif_tx_lock_bh(tun->dev);
>>>> +	if (!numqueues)
>>>> +		goto out;
>>>>
>>>> -	err = -EINVAL;
>>>> -	if (tfile->tun)
>>>> +	if (numqueues == 1) {
>>>> +		tfile = rcu_dereference(tun->tfiles[0]);
>>> Instead of hacks like this, you can ask for an MQ
>>> flag to be set in SETIFF. Then you won't need to
>>> handle attach/detach at random times.
>>> And most of the scary num_queues checks can go away.
>>> You can then also ask userspace about the max # of queues
>>> to expect if you want to save some memory.
>>>
>>>
>>>>   		goto out;
>>>> +	}
>>>>
>>>> -	err = -EBUSY;
>>>> -	if (tun->tfile)
>>>> +	if (likely(skb_rx_queue_recorded(skb))) {
>>>> +		rxq = skb_get_rx_queue(skb);
>>>> +
>>>> +		while (unlikely(rxq>= numqueues))
>>>> +			rxq -= numqueues;
>>>> +
>>>> +		tfile = rcu_dereference(tun->tfiles[rxq]);
>>>>   		goto out;
>>>> +	}
>>>>
>>>> -	err = 0;
>>>> -	tfile->tun = tun;
>>>> -	tun->tfile = tfile;
>>>> -	netif_carrier_on(tun->dev);
>>>> -	dev_hold(tun->dev);
>>>> -	sock_hold(&tfile->sk);
>>>> -	atomic_inc(&tfile->count);
>>>> +	/* Check if we can use flow to select a queue */
>>>> +	rxq = skb_get_rxhash(skb);
>>>> +	if (rxq) {
>>>> +		u32 idx = ((u64)rxq * numqueues)>>   32;
>>> This completely confuses me. What's the logic here?
>>> How do we even know it's in range?
>>>
>>>> +		tfile = rcu_dereference(tun->tfiles[idx]);
>>>> +		goto out;
>>>> +	}
>>>>
>>>> +	tfile = rcu_dereference(tun->tfiles[0]);
>>>>   out:
>>>> -	netif_tx_unlock_bh(tun->dev);
>>>> -	return err;
>>>> +	return tfile;
>>>>   }
>>>>
>>>> -static void __tun_detach(struct tun_struct *tun)
>>>> +static int tun_detach(struct tun_file *tfile, bool clean)
>>>>   {
>>>> -	struct tun_file *tfile = tun->tfile;
>>>> -	/* Detach from net device */
>>>> -	netif_tx_lock_bh(tun->dev);
>>>> -	netif_carrier_off(tun->dev);
>>>> -	tun->tfile = NULL;
>>>> -	netif_tx_unlock_bh(tun->dev);
>>>> -
>>>> -	/* Drop read queue */
>>>> -	skb_queue_purge(&tfile->socket.sk->sk_receive_queue);
>>>> -
>>>> -	/* Drop the extra count on the net device */
>>>> -	dev_put(tun->dev);
>>>> -}
>>>> +	struct tun_struct *tun;
>>>> +	struct net_device *dev = NULL;
>>>> +	bool destroy = false;
>>>>
>>>> -static void tun_detach(struct tun_struct *tun)
>>>> -{
>>>> -	rtnl_lock();
>>>> -	__tun_detach(tun);
>>>> -	rtnl_unlock();
>>>> -}
>>>> +	spin_lock(&tun_lock);
>>>>
>>>> -static struct tun_struct *__tun_get(struct tun_file *tfile)
>>>> -{
>>>> -	struct tun_struct *tun = NULL;
>>>> +	tun = rcu_dereference_protected(tfile->tun,
>>>> +					lockdep_is_held(&tun_lock));
>>>> +	if (tun) {
>>>> +		u16 index = tfile->queue_index;
>>>> +		BUG_ON(index>= tun->numqueues);
>>>> +		dev = tun->dev;
>>>> +
>>>> +		rcu_assign_pointer(tun->tfiles[index],
>>>> +				   tun->tfiles[tun->numqueues - 1]);
>>>> +		tun->tfiles[index]->queue_index = index;
>>>> +		rcu_assign_pointer(tfile->tun, NULL);
>>>> +		--tun->numqueues;
>>>> +		sock_put(&tfile->sk);
>>>>
>>>> -	if (atomic_inc_not_zero(&tfile->count))
>>>> -		tun = tfile->tun;
>>>> +		if (tun->numqueues == 0&&   !(tun->flags&   TUN_PERSIST))
>>>> +			destroy = true;
>>> Please don't use flags like that. Use dedicated labels and goto there on error.
>>>
>>>
>>>> +	}
>>>>
>>>> -	return tun;
>>>> +	spin_unlock(&tun_lock);
>>>> +
>>>> +	synchronize_rcu();
>>>> +	if (clean)
>>>> +		sock_put(&tfile->sk);
>>>> +
>>>> +	if (destroy) {
>>>> +		rtnl_lock();
>>>> +		if (dev->reg_state == NETREG_REGISTERED)
>>>> +			unregister_netdevice(dev);
>>>> +		rtnl_unlock();
>>>> +	}
>>>> +
>>>> +	return 0;
>>>>   }
>>>>
>>>> -static struct tun_struct *tun_get(struct file *file)
>>>> +static void tun_detach_all(struct net_device *dev)
>>>>   {
>>>> -	return __tun_get(file->private_data);
>>>> +	struct tun_struct *tun = netdev_priv(dev);
>>>> +	struct tun_file *tfile, *tfile_list[MAX_TAP_QUEUES];
>>>> +	int i, j = 0;
>>>> +
>>>> +	spin_lock(&tun_lock);
>>>> +
>>>> +	for (i = 0; i<   MAX_TAP_QUEUES&&   tun->numqueues; i++) {
>>>> +		tfile = rcu_dereference_protected(tun->tfiles[i],
>>>> +						lockdep_is_held(&tun_lock));
>>>> +		BUG_ON(!tfile);
>>>> +		wake_up_all(&tfile->wq.wait);
>>>> +		tfile_list[j++] = tfile;
>>>> +		rcu_assign_pointer(tfile->tun, NULL);
>>>> +		--tun->numqueues;
>>>> +	}
>>>> +	BUG_ON(tun->numqueues != 0);
>>>> +	/* guarantee that any future tun_attach will fail */
>>>> +	tun->numqueues = MAX_TAP_QUEUES;
>>>> +	spin_unlock(&tun_lock);
>>>> +
>>>> +	synchronize_rcu();
>>>> +	for (--j; j>= 0; j--)
>>>> +		sock_put(&tfile_list[j]->sk);
>>>>   }
>>>>
>>>> -static void tun_put(struct tun_struct *tun)
>>>> +static int tun_attach(struct tun_struct *tun, struct file *file)
>>>>   {
>>>> -	struct tun_file *tfile = tun->tfile;
>>>> +	struct tun_file *tfile = file->private_data;
>>>> +	int err;
>>>> +
>>>> +	ASSERT_RTNL();
>>>> +
>>>> +	spin_lock(&tun_lock);
>>>>
>>>> -	if (atomic_dec_and_test(&tfile->count))
>>>> -		tun_detach(tfile->tun);
>>>> +	err = -EINVAL;
>>>> +	if (rcu_dereference_protected(tfile->tun, lockdep_is_held(&tun_lock)))
>>>> +		goto out;
>>>> +
>>>> +	err = -EBUSY;
>>>> +	if (!(tun->flags&   TUN_TAP_MQ)&&   tun->numqueues == 1)
>>>> +		goto out;
>>>> +
>>>> +	if (tun->numqueues == MAX_TAP_QUEUES)
>>>> +		goto out;
>>>> +
>>>> +	err = 0;
>>>> +	tfile->queue_index = tun->numqueues;
>>>> +	rcu_assign_pointer(tfile->tun, tun);
>>>> +	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
>>>> +	sock_hold(&tfile->sk);
>>>> +	tun->numqueues++;
>>>> +
>>>> +	if (tun->numqueues == 1)
>>>> +		netif_carrier_on(tun->dev);
>>>> +
>>>> +	/* device is allowed to go away first, so no need to hold extra
>>>> +	 * refcnt. */
>>>> +
>>>> +out:
>>>> +	spin_unlock(&tun_lock);
>>>> +	return err;
>>>>   }
>>>>
>>>>   /* TAP filtering */
>>>> @@ -331,16 +414,7 @@ static const struct ethtool_ops tun_ethtool_ops;
>>>>   /* Net device detach from fd. */
>>>>   static void tun_net_uninit(struct net_device *dev)
>>>>   {
>>>> -	struct tun_struct *tun = netdev_priv(dev);
>>>> -	struct tun_file *tfile = tun->tfile;
>>>> -
>>>> -	/* Inform the methods they need to stop using the dev.
>>>> -	 */
>>>> -	if (tfile) {
>>>> -		wake_up_all(&tfile->wq.wait);
>>>> -		if (atomic_dec_and_test(&tfile->count))
>>>> -			__tun_detach(tun);
>>>> -	}
>>>> +	tun_detach_all(dev);
>>>>   }
>>>>
>>>>   /* Net device open. */
>>>> @@ -360,10 +434,10 @@ static int tun_net_close(struct net_device *dev)
>>>>   /* Net device start xmit */
>>>>   static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>   {
>>>> -	struct tun_struct *tun = netdev_priv(dev);
>>>> -	struct tun_file *tfile = tun->tfile;
>>>> +	struct tun_file *tfile = NULL;
>>>>
>>>> -	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
>>>> +	rcu_read_lock();
>>>> +	tfile = tun_get_queue(dev, skb);
>>>>
>>>>   	/* Drop packet if interface is not attached */
>>>>   	if (!tfile)
>>>> @@ -381,7 +455,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>
>>>>   	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
>>>>   	>= dev->tx_queue_len) {
>>>> -		if (!(tun->flags&   TUN_ONE_QUEUE)) {
>>>> +		if (!(tfile->flags&   TUN_ONE_QUEUE)&&
>>> Which patch moved flags from tun to tfile?
>>>
>>>> +		    !(tfile->flags&   TUN_TAP_MQ)) {
>>>>   			/* Normal queueing mode. */
>>>>   			/* Packet scheduler handles dropping of further packets. */
>>>>   			netif_stop_queue(dev);
>>>> @@ -390,7 +465,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>   			 * error is more appropriate. */
>>>>   			dev->stats.tx_fifo_errors++;
>>>>   		} else {
>>>> -			/* Single queue mode.
>>>> +			/* Single queue mode or multi queue mode.
>>>>   			 * Driver handles dropping of all packets itself. */
>>> Please don't do this. Stop the queue on overrun as appropriate.
>>> ONE_QUEUE is a legacy hack.
>>>
>>> BTW we really should stop queue before we start dropping packets,
>>> but that can be a separate patch.
>>>
>>>>   			goto drop;
>>>>   		}
>>>> @@ -408,9 +483,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>   		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
>>>>   	wake_up_interruptible_poll(&tfile->wq.wait, POLLIN |
>>>>   				   POLLRDNORM | POLLRDBAND);
>>>> +	rcu_read_unlock();
>>>>   	return NETDEV_TX_OK;
>>>>
>>>>   drop:
>>>> +	rcu_read_unlock();
>>>>   	dev->stats.tx_dropped++;
>>>>   	kfree_skb(skb);
>>>>   	return NETDEV_TX_OK;
>>>> @@ -527,16 +604,22 @@ static void tun_net_init(struct net_device *dev)
>>>>   static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>>>>   {
>>>>   	struct tun_file *tfile = file->private_data;
>>>> -	struct tun_struct *tun = __tun_get(tfile);
>>>> +	struct tun_struct *tun = NULL;
>>>>   	struct sock *sk;
>>>>   	unsigned int mask = 0;
>>>>
>>>> -	if (!tun)
>>>> +	if (!tfile)
>>>>   		return POLLERR;
>>>>
>>>> -	sk = tfile->socket.sk;
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun) {
>>>> +		rcu_read_unlock();
>>>> +		return POLLERR;
>>>> +	}
>>>> +	rcu_read_unlock();
>>>>
>>>> -	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
>>>> +	sk =&tfile->sk;
>>>>
>>>>   	poll_wait(file,&tfile->wq.wait, wait);
>>>>
>>>> @@ -548,10 +631,12 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>>>>   	     sock_writeable(sk)))
>>>>   		mask |= POLLOUT | POLLWRNORM;
>>>>
>>>> -	if (tun->dev->reg_state != NETREG_REGISTERED)
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun || tun->dev->reg_state != NETREG_REGISTERED)
>>>>   		mask = POLLERR;
>>>> +	rcu_read_unlock();
>>>>
>>>> -	tun_put(tun);
>>>>   	return mask;
>>>>   }
>>>>
>>>> @@ -708,9 +793,12 @@ static ssize_t tun_get_user(struct tun_file *tfile,
>>>>   		skb_shinfo(skb)->gso_segs = 0;
>>>>   	}
>>>>
>>>> -	tun = __tun_get(tfile);
>>>> -	if (!tun)
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun) {
>>>> +		rcu_read_unlock();
>>>>   		return -EBADFD;
>>>> +	}
>>>>
>>>>   	switch (tfile->flags&   TUN_TYPE_MASK) {
>>>>   	case TUN_TUN_DEV:
>>>> @@ -720,26 +808,30 @@ static ssize_t tun_get_user(struct tun_file *tfile,
>>>>   		skb->protocol = eth_type_trans(skb, tun->dev);
>>>>   		break;
>>>>   	}
>>>> -
>>>> -	netif_rx_ni(skb);
>>>>   	tun->dev->stats.rx_packets++;
>>>>   	tun->dev->stats.rx_bytes += len;
>>>> -	tun_put(tun);
>>>> +	rcu_read_unlock();
>>>> +
>>>> +	netif_rx_ni(skb);
>>>> +
>>>>   	return count;
>>>>
>>>>   err_free:
>>>>   	count = -EINVAL;
>>>>   	kfree_skb(skb);
>>>>   err:
>>>> -	tun = __tun_get(tfile);
>>>> -	if (!tun)
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun) {
>>>> +		rcu_read_unlock();
>>>>   		return -EBADFD;
>>>> +	}
>>>>
>>>>   	if (drop)
>>>>   		tun->dev->stats.rx_dropped++;
>>>>   	if (error)
>>>>   		tun->dev->stats.rx_frame_errors++;
>>>> -	tun_put(tun);
>>>> +	rcu_read_unlock();
>>>>   	return count;
>>>>   }
>>>>
>>>> @@ -833,12 +925,13 @@ static ssize_t tun_put_user(struct tun_file *tfile,
>>>>   	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
>>>>   	total += skb->len;
>>>>
>>>> -	tun = __tun_get(tfile);
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>>   	if (tun) {
>>>>   		tun->dev->stats.tx_packets++;
>>>>   		tun->dev->stats.tx_bytes += len;
>>>> -		tun_put(tun);
>>>>   	}
>>>> +	rcu_read_unlock();
>>>>
>>>>   	return total;
>>>>   }
>>>> @@ -869,28 +962,31 @@ static ssize_t tun_do_read(struct tun_file *tfile,
>>>>   				break;
>>>>   			}
>>>>
>>>> -			tun = __tun_get(tfile);
>>>> +			rcu_read_lock();
>>>> +			tun = rcu_dereference(tfile->tun);
>>>>   			if (!tun) {
>>>> -				ret = -EIO;
>>>> +				ret = -EBADFD;
>>> BADFD is for when you get passed something like -1 fd.
>>> Here fd is OK, it's just in a bad state so you can not do IO.
>>>
>>>
>>>> +				rcu_read_unlock();
>>>>   				break;
>>>>   			}
>>>>   			if (tun->dev->reg_state != NETREG_REGISTERED) {
>>>>   				ret = -EIO;
>>>> -				tun_put(tun);
>>>> +				rcu_read_unlock();
>>>>   				break;
>>>>   			}
>>>> -			tun_put(tun);
>>>> +			rcu_read_unlock();
>>>>
>>>>   			/* Nothing to read, let's sleep */
>>>>   			schedule();
>>>>   			continue;
>>>>   		}
>>>>
>>>> -		tun = __tun_get(tfile);
>>>> +		rcu_read_lock();
>>>> +		tun = rcu_dereference(tfile->tun);
>>>>   		if (tun) {
>>>>   			netif_wake_queue(tun->dev);
>>>> -			tun_put(tun);
>>>>   		}
>>>> +		rcu_read_unlock();
>>>>
>>>>   		ret = tun_put_user(tfile, skb, iv, len);
>>>>   		kfree_skb(skb);
>>>> @@ -1038,6 +1134,9 @@ static int tun_flags(struct tun_struct *tun)
>>>>   	if (tun->flags&   TUN_VNET_HDR)
>>>>   		flags |= IFF_VNET_HDR;
>>>>
>>>> +	if (tun->flags&   TUN_TAP_MQ)
>>>> +		flags |= IFF_MULTI_QUEUE;
>>>> +
>>>>   	return flags;
>>>>   }
>>>>
>>>> @@ -1097,8 +1196,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>   		err = tun_attach(tun, file);
>>>>   		if (err<   0)
>>>>   			return err;
>>>> -	}
>>>> -	else {
>>>> +	} else {
>>>>   		char *name;
>>>>   		unsigned long flags = 0;
>>>>
>>>> @@ -1142,6 +1240,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>   		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
>>>>   			TUN_USER_FEATURES;
>>>>   		dev->features = dev->hw_features;
>>>> +		if (ifr->ifr_flags&   IFF_MULTI_QUEUE)
>>>> +			dev->features |= NETIF_F_LLTX;
>>>>
>>>>   		err = register_netdevice(tun->dev);
>>>>   		if (err<   0)
>>>> @@ -1154,7 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>
>>>>   		err = tun_attach(tun, file);
>>>>   		if (err<   0)
>>>> -			goto failed;
>>>> +			goto err_free_dev;
>>>>   	}
>>>>
>>>>   	tun_debug(KERN_INFO, tun, "tun_set_iff\n");
>>>> @@ -1174,6 +1274,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>   	else
>>>>   		tun->flags&= ~TUN_VNET_HDR;
>>>>
>>>> +	if (ifr->ifr_flags&   IFF_MULTI_QUEUE)
>>>> +		tun->flags |= TUN_TAP_MQ;
>>>> +	else
>>>> +		tun->flags&= ~TUN_TAP_MQ;
>>>> +
>>>>   	/* Cache flags from tun device */
>>>>   	tfile->flags = tun->flags;
>>>>   	/* Make sure persistent devices do not get stuck in
>>>> @@ -1187,7 +1292,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>
>>>>   err_free_dev:
>>>>   	free_netdev(dev);
>>>> -failed:
>>>>   	return err;
>>>>   }
>>>>
>>>> @@ -1264,38 +1368,40 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>>>   				(unsigned int __user*)argp);
>>>>   	}
>>>>
>>>> -	rtnl_lock();
>>>> -
>>>> -	tun = __tun_get(tfile);
>>>> -	if (cmd == TUNSETIFF&&   !tun) {
>>>> +	ret = 0;
>>>> +	if (cmd == TUNSETIFF) {
>>>> +		rtnl_lock();
>>>>   		ifr.ifr_name[IFNAMSIZ-1] = '\0';
>>>> -
>>>>   		ret = tun_set_iff(tfile->net, file,&ifr);
>>>> -
>>>> +		rtnl_unlock();
>>>>   		if (ret)
>>>> -			goto unlock;
>>>> -
>>>> +			return ret;
>>>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>>>> -			ret = -EFAULT;
>>>> -		goto unlock;
>>>> +			return -EFAULT;
>>>> +		return ret;
>>>>   	}
>>>>
>>>> +	rtnl_lock();
>>>> +
>>>> +	rcu_read_lock();
>>>> +
>>>>   	ret = -EBADFD;
>>>> +	tun = rcu_dereference(tfile->tun);
>>>>   	if (!tun)
>>>>   		goto unlock;
>>>> +	else
>>>> +		ret = 0;
>>>>
>>>> -	tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %d\n", cmd);
>>>> -
>>>> -	ret = 0;
>>>>   	switch (cmd) {
>>>>   	case TUNGETIFF:
>>>>   		ret = tun_get_iff(current->nsproxy->net_ns, tun,&ifr);
>>>> +		rcu_read_unlock();
>>>>   		if (ret)
>>>> -			break;
>>>> +			goto out;
>>>>
>>>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>>>>   			ret = -EFAULT;
>>>> -		break;
>>>> +		goto out;
>>>>
>>>>   	case TUNSETNOCSUM:
>>>>   		/* Disable/Enable checksum */
>>>> @@ -1357,9 +1463,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>>>   		/* Get hw address */
>>>>   		memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
>>>>   		ifr.ifr_hwaddr.sa_family = tun->dev->type;
>>>> +		rcu_read_unlock();
>>>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>>>>   			ret = -EFAULT;
>>>> -		break;
>>>> +		goto out;
>>>>
>>>>   	case SIOCSIFHWADDR:
>>>>   		/* Set hw address */
>>>> @@ -1375,9 +1482,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>>>   	}
>>>>
>>>>   unlock:
>>>> +	rcu_read_unlock();
>>>> +out:
>>>>   	rtnl_unlock();
>>>> -	if (tun)
>>>> -		tun_put(tun);
>>>>   	return ret;
>>>>   }
>>>>
>>>> @@ -1517,6 +1624,11 @@ out:
>>>>   	return ret;
>>>>   }
>>>>
>>>> +static void tun_sock_destruct(struct sock *sk)
>>>> +{
>>>> +	skb_queue_purge(&sk->sk_receive_queue);
>>>> +}
>>>> +
>>>>   static int tun_chr_open(struct inode *inode, struct file * file)
>>>>   {
>>>>   	struct net *net = current->nsproxy->net_ns;
>>>> @@ -1540,6 +1652,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>>>>   	sock_init_data(&tfile->socket,&tfile->sk);
>>>>
>>>>   	tfile->sk.sk_write_space = tun_sock_write_space;
>>>> +	tfile->sk.sk_destruct = tun_sock_destruct;
>>>>   	tfile->sk.sk_sndbuf = INT_MAX;
>>>>   	file->private_data = tfile;
>>>>
>>>> @@ -1549,31 +1662,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>>>>   static int tun_chr_close(struct inode *inode, struct file *file)
>>>>   {
>>>>   	struct tun_file *tfile = file->private_data;
>>>> -	struct tun_struct *tun;
>>>> -
>>>> -	tun = __tun_get(tfile);
>>>> -	if (tun) {
>>>> -		struct net_device *dev = tun->dev;
>>>> -
>>>> -		tun_debug(KERN_INFO, tun, "tun_chr_close\n");
>>>> -
>>>> -		__tun_detach(tun);
>>>> -
>>>> -		/* If desirable, unregister the netdevice. */
>>>> -		if (!(tun->flags&   TUN_PERSIST)) {
>>>> -			rtnl_lock();
>>>> -			if (dev->reg_state == NETREG_REGISTERED)
>>>> -				unregister_netdevice(dev);
>>>> -			rtnl_unlock();
>>>> -		}
>>>>
>>>> -		/* drop the reference that netdevice holds */
>>>> -		sock_put(&tfile->sk);
>>>> -
>>>> -	}
>>>> -
>>>> -	/* drop the reference that file holds */
>>>> -	sock_put(&tfile->sk);
>>>> +	tun_detach(tfile, true);
>>>>
>>>>   	return 0;
>>>>   }
>>>> @@ -1700,14 +1790,17 @@ static void tun_cleanup(void)
>>>>    * holding a reference to the file for as long as the socket is in use. */
>>>>   struct socket *tun_get_socket(struct file *file)
>>>>   {
>>>> -	struct tun_struct *tun;
>>>> +	struct tun_struct *tun = NULL;
>>>>   	struct tun_file *tfile = file->private_data;
>>>>   	if (file->f_op !=&tun_fops)
>>>>   		return ERR_PTR(-EINVAL);
>>>> -	tun = tun_get(file);
>>>> -	if (!tun)
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun) {
>>>> +		rcu_read_unlock();
>>>>   		return ERR_PTR(-EBADFD);
>>>> -	tun_put(tun);
>>>> +	}
>>>> +	rcu_read_unlock();
>>>>   	return&tfile->socket;
>>>>   }
>>>>   EXPORT_SYMBOL_GPL(tun_get_socket);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply

* Re: [PATCH 7/18] netfilter: nfnetlink_log: Move away from NLMSG_PUT().
From: Joe Perches @ 2012-06-27  5:23 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20120626.220223.1090653207727010874.davem@davemloft.net>

On Tue, 2012-06-26 at 22:02 -0700, David Miller wrote:
> And use nlmsg_data() while we're here too.
[]
> diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
[]
> @@ -326,18 +326,20 @@ __nfulnl_send(struct nfulnl_instance *inst)
>  {
>  	int status = -1;
>  
> -	if (inst->qlen > 1)
> -		NLMSG_PUT(inst->skb, 0, 0,
> -			  NLMSG_DONE,
> -			  sizeof(struct nfgenmsg));
> -
> +	if (inst->qlen > 1) {
> +		struct nlmsghdr *nlh = nlmsg_put(inst->skb, 0, 0,
> +						 NLMSG_DONE,
> +						 sizeof(struct nfgenmsg),
> +						 0);
> +		if (!nlh)
> +			goto out;
> +	}

Because nlh isn't used for anything other than a test,
perhaps this is more readable as:

	if (inst->qlen > 1 &&
	    !nlmsg_put(inst->skb, 0, 0, NLMSG_DONE,
		       sizeof(struct nfgenmsg), 0))
		goto out;

^ permalink raw reply

* Re: [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
From: Hans Schillstrom @ 2012-06-27  5:23 UTC (permalink / raw)
  To: Eric Dumazet, David Miller
  Cc: subramanian.vijay@gmail.com, dave.taht@gmail.com,
	netdev@vger.kernel.org, ncardwell@google.com, therbert@google.com,
	brouer@redhat.com
In-Reply-To: <1340730156.10893.359.camel@edumazet-glaptop>

On Tuesday 26 June 2012 19:02:36 Eric Dumazet wrote:
> On Tue, 2012-06-26 at 07:34 +0200, Hans Schillstrom wrote:
> 
> > This patch didn't give much in gain actually.
> 
> With a 100Mbps link it does.
 
I was testing with a patched igb driver with TCP SYN irq:s on one core only,
there was some fault in the prev. setup (RPS was also involved) because now it gives a boost of ~15%

> With a 1Gbps link we are cpu bounded for sure.

True.

> 
> > The big cycle consumer during a syn attack is SHA sum right now, 
> > so from that perspective it's better to add aes crypto (by using AES-NI) 
> > to the syn cookies instead of SHA sum. Even if only newer x86_64 can use it.
> 
> My dev machine is able to process ~280.000 SYN (and synack) per second
> (tg3, mono queue), and sha_transform() takes ~10 % of the time according
> to perf.

My test machine is not that fast :-(
I have only 170.000 syn/synack per sec. and sha_transform() takes ~9.6%
have seen peeks of 16% (during 10 sec samples)

> 
> With David patch using jhash instead of SHA, I reach ~315.000 SYN per
> second.

I have similar results from ~170k to ~199k synack/sec.

BTW, 
cookie_hash() did not show up in the perf results, (< 0.08%)

^ permalink raw reply

* Re: [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support
From: Jason Wang @ 2012-06-27  5:59 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: habanero, netdev, linux-kernel, krkumar2, tahm, akong, davem,
	shemminger, mashirle, Eric Dumazet
In-Reply-To: <20120626115420.GD13108@redhat.com>

On 06/26/2012 07:54 PM, Michael S. Tsirkin wrote:
> On Tue, Jun 26, 2012 at 01:52:57PM +0800, Jason Wang wrote:
>> On 06/25/2012 04:25 PM, Michael S. Tsirkin wrote:
>>> On Mon, Jun 25, 2012 at 02:10:18PM +0800, Jason Wang wrote:
>>>> This patch adds multiqueue support for tap device. This is done by abstracting
>>>> each queue as a file/socket and allowing multiple sockets to be attached to the
>>>> tuntap device (an array of tun_file were stored in the tun_struct). Userspace
>>>> could write and read from those files to do the parallel packet
>>>> sending/receiving.
>>>>
>>>> Unlike the previous single queue implementation, the socket and device were
>>>> loosely coupled, each of them were allowed to go away first. In order to let the
>>>> tx path lockless, netif_tx_loch_bh() is replaced by RCU/NETIF_F_LLTX to
>>>> synchronize between data path and system call.
>>> Don't use LLTX/RCU. It's not worth it.
>>> Use something like netif_set_real_num_tx_queues.
>>>
>> For LLTX, maybe it's better to convert it to alloc_netdev_mq() to
>> let the kernel see all queues and make the queue stopping and
>> per-queue stats eaiser.
>> RCU is used to handle the attaching/detaching when tun/tap is
>> sending and receiving packets which looks reasonalbe for me.
> Yes but do we have to allow this? How about we always ask
> userspace to attach to all active queues?

Attaching/detaching is a method to active/deactive a queue, if all 
queues were kept attached, then we need other method or flag to mark the 
queue as activateddeactived and still need to synchronize with data path.
>> Not
>> sure netif_set_real_num_tx_queues() can help in this situation.
> Check it out.
>
>>>> The tx queue selecting is first based on the recorded rxq index of an skb, it
>>>> there's no such one, then choosing based on rx hashing (skb_get_rxhash()).
>>>>
>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>> Interestingly macvtap switched to hashing first:
>>> ef0002b577b52941fb147128f30bd1ecfdd3ff6d
>>> (the commit log is corrupted but see what it
>>> does in the patch).
>>> Any idea why?
>>>
>>>> ---
>>>>   drivers/net/tun.c |  371 +++++++++++++++++++++++++++++++++--------------------
>>>>   1 files changed, 232 insertions(+), 139 deletions(-)
>>>>
>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>> index 8233b0a..5c26757 100644
>>>> --- a/drivers/net/tun.c
>>>> +++ b/drivers/net/tun.c
>>>> @@ -107,6 +107,8 @@ struct tap_filter {
>>>>   	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
>>>>   };
>>>>
>>>> +#define MAX_TAP_QUEUES (NR_CPUS<   16 ? NR_CPUS : 16)
>>> Why the limit? I am guessing you copied this from macvtap?
>>> This is problematic for a number of reasons:
>>> 	- will not play well with migration
>>> 	- will not work well for a large guest
>>>
>>> Yes, macvtap needs to be fixed too.
>>>
>>> I am guessing what it is trying to prevent is queueing
>>> up a huge number of packets?
>>> So just divide the default tx queue limit by the # of queues.
>> Not sure,
>> another reasons I can guess:
>> - to prevent storing a large array of pointers in tun_struct or macvlan_dev.
> OK so with the limit of e.g. 1024 we'd allocate at most
> 2 pages of memory. This doesn't look too bad. 1024 is probably a
> high enough limit: modern hypervisors seem to support on the order
> of 100-200 CPUs so this leaves us some breathing space
> if we want to match a queue per guest CPU.
> Of course we need to limit the packets per queue
> in such a setup more aggressively. 1000 packets * 1000 queues
> * 64K per packet is too much.
>
>> - it may not be suitable to allow the number of virtqueues greater
>> than the number of physical queues in the card
> Maybe for macvtap, here we have no idea which card we
> are working with and how many queues it has.
>
>>> And by the way, for MQ applications maybe we can finally
>>> ignore tx queue altogether and limit the total number
>>> of bytes queued?
>>> To avoid regressions we can make it large like 64M/# queues.
>>> Could be a separate patch I think, and for a single queue
>>> might need a compatible mode though I am not sure.
>> Could you explain more about this?
>> Did you mean to have a total
>> sndbuf for all sockets that attached to tun/tap?
> Consider that we currently limit the # of
> packets queued at tun for xmit to userspace.
> Some limit is needed but # of packets sounds
> very silly - limiting the total memory
> might be more reasonable.
>
> In case of multiqueue, we really care about
> total # of packets or total memory, but a simple
> approximation could be to divide the allocation
> between active queues equally.

A possible method is to divce the TUN_READQ_SIZE by #queues, but make it 
at least to be equal to the vring size (256).
>
> qdisc also queues some packets, that logic is
> using # of packets anyway. So either make that
> 1000/# queues, or even set to 0 as Eric once
> suggested.
>
>>>> +
>>>>   struct tun_file {
>>>>   	struct sock sk;
>>>>   	struct socket socket;
>>>> @@ -114,16 +116,18 @@ struct tun_file {
>>>>   	int vnet_hdr_sz;
>>>>   	struct tap_filter txflt;
>>>>   	atomic_t count;
>>>> -	struct tun_struct *tun;
>>>> +	struct tun_struct __rcu *tun;
>>>>   	struct net *net;
>>>>   	struct fasync_struct *fasync;
>>>>   	unsigned int flags;
>>>> +	u16 queue_index;
>>>>   };
>>>>
>>>>   struct tun_sock;
>>>>
>>>>   struct tun_struct {
>>>> -	struct tun_file		*tfile;
>>>> +	struct tun_file		*tfiles[MAX_TAP_QUEUES];
>>>> +	unsigned int            numqueues;
>>>>   	unsigned int 		flags;
>>>>   	uid_t			owner;
>>>>   	gid_t			group;
>>>> @@ -138,80 +142,159 @@ struct tun_struct {
>>>>   #endif
>>>>   };
>>>>
>>>> -static int tun_attach(struct tun_struct *tun, struct file *file)
>>>> +static DEFINE_SPINLOCK(tun_lock);
>>>> +
>>>> +/*
>>>> + * tun_get_queue(): calculate the queue index
>>>> + *     - if skbs comes from mq nics, we can just borrow
>>>> + *     - if not, calculate from the hash
>>>> + */
>>>> +static struct tun_file *tun_get_queue(struct net_device *dev,
>>>> +				      struct sk_buff *skb)
>>>>   {
>>>> -	struct tun_file *tfile = file->private_data;
>>>> -	int err;
>>>> +	struct tun_struct *tun = netdev_priv(dev);
>>>> +	struct tun_file *tfile = NULL;
>>>> +	int numqueues = tun->numqueues;
>>>> +	__u32 rxq;
>>>>
>>>> -	ASSERT_RTNL();
>>>> +	BUG_ON(!rcu_read_lock_held());
>>>>
>>>> -	netif_tx_lock_bh(tun->dev);
>>>> +	if (!numqueues)
>>>> +		goto out;
>>>>
>>>> -	err = -EINVAL;
>>>> -	if (tfile->tun)
>>>> +	if (numqueues == 1) {
>>>> +		tfile = rcu_dereference(tun->tfiles[0]);
>>> Instead of hacks like this, you can ask for an MQ
>>> flag to be set in SETIFF. Then you won't need to
>>> handle attach/detach at random times.
>> Consier user switch between a sq guest to mq guest, qemu would
>> attach or detach the fd which could not be expceted in kernel.
> Can't userspace keep it attached always, just deactivate MQ?
>
>>> And most of the scary num_queues checks can go away.
>> Even we has a MQ flag, userspace could still just attach one queue
>> to the device.
> I think we allow too much flexibility if we let
> userspace detach a random queue.

The point is to let tun/tap has the same flexibility as macvtap. Macvtap 
allows add/delete queues at any time and it's very easy to add 
detach/attach to macvtap. So we can easily use almost the same ioctls to 
active/deactive a queue at any time for both tap and macvtap.
> Maybe only allow attaching/detaching with MQ off?
> If userspace wants to attach/detach, clear MQ first?

Maybe I didn't understand the point here but I didn't advantages except 
more times of ioctl().
> Alternatively, attach/detach all queues in one ioctl?

Yes, it can be same one.
>
>>> You can then also ask userspace about the max # of queues
>>> to expect if you want to save some memory.
>>>
>> Yes, good suggestion.
>>>>   		goto out;
>>>> +	}
>>>>
>>>> -	err = -EBUSY;
>>>> -	if (tun->tfile)
>>>> +	if (likely(skb_rx_queue_recorded(skb))) {
>>>> +		rxq = skb_get_rx_queue(skb);
>>>> +
>>>> +		while (unlikely(rxq>= numqueues))
>>>> +			rxq -= numqueues;
>>>> +
>>>> +		tfile = rcu_dereference(tun->tfiles[rxq]);
>>>>   		goto out;
>>>> +	}
>>>>
>>>> -	err = 0;
>>>> -	tfile->tun = tun;
>>>> -	tun->tfile = tfile;
>>>> -	netif_carrier_on(tun->dev);
>>>> -	dev_hold(tun->dev);
>>>> -	sock_hold(&tfile->sk);
>>>> -	atomic_inc(&tfile->count);
>>>> +	/* Check if we can use flow to select a queue */
>>>> +	rxq = skb_get_rxhash(skb);
>>>> +	if (rxq) {
>>>> +		u32 idx = ((u64)rxq * numqueues)>>   32;
>>> This completely confuses me. What's the logic here?
>>> How do we even know it's in range?
>>>
>> rxq is a u32, so the result should be less than numqueues.
> Aha. So the point is to use multiply+shift instead of %?
> Please add a comment.
>

Yes sure.
>>>> +		tfile = rcu_dereference(tun->tfiles[idx]);
>>>> +		goto out;
>>>> +	}
>>>>
>>>> +	tfile = rcu_dereference(tun->tfiles[0]);
>>>>   out:
>>>> -	netif_tx_unlock_bh(tun->dev);
>>>> -	return err;
>>>> +	return tfile;
>>>>   }
>>>>
>>>> -static void __tun_detach(struct tun_struct *tun)
>>>> +static int tun_detach(struct tun_file *tfile, bool clean)
>>>>   {
>>>> -	struct tun_file *tfile = tun->tfile;
>>>> -	/* Detach from net device */
>>>> -	netif_tx_lock_bh(tun->dev);
>>>> -	netif_carrier_off(tun->dev);
>>>> -	tun->tfile = NULL;
>>>> -	netif_tx_unlock_bh(tun->dev);
>>>> -
>>>> -	/* Drop read queue */
>>>> -	skb_queue_purge(&tfile->socket.sk->sk_receive_queue);
>>>> -
>>>> -	/* Drop the extra count on the net device */
>>>> -	dev_put(tun->dev);
>>>> -}
>>>> +	struct tun_struct *tun;
>>>> +	struct net_device *dev = NULL;
>>>> +	bool destroy = false;
>>>>
>>>> -static void tun_detach(struct tun_struct *tun)
>>>> -{
>>>> -	rtnl_lock();
>>>> -	__tun_detach(tun);
>>>> -	rtnl_unlock();
>>>> -}
>>>> +	spin_lock(&tun_lock);
>>>>
>>>> -static struct tun_struct *__tun_get(struct tun_file *tfile)
>>>> -{
>>>> -	struct tun_struct *tun = NULL;
>>>> +	tun = rcu_dereference_protected(tfile->tun,
>>>> +					lockdep_is_held(&tun_lock));
>>>> +	if (tun) {
>>>> +		u16 index = tfile->queue_index;
>>>> +		BUG_ON(index>= tun->numqueues);
>>>> +		dev = tun->dev;
>>>> +
>>>> +		rcu_assign_pointer(tun->tfiles[index],
>>>> +				   tun->tfiles[tun->numqueues - 1]);
>>>> +		tun->tfiles[index]->queue_index = index;
>>>> +		rcu_assign_pointer(tfile->tun, NULL);
>>>> +		--tun->numqueues;
>>>> +		sock_put(&tfile->sk);
>>>>
>>>> -	if (atomic_inc_not_zero(&tfile->count))
>>>> -		tun = tfile->tun;
>>>> +		if (tun->numqueues == 0&&   !(tun->flags&   TUN_PERSIST))
>>>> +			destroy = true;
>>> Please don't use flags like that. Use dedicated labels and goto there on error.
>> ok.
>>>> +	}
>>>>
>>>> -	return tun;
>>>> +	spin_unlock(&tun_lock);
>>>> +
>>>> +	synchronize_rcu();
>>>> +	if (clean)
>>>> +		sock_put(&tfile->sk);
>>>> +
>>>> +	if (destroy) {
>>>> +		rtnl_lock();
>>>> +		if (dev->reg_state == NETREG_REGISTERED)
>>>> +			unregister_netdevice(dev);
>>>> +		rtnl_unlock();
>>>> +	}
>>>> +
>>>> +	return 0;
>>>>   }
>>>>
>>>> -static struct tun_struct *tun_get(struct file *file)
>>>> +static void tun_detach_all(struct net_device *dev)
>>>>   {
>>>> -	return __tun_get(file->private_data);
>>>> +	struct tun_struct *tun = netdev_priv(dev);
>>>> +	struct tun_file *tfile, *tfile_list[MAX_TAP_QUEUES];
>>>> +	int i, j = 0;
>>>> +
>>>> +	spin_lock(&tun_lock);
>>>> +
>>>> +	for (i = 0; i<   MAX_TAP_QUEUES&&   tun->numqueues; i++) {
>>>> +		tfile = rcu_dereference_protected(tun->tfiles[i],
>>>> +						lockdep_is_held(&tun_lock));
>>>> +		BUG_ON(!tfile);
>>>> +		wake_up_all(&tfile->wq.wait);
>>>> +		tfile_list[j++] = tfile;
>>>> +		rcu_assign_pointer(tfile->tun, NULL);
>>>> +		--tun->numqueues;
>>>> +	}
>>>> +	BUG_ON(tun->numqueues != 0);
>>>> +	/* guarantee that any future tun_attach will fail */
>>>> +	tun->numqueues = MAX_TAP_QUEUES;
>>>> +	spin_unlock(&tun_lock);
>>>> +
>>>> +	synchronize_rcu();
>>>> +	for (--j; j>= 0; j--)
>>>> +		sock_put(&tfile_list[j]->sk);
>>>>   }
>>>>
>>>> -static void tun_put(struct tun_struct *tun)
>>>> +static int tun_attach(struct tun_struct *tun, struct file *file)
>>>>   {
>>>> -	struct tun_file *tfile = tun->tfile;
>>>> +	struct tun_file *tfile = file->private_data;
>>>> +	int err;
>>>> +
>>>> +	ASSERT_RTNL();
>>>> +
>>>> +	spin_lock(&tun_lock);
>>>>
>>>> -	if (atomic_dec_and_test(&tfile->count))
>>>> -		tun_detach(tfile->tun);
>>>> +	err = -EINVAL;
>>>> +	if (rcu_dereference_protected(tfile->tun, lockdep_is_held(&tun_lock)))
>>>> +		goto out;
>>>> +
>>>> +	err = -EBUSY;
>>>> +	if (!(tun->flags&   TUN_TAP_MQ)&&   tun->numqueues == 1)
>>>> +		goto out;
>>>> +
>>>> +	if (tun->numqueues == MAX_TAP_QUEUES)
>>>> +		goto out;
>>>> +
>>>> +	err = 0;
>>>> +	tfile->queue_index = tun->numqueues;
>>>> +	rcu_assign_pointer(tfile->tun, tun);
>>>> +	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
>>>> +	sock_hold(&tfile->sk);
>>>> +	tun->numqueues++;
>>>> +
>>>> +	if (tun->numqueues == 1)
>>>> +		netif_carrier_on(tun->dev);
>>>> +
>>>> +	/* device is allowed to go away first, so no need to hold extra
>>>> +	 * refcnt. */
>>>> +
>>>> +out:
>>>> +	spin_unlock(&tun_lock);
>>>> +	return err;
>>>>   }
>>>>
>>>>   /* TAP filtering */
>>>> @@ -331,16 +414,7 @@ static const struct ethtool_ops tun_ethtool_ops;
>>>>   /* Net device detach from fd. */
>>>>   static void tun_net_uninit(struct net_device *dev)
>>>>   {
>>>> -	struct tun_struct *tun = netdev_priv(dev);
>>>> -	struct tun_file *tfile = tun->tfile;
>>>> -
>>>> -	/* Inform the methods they need to stop using the dev.
>>>> -	 */
>>>> -	if (tfile) {
>>>> -		wake_up_all(&tfile->wq.wait);
>>>> -		if (atomic_dec_and_test(&tfile->count))
>>>> -			__tun_detach(tun);
>>>> -	}
>>>> +	tun_detach_all(dev);
>>>>   }
>>>>
>>>>   /* Net device open. */
>>>> @@ -360,10 +434,10 @@ static int tun_net_close(struct net_device *dev)
>>>>   /* Net device start xmit */
>>>>   static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>   {
>>>> -	struct tun_struct *tun = netdev_priv(dev);
>>>> -	struct tun_file *tfile = tun->tfile;
>>>> +	struct tun_file *tfile = NULL;
>>>>
>>>> -	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
>>>> +	rcu_read_lock();
>>>> +	tfile = tun_get_queue(dev, skb);
>>>>
>>>>   	/* Drop packet if interface is not attached */
>>>>   	if (!tfile)
>>>> @@ -381,7 +455,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>
>>>>   	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
>>>>   	>= dev->tx_queue_len) {
>>>> -		if (!(tun->flags&   TUN_ONE_QUEUE)) {
>>>> +		if (!(tfile->flags&   TUN_ONE_QUEUE)&&
>>> Which patch moved flags from tun to tfile?
>> Patch 1 cache the tun->flags in tfile, but it seems this may let the
>> flags out of sync. So we'd better to use the one in tun_struct.
>>>> +		    !(tfile->flags&   TUN_TAP_MQ)) {
>>>>   			/* Normal queueing mode. */
>>>>   			/* Packet scheduler handles dropping of further packets. */
>>>>   			netif_stop_queue(dev);
>>>> @@ -390,7 +465,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>   			 * error is more appropriate. */
>>>>   			dev->stats.tx_fifo_errors++;
>>>>   		} else {
>>>> -			/* Single queue mode.
>>>> +			/* Single queue mode or multi queue mode.
>>>>   			 * Driver handles dropping of all packets itself. */
>>> Please don't do this. Stop the queue on overrun as appropriate.
>>> ONE_QUEUE is a legacy hack.
>>>
>>> BTW we really should stop queue before we start dropping packets,
>>> but that can be a separate patch.
>> The problem here is the using of NETIF_F_LLTX. Kernel could only see
>> one queue even for a multiqueue tun/tap. If we use
>> netif_stop_queue(), all other queues would be stopped also.
> Another reason not to use LLTX?

Yes.
>>>>   			goto drop;
>>>>   		}
>>>> @@ -408,9 +483,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>   		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
>>>>   	wake_up_interruptible_poll(&tfile->wq.wait, POLLIN |
>>>>   				   POLLRDNORM | POLLRDBAND);
>>>> +	rcu_read_unlock();
>>>>   	return NETDEV_TX_OK;
>>>>
>>>>   drop:
>>>> +	rcu_read_unlock();
>>>>   	dev->stats.tx_dropped++;
>>>>   	kfree_skb(skb);
>>>>   	return NETDEV_TX_OK;
>>>> @@ -527,16 +604,22 @@ static void tun_net_init(struct net_device *dev)
>>>>   static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>>>>   {
>>>>   	struct tun_file *tfile = file->private_data;
>>>> -	struct tun_struct *tun = __tun_get(tfile);
>>>> +	struct tun_struct *tun = NULL;
>>>>   	struct sock *sk;
>>>>   	unsigned int mask = 0;
>>>>
>>>> -	if (!tun)
>>>> +	if (!tfile)
>>>>   		return POLLERR;
>>>>
>>>> -	sk = tfile->socket.sk;
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun) {
>>>> +		rcu_read_unlock();
>>>> +		return POLLERR;
>>>> +	}
>>>> +	rcu_read_unlock();
>>>>
>>>> -	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
>>>> +	sk =&tfile->sk;
>>>>
>>>>   	poll_wait(file,&tfile->wq.wait, wait);
>>>>
>>>> @@ -548,10 +631,12 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>>>>   	     sock_writeable(sk)))
>>>>   		mask |= POLLOUT | POLLWRNORM;
>>>>
>>>> -	if (tun->dev->reg_state != NETREG_REGISTERED)
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun || tun->dev->reg_state != NETREG_REGISTERED)
>>>>   		mask = POLLERR;
>>>> +	rcu_read_unlock();
>>>>
>>>> -	tun_put(tun);
>>>>   	return mask;
>>>>   }
>>>>
>>>> @@ -708,9 +793,12 @@ static ssize_t tun_get_user(struct tun_file *tfile,
>>>>   		skb_shinfo(skb)->gso_segs = 0;
>>>>   	}
>>>>
>>>> -	tun = __tun_get(tfile);
>>>> -	if (!tun)
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun) {
>>>> +		rcu_read_unlock();
>>>>   		return -EBADFD;
>>>> +	}
>>>>
>>>>   	switch (tfile->flags&   TUN_TYPE_MASK) {
>>>>   	case TUN_TUN_DEV:
>>>> @@ -720,26 +808,30 @@ static ssize_t tun_get_user(struct tun_file *tfile,
>>>>   		skb->protocol = eth_type_trans(skb, tun->dev);
>>>>   		break;
>>>>   	}
>>>> -
>>>> -	netif_rx_ni(skb);
>>>>   	tun->dev->stats.rx_packets++;
>>>>   	tun->dev->stats.rx_bytes += len;
>>>> -	tun_put(tun);
>>>> +	rcu_read_unlock();
>>>> +
>>>> +	netif_rx_ni(skb);
>>>> +
>>>>   	return count;
>>>>
>>>>   err_free:
>>>>   	count = -EINVAL;
>>>>   	kfree_skb(skb);
>>>>   err:
>>>> -	tun = __tun_get(tfile);
>>>> -	if (!tun)
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun) {
>>>> +		rcu_read_unlock();
>>>>   		return -EBADFD;
>>>> +	}
>>>>
>>>>   	if (drop)
>>>>   		tun->dev->stats.rx_dropped++;
>>>>   	if (error)
>>>>   		tun->dev->stats.rx_frame_errors++;
>>>> -	tun_put(tun);
>>>> +	rcu_read_unlock();
>>>>   	return count;
>>>>   }
>>>>
>>>> @@ -833,12 +925,13 @@ static ssize_t tun_put_user(struct tun_file *tfile,
>>>>   	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
>>>>   	total += skb->len;
>>>>
>>>> -	tun = __tun_get(tfile);
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>>   	if (tun) {
>>>>   		tun->dev->stats.tx_packets++;
>>>>   		tun->dev->stats.tx_bytes += len;
>>>> -		tun_put(tun);
>>>>   	}
>>>> +	rcu_read_unlock();
>>>>
>>>>   	return total;
>>>>   }
>>>> @@ -869,28 +962,31 @@ static ssize_t tun_do_read(struct tun_file *tfile,
>>>>   				break;
>>>>   			}
>>>>
>>>> -			tun = __tun_get(tfile);
>>>> +			rcu_read_lock();
>>>> +			tun = rcu_dereference(tfile->tun);
>>>>   			if (!tun) {
>>>> -				ret = -EIO;
>>>> +				ret = -EBADFD;
>>> BADFD is for when you get passed something like -1 fd.
>>> Here fd is OK, it's just in a bad state so you can not do IO.
>>>
>> Sure.
>>>> +				rcu_read_unlock();
>>>>   				break;
>>>>   			}
>>>>   			if (tun->dev->reg_state != NETREG_REGISTERED) {
>>>>   				ret = -EIO;
>>>> -				tun_put(tun);
>>>> +				rcu_read_unlock();
>>>>   				break;
>>>>   			}
>>>> -			tun_put(tun);
>>>> +			rcu_read_unlock();
>>>>
>>>>   			/* Nothing to read, let's sleep */
>>>>   			schedule();
>>>>   			continue;
>>>>   		}
>>>>
>>>> -		tun = __tun_get(tfile);
>>>> +		rcu_read_lock();
>>>> +		tun = rcu_dereference(tfile->tun);
>>>>   		if (tun) {
>>>>   			netif_wake_queue(tun->dev);
>>>> -			tun_put(tun);
>>>>   		}
>>>> +		rcu_read_unlock();
>>>>
>>>>   		ret = tun_put_user(tfile, skb, iv, len);
>>>>   		kfree_skb(skb);
>>>> @@ -1038,6 +1134,9 @@ static int tun_flags(struct tun_struct *tun)
>>>>   	if (tun->flags&   TUN_VNET_HDR)
>>>>   		flags |= IFF_VNET_HDR;
>>>>
>>>> +	if (tun->flags&   TUN_TAP_MQ)
>>>> +		flags |= IFF_MULTI_QUEUE;
>>>> +
>>>>   	return flags;
>>>>   }
>>>>
>>>> @@ -1097,8 +1196,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>   		err = tun_attach(tun, file);
>>>>   		if (err<   0)
>>>>   			return err;
>>>> -	}
>>>> -	else {
>>>> +	} else {
>>>>   		char *name;
>>>>   		unsigned long flags = 0;
>>>>
>>>> @@ -1142,6 +1240,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>   		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
>>>>   			TUN_USER_FEATURES;
>>>>   		dev->features = dev->hw_features;
>>>> +		if (ifr->ifr_flags&   IFF_MULTI_QUEUE)
>>>> +			dev->features |= NETIF_F_LLTX;
>>>>
>>>>   		err = register_netdevice(tun->dev);
>>>>   		if (err<   0)
>>>> @@ -1154,7 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>
>>>>   		err = tun_attach(tun, file);
>>>>   		if (err<   0)
>>>> -			goto failed;
>>>> +			goto err_free_dev;
>>>>   	}
>>>>
>>>>   	tun_debug(KERN_INFO, tun, "tun_set_iff\n");
>>>> @@ -1174,6 +1274,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>   	else
>>>>   		tun->flags&= ~TUN_VNET_HDR;
>>>>
>>>> +	if (ifr->ifr_flags&   IFF_MULTI_QUEUE)
>>>> +		tun->flags |= TUN_TAP_MQ;
>>>> +	else
>>>> +		tun->flags&= ~TUN_TAP_MQ;
>>>> +
>>>>   	/* Cache flags from tun device */
>>>>   	tfile->flags = tun->flags;
>>>>   	/* Make sure persistent devices do not get stuck in
>>>> @@ -1187,7 +1292,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>
>>>>   err_free_dev:
>>>>   	free_netdev(dev);
>>>> -failed:
>>>>   	return err;
>>>>   }
>>>>
>>>> @@ -1264,38 +1368,40 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>>>   				(unsigned int __user*)argp);
>>>>   	}
>>>>
>>>> -	rtnl_lock();
>>>> -
>>>> -	tun = __tun_get(tfile);
>>>> -	if (cmd == TUNSETIFF&&   !tun) {
>>>> +	ret = 0;
>>>> +	if (cmd == TUNSETIFF) {
>>>> +		rtnl_lock();
>>>>   		ifr.ifr_name[IFNAMSIZ-1] = '\0';
>>>> -
>>>>   		ret = tun_set_iff(tfile->net, file,&ifr);
>>>> -
>>>> +		rtnl_unlock();
>>>>   		if (ret)
>>>> -			goto unlock;
>>>> -
>>>> +			return ret;
>>>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>>>> -			ret = -EFAULT;
>>>> -		goto unlock;
>>>> +			return -EFAULT;
>>>> +		return ret;
>>>>   	}
>>>>
>>>> +	rtnl_lock();
>>>> +
>>>> +	rcu_read_lock();
>>>> +
>>>>   	ret = -EBADFD;
>>>> +	tun = rcu_dereference(tfile->tun);
>>>>   	if (!tun)
>>>>   		goto unlock;
>>>> +	else
>>>> +		ret = 0;
>>>>
>>>> -	tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %d\n", cmd);
>>>> -
>>>> -	ret = 0;
>>>>   	switch (cmd) {
>>>>   	case TUNGETIFF:
>>>>   		ret = tun_get_iff(current->nsproxy->net_ns, tun,&ifr);
>>>> +		rcu_read_unlock();
>>>>   		if (ret)
>>>> -			break;
>>>> +			goto out;
>>>>
>>>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>>>>   			ret = -EFAULT;
>>>> -		break;
>>>> +		goto out;
>>>>
>>>>   	case TUNSETNOCSUM:
>>>>   		/* Disable/Enable checksum */
>>>> @@ -1357,9 +1463,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>>>   		/* Get hw address */
>>>>   		memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
>>>>   		ifr.ifr_hwaddr.sa_family = tun->dev->type;
>>>> +		rcu_read_unlock();
>>>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>>>>   			ret = -EFAULT;
>>>> -		break;
>>>> +		goto out;
>>>>
>>>>   	case SIOCSIFHWADDR:
>>>>   		/* Set hw address */
>>>> @@ -1375,9 +1482,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>>>   	}
>>>>
>>>>   unlock:
>>>> +	rcu_read_unlock();
>>>> +out:
>>>>   	rtnl_unlock();
>>>> -	if (tun)
>>>> -		tun_put(tun);
>>>>   	return ret;
>>>>   }
>>>>
>>>> @@ -1517,6 +1624,11 @@ out:
>>>>   	return ret;
>>>>   }
>>>>
>>>> +static void tun_sock_destruct(struct sock *sk)
>>>> +{
>>>> +	skb_queue_purge(&sk->sk_receive_queue);
>>>> +}
>>>> +
>>>>   static int tun_chr_open(struct inode *inode, struct file * file)
>>>>   {
>>>>   	struct net *net = current->nsproxy->net_ns;
>>>> @@ -1540,6 +1652,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>>>>   	sock_init_data(&tfile->socket,&tfile->sk);
>>>>
>>>>   	tfile->sk.sk_write_space = tun_sock_write_space;
>>>> +	tfile->sk.sk_destruct = tun_sock_destruct;
>>>>   	tfile->sk.sk_sndbuf = INT_MAX;
>>>>   	file->private_data = tfile;
>>>>
>>>> @@ -1549,31 +1662,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>>>>   static int tun_chr_close(struct inode *inode, struct file *file)
>>>>   {
>>>>   	struct tun_file *tfile = file->private_data;
>>>> -	struct tun_struct *tun;
>>>> -
>>>> -	tun = __tun_get(tfile);
>>>> -	if (tun) {
>>>> -		struct net_device *dev = tun->dev;
>>>> -
>>>> -		tun_debug(KERN_INFO, tun, "tun_chr_close\n");
>>>> -
>>>> -		__tun_detach(tun);
>>>> -
>>>> -		/* If desirable, unregister the netdevice. */
>>>> -		if (!(tun->flags&   TUN_PERSIST)) {
>>>> -			rtnl_lock();
>>>> -			if (dev->reg_state == NETREG_REGISTERED)
>>>> -				unregister_netdevice(dev);
>>>> -			rtnl_unlock();
>>>> -		}
>>>>
>>>> -		/* drop the reference that netdevice holds */
>>>> -		sock_put(&tfile->sk);
>>>> -
>>>> -	}
>>>> -
>>>> -	/* drop the reference that file holds */
>>>> -	sock_put(&tfile->sk);
>>>> +	tun_detach(tfile, true);
>>>>
>>>>   	return 0;
>>>>   }
>>>> @@ -1700,14 +1790,17 @@ static void tun_cleanup(void)
>>>>    * holding a reference to the file for as long as the socket is in use. */
>>>>   struct socket *tun_get_socket(struct file *file)
>>>>   {
>>>> -	struct tun_struct *tun;
>>>> +	struct tun_struct *tun = NULL;
>>>>   	struct tun_file *tfile = file->private_data;
>>>>   	if (file->f_op !=&tun_fops)
>>>>   		return ERR_PTR(-EINVAL);
>>>> -	tun = tun_get(file);
>>>> -	if (!tun)
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun) {
>>>> +		rcu_read_unlock();
>>>>   		return ERR_PTR(-EBADFD);
>>>> -	tun_put(tun);
>>>> +	}
>>>> +	rcu_read_unlock();
>>>>   	return&tfile->socket;
>>>>   }
>>>>   EXPORT_SYMBOL_GPL(tun_get_socket);

^ permalink raw reply

* [PATCH net-next 1/4 v2] net: sh_eth: remove unnecessary function
From: Shimoda, Yoshihiro @ 2012-06-27  5:59 UTC (permalink / raw)
  To: netdev; +Cc: SH-Linux

The sh_eth_timer() called mod_timer() for itself. So, this patch
removes the function.

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
---
 about v2:
  - rebase the latest net-next.git
  - I don't submit the NAPI patch this time

 drivers/net/ethernet/renesas/sh_eth.c |   22 ----------------------
 drivers/net/ethernet/renesas/sh_eth.h |    1 -
 2 files changed, 0 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 326cb91..cf0bc31 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -1339,14 +1339,6 @@ other_irq:
 	return ret;
 }

-static void sh_eth_timer(unsigned long data)
-{
-	struct net_device *ndev = (struct net_device *)data;
-	struct sh_eth_private *mdp = netdev_priv(ndev);
-
-	mod_timer(&mdp->timer, jiffies + (10 * HZ));
-}
-
 /* PHY state control function */
 static void sh_eth_adjust_link(struct net_device *ndev)
 {
@@ -1594,11 +1586,6 @@ static int sh_eth_open(struct net_device *ndev)
 	if (ret)
 		goto out_free_irq;

-	/* Set the timer to check for link beat. */
-	init_timer(&mdp->timer);
-	mdp->timer.expires = (jiffies + (24 * HZ)) / 10;/* 2.4 sec. */
-	setup_timer(&mdp->timer, sh_eth_timer, (unsigned long)ndev);
-
 	return ret;

 out_free_irq:
@@ -1623,9 +1610,6 @@ static void sh_eth_tx_timeout(struct net_device *ndev)
 	/* tx_errors count up */
 	ndev->stats.tx_errors++;

-	/* timer off */
-	del_timer_sync(&mdp->timer);
-
 	/* Free all the skbuffs in the Rx queue. */
 	for (i = 0; i < RX_RING_SIZE; i++) {
 		rxdesc = &mdp->rx_ring[i];
@@ -1643,10 +1627,6 @@ static void sh_eth_tx_timeout(struct net_device *ndev)

 	/* device init */
 	sh_eth_dev_init(ndev);
-
-	/* timer on */
-	mdp->timer.expires = (jiffies + (24 * HZ)) / 10;/* 2.4 sec. */
-	add_timer(&mdp->timer);
 }

 /* Packet transmit function */
@@ -1719,8 +1699,6 @@ static int sh_eth_close(struct net_device *ndev)

 	free_irq(ndev->irq, ndev);

-	del_timer_sync(&mdp->timer);
-
 	/* Free all the skbuffs in the Rx queue. */
 	sh_eth_ring_free(ndev);

diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h
index d6763b1..5af3f2a 100644
--- a/drivers/net/ethernet/renesas/sh_eth.h
+++ b/drivers/net/ethernet/renesas/sh_eth.h
@@ -772,7 +772,6 @@ struct sh_eth_private {
 	struct sh_eth_txdesc *tx_ring;
 	struct sk_buff **rx_skbuff;
 	struct sk_buff **tx_skbuff;
-	struct timer_list timer;
 	spinlock_t lock;
 	u32 cur_rx, dirty_rx;	/* Producer/consumer ring indices */
 	u32 cur_tx, dirty_tx;
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 2/4 v2] net: sh_eth: remove unnecessary members/definitions
From: Shimoda, Yoshihiro @ 2012-06-27  5:59 UTC (permalink / raw)
  To: netdev; +Cc: SH-Linux

This patch removes unnecessary members in sh_th_private.
This patch also removes unnecessary definitions in sh_eth.h

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
---
 about v2:
  - rebase the latest net-next.git

 drivers/net/ethernet/renesas/sh_eth.c |    7 +---
 drivers/net/ethernet/renesas/sh_eth.h |   69 ---------------------------------
 2 files changed, 1 insertions(+), 75 deletions(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index cf0bc31..43e76d2 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -941,7 +941,6 @@ static int sh_eth_dev_init(struct net_device *ndev)
 {
 	int ret = 0;
 	struct sh_eth_private *mdp = netdev_priv(ndev);
-	u_int32_t rx_int_var, tx_int_var;
 	u32 val;

 	/* Soft Reset */
@@ -971,9 +970,7 @@ static int sh_eth_dev_init(struct net_device *ndev)
 	/* Frame recv control */
 	sh_eth_write(ndev, mdp->cd->rmcr_value, RMCR);

-	rx_int_var = mdp->rx_int_var = DESC_I_RINT8 | DESC_I_RINT5;
-	tx_int_var = mdp->tx_int_var = DESC_I_TINT2;
-	sh_eth_write(ndev, rx_int_var | tx_int_var, TRSCER);
+	sh_eth_write(ndev, DESC_I_RINT8 | DESC_I_RINT5 | DESC_I_TINT2, TRSCER);

 	if (mdp->cd->bculr)
 		sh_eth_write(ndev, 0x800, BCULR);	/* Burst sycle set */
@@ -2336,8 +2333,6 @@ static int sh_eth_drv_probe(struct platform_device *pdev)

 	/* debug message level */
 	mdp->msg_enable = SH_ETH_DEF_MSG_ENABLE;
-	mdp->post_rx = POST_RX >> (devno << 1);
-	mdp->post_fw = POST_FW >> (devno << 1);

 	/* read and set MAC address */
 	read_mac_address(ndev, pd->mac_addr);
diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h
index 5af3f2a..37a0702 100644
--- a/drivers/net/ethernet/renesas/sh_eth.h
+++ b/drivers/net/ethernet/renesas/sh_eth.h
@@ -585,71 +585,6 @@ enum RPADIR_BIT {
 /* FDR */
 #define DEFAULT_FDR_INIT	0x00000707

-enum phy_offsets {
-	PHY_CTRL = 0, PHY_STAT = 1, PHY_IDT1 = 2, PHY_IDT2 = 3,
-	PHY_ANA = 4, PHY_ANL = 5, PHY_ANE = 6,
-	PHY_16 = 16,
-};
-
-/* PHY_CTRL */
-enum PHY_CTRL_BIT {
-	PHY_C_RESET = 0x8000, PHY_C_LOOPBK = 0x4000, PHY_C_SPEEDSL = 0x2000,
-	PHY_C_ANEGEN = 0x1000, PHY_C_PWRDN = 0x0800, PHY_C_ISO = 0x0400,
-	PHY_C_RANEG = 0x0200, PHY_C_DUPLEX = 0x0100, PHY_C_COLT = 0x0080,
-};
-#define DM9161_PHY_C_ANEGEN 0	/* auto nego special */
-
-/* PHY_STAT */
-enum PHY_STAT_BIT {
-	PHY_S_100T4 = 0x8000, PHY_S_100X_F = 0x4000, PHY_S_100X_H = 0x2000,
-	PHY_S_10T_F = 0x1000, PHY_S_10T_H = 0x0800, PHY_S_ANEGC = 0x0020,
-	PHY_S_RFAULT = 0x0010, PHY_S_ANEGA = 0x0008, PHY_S_LINK = 0x0004,
-	PHY_S_JAB = 0x0002, PHY_S_EXTD = 0x0001,
-};
-
-/* PHY_ANA */
-enum PHY_ANA_BIT {
-	PHY_A_NP = 0x8000, PHY_A_ACK = 0x4000, PHY_A_RF = 0x2000,
-	PHY_A_FCS = 0x0400, PHY_A_T4 = 0x0200, PHY_A_FDX = 0x0100,
-	PHY_A_HDX = 0x0080, PHY_A_10FDX = 0x0040, PHY_A_10HDX = 0x0020,
-	PHY_A_SEL = 0x001e,
-};
-/* PHY_ANL */
-enum PHY_ANL_BIT {
-	PHY_L_NP = 0x8000, PHY_L_ACK = 0x4000, PHY_L_RF = 0x2000,
-	PHY_L_FCS = 0x0400, PHY_L_T4 = 0x0200, PHY_L_FDX = 0x0100,
-	PHY_L_HDX = 0x0080, PHY_L_10FDX = 0x0040, PHY_L_10HDX = 0x0020,
-	PHY_L_SEL = 0x001f,
-};
-
-/* PHY_ANE */
-enum PHY_ANE_BIT {
-	PHY_E_PDF = 0x0010, PHY_E_LPNPA = 0x0008, PHY_E_NPA = 0x0004,
-	PHY_E_PRX = 0x0002, PHY_E_LPANEGA = 0x0001,
-};
-
-/* DM9161 */
-enum PHY_16_BIT {
-	PHY_16_BP4B45 = 0x8000, PHY_16_BPSCR = 0x4000, PHY_16_BPALIGN = 0x2000,
-	PHY_16_BP_ADPOK = 0x1000, PHY_16_Repeatmode = 0x0800,
-	PHY_16_TXselect = 0x0400,
-	PHY_16_Rsvd = 0x0200, PHY_16_RMIIEnable = 0x0100,
-	PHY_16_Force100LNK = 0x0080,
-	PHY_16_APDLED_CTL = 0x0040, PHY_16_COLLED_CTL = 0x0020,
-	PHY_16_RPDCTR_EN = 0x0010,
-	PHY_16_ResetStMch = 0x0008, PHY_16_PreamSupr = 0x0004,
-	PHY_16_Sleepmode = 0x0002,
-	PHY_16_RemoteLoopOut = 0x0001,
-};
-
-#define POST_RX		0x08
-#define POST_FW		0x04
-#define POST0_RX	(POST_RX)
-#define POST0_FW	(POST_FW)
-#define POST1_RX	(POST_RX >> 2)
-#define POST1_FW	(POST_FW >> 2)
-#define POST_ALL	(POST0_RX | POST0_FW | POST1_RX | POST1_FW)
-
 /* ARSTR */
 enum ARSTR_BIT { ARSTR_ARSTR = 0x00000001, };

@@ -786,10 +721,6 @@ struct sh_eth_private {
 	int msg_enable;
 	int speed;
 	int duplex;
-	u32 rx_int_var, tx_int_var;	/* interrupt control variables */
-	char post_rx;		/* POST receive */
-	char post_fw;		/* POST forward */
-	struct net_device_stats tsu_stats;	/* TSU forward status */
 	int port;		/* for TSU */
 	int vlan_num_ids;	/* for VLAN tag filter */

-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 4/4 v2] net: sh_eth: add support for set_ringparam/get_ringparam
From: Shimoda, Yoshihiro @ 2012-06-27  6:00 UTC (permalink / raw)
  To: netdev; +Cc: SH-Linux

This patch supports the ethtool's set_ringparam() and get_ringparam().

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
---
 about v2:
  - rebase the latest net-next.git

 drivers/net/ethernet/renesas/sh_eth.c |  139 +++++++++++++++++++++++++--------
 drivers/net/ethernet/renesas/sh_eth.h |    6 ++
 2 files changed, 112 insertions(+), 33 deletions(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 2dd2ff5..af0b867 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -782,7 +782,7 @@ static void sh_eth_ring_free(struct net_device *ndev)

 	/* Free Rx skb ringbuffer */
 	if (mdp->rx_skbuff) {
-		for (i = 0; i < RX_RING_SIZE; i++) {
+		for (i = 0; i < mdp->num_rx_ring; i++) {
 			if (mdp->rx_skbuff[i])
 				dev_kfree_skb(mdp->rx_skbuff[i]);
 		}
@@ -792,7 +792,7 @@ static void sh_eth_ring_free(struct net_device *ndev)

 	/* Free Tx skb ringbuffer */
 	if (mdp->tx_skbuff) {
-		for (i = 0; i < TX_RING_SIZE; i++) {
+		for (i = 0; i < mdp->num_tx_ring; i++) {
 			if (mdp->tx_skbuff[i])
 				dev_kfree_skb(mdp->tx_skbuff[i]);
 		}
@@ -809,8 +809,8 @@ static void sh_eth_ring_format(struct net_device *ndev)
 	struct sk_buff *skb;
 	struct sh_eth_rxdesc *rxdesc = NULL;
 	struct sh_eth_txdesc *txdesc = NULL;
-	int rx_ringsize = sizeof(*rxdesc) * RX_RING_SIZE;
-	int tx_ringsize = sizeof(*txdesc) * TX_RING_SIZE;
+	int rx_ringsize = sizeof(*rxdesc) * mdp->num_rx_ring;
+	int tx_ringsize = sizeof(*txdesc) * mdp->num_tx_ring;

 	mdp->cur_rx = mdp->cur_tx = 0;
 	mdp->dirty_rx = mdp->dirty_tx = 0;
@@ -818,7 +818,7 @@ static void sh_eth_ring_format(struct net_device *ndev)
 	memset(mdp->rx_ring, 0, rx_ringsize);

 	/* build Rx ring buffer */
-	for (i = 0; i < RX_RING_SIZE; i++) {
+	for (i = 0; i < mdp->num_rx_ring; i++) {
 		/* skb */
 		mdp->rx_skbuff[i] = NULL;
 		skb = netdev_alloc_skb(ndev, mdp->rx_buf_sz);
@@ -844,7 +844,7 @@ static void sh_eth_ring_format(struct net_device *ndev)
 		}
 	}

-	mdp->dirty_rx = (u32) (i - RX_RING_SIZE);
+	mdp->dirty_rx = (u32) (i - mdp->num_rx_ring);

 	/* Mark the last entry as wrapping the ring. */
 	rxdesc->status |= cpu_to_edmac(mdp, RD_RDEL);
@@ -852,7 +852,7 @@ static void sh_eth_ring_format(struct net_device *ndev)
 	memset(mdp->tx_ring, 0, tx_ringsize);

 	/* build Tx ring buffer */
-	for (i = 0; i < TX_RING_SIZE; i++) {
+	for (i = 0; i < mdp->num_tx_ring; i++) {
 		mdp->tx_skbuff[i] = NULL;
 		txdesc = &mdp->tx_ring[i];
 		txdesc->status = cpu_to_edmac(mdp, TD_TFP);
@@ -886,7 +886,7 @@ static int sh_eth_ring_init(struct net_device *ndev)
 		mdp->rx_buf_sz += NET_IP_ALIGN;

 	/* Allocate RX and TX skb rings */
-	mdp->rx_skbuff = kmalloc(sizeof(*mdp->rx_skbuff) * RX_RING_SIZE,
+	mdp->rx_skbuff = kmalloc(sizeof(*mdp->rx_skbuff) * mdp->num_rx_ring,
 				GFP_KERNEL);
 	if (!mdp->rx_skbuff) {
 		dev_err(&ndev->dev, "Cannot allocate Rx skb\n");
@@ -894,7 +894,7 @@ static int sh_eth_ring_init(struct net_device *ndev)
 		return ret;
 	}

-	mdp->tx_skbuff = kmalloc(sizeof(*mdp->tx_skbuff) * TX_RING_SIZE,
+	mdp->tx_skbuff = kmalloc(sizeof(*mdp->tx_skbuff) * mdp->num_tx_ring,
 				GFP_KERNEL);
 	if (!mdp->tx_skbuff) {
 		dev_err(&ndev->dev, "Cannot allocate Tx skb\n");
@@ -903,7 +903,7 @@ static int sh_eth_ring_init(struct net_device *ndev)
 	}

 	/* Allocate all Rx descriptors. */
-	rx_ringsize = sizeof(struct sh_eth_rxdesc) * RX_RING_SIZE;
+	rx_ringsize = sizeof(struct sh_eth_rxdesc) * mdp->num_rx_ring;
 	mdp->rx_ring = dma_alloc_coherent(NULL, rx_ringsize, &mdp->rx_desc_dma,
 			GFP_KERNEL);

@@ -917,7 +917,7 @@ static int sh_eth_ring_init(struct net_device *ndev)
 	mdp->dirty_rx = 0;

 	/* Allocate all Tx descriptors. */
-	tx_ringsize = sizeof(struct sh_eth_txdesc) * TX_RING_SIZE;
+	tx_ringsize = sizeof(struct sh_eth_txdesc) * mdp->num_tx_ring;
 	mdp->tx_ring = dma_alloc_coherent(NULL, tx_ringsize, &mdp->tx_desc_dma,
 			GFP_KERNEL);
 	if (!mdp->tx_ring) {
@@ -946,21 +946,21 @@ static void sh_eth_free_dma_buffer(struct sh_eth_private *mdp)
 	int ringsize;

 	if (mdp->rx_ring) {
-		ringsize = sizeof(struct sh_eth_rxdesc) * RX_RING_SIZE;
+		ringsize = sizeof(struct sh_eth_rxdesc) * mdp->num_rx_ring;
 		dma_free_coherent(NULL, ringsize, mdp->rx_ring,
 				  mdp->rx_desc_dma);
 		mdp->rx_ring = NULL;
 	}

 	if (mdp->tx_ring) {
-		ringsize = sizeof(struct sh_eth_txdesc) * TX_RING_SIZE;
+		ringsize = sizeof(struct sh_eth_txdesc) * mdp->num_tx_ring;
 		dma_free_coherent(NULL, ringsize, mdp->tx_ring,
 				  mdp->tx_desc_dma);
 		mdp->tx_ring = NULL;
 	}
 }

-static int sh_eth_dev_init(struct net_device *ndev)
+static int sh_eth_dev_init(struct net_device *ndev, bool start)
 {
 	int ret = 0;
 	struct sh_eth_private *mdp = netdev_priv(ndev);
@@ -1008,7 +1008,8 @@ static int sh_eth_dev_init(struct net_device *ndev)
 		     RFLR);

 	sh_eth_write(ndev, sh_eth_read(ndev, EESR), EESR);
-	sh_eth_write(ndev, mdp->cd->eesipr_value, EESIPR);
+	if (start)
+		sh_eth_write(ndev, mdp->cd->eesipr_value, EESIPR);

 	/* PAUSE Prohibition */
 	val = (sh_eth_read(ndev, ECMR) & ECMR_DM) |
@@ -1023,7 +1024,8 @@ static int sh_eth_dev_init(struct net_device *ndev)
 	sh_eth_write(ndev, mdp->cd->ecsr_value, ECSR);

 	/* E-MAC Interrupt Enable register */
-	sh_eth_write(ndev, mdp->cd->ecsipr_value, ECSIPR);
+	if (start)
+		sh_eth_write(ndev, mdp->cd->ecsipr_value, ECSIPR);

 	/* Set MAC address */
 	update_mac_address(ndev);
@@ -1036,10 +1038,12 @@ static int sh_eth_dev_init(struct net_device *ndev)
 	if (mdp->cd->tpauser)
 		sh_eth_write(ndev, TPAUSER_UNLIMITED, TPAUSER);

-	/* Setting the Rx mode will start the Rx process. */
-	sh_eth_write(ndev, EDRRR_R, EDRRR);
+	if (start) {
+		/* Setting the Rx mode will start the Rx process. */
+		sh_eth_write(ndev, EDRRR_R, EDRRR);

-	netif_start_queue(ndev);
+		netif_start_queue(ndev);
+	}

 out:
 	return ret;
@@ -1054,7 +1058,7 @@ static int sh_eth_txfree(struct net_device *ndev)
 	int entry = 0;

 	for (; mdp->cur_tx - mdp->dirty_tx > 0; mdp->dirty_tx++) {
-		entry = mdp->dirty_tx % TX_RING_SIZE;
+		entry = mdp->dirty_tx % mdp->num_tx_ring;
 		txdesc = &mdp->tx_ring[entry];
 		if (txdesc->status & cpu_to_edmac(mdp, TD_TACT))
 			break;
@@ -1067,7 +1071,7 @@ static int sh_eth_txfree(struct net_device *ndev)
 			freeNum++;
 		}
 		txdesc->status = cpu_to_edmac(mdp, TD_TFP);
-		if (entry >= TX_RING_SIZE - 1)
+		if (entry >= mdp->num_tx_ring - 1)
 			txdesc->status |= cpu_to_edmac(mdp, TD_TDLE);

 		ndev->stats.tx_packets++;
@@ -1082,8 +1086,8 @@ static int sh_eth_rx(struct net_device *ndev, u32 intr_status)
 	struct sh_eth_private *mdp = netdev_priv(ndev);
 	struct sh_eth_rxdesc *rxdesc;

-	int entry = mdp->cur_rx % RX_RING_SIZE;
-	int boguscnt = (mdp->dirty_rx + RX_RING_SIZE) - mdp->cur_rx;
+	int entry = mdp->cur_rx % mdp->num_rx_ring;
+	int boguscnt = (mdp->dirty_rx + mdp->num_rx_ring) - mdp->cur_rx;
 	struct sk_buff *skb;
 	u16 pkt_len = 0;
 	u32 desc_status;
@@ -1134,13 +1138,13 @@ static int sh_eth_rx(struct net_device *ndev, u32 intr_status)
 			ndev->stats.rx_bytes += pkt_len;
 		}
 		rxdesc->status |= cpu_to_edmac(mdp, RD_RACT);
-		entry = (++mdp->cur_rx) % RX_RING_SIZE;
+		entry = (++mdp->cur_rx) % mdp->num_rx_ring;
 		rxdesc = &mdp->rx_ring[entry];
 	}

 	/* Refill the Rx ring buffers. */
 	for (; mdp->cur_rx - mdp->dirty_rx > 0; mdp->dirty_rx++) {
-		entry = mdp->dirty_rx % RX_RING_SIZE;
+		entry = mdp->dirty_rx % mdp->num_rx_ring;
 		rxdesc = &mdp->rx_ring[entry];
 		/* The size of the buffer is 16 byte boundary. */
 		rxdesc->buffer_length = ALIGN(mdp->rx_buf_sz, 16);
@@ -1157,7 +1161,7 @@ static int sh_eth_rx(struct net_device *ndev, u32 intr_status)
 			skb_checksum_none_assert(skb);
 			rxdesc->addr = virt_to_phys(PTR_ALIGN(skb->data, 4));
 		}
-		if (entry >= RX_RING_SIZE - 1)
+		if (entry >= mdp->num_rx_ring - 1)
 			rxdesc->status |=
 				cpu_to_edmac(mdp, RD_RACT | RD_RFP | RD_RDEL);
 		else
@@ -1557,6 +1561,71 @@ static void sh_eth_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
 	}
 }

+static void sh_eth_get_ringparam(struct net_device *ndev,
+				 struct ethtool_ringparam *ring)
+{
+	struct sh_eth_private *mdp = netdev_priv(ndev);
+
+	ring->rx_max_pending = RX_RING_MAX;
+	ring->tx_max_pending = TX_RING_MAX;
+	ring->rx_pending = mdp->num_rx_ring;
+	ring->tx_pending = mdp->num_tx_ring;
+}
+
+static int sh_eth_set_ringparam(struct net_device *ndev,
+				struct ethtool_ringparam *ring)
+{
+	struct sh_eth_private *mdp = netdev_priv(ndev);
+	int ret;
+
+	if (ring->tx_pending > TX_RING_MAX ||
+	    ring->rx_pending > RX_RING_MAX ||
+	    ring->tx_pending < TX_RING_MIN ||
+	    ring->rx_pending < RX_RING_MIN)
+		return -EINVAL;
+	if (ring->rx_mini_pending || ring->rx_jumbo_pending)
+		return -EINVAL;
+
+	if (netif_running(ndev)) {
+		netif_tx_disable(ndev);
+		/* Disable interrupts by clearing the interrupt mask. */
+		sh_eth_write(ndev, 0x0000, EESIPR);
+		/* Stop the chip's Tx and Rx processes. */
+		sh_eth_write(ndev, 0, EDTRR);
+		sh_eth_write(ndev, 0, EDRRR);
+		synchronize_irq(ndev->irq);
+	}
+
+	/* Free all the skbuffs in the Rx queue. */
+	sh_eth_ring_free(ndev);
+	/* Free DMA buffer */
+	sh_eth_free_dma_buffer(mdp);
+
+	/* Set new parameters */
+	mdp->num_rx_ring = ring->rx_pending;
+	mdp->num_tx_ring = ring->tx_pending;
+
+	ret = sh_eth_ring_init(ndev);
+	if (ret < 0) {
+		dev_err(&ndev->dev, "%s: sh_eth_ring_init failed.\n", __func__);
+		return ret;
+	}
+	ret = sh_eth_dev_init(ndev, false);
+	if (ret < 0) {
+		dev_err(&ndev->dev, "%s: sh_eth_dev_init failed.\n", __func__);
+		return ret;
+	}
+
+	if (netif_running(ndev)) {
+		sh_eth_write(ndev, mdp->cd->eesipr_value, EESIPR);
+		/* Setting the Rx mode will start the Rx process. */
+		sh_eth_write(ndev, EDRRR_R, EDRRR);
+		netif_wake_queue(ndev);
+	}
+
+	return 0;
+}
+
 static const struct ethtool_ops sh_eth_ethtool_ops = {
 	.get_settings	= sh_eth_get_settings,
 	.set_settings	= sh_eth_set_settings,
@@ -1567,6 +1636,8 @@ static const struct ethtool_ops sh_eth_ethtool_ops = {
 	.get_strings	= sh_eth_get_strings,
 	.get_ethtool_stats  = sh_eth_get_ethtool_stats,
 	.get_sset_count     = sh_eth_get_sset_count,
+	.get_ringparam	= sh_eth_get_ringparam,
+	.set_ringparam	= sh_eth_set_ringparam,
 };

 /* network device open function */
@@ -1597,7 +1668,7 @@ static int sh_eth_open(struct net_device *ndev)
 		goto out_free_irq;

 	/* device init */
-	ret = sh_eth_dev_init(ndev);
+	ret = sh_eth_dev_init(ndev, true);
 	if (ret)
 		goto out_free_irq;

@@ -1631,7 +1702,7 @@ static void sh_eth_tx_timeout(struct net_device *ndev)
 	ndev->stats.tx_errors++;

 	/* Free all the skbuffs in the Rx queue. */
-	for (i = 0; i < RX_RING_SIZE; i++) {
+	for (i = 0; i < mdp->num_rx_ring; i++) {
 		rxdesc = &mdp->rx_ring[i];
 		rxdesc->status = 0;
 		rxdesc->addr = 0xBADF00D0;
@@ -1639,14 +1710,14 @@ static void sh_eth_tx_timeout(struct net_device *ndev)
 			dev_kfree_skb(mdp->rx_skbuff[i]);
 		mdp->rx_skbuff[i] = NULL;
 	}
-	for (i = 0; i < TX_RING_SIZE; i++) {
+	for (i = 0; i < mdp->num_tx_ring; i++) {
 		if (mdp->tx_skbuff[i])
 			dev_kfree_skb(mdp->tx_skbuff[i]);
 		mdp->tx_skbuff[i] = NULL;
 	}

 	/* device init */
-	sh_eth_dev_init(ndev);
+	sh_eth_dev_init(ndev, true);
 }

 /* Packet transmit function */
@@ -1658,7 +1729,7 @@ static int sh_eth_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	unsigned long flags;

 	spin_lock_irqsave(&mdp->lock, flags);
-	if ((mdp->cur_tx - mdp->dirty_tx) >= (TX_RING_SIZE - 4)) {
+	if ((mdp->cur_tx - mdp->dirty_tx) >= (mdp->num_tx_ring - 4)) {
 		if (!sh_eth_txfree(ndev)) {
 			if (netif_msg_tx_queued(mdp))
 				dev_warn(&ndev->dev, "TxFD exhausted.\n");
@@ -1669,7 +1740,7 @@ static int sh_eth_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	}
 	spin_unlock_irqrestore(&mdp->lock, flags);

-	entry = mdp->cur_tx % TX_RING_SIZE;
+	entry = mdp->cur_tx % mdp->num_tx_ring;
 	mdp->tx_skbuff[entry] = skb;
 	txdesc = &mdp->tx_ring[entry];
 	/* soft swap. */
@@ -1683,7 +1754,7 @@ static int sh_eth_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	else
 		txdesc->buffer_length = skb->len;

-	if (entry >= TX_RING_SIZE - 1)
+	if (entry >= mdp->num_tx_ring - 1)
 		txdesc->status |= cpu_to_edmac(mdp, TD_TACT | TD_TDLE);
 	else
 		txdesc->status |= cpu_to_edmac(mdp, TD_TACT);
@@ -2313,6 +2384,8 @@ static int sh_eth_drv_probe(struct platform_device *pdev)
 	ether_setup(ndev);

 	mdp = netdev_priv(ndev);
+	mdp->num_tx_ring = TX_RING_SIZE;
+	mdp->num_rx_ring = RX_RING_SIZE;
 	mdp->addr = ioremap(res->start, resource_size(res));
 	if (mdp->addr == NULL) {
 		ret = -ENOMEM;
diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h
index 37a0702..bae84fd 100644
--- a/drivers/net/ethernet/renesas/sh_eth.h
+++ b/drivers/net/ethernet/renesas/sh_eth.h
@@ -27,6 +27,10 @@
 #define TX_TIMEOUT	(5*HZ)
 #define TX_RING_SIZE	64	/* Tx ring size */
 #define RX_RING_SIZE	64	/* Rx ring size */
+#define TX_RING_MIN	64
+#define RX_RING_MIN	64
+#define TX_RING_MAX	1024
+#define RX_RING_MAX	1024
 #define ETHERSMALL		60
 #define PKT_BUF_SZ		1538
 #define SH_ETH_TSU_TIMEOUT_MS	500
@@ -701,6 +705,8 @@ struct sh_eth_private {
 	const u16 *reg_offset;
 	void __iomem *addr;
 	void __iomem *tsu_addr;
+	u32 num_rx_ring;
+	u32 num_tx_ring;
 	dma_addr_t rx_desc_dma;
 	dma_addr_t tx_desc_dma;
 	struct sh_eth_rxdesc *rx_ring;
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 3/4 v2] net: sh_eth: fix up the buffer pointers
From: Shimoda, Yoshihiro @ 2012-06-27  6:00 UTC (permalink / raw)
  To: netdev; +Cc: SH-Linux

After freeing the buffer, the driver should change the value of
the pointer to NULL.

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
---
 about v2:
  - rebase the latest net-next.git

 drivers/net/ethernet/renesas/sh_eth.c |   31 ++++++++++++++++++++++++-------
 1 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 43e76d2..2dd2ff5 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -788,6 +788,7 @@ static void sh_eth_ring_free(struct net_device *ndev)
 		}
 	}
 	kfree(mdp->rx_skbuff);
+	mdp->rx_skbuff = NULL;

 	/* Free Tx skb ringbuffer */
 	if (mdp->tx_skbuff) {
@@ -797,6 +798,7 @@ static void sh_eth_ring_free(struct net_device *ndev)
 		}
 	}
 	kfree(mdp->tx_skbuff);
+	mdp->tx_skbuff = NULL;
 }

 /* format skb and descriptor buffer */
@@ -933,10 +935,31 @@ desc_ring_free:
 skb_ring_free:
 	/* Free Rx and Tx skb ring buffer */
 	sh_eth_ring_free(ndev);
+	mdp->tx_ring = NULL;
+	mdp->rx_ring = NULL;

 	return ret;
 }

+static void sh_eth_free_dma_buffer(struct sh_eth_private *mdp)
+{
+	int ringsize;
+
+	if (mdp->rx_ring) {
+		ringsize = sizeof(struct sh_eth_rxdesc) * RX_RING_SIZE;
+		dma_free_coherent(NULL, ringsize, mdp->rx_ring,
+				  mdp->rx_desc_dma);
+		mdp->rx_ring = NULL;
+	}
+
+	if (mdp->tx_ring) {
+		ringsize = sizeof(struct sh_eth_txdesc) * TX_RING_SIZE;
+		dma_free_coherent(NULL, ringsize, mdp->tx_ring,
+				  mdp->tx_desc_dma);
+		mdp->tx_ring = NULL;
+	}
+}
+
 static int sh_eth_dev_init(struct net_device *ndev)
 {
 	int ret = 0;
@@ -1677,7 +1700,6 @@ static int sh_eth_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 static int sh_eth_close(struct net_device *ndev)
 {
 	struct sh_eth_private *mdp = netdev_priv(ndev);
-	int ringsize;

 	netif_stop_queue(ndev);

@@ -1700,12 +1722,7 @@ static int sh_eth_close(struct net_device *ndev)
 	sh_eth_ring_free(ndev);

 	/* free DMA buffer */
-	ringsize = sizeof(struct sh_eth_rxdesc) * RX_RING_SIZE;
-	dma_free_coherent(NULL, ringsize, mdp->rx_ring, mdp->rx_desc_dma);
-
-	/* free DMA buffer */
-	ringsize = sizeof(struct sh_eth_txdesc) * TX_RING_SIZE;
-	dma_free_coherent(NULL, ringsize, mdp->tx_ring, mdp->tx_desc_dma);
+	sh_eth_free_dma_buffer(mdp);

 	pm_runtime_put_sync(&mdp->pdev->dev);

-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next] be2net: Fix to trim skb for padded vlan packets to workaround an ASIC Bug
From: Somnath Kotur @ 2012-06-27  6:04 UTC (permalink / raw)
  To: netdev; +Cc: davem, Somnath Kotur

Also refactored existing code a bit to provide placeholders for another ASIC Bug
workaround that will be checked-in soon after this.

Signed-off-by: Somnath Kotur <somnath.kotur@emulex.com>
---
 drivers/net/ethernet/emulex/benet/be.h      |    5 ++
 drivers/net/ethernet/emulex/benet/be_main.c |   56 ++++++++++++++++++++-------
 2 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
index 7b5cc2b..7a71fb6 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -573,6 +573,11 @@ static inline u8 is_udp_pkt(struct sk_buff *skb)
 	return val;
 }
 
+static inline bool is_ipv4_pkt(struct sk_buff *skb)
+{
+	return skb->protocol == ntohs(ETH_P_IP) && ip_hdr(skb)->version == 4;
+}
+
 static inline void be_vf_eth_addr_generate(struct be_adapter *adapter, u8 *mac)
 {
 	u32 addr;
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index a28896d..22e2c04 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -577,6 +577,11 @@ static inline u16 be_get_tx_vlan_tag(struct be_adapter *adapter,
 	return vlan_tag;
 }
 
+static int be_vlan_tag_chk(struct be_adapter *adapter, struct sk_buff *skb)
+{
+	return vlan_tx_tag_present(skb) || adapter->pvid;
+}
+
 static void wrb_fill_hdr(struct be_adapter *adapter, struct be_eth_hdr_wrb *hdr,
 		struct sk_buff *skb, u32 wrb_cnt, u32 len)
 {
@@ -704,33 +709,56 @@ dma_err:
 	return 0;
 }
 
+static struct sk_buff *be_insert_vlan_in_pkt(struct be_adapter *adapter,
+					     struct sk_buff *skb)
+{
+	u16 vlan_tag = 0;
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (unlikely(!skb))
+		return skb;
+
+	if (vlan_tx_tag_present(skb)) {
+		vlan_tag = be_get_tx_vlan_tag(adapter, skb);
+		__vlan_put_tag(skb, vlan_tag);
+		skb->vlan_tci = 0;
+	}
+
+	return skb;
+}
+
 static netdev_tx_t be_xmit(struct sk_buff *skb,
 			struct net_device *netdev)
 {
 	struct be_adapter *adapter = netdev_priv(netdev);
 	struct be_tx_obj *txo = &adapter->tx_obj[skb_get_queue_mapping(skb)];
 	struct be_queue_info *txq = &txo->q;
+	struct iphdr *ip = NULL;
 	u32 wrb_cnt = 0, copied = 0;
-	u32 start = txq->head;
+	u32 start = txq->head, eth_hdr_len;
 	bool dummy_wrb, stopped = false;
 
-	/* For vlan tagged pkts, BE
-	 * 1) calculates checksum even when CSO is not requested
-	 * 2) calculates checksum wrongly for padded pkt less than
-	 * 60 bytes long.
-	 * As a workaround disable TX vlan offloading in such cases.
+	eth_hdr_len = ntohs(skb->protocol) == ETH_P_8021Q ?
+		VLAN_ETH_HLEN : ETH_HLEN;
+
+	/* HW has a bug whicn considers padding bytes as legal
+	 * and modifies the IPv4 hdr's 'tot_len' field
 	 */
-	if (vlan_tx_tag_present(skb) &&
-	    (skb->ip_summed != CHECKSUM_PARTIAL || skb->len <= 60)) {
-		skb = skb_share_check(skb, GFP_ATOMIC);
-		if (unlikely(!skb))
-			goto tx_drop;
+	if (skb->len <= 60 && be_vlan_tag_chk(adapter, skb) &&
+			is_ipv4_pkt(skb)) {
+		ip = (struct iphdr *)ip_hdr(skb);
+		pskb_trim(skb, eth_hdr_len + ntohs(ip->tot_len));
+	}
 
-		skb = __vlan_put_tag(skb, be_get_tx_vlan_tag(adapter, skb));
+	/* HW has a bug wherein it will calculate CSUM for VLAN
+	 * pkts even though it is disabled.
+	 * Manually insert VLAN in pkt.
+	 */
+	if (skb->ip_summed != CHECKSUM_PARTIAL &&
+			be_vlan_tag_chk(adapter, skb)) {
+		skb = be_insert_vlan_in_pkt(adapter, skb);
 		if (unlikely(!skb))
 			goto tx_drop;
-
-		skb->vlan_tci = 0;
 	}
 
 	wrb_cnt = wrb_cnt_for_skb(adapter, skb, &dummy_wrb);
-- 
1.5.6.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox