Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH] Phonet: set the pipe handle using setsockopt
From: Hemant Vilas RAMDASI @ 2011-11-09 11:20 UTC (permalink / raw)
  To: remi.denis-courmont; +Cc: netdev, Dinesh Kumar Sharma, Hemant Ramdasi

From: Dinesh Kumar Sharma <dinesh.sharma@stericsson.com>

This provides flexibility to set the pipe handle
using setsockopt and enable the same.

Signed-off-by: Hemant Ramdasi <hemant.ramdasi@stericsson.com>
Signed-off-by: Dinesh Kumar Sharma <dinesh.sharma@stericsson.com>
---
 include/linux/phonet.h |    2 +
 net/phonet/pep.c       |   86 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/include/linux/phonet.h b/include/linux/phonet.h
index 6fb1384..491caec 100644
--- a/include/linux/phonet.h
+++ b/include/linux/phonet.h
@@ -37,6 +37,8 @@
 #define PNPIPE_ENCAP		1
 #define PNPIPE_IFINDEX		2
 #define PNPIPE_HANDLE		3
+#define PNPIPE_ENABLE		4
+#define PNPIPE_INITSTATE	5
 
 #define PNADDR_ANY		0
 #define PNADDR_BROADCAST	0xFC
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index f17fd84..3109563 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -167,6 +167,12 @@ static int pipe_handler_send_created_ind(struct sock *sk)
 				data, 4, GFP_ATOMIC);
 }
 
+static int pipe_handler_send_enabled_ind(struct sock *sk)
+{
+	return pep_indicate(sk, PNS_PIPE_ENABLED_IND, 0 /* sub-blocks */,
+				NULL, 0, GFP_ATOMIC);
+}
+
 static int pep_accept_conn(struct sock *sk, struct sk_buff *skb)
 {
 	static const u8 data[20] = {
@@ -533,6 +539,17 @@ static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb)
 	return pipe_handler_send_created_ind(sk);
 }
 
+static int pep_enableresp_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct pnpipehdr *hdr = pnp_hdr(skb);
+
+	if (hdr->error_code != PN_PIPE_NO_ERROR)
+		return -ECONNREFUSED;
+
+	return pipe_handler_send_enabled_ind(sk);
+}
+
+
 /* Queue an skb to an actively connected sock.
  * Socket lock must be held. */
 static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb)
@@ -578,6 +595,28 @@ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb)
 			sk->sk_state = TCP_CLOSE_WAIT;
 			break;
 		}
+		if (pn->init_enable == PN_PIPE_DISABLE)
+			sk->sk_state = TCP_SYN_RECV;
+		else {
+			sk->sk_state = TCP_ESTABLISHED;
+
+			if (!pn_flow_safe(pn->tx_fc)) {
+				atomic_set(&pn->tx_credits, 1);
+				sk->sk_write_space(sk);
+			}
+			pipe_grant_credits(sk, GFP_ATOMIC);
+
+		}
+		break;
+
+    case PNS_PEP_ENABLE_RESP:
+		if (sk->sk_state != TCP_SYN_SENT)
+			break;
+
+		if (pep_enableresp_rcv(sk, skb)) {
+			sk->sk_state = TCP_CLOSE_WAIT;
+			break;
+		}
 
 		sk->sk_state = TCP_ESTABLISHED;
 		if (!pn_flow_safe(pn->tx_fc)) {
@@ -863,9 +902,27 @@ static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len)
 	int err;
 	u8 data[4] = { 0 /* sub-blocks */, PAD, PAD, PAD };
 
-	pn->pipe_handle = 1; /* anything but INVALID_HANDLE */
+	if (pn->pipe_handle == PN_PIPE_INVALID_HANDLE)
+		pn->pipe_handle = 1; /* anything but INVALID_HANDLE */
+
 	err = pipe_handler_request(sk, PNS_PEP_CONNECT_REQ,
-					PN_PIPE_ENABLE, data, 4);
+					pn->init_enable, data, 4);
+
+	if (err) {
+		pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
+		return err;
+	}
+	sk->sk_state = TCP_SYN_SENT;
+	return 0;
+}
+
+static int pep_sock_enable(struct sock *sk, struct sockaddr *addr, int len)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	int err;
+
+	err = pipe_handler_request(sk, PNS_PEP_ENABLE_REQ, PAD,
+				NULL, 0);
 	if (err) {
 		pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
 		return err;
@@ -959,6 +1016,24 @@ static int pep_setsockopt(struct sock *sk, int level, int optname,
 		}
 		goto out_norel;
 
+	case PNPIPE_HANDLE:
+		if (val)
+			pn->pipe_handle = val;
+		else
+			err = -EINVAL;
+		break;
+
+	case PNPIPE_ENABLE:
+		err = pep_sock_enable(sk, NULL, 0);
+		break;
+
+	case PNPIPE_INITSTATE:
+		if ((val == PN_PIPE_DISABLE) || (val == PN_PIPE_ENABLE))
+			pn->init_enable = val;
+		else
+			err = -EINVAL;
+		break;
+
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -994,6 +1069,13 @@ static int pep_getsockopt(struct sock *sk, int level, int optname,
 			return -EINVAL;
 		break;
 
+	case PNPIPE_ENABLE:
+		if (sk->sk_state != TCP_ESTABLISHED)
+			return -EINVAL;
+		else
+			val = 1;
+		break;
+
 	default:
 		return -ENOPROTOOPT;
 	}
-- 
1.7.4.3

^ permalink raw reply related

* [PATCH V4 net-next] neigh: new unresolved queue limits
From: Eric Dumazet @ 2011-11-09 11:14 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.

$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms

--- 192.168.20.108 ping statistics ---
2 packets transmitted, 1 received, 50% packet loss, time 1001ms
rtt min/avg/max/mdev = 0.322/0.322/0.322/0.000 ms

Increasing unres_qlen can be dangerous, since an attacker might try to
fill many queues with many packets and consume all memory.

Switch to a bytes limit (limiting queued skbs truesize), and allow a
default limit of 64Kbytes per unresolved neighbour. This new limit seems
big, but as a packet can consume 64Kbytes, it reduces the memory window
offered to attackers.

unres_qlen is kept for compatibility, but internally converted to/from
bytes limit.

# cd /proc/sys/net/ipv4/neigh/default/
# grep . unres_qlen*
unres_qlen:31
unres_qlen_bytes:65536
# echo 10 >unres_qlen
# grep . unres_qlen*
unres_qlen:10
unres_qlen_bytes:21540
# echo 30000 >unres_qlen_bytes
# grep . unres_qlen*
unres_qlen:14
unres_qlen_bytes:30000

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 Documentation/networking/ip-sysctl.txt |   10 +
 include/linux/neighbour.h              |    1 
 include/net/neighbour.h                |    3 
 net/atm/clip.c                         |    2 
 net/core/neighbour.c                   |  163 +++++++++++++++--------
 net/decnet/dn_neigh.c                  |    2 
 net/ipv4/arp.c                         |    2 
 net/ipv6/ndisc.c                       |    2 
 8 files changed, 129 insertions(+), 56 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index f049a1c..b886706 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -31,6 +31,16 @@ neigh/default/gc_thresh3 - INTEGER
 	when using large numbers of interfaces and when communicating
 	with large numbers of directly-connected peers.
 
+neigh/default/unres_qlen_bytes - INTEGER
+	The maximum number of bytes which may be used by packets
+	queued for each	unresolved address by other network layers.
+	(added in linux 3.3)
+
+neigh/default/unres_qlen - INTEGER
+	The maximum number of packets which may be queued for each
+	unresolved address by other network layers.
+	(deprecated in linux 3.3) : use unres_qlen_bytes instead.
+
 mtu_expires - INTEGER
 	Time, in seconds, that cached PMTU information is kept.
 
diff --git a/include/linux/neighbour.h b/include/linux/neighbour.h
index a7003b7..b188f68 100644
--- a/include/linux/neighbour.h
+++ b/include/linux/neighbour.h
@@ -116,6 +116,7 @@ enum {
 	NDTPA_PROXY_DELAY,		/* u64, msecs */
 	NDTPA_PROXY_QLEN,		/* u32 */
 	NDTPA_LOCKTIME,			/* u64, msecs */
+	NDTPA_QUEUE_LENBYTES,		/* u32 */
 	__NDTPA_MAX
 };
 #define NDTPA_MAX (__NDTPA_MAX - 1)
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 2720884..7ae5acf 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -59,7 +59,7 @@ struct neigh_parms {
 	int	reachable_time;
 	int	delay_probe_time;
 
-	int	queue_len;
+	int	queue_len_bytes;
 	int	ucast_probes;
 	int	app_probes;
 	int	mcast_probes;
@@ -99,6 +99,7 @@ struct neighbour {
 	rwlock_t		lock;
 	atomic_t		refcnt;
 	struct sk_buff_head	arp_queue;
+	unsigned int		arp_queue_len_bytes;
 	struct timer_list	timer;
 	unsigned long		used;
 	atomic_t		probes;
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 8523940..32c41b8 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -329,7 +329,7 @@ static struct neigh_table clip_tbl = {
 		.gc_staletime 		= 60 * HZ,
 		.reachable_time 	= 30 * HZ,
 		.delay_probe_time 	= 5 * HZ,
-		.queue_len 		= 3,
+		.queue_len_bytes 	= 64 * 1024,
 		.ucast_probes 		= 3,
 		.mcast_probes 		= 3,
 		.anycast_delay 		= 1 * HZ,
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 039d51e..173d9b6 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -238,6 +238,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
 				   it to safe state.
 				 */
 				skb_queue_purge(&n->arp_queue);
+				n->arp_queue_len_bytes = 0;
 				n->output = neigh_blackhole;
 				if (n->nud_state & NUD_VALID)
 					n->nud_state = NUD_NOARP;
@@ -702,6 +703,7 @@ void neigh_destroy(struct neighbour *neigh)
 		printk(KERN_WARNING "Impossible event.\n");
 
 	skb_queue_purge(&neigh->arp_queue);
+	neigh->arp_queue_len_bytes = 0;
 
 	dev_put(neigh->dev);
 	neigh_parms_put(neigh->parms);
@@ -842,6 +844,7 @@ static void neigh_invalidate(struct neighbour *neigh)
 		write_lock(&neigh->lock);
 	}
 	skb_queue_purge(&neigh->arp_queue);
+	neigh->arp_queue_len_bytes = 0;
 }
 
 static void neigh_probe(struct neighbour *neigh)
@@ -980,15 +983,20 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
 
 	if (neigh->nud_state == NUD_INCOMPLETE) {
 		if (skb) {
-			if (skb_queue_len(&neigh->arp_queue) >=
-			    neigh->parms->queue_len) {
+			while (neigh->arp_queue_len_bytes + skb->truesize >
+			       neigh->parms->queue_len_bytes) {
 				struct sk_buff *buff;
+
 				buff = __skb_dequeue(&neigh->arp_queue);
+				if (!buff)
+					break;
+				neigh->arp_queue_len_bytes -= buff->truesize;
 				kfree_skb(buff);
 				NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
 			}
 			skb_dst_force(skb);
 			__skb_queue_tail(&neigh->arp_queue, skb);
+			neigh->arp_queue_len_bytes += skb->truesize;
 		}
 		rc = 1;
 	}
@@ -1175,6 +1183,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
 			write_lock_bh(&neigh->lock);
 		}
 		skb_queue_purge(&neigh->arp_queue);
+		neigh->arp_queue_len_bytes = 0;
 	}
 out:
 	if (update_isrouter) {
@@ -1747,7 +1756,11 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
 		NLA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex);
 
 	NLA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt));
-	NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len);
+	NLA_PUT_U32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes);
+	/* approximative value for deprecated QUEUE_LEN (in packets) */
+	NLA_PUT_U32(skb, NDTPA_QUEUE_LEN,
+		    DIV_ROUND_UP(parms->queue_len_bytes,
+				 SKB_TRUESIZE(ETH_FRAME_LEN)));
 	NLA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen);
 	NLA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes);
 	NLA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes);
@@ -1897,6 +1910,7 @@ static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = {
 static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
 	[NDTPA_IFINDEX]			= { .type = NLA_U32 },
 	[NDTPA_QUEUE_LEN]		= { .type = NLA_U32 },
+	[NDTPA_QUEUE_LENBYTES]		= { .type = NLA_U32 },
 	[NDTPA_PROXY_QLEN]		= { .type = NLA_U32 },
 	[NDTPA_APP_PROBES]		= { .type = NLA_U32 },
 	[NDTPA_UCAST_PROBES]		= { .type = NLA_U32 },
@@ -1974,7 +1988,11 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 
 			switch (i) {
 			case NDTPA_QUEUE_LEN:
-				p->queue_len = nla_get_u32(tbp[i]);
+				p->queue_len_bytes = nla_get_u32(tbp[i]) *
+						     SKB_TRUESIZE(ETH_FRAME_LEN);
+				break;
+			case NDTPA_QUEUE_LENBYTES:
+				p->queue_len_bytes = nla_get_u32(tbp[i]);
 				break;
 			case NDTPA_PROXY_QLEN:
 				p->proxy_qlen = nla_get_u32(tbp[i]);
@@ -2635,117 +2653,158 @@ EXPORT_SYMBOL(neigh_app_ns);
 
 #ifdef CONFIG_SYSCTL
 
-#define NEIGH_VARS_MAX 19
+static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer,
+			   size_t *lenp, loff_t *ppos)
+{
+	int size, ret;
+	ctl_table tmp = *ctl;
+
+	tmp.data = &size;
+	size = DIV_ROUND_UP(*(int *)ctl->data, SKB_TRUESIZE(ETH_FRAME_LEN));
+	ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+	if (write && !ret)
+		*(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN);
+	return ret;
+}
+
+enum {
+	NEIGH_VAR_MCAST_PROBE,
+	NEIGH_VAR_UCAST_PROBE,
+	NEIGH_VAR_APP_PROBE,
+	NEIGH_VAR_RETRANS_TIME,
+	NEIGH_VAR_BASE_REACHABLE_TIME,
+	NEIGH_VAR_DELAY_PROBE_TIME,
+	NEIGH_VAR_GC_STALETIME,
+	NEIGH_VAR_QUEUE_LEN,
+	NEIGH_VAR_QUEUE_LEN_BYTES,
+	NEIGH_VAR_PROXY_QLEN,
+	NEIGH_VAR_ANYCAST_DELAY,
+	NEIGH_VAR_PROXY_DELAY,
+	NEIGH_VAR_LOCKTIME,
+	NEIGH_VAR_RETRANS_TIME_MS,
+	NEIGH_VAR_BASE_REACHABLE_TIME_MS,
+	NEIGH_VAR_GC_INTERVAL,
+	NEIGH_VAR_GC_THRESH1,
+	NEIGH_VAR_GC_THRESH2,
+	NEIGH_VAR_GC_THRESH3,
+	NEIGH_VAR_MAX
+};
 
 static struct neigh_sysctl_table {
 	struct ctl_table_header *sysctl_header;
-	struct ctl_table neigh_vars[NEIGH_VARS_MAX];
+	struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];
 	char *dev_name;
 } neigh_sysctl_template __read_mostly = {
 	.neigh_vars = {
-		{
+		[NEIGH_VAR_MCAST_PROBE] = {
 			.procname	= "mcast_solicit",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
-		{
+		[NEIGH_VAR_UCAST_PROBE] = {
 			.procname	= "ucast_solicit",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
-		{
+		[NEIGH_VAR_APP_PROBE] = {
 			.procname	= "app_solicit",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
-		{
+		[NEIGH_VAR_RETRANS_TIME] = {
 			.procname	= "retrans_time",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_userhz_jiffies,
 		},
-		{
+		[NEIGH_VAR_BASE_REACHABLE_TIME] = {
 			.procname	= "base_reachable_time",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_jiffies,
 		},
-		{
+		[NEIGH_VAR_DELAY_PROBE_TIME] = {
 			.procname	= "delay_first_probe_time",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_jiffies,
 		},
-		{
+		[NEIGH_VAR_GC_STALETIME] = {
 			.procname	= "gc_stale_time",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_jiffies,
 		},
-		{
+		[NEIGH_VAR_QUEUE_LEN] = {
 			.procname	= "unres_qlen",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
+			.proc_handler	= proc_unres_qlen,
+		},
+		[NEIGH_VAR_QUEUE_LEN_BYTES] = {
+			.procname	= "unres_qlen_bytes",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
-		{
+		[NEIGH_VAR_PROXY_QLEN] = {
 			.procname	= "proxy_qlen",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
-		{
+		[NEIGH_VAR_ANYCAST_DELAY] = {
 			.procname	= "anycast_delay",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_userhz_jiffies,
 		},
-		{
+		[NEIGH_VAR_PROXY_DELAY] = {
 			.procname	= "proxy_delay",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_userhz_jiffies,
 		},
-		{
+		[NEIGH_VAR_LOCKTIME] = {
 			.procname	= "locktime",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_userhz_jiffies,
 		},
-		{
+		[NEIGH_VAR_RETRANS_TIME_MS] = {
 			.procname	= "retrans_time_ms",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_ms_jiffies,
 		},
-		{
+		[NEIGH_VAR_BASE_REACHABLE_TIME_MS] = {
 			.procname	= "base_reachable_time_ms",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_ms_jiffies,
 		},
-		{
+		[NEIGH_VAR_GC_INTERVAL] = {
 			.procname	= "gc_interval",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_jiffies,
 		},
-		{
+		[NEIGH_VAR_GC_THRESH1] = {
 			.procname	= "gc_thresh1",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
-		{
+		[NEIGH_VAR_GC_THRESH2] = {
 			.procname	= "gc_thresh2",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
-		{
+		[NEIGH_VAR_GC_THRESH3] = {
 			.procname	= "gc_thresh3",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
@@ -2778,47 +2837,49 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
 	if (!t)
 		goto err;
 
-	t->neigh_vars[0].data  = &p->mcast_probes;
-	t->neigh_vars[1].data  = &p->ucast_probes;
-	t->neigh_vars[2].data  = &p->app_probes;
-	t->neigh_vars[3].data  = &p->retrans_time;
-	t->neigh_vars[4].data  = &p->base_reachable_time;
-	t->neigh_vars[5].data  = &p->delay_probe_time;
-	t->neigh_vars[6].data  = &p->gc_staletime;
-	t->neigh_vars[7].data  = &p->queue_len;
-	t->neigh_vars[8].data  = &p->proxy_qlen;
-	t->neigh_vars[9].data  = &p->anycast_delay;
-	t->neigh_vars[10].data = &p->proxy_delay;
-	t->neigh_vars[11].data = &p->locktime;
-	t->neigh_vars[12].data  = &p->retrans_time;
-	t->neigh_vars[13].data  = &p->base_reachable_time;
+	t->neigh_vars[NEIGH_VAR_MCAST_PROBE].data  = &p->mcast_probes;
+	t->neigh_vars[NEIGH_VAR_UCAST_PROBE].data  = &p->ucast_probes;
+	t->neigh_vars[NEIGH_VAR_APP_PROBE].data  = &p->app_probes;
+	t->neigh_vars[NEIGH_VAR_RETRANS_TIME].data  = &p->retrans_time;
+	t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].data  = &p->base_reachable_time;
+	t->neigh_vars[NEIGH_VAR_DELAY_PROBE_TIME].data  = &p->delay_probe_time;
+	t->neigh_vars[NEIGH_VAR_GC_STALETIME].data  = &p->gc_staletime;
+	t->neigh_vars[NEIGH_VAR_QUEUE_LEN].data  = &p->queue_len_bytes;
+	t->neigh_vars[NEIGH_VAR_QUEUE_LEN_BYTES].data  = &p->queue_len_bytes;
+	t->neigh_vars[NEIGH_VAR_PROXY_QLEN].data  = &p->proxy_qlen;
+	t->neigh_vars[NEIGH_VAR_ANYCAST_DELAY].data  = &p->anycast_delay;
+	t->neigh_vars[NEIGH_VAR_PROXY_DELAY].data = &p->proxy_delay;
+	t->neigh_vars[NEIGH_VAR_LOCKTIME].data = &p->locktime;
+	t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].data  = &p->retrans_time;
+	t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].data  = &p->base_reachable_time;
 
 	if (dev) {
 		dev_name_source = dev->name;
 		/* Terminate the table early */
-		memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14]));
+		memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,
+		       sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));
 	} else {
 		dev_name_source = neigh_path[NEIGH_CTL_PATH_DEV].procname;
-		t->neigh_vars[14].data = (int *)(p + 1);
-		t->neigh_vars[15].data = (int *)(p + 1) + 1;
-		t->neigh_vars[16].data = (int *)(p + 1) + 2;
-		t->neigh_vars[17].data = (int *)(p + 1) + 3;
+		t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1);
+		t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1;
+		t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2;
+		t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3;
 	}
 
 
 	if (handler) {
 		/* RetransTime */
-		t->neigh_vars[3].proc_handler = handler;
-		t->neigh_vars[3].extra1 = dev;
+		t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler;
+		t->neigh_vars[NEIGH_VAR_RETRANS_TIME].extra1 = dev;
 		/* ReachableTime */
-		t->neigh_vars[4].proc_handler = handler;
-		t->neigh_vars[4].extra1 = dev;
+		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler;
+		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].extra1 = dev;
 		/* RetransTime (in milliseconds)*/
-		t->neigh_vars[12].proc_handler = handler;
-		t->neigh_vars[12].extra1 = dev;
+		t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler;
+		t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].extra1 = dev;
 		/* ReachableTime (in milliseconds) */
-		t->neigh_vars[13].proc_handler = handler;
-		t->neigh_vars[13].extra1 = dev;
+		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler;
+		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].extra1 = dev;
 	}
 
 	t->dev_name = kstrdup(dev_name_source, GFP_KERNEL);
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 7f0eb08..fb8b096 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -107,7 +107,7 @@ struct neigh_table dn_neigh_table = {
 		.gc_staletime =	60 * HZ,
 		.reachable_time =		30 * HZ,
 		.delay_probe_time =	5 * HZ,
-		.queue_len =		3,
+		.queue_len =		64*1024,
 		.ucast_probes =	0,
 		.app_probes =		0,
 		.mcast_probes =	0,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 96a164a..d732827 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -177,7 +177,7 @@ struct neigh_table arp_tbl = {
 		.gc_staletime		= 60 * HZ,
 		.reachable_time		= 30 * HZ,
 		.delay_probe_time	= 5 * HZ,
-		.queue_len		= 3,
+		.queue_len_bytes	= 64*1024,
 		.ucast_probes		= 3,
 		.mcast_probes		= 3,
 		.anycast_delay		= 1 * HZ,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 44e5b7f..4a20982 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -141,7 +141,7 @@ struct neigh_table nd_tbl = {
 		.gc_staletime		= 60 * HZ,
 		.reachable_time		= ND_REACHABLE_TIME,
 		.delay_probe_time	= 5 * HZ,
-		.queue_len		= 3,
+		.queue_len_bytes	= 64*1024,
 		.ucast_probes		= 3,
 		.mcast_probes		= 3,
 		.anycast_delay		= 1 * HZ,

^ permalink raw reply related

* Re: [PATCH V3 net-next] neigh: new unresolved queue limits
From: Eric Dumazet @ 2011-11-09 11:10 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <1320836664.2315.23.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC>

Le mercredi 09 novembre 2011 à 12:04 +0100, Eric Dumazet a écrit :
> unres_qlen is the number of frames we are able to queue per unresolved
> neighbour. Its default value (3) was never changed and is responsible
> for strange drops, especially if IP fragments are used, or multiple
> sessions start in parallel. Even a single tcp flow can hit this limit.
> 

> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Oh well I forgot the missing bit :

 [NDTPA_QUEUE_LENBYTES]          = { .type = NLA_U32 },

I am sending a V4 ASAP

^ permalink raw reply

* [PATCH V3 net-next] neigh: new unresolved queue limits
From: Eric Dumazet @ 2011-11-09 11:04 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <1320825325.26025.51.camel@edumazet-laptop>

unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.

$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms

--- 192.168.20.108 ping statistics ---
2 packets transmitted, 1 received, 50% packet loss, time 1001ms
rtt min/avg/max/mdev = 0.322/0.322/0.322/0.000 ms

Increasing unres_qlen can be dangerous, since an attacker might try to
fill many queues with many packets and consume all memory.

Switch to a bytes limit (limiting queued skbs truesize), and allow a
default limit of 64Kbytes per unresolved neighbour. This new limit seems
big, but as a packet can consume 64Kbytes, it reduces the memory window
offered to attackers.

unres_qlen is kept for compatibility, but internally converted to/from
bytes limit.

# cd /proc/sys/net/ipv4/neigh/default/
# grep . unres_qlen*
unres_qlen:31
unres_qlen_bytes:65536
# echo 10 >unres_qlen
# grep . unres_qlen*
unres_qlen:10
unres_qlen_bytes:21540
# echo 30000 >unres_qlen_bytes
# grep . unres_qlen*
unres_qlen:14
unres_qlen_bytes:30000

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 Documentation/networking/ip-sysctl.txt |   10 +
 include/linux/neighbour.h              |    1 
 include/net/neighbour.h                |    3 
 net/atm/clip.c                         |    2 
 net/core/neighbour.c                   |  162 +++++++++++++++--------
 net/decnet/dn_neigh.c                  |    2 
 net/ipv4/arp.c                         |    2 
 net/ipv6/ndisc.c                       |    2 
 8 files changed, 128 insertions(+), 56 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index f049a1c..b886706 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -31,6 +31,16 @@ neigh/default/gc_thresh3 - INTEGER
 	when using large numbers of interfaces and when communicating
 	with large numbers of directly-connected peers.
 
+neigh/default/unres_qlen_bytes - INTEGER
+	The maximum number of bytes which may be used by packets
+	queued for each	unresolved address by other network layers.
+	(added in linux 3.3)
+
+neigh/default/unres_qlen - INTEGER
+	The maximum number of packets which may be queued for each
+	unresolved address by other network layers.
+	(deprecated in linux 3.3) : use unres_qlen_bytes instead.
+
 mtu_expires - INTEGER
 	Time, in seconds, that cached PMTU information is kept.
 
diff --git a/include/linux/neighbour.h b/include/linux/neighbour.h
index a7003b7..b188f68 100644
--- a/include/linux/neighbour.h
+++ b/include/linux/neighbour.h
@@ -116,6 +116,7 @@ enum {
 	NDTPA_PROXY_DELAY,		/* u64, msecs */
 	NDTPA_PROXY_QLEN,		/* u32 */
 	NDTPA_LOCKTIME,			/* u64, msecs */
+	NDTPA_QUEUE_LENBYTES,		/* u32 */
 	__NDTPA_MAX
 };
 #define NDTPA_MAX (__NDTPA_MAX - 1)
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 2720884..7ae5acf 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -59,7 +59,7 @@ struct neigh_parms {
 	int	reachable_time;
 	int	delay_probe_time;
 
-	int	queue_len;
+	int	queue_len_bytes;
 	int	ucast_probes;
 	int	app_probes;
 	int	mcast_probes;
@@ -99,6 +99,7 @@ struct neighbour {
 	rwlock_t		lock;
 	atomic_t		refcnt;
 	struct sk_buff_head	arp_queue;
+	unsigned int		arp_queue_len_bytes;
 	struct timer_list	timer;
 	unsigned long		used;
 	atomic_t		probes;
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 8523940..32c41b8 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -329,7 +329,7 @@ static struct neigh_table clip_tbl = {
 		.gc_staletime 		= 60 * HZ,
 		.reachable_time 	= 30 * HZ,
 		.delay_probe_time 	= 5 * HZ,
-		.queue_len 		= 3,
+		.queue_len_bytes 	= 64 * 1024,
 		.ucast_probes 		= 3,
 		.mcast_probes 		= 3,
 		.anycast_delay 		= 1 * HZ,
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 039d51e..05e2f38 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -238,6 +238,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
 				   it to safe state.
 				 */
 				skb_queue_purge(&n->arp_queue);
+				n->arp_queue_len_bytes = 0;
 				n->output = neigh_blackhole;
 				if (n->nud_state & NUD_VALID)
 					n->nud_state = NUD_NOARP;
@@ -702,6 +703,7 @@ void neigh_destroy(struct neighbour *neigh)
 		printk(KERN_WARNING "Impossible event.\n");
 
 	skb_queue_purge(&neigh->arp_queue);
+	neigh->arp_queue_len_bytes = 0;
 
 	dev_put(neigh->dev);
 	neigh_parms_put(neigh->parms);
@@ -842,6 +844,7 @@ static void neigh_invalidate(struct neighbour *neigh)
 		write_lock(&neigh->lock);
 	}
 	skb_queue_purge(&neigh->arp_queue);
+	neigh->arp_queue_len_bytes = 0;
 }
 
 static void neigh_probe(struct neighbour *neigh)
@@ -980,15 +983,20 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
 
 	if (neigh->nud_state == NUD_INCOMPLETE) {
 		if (skb) {
-			if (skb_queue_len(&neigh->arp_queue) >=
-			    neigh->parms->queue_len) {
+			while (neigh->arp_queue_len_bytes + skb->truesize >
+			       neigh->parms->queue_len_bytes) {
 				struct sk_buff *buff;
+
 				buff = __skb_dequeue(&neigh->arp_queue);
+				if (!buff)
+					break;
+				neigh->arp_queue_len_bytes -= buff->truesize;
 				kfree_skb(buff);
 				NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
 			}
 			skb_dst_force(skb);
 			__skb_queue_tail(&neigh->arp_queue, skb);
+			neigh->arp_queue_len_bytes += skb->truesize;
 		}
 		rc = 1;
 	}
@@ -1175,6 +1183,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
 			write_lock_bh(&neigh->lock);
 		}
 		skb_queue_purge(&neigh->arp_queue);
+		neigh->arp_queue_len_bytes = 0;
 	}
 out:
 	if (update_isrouter) {
@@ -1747,7 +1756,11 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
 		NLA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex);
 
 	NLA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt));
-	NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len);
+	NLA_PUT_U32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes);
+	/* approximative value for deprecated QUEUE_LEN (in packets) */
+	NLA_PUT_U32(skb, NDTPA_QUEUE_LEN,
+		    DIV_ROUND_UP(parms->queue_len_bytes,
+				 SKB_TRUESIZE(ETH_FRAME_LEN)));
 	NLA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen);
 	NLA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes);
 	NLA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes);
@@ -1974,7 +1987,11 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 
 			switch (i) {
 			case NDTPA_QUEUE_LEN:
-				p->queue_len = nla_get_u32(tbp[i]);
+				p->queue_len_bytes = nla_get_u32(tbp[i]) *
+						     SKB_TRUESIZE(ETH_FRAME_LEN);
+				break;
+			case NDTPA_QUEUE_LENBYTES:
+				p->queue_len_bytes = nla_get_u32(tbp[i]);
 				break;
 			case NDTPA_PROXY_QLEN:
 				p->proxy_qlen = nla_get_u32(tbp[i]);
@@ -2635,117 +2652,158 @@ EXPORT_SYMBOL(neigh_app_ns);
 
 #ifdef CONFIG_SYSCTL
 
-#define NEIGH_VARS_MAX 19
+static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer,
+			   size_t *lenp, loff_t *ppos)
+{
+	int size, ret;
+	ctl_table tmp = *ctl;
+
+	tmp.data = &size;
+	size = DIV_ROUND_UP(*(int *)ctl->data, SKB_TRUESIZE(ETH_FRAME_LEN));
+	ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+	if (write && !ret)
+		*(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN);
+	return ret;
+}
+
+enum {
+	NEIGH_VAR_MCAST_PROBE,
+	NEIGH_VAR_UCAST_PROBE,
+	NEIGH_VAR_APP_PROBE,
+	NEIGH_VAR_RETRANS_TIME,
+	NEIGH_VAR_BASE_REACHABLE_TIME,
+	NEIGH_VAR_DELAY_PROBE_TIME,
+	NEIGH_VAR_GC_STALETIME,
+	NEIGH_VAR_QUEUE_LEN,
+	NEIGH_VAR_QUEUE_LEN_BYTES,
+	NEIGH_VAR_PROXY_QLEN,
+	NEIGH_VAR_ANYCAST_DELAY,
+	NEIGH_VAR_PROXY_DELAY,
+	NEIGH_VAR_LOCKTIME,
+	NEIGH_VAR_RETRANS_TIME_MS,
+	NEIGH_VAR_BASE_REACHABLE_TIME_MS,
+	NEIGH_VAR_GC_INTERVAL,
+	NEIGH_VAR_GC_THRESH1,
+	NEIGH_VAR_GC_THRESH2,
+	NEIGH_VAR_GC_THRESH3,
+	NEIGH_VAR_MAX
+};
 
 static struct neigh_sysctl_table {
 	struct ctl_table_header *sysctl_header;
-	struct ctl_table neigh_vars[NEIGH_VARS_MAX];
+	struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];
 	char *dev_name;
 } neigh_sysctl_template __read_mostly = {
 	.neigh_vars = {
-		{
+		[NEIGH_VAR_MCAST_PROBE] = {
 			.procname	= "mcast_solicit",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
-		{
+		[NEIGH_VAR_UCAST_PROBE] = {
 			.procname	= "ucast_solicit",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
-		{
+		[NEIGH_VAR_APP_PROBE] = {
 			.procname	= "app_solicit",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
-		{
+		[NEIGH_VAR_RETRANS_TIME] = {
 			.procname	= "retrans_time",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_userhz_jiffies,
 		},
-		{
+		[NEIGH_VAR_BASE_REACHABLE_TIME] = {
 			.procname	= "base_reachable_time",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_jiffies,
 		},
-		{
+		[NEIGH_VAR_DELAY_PROBE_TIME] = {
 			.procname	= "delay_first_probe_time",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_jiffies,
 		},
-		{
+		[NEIGH_VAR_GC_STALETIME] = {
 			.procname	= "gc_stale_time",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_jiffies,
 		},
-		{
+		[NEIGH_VAR_QUEUE_LEN] = {
 			.procname	= "unres_qlen",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
+			.proc_handler	= proc_unres_qlen,
+		},
+		[NEIGH_VAR_QUEUE_LEN_BYTES] = {
+			.procname	= "unres_qlen_bytes",
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
-		{
+		[NEIGH_VAR_PROXY_QLEN] = {
 			.procname	= "proxy_qlen",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
-		{
+		[NEIGH_VAR_ANYCAST_DELAY] = {
 			.procname	= "anycast_delay",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_userhz_jiffies,
 		},
-		{
+		[NEIGH_VAR_PROXY_DELAY] = {
 			.procname	= "proxy_delay",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_userhz_jiffies,
 		},
-		{
+		[NEIGH_VAR_LOCKTIME] = {
 			.procname	= "locktime",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_userhz_jiffies,
 		},
-		{
+		[NEIGH_VAR_RETRANS_TIME_MS] = {
 			.procname	= "retrans_time_ms",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_ms_jiffies,
 		},
-		{
+		[NEIGH_VAR_BASE_REACHABLE_TIME_MS] = {
 			.procname	= "base_reachable_time_ms",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_ms_jiffies,
 		},
-		{
+		[NEIGH_VAR_GC_INTERVAL] = {
 			.procname	= "gc_interval",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec_jiffies,
 		},
-		{
+		[NEIGH_VAR_GC_THRESH1] = {
 			.procname	= "gc_thresh1",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
-		{
+		[NEIGH_VAR_GC_THRESH2] = {
 			.procname	= "gc_thresh2",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
-		{
+		[NEIGH_VAR_GC_THRESH3] = {
 			.procname	= "gc_thresh3",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
@@ -2778,47 +2836,49 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
 	if (!t)
 		goto err;
 
-	t->neigh_vars[0].data  = &p->mcast_probes;
-	t->neigh_vars[1].data  = &p->ucast_probes;
-	t->neigh_vars[2].data  = &p->app_probes;
-	t->neigh_vars[3].data  = &p->retrans_time;
-	t->neigh_vars[4].data  = &p->base_reachable_time;
-	t->neigh_vars[5].data  = &p->delay_probe_time;
-	t->neigh_vars[6].data  = &p->gc_staletime;
-	t->neigh_vars[7].data  = &p->queue_len;
-	t->neigh_vars[8].data  = &p->proxy_qlen;
-	t->neigh_vars[9].data  = &p->anycast_delay;
-	t->neigh_vars[10].data = &p->proxy_delay;
-	t->neigh_vars[11].data = &p->locktime;
-	t->neigh_vars[12].data  = &p->retrans_time;
-	t->neigh_vars[13].data  = &p->base_reachable_time;
+	t->neigh_vars[NEIGH_VAR_MCAST_PROBE].data  = &p->mcast_probes;
+	t->neigh_vars[NEIGH_VAR_UCAST_PROBE].data  = &p->ucast_probes;
+	t->neigh_vars[NEIGH_VAR_APP_PROBE].data  = &p->app_probes;
+	t->neigh_vars[NEIGH_VAR_RETRANS_TIME].data  = &p->retrans_time;
+	t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].data  = &p->base_reachable_time;
+	t->neigh_vars[NEIGH_VAR_DELAY_PROBE_TIME].data  = &p->delay_probe_time;
+	t->neigh_vars[NEIGH_VAR_GC_STALETIME].data  = &p->gc_staletime;
+	t->neigh_vars[NEIGH_VAR_QUEUE_LEN].data  = &p->queue_len_bytes;
+	t->neigh_vars[NEIGH_VAR_QUEUE_LEN_BYTES].data  = &p->queue_len_bytes;
+	t->neigh_vars[NEIGH_VAR_PROXY_QLEN].data  = &p->proxy_qlen;
+	t->neigh_vars[NEIGH_VAR_ANYCAST_DELAY].data  = &p->anycast_delay;
+	t->neigh_vars[NEIGH_VAR_PROXY_DELAY].data = &p->proxy_delay;
+	t->neigh_vars[NEIGH_VAR_LOCKTIME].data = &p->locktime;
+	t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].data  = &p->retrans_time;
+	t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].data  = &p->base_reachable_time;
 
 	if (dev) {
 		dev_name_source = dev->name;
 		/* Terminate the table early */
-		memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14]));
+		memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,
+		       sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));
 	} else {
 		dev_name_source = neigh_path[NEIGH_CTL_PATH_DEV].procname;
-		t->neigh_vars[14].data = (int *)(p + 1);
-		t->neigh_vars[15].data = (int *)(p + 1) + 1;
-		t->neigh_vars[16].data = (int *)(p + 1) + 2;
-		t->neigh_vars[17].data = (int *)(p + 1) + 3;
+		t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1);
+		t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1;
+		t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2;
+		t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3;
 	}
 
 
 	if (handler) {
 		/* RetransTime */
-		t->neigh_vars[3].proc_handler = handler;
-		t->neigh_vars[3].extra1 = dev;
+		t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler;
+		t->neigh_vars[NEIGH_VAR_RETRANS_TIME].extra1 = dev;
 		/* ReachableTime */
-		t->neigh_vars[4].proc_handler = handler;
-		t->neigh_vars[4].extra1 = dev;
+		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler;
+		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].extra1 = dev;
 		/* RetransTime (in milliseconds)*/
-		t->neigh_vars[12].proc_handler = handler;
-		t->neigh_vars[12].extra1 = dev;
+		t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler;
+		t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].extra1 = dev;
 		/* ReachableTime (in milliseconds) */
-		t->neigh_vars[13].proc_handler = handler;
-		t->neigh_vars[13].extra1 = dev;
+		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler;
+		t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].extra1 = dev;
 	}
 
 	t->dev_name = kstrdup(dev_name_source, GFP_KERNEL);
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 7f0eb08..fb8b096 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -107,7 +107,7 @@ struct neigh_table dn_neigh_table = {
 		.gc_staletime =	60 * HZ,
 		.reachable_time =		30 * HZ,
 		.delay_probe_time =	5 * HZ,
-		.queue_len =		3,
+		.queue_len =		64*1024,
 		.ucast_probes =	0,
 		.app_probes =		0,
 		.mcast_probes =	0,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 96a164a..d732827 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -177,7 +177,7 @@ struct neigh_table arp_tbl = {
 		.gc_staletime		= 60 * HZ,
 		.reachable_time		= 30 * HZ,
 		.delay_probe_time	= 5 * HZ,
-		.queue_len		= 3,
+		.queue_len_bytes	= 64*1024,
 		.ucast_probes		= 3,
 		.mcast_probes		= 3,
 		.anycast_delay		= 1 * HZ,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 44e5b7f..4a20982 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -141,7 +141,7 @@ struct neigh_table nd_tbl = {
 		.gc_staletime		= 60 * HZ,
 		.reachable_time		= ND_REACHABLE_TIME,
 		.delay_probe_time	= 5 * HZ,
-		.queue_len		= 3,
+		.queue_len_bytes	= 64*1024,
 		.ucast_probes		= 3,
 		.mcast_probes		= 3,
 		.anycast_delay		= 1 * HZ,

^ permalink raw reply related

* Re: patch "workflow" - what deferred state means?
From: Maz The Northener @ 2011-11-09  9:57 UTC (permalink / raw)
  To: David Miller, Matti Vaittinen; +Cc: netdev
In-Reply-To: <20111107.151954.624621133639707298.davem@davemloft.net>

On Mon, Nov 7, 2011 at 10:19 PM, David Miller <davem@davemloft.net> wrote:
> From: Maz The Northener <mazziesaccount@gmail.com>
> Date: Mon, 7 Nov 2011 22:08:00 +0200
>
>> I was talking about http://patchwork.ozlabs.org/patch/123407/ and
>> patchwork.ozlabs.org/patch/123406/
>
> This kind of work should be resubmitted when net-next opens back up.
>

I assume now would be correct time, right? If I am correct, net-next
is open now due to linux 3.2-rc1 (If I am wrong, please tell me how I
know when net-next is opened).
So I should resubmit the patches. I just fetched snapshot from
net-next.git at kernel.org, and tried applying the patches. Applying
succeeded but with offset. So should I redo patches on top of some
other kernel, and if so, which one?

-Matti

^ permalink raw reply

* Re: config NET_ETHERNET
From: Jeff Kirsher @ 2011-11-09  9:28 UTC (permalink / raw)
  To: David Miller; +Cc: pebolle@tiscali.nl, netdev
In-Reply-To: <20111109.001540.816179448716894786.davem@davemloft.net>

[-- Attachment #1: Type: text/plain, Size: 1847 bytes --]

On Tue, 2011-11-08 at 21:15 -0800, David Miller wrote:
> From: Paul Bolle <pebolle@tiscali.nl>
> Date: Wed, 09 Nov 2011 01:41:15 +0100
> 
> > 0) The Kconfig symbol NET_ETHERNET got dropped in commit f860b0522f
> > ("drivers/net: Kconfig and Makefile cleanup").
> > 
> > 1) However, there are still a number of references to that symbol
> and to
> > the macro CONFIG_NET_ETHERNET:
> >     $ git grep -n "NET_ETHERNET\>"  | grep -v defconfig:
> >     arch/cris/arch-v10/drivers/Kconfig:6:     select NET_ETHERNET
> >     arch/cris/arch-v32/drivers/Kconfig:6:     select NET_ETHERNET
> >     drivers/s390/net/Kconfig:7:       depends on CCW && NETDEVICES
> && (NET_ETHERNET || TR || FDDI)
> >     drivers/s390/net/lcs.c:53:#if !defined(CONFIG_NET_ETHERNET) && \
> >     drivers/s390/net/lcs.c:1637:#ifdef CONFIG_NET_ETHERNET
> >     drivers/s390/net/lcs.c:2169:#ifdef CONFIG_NET_ETHERNET
> > 
> > 2) It's trivial to draft a patch which drops both that Kconfig
> symbol
> > and the macro (and everything that could be considered dead code by
> > now). But perhaps this needs a more sophisticated solution.
> 
> The best thing to do for now is to switch these references over
> to use just plain "ETHERNET" and that's what I'll commit into
> the net tree.
> 
> --------------------
> [PATCH] net: Fix references to deleted NET_ETHERNET Kconfig setting.
> 
> Change them over to plain "ETHERNET"
> 
> Reported-by: Paul Bolle <pebolle@tiscali.nl>
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>  arch/cris/arch-v10/drivers/Kconfig |    2 +-
>  arch/cris/arch-v32/drivers/Kconfig |    2 +-
>  drivers/s390/net/Kconfig           |    2 +-
>  drivers/s390/net/lcs.c             |    6 +++---
>  4 files changed, 6 insertions(+), 6 deletions(-) 

Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* [PATCH v2] net: add wireless TX status socket option
From: Johannes Berg @ 2011-11-09  9:15 UTC (permalink / raw)
  To: John Linville; +Cc: linux-wireless, netdev
In-Reply-To: <1320317796.3950.43.camel@jlt3.sipsolutions.net>

From: Johannes Berg <johannes.berg@intel.com>

The 802.1X EAPOL handshake hostapd does requires
knowing whether the frame was ack'ed by the peer.
Currently, we fudge this pretty badly by not even
transmitting the frame as a normal data frame but
injecting it with radiotap and getting the status
out of radiotap monitor as well. This is rather
complex, confuses users (mon.wlan0 presence) and
doesn't work with all hardware.

To get rid of that hack, introduce a real wifi TX
status option for data frame transmissions.

This works similar to the existing TX timestamping
in that it reflects the SKB back to the socket's
error queue with a SCM_WIFI_STATUS cmsg that has
an int indicating ACK status (0/1).

Since it is possible that at some point we will
want to have TX timestamping and wifi status in a
single errqueue SKB (there's little point in not
doing that), redefine SO_EE_ORIGIN_TIMESTAMPING
to SO_EE_ORIGIN_TXSTATUS which can collect more
than just the timestamp; keep the old constant
as an alias of course. Currently the internal APIs
don't make that possible, but it wouldn't be hard
to split them up in a way that makes it possible.

Thanks to Neil Horman for helping me figure out
the functions that add the control messages.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
v2: rebase onto net-next coming into wireless-next

 arch/alpha/include/asm/socket.h   |    3 +++
 arch/arm/include/asm/socket.h     |    3 +++
 arch/avr32/include/asm/socket.h   |    3 +++
 arch/cris/include/asm/socket.h    |    3 +++
 arch/frv/include/asm/socket.h     |    3 +++
 arch/h8300/include/asm/socket.h   |    3 +++
 arch/ia64/include/asm/socket.h    |    3 +++
 arch/m32r/include/asm/socket.h    |    3 +++
 arch/m68k/include/asm/socket.h    |    3 +++
 arch/mips/include/asm/socket.h    |    3 +++
 arch/mn10300/include/asm/socket.h |    3 +++
 arch/parisc/include/asm/socket.h  |    3 +++
 arch/powerpc/include/asm/socket.h |    3 +++
 arch/s390/include/asm/socket.h    |    3 +++
 arch/sparc/include/asm/socket.h   |    3 +++
 arch/xtensa/include/asm/socket.h  |    3 +++
 include/asm-generic/socket.h      |    3 +++
 include/linux/errqueue.h          |    3 ++-
 include/linux/skbuff.h            |   19 +++++++++++++++++--
 include/net/sock.h                |    6 ++++++
 net/core/skbuff.c                 |   20 ++++++++++++++++++++
 net/core/sock.c                   |    9 +++++++++
 net/socket.c                      |   18 ++++++++++++++++++
 23 files changed, 123 insertions(+), 3 deletions(-)

--- a/include/asm-generic/socket.h	2011-11-08 22:32:30.000000000 +0100
+++ b/include/asm-generic/socket.h	2011-11-09 10:12:02.000000000 +0100
@@ -64,4 +64,7 @@
 #define SO_DOMAIN		39
 
 #define SO_RXQ_OVFL             40
+
+#define SO_WIFI_STATUS		41
+#define SCM_WIFI_STATUS	SO_WIFI_STATUS
 #endif /* __ASM_GENERIC_SOCKET_H */
--- a/net/core/sock.c	2011-11-09 10:07:34.000000000 +0100
+++ b/net/core/sock.c	2011-11-09 10:12:02.000000000 +0100
@@ -740,6 +740,11 @@ set_rcvbuf:
 	case SO_RXQ_OVFL:
 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
 		break;
+
+	case SO_WIFI_STATUS:
+		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
+		break;
+
 	default:
 		ret = -ENOPROTOOPT;
 		break;
@@ -961,6 +966,10 @@ int sock_getsockopt(struct socket *sock,
 		v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
 		break;
 
+	case SO_WIFI_STATUS:
+		v.val = !!sock_flag(sk, SOCK_WIFI_STATUS);
+		break;
+
 	default:
 		return -ENOPROTOOPT;
 	}
--- a/include/net/sock.h	2011-11-09 10:07:30.000000000 +0100
+++ b/include/net/sock.h	2011-11-09 10:12:02.000000000 +0100
@@ -563,6 +563,7 @@ enum sock_flags {
 	SOCK_FASYNC, /* fasync() active */
 	SOCK_RXQ_OVFL,
 	SOCK_ZEROCOPY, /* buffers from userspace */
+	SOCK_WIFI_STATUS, /* push wifi status to userspace */
 };
 
 static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
@@ -1714,6 +1715,8 @@ static inline int sock_intr_errno(long t
 
 extern void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 	struct sk_buff *skb);
+extern void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
+	struct sk_buff *skb);
 
 static __inline__ void
 sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
@@ -1741,6 +1744,9 @@ sock_recv_timestamp(struct msghdr *msg,
 		__sock_recv_timestamp(msg, sk, skb);
 	else
 		sk->sk_stamp = kt;
+
+	if (sock_flag(sk, SOCK_WIFI_STATUS) && skb->wifi_acked_valid)
+		__sock_recv_wifi_status(msg, sk, skb);
 }
 
 extern void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
--- a/arch/alpha/include/asm/socket.h	2011-11-08 22:32:30.000000000 +0100
+++ b/arch/alpha/include/asm/socket.h	2011-11-09 10:12:02.000000000 +0100
@@ -69,6 +69,9 @@
 
 #define SO_RXQ_OVFL             40
 
+#define SO_WIFI_STATUS		41
+#define SCM_WIFI_STATUS		SO_WIFI_STATUS
+
 /* O_NONBLOCK clashes with the bits used for socket types.  Therefore we
  * have to define SOCK_NONBLOCK to a different value here.
  */
--- a/arch/arm/include/asm/socket.h	2011-11-08 22:32:30.000000000 +0100
+++ b/arch/arm/include/asm/socket.h	2011-11-09 10:12:02.000000000 +0100
@@ -62,4 +62,7 @@
 
 #define SO_RXQ_OVFL             40
 
+#define SO_WIFI_STATUS		41
+#define SCM_WIFI_STATUS		SO_WIFI_STATUS
+
 #endif /* _ASM_SOCKET_H */
--- a/arch/avr32/include/asm/socket.h	2011-11-08 22:32:29.000000000 +0100
+++ b/arch/avr32/include/asm/socket.h	2011-11-09 10:12:02.000000000 +0100
@@ -62,4 +62,7 @@
 
 #define SO_RXQ_OVFL             40
 
+#define SO_WIFI_STATUS		41
+#define SCM_WIFI_STATUS		SO_WIFI_STATUS
+
 #endif /* __ASM_AVR32_SOCKET_H */
--- a/arch/cris/include/asm/socket.h	2011-11-08 22:32:30.000000000 +0100
+++ b/arch/cris/include/asm/socket.h	2011-11-09 10:12:02.000000000 +0100
@@ -64,6 +64,9 @@
 
 #define SO_RXQ_OVFL             40
 
+#define SO_WIFI_STATUS		41
+#define SCM_WIFI_STATUS		SO_WIFI_STATUS
+
 #endif /* _ASM_SOCKET_H */
 
 
--- a/arch/frv/include/asm/socket.h	2011-11-08 22:32:29.000000000 +0100
+++ b/arch/frv/include/asm/socket.h	2011-11-09 10:12:02.000000000 +0100
@@ -62,5 +62,8 @@
 
 #define SO_RXQ_OVFL             40
 
+#define SO_WIFI_STATUS		41
+#define SCM_WIFI_STATUS		SO_WIFI_STATUS
+
 #endif /* _ASM_SOCKET_H */
 
--- a/arch/h8300/include/asm/socket.h	2011-11-08 22:32:29.000000000 +0100
+++ b/arch/h8300/include/asm/socket.h	2011-11-09 10:12:02.000000000 +0100
@@ -62,4 +62,7 @@
 
 #define SO_RXQ_OVFL             40
 
+#define SO_WIFI_STATUS		41
+#define SCM_WIFI_STATUS		SO_WIFI_STATUS
+
 #endif /* _ASM_SOCKET_H */
--- a/arch/ia64/include/asm/socket.h	2011-11-08 22:32:30.000000000 +0100
+++ b/arch/ia64/include/asm/socket.h	2011-11-09 10:12:02.000000000 +0100
@@ -71,4 +71,7 @@
 
 #define SO_RXQ_OVFL             40
 
+#define SO_WIFI_STATUS		41
+#define SCM_WIFI_STATUS		SO_WIFI_STATUS
+
 #endif /* _ASM_IA64_SOCKET_H */
--- a/arch/m32r/include/asm/socket.h	2011-11-08 22:32:30.000000000 +0100
+++ b/arch/m32r/include/asm/socket.h	2011-11-09 10:12:02.000000000 +0100
@@ -62,4 +62,7 @@
 
 #define SO_RXQ_OVFL             40
 
+#define SO_WIFI_STATUS		41
+#define SCM_WIFI_STATUS		SO_WIFI_STATUS
+
 #endif /* _ASM_M32R_SOCKET_H */
--- a/arch/m68k/include/asm/socket.h	2011-11-08 22:32:29.000000000 +0100
+++ b/arch/m68k/include/asm/socket.h	2011-11-09 10:12:02.000000000 +0100
@@ -62,4 +62,7 @@
 
 #define SO_RXQ_OVFL             40
 
+#define SO_WIFI_STATUS		41
+#define SCM_WIFI_STATUS		SO_WIFI_STATUS
+
 #endif /* _ASM_SOCKET_H */
--- a/arch/mips/include/asm/socket.h	2011-11-08 22:32:29.000000000 +0100
+++ b/arch/mips/include/asm/socket.h	2011-11-09 10:12:02.000000000 +0100
@@ -82,6 +82,9 @@ To add: #define SO_REUSEPORT 0x0200	/* A
 
 #define SO_RXQ_OVFL             40
 
+#define SO_WIFI_STATUS		41
+#define SCM_WIFI_STATUS		SO_WIFI_STATUS
+
 #ifdef __KERNEL__
 
 /** sock_type - Socket types
--- a/arch/mn10300/include/asm/socket.h	2011-11-08 22:32:29.000000000 +0100
+++ b/arch/mn10300/include/asm/socket.h	2011-11-09 10:12:02.000000000 +0100
@@ -62,4 +62,7 @@
 
 #define SO_RXQ_OVFL             40
 
+#define SO_WIFI_STATUS		41
+#define SCM_WIFI_STATUS		SO_WIFI_STATUS
+
 #endif /* _ASM_SOCKET_H */
--- a/arch/parisc/include/asm/socket.h	2011-11-08 22:32:29.000000000 +0100
+++ b/arch/parisc/include/asm/socket.h	2011-11-09 10:12:02.000000000 +0100
@@ -61,6 +61,9 @@
 
 #define SO_RXQ_OVFL             0x4021
 
+#define SO_WIFI_STATUS		0x4022
+#define SCM_WIFI_STATUS		SO_WIFI_STATUS
+
 /* O_NONBLOCK clashes with the bits used for socket types.  Therefore we
  * have to define SOCK_NONBLOCK to a different value here.
  */
--- a/arch/powerpc/include/asm/socket.h	2011-11-08 22:32:29.000000000 +0100
+++ b/arch/powerpc/include/asm/socket.h	2011-11-09 10:12:02.000000000 +0100
@@ -69,4 +69,7 @@
 
 #define SO_RXQ_OVFL             40
 
+#define SO_WIFI_STATUS		41
+#define SCM_WIFI_STATUS		SO_WIFI_STATUS
+
 #endif	/* _ASM_POWERPC_SOCKET_H */
--- a/arch/s390/include/asm/socket.h	2011-11-08 22:32:29.000000000 +0100
+++ b/arch/s390/include/asm/socket.h	2011-11-09 10:12:02.000000000 +0100
@@ -70,4 +70,7 @@
 
 #define SO_RXQ_OVFL             40
 
+#define SO_WIFI_STATUS		41
+#define SCM_WIFI_STATUS		SO_WIFI_STATUS
+
 #endif /* _ASM_SOCKET_H */
--- a/arch/sparc/include/asm/socket.h	2011-11-08 22:32:29.000000000 +0100
+++ b/arch/sparc/include/asm/socket.h	2011-11-09 10:12:02.000000000 +0100
@@ -58,6 +58,9 @@
 
 #define SO_RXQ_OVFL             0x0024
 
+#define SO_WIFI_STATUS		0x0025
+#define SCM_WIFI_STATUS		SO_WIFI_STATUS
+
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION		0x5001
 #define SO_SECURITY_ENCRYPTION_TRANSPORT	0x5002
--- a/arch/xtensa/include/asm/socket.h	2011-11-08 22:32:29.000000000 +0100
+++ b/arch/xtensa/include/asm/socket.h	2011-11-09 10:12:02.000000000 +0100
@@ -73,4 +73,7 @@
 
 #define SO_RXQ_OVFL             40
 
+#define SO_WIFI_STATUS		41
+#define SCM_WIFI_STATUS		SO_WIFI_STATUS
+
 #endif	/* _XTENSA_SOCKET_H */
--- a/include/linux/errqueue.h	2011-11-08 22:32:30.000000000 +0100
+++ b/include/linux/errqueue.h	2011-11-09 10:12:02.000000000 +0100
@@ -17,7 +17,8 @@ struct sock_extended_err {
 #define SO_EE_ORIGIN_LOCAL	1
 #define SO_EE_ORIGIN_ICMP	2
 #define SO_EE_ORIGIN_ICMP6	3
-#define SO_EE_ORIGIN_TIMESTAMPING 4
+#define SO_EE_ORIGIN_TXSTATUS	4
+#define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS
 
 #define SO_EE_OFFENDER(ee)	((struct sockaddr*)((ee)+1))
 
--- a/include/linux/skbuff.h	2011-11-09 10:07:30.000000000 +0100
+++ b/include/linux/skbuff.h	2011-11-09 10:12:41.000000000 +0100
@@ -218,6 +218,9 @@ enum {
 
 	/* device driver supports TX zero-copy buffers */
 	SKBTX_DEV_ZEROCOPY = 1 << 4,
+
+	/* generate wifi status information (where possible) */
+	SKBTX_WIFI_STATUS = 1 << 5,
 };
 
 /*
@@ -352,6 +355,8 @@ typedef unsigned char *sk_buff_data_t;
  *	@ooo_okay: allow the mapping of a socket to a queue to be changed
  *	@l4_rxhash: indicate rxhash is a canonical 4-tuple hash over transport
  *		ports.
+ *	@wifi_acked_valid: wifi_acked was set
+ *	@wifi_acked: whether frame was acked on wifi or not
  *	@dma_cookie: a cookie to one of several possible DMA operations
  *		done by skb DMA functions
  *	@secmark: security marking
@@ -445,10 +450,11 @@ struct sk_buff {
 #endif
 	__u8			ooo_okay:1;
 	__u8			l4_rxhash:1;
+	__u8			wifi_acked_valid:1;
+	__u8			wifi_acked:1;
+	/* 10/12 bit hole (depending on ndisc_nodetype presence) */
 	kmemcheck_bitfield_end(flags2);
 
-	/* 0/13 bit hole */
-
 #ifdef CONFIG_NET_DMA
 	dma_cookie_t		dma_cookie;
 #endif
@@ -2263,6 +2269,15 @@ static inline void skb_tx_timestamp(stru
 	sw_tx_timestamp(skb);
 }
 
+/**
+ * skb_complete_wifi_ack - deliver skb with wifi status
+ *
+ * @skb: the original outgoing packet
+ * @acked: ack status
+ *
+ */
+void skb_complete_wifi_ack(struct sk_buff *skb, bool acked);
+
 extern __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len);
 extern __sum16 __skb_checksum_complete(struct sk_buff *skb);
 
--- a/net/core/skbuff.c	2011-11-09 10:07:34.000000000 +0100
+++ b/net/core/skbuff.c	2011-11-09 10:12:02.000000000 +0100
@@ -3169,6 +3169,26 @@ void skb_tstamp_tx(struct sk_buff *orig_
 }
 EXPORT_SYMBOL_GPL(skb_tstamp_tx);
 
+void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
+{
+	struct sock *sk = skb->sk;
+	struct sock_exterr_skb *serr;
+	int err;
+
+	skb->wifi_acked_valid = 1;
+	skb->wifi_acked = acked;
+
+	serr = SKB_EXT_ERR(skb);
+	memset(serr, 0, sizeof(*serr));
+	serr->ee.ee_errno = ENOMSG;
+	serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
+
+	err = sock_queue_err_skb(sk, skb);
+	if (err)
+		kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
+
 
 /**
  * skb_partial_csum_set - set up and verify partial csum values for packet
--- a/net/socket.c	2011-11-09 10:07:34.000000000 +0100
+++ b/net/socket.c	2011-11-09 10:12:02.000000000 +0100
@@ -538,6 +538,8 @@ int sock_tx_timestamp(struct sock *sk, _
 		*tx_flags |= SKBTX_HW_TSTAMP;
 	if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
 		*tx_flags |= SKBTX_SW_TSTAMP;
+	if (sock_flag(sk, SOCK_WIFI_STATUS))
+		*tx_flags |= SKBTX_WIFI_STATUS;
 	return 0;
 }
 EXPORT_SYMBOL(sock_tx_timestamp);
@@ -674,6 +676,22 @@ void __sock_recv_timestamp(struct msghdr
 }
 EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
 
+void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
+	struct sk_buff *skb)
+{
+	int ack;
+
+	if (!sock_flag(sk, SOCK_WIFI_STATUS))
+		return;
+	if (!skb->wifi_acked_valid)
+		return;
+
+	ack = skb->wifi_acked;
+
+	put_cmsg(msg, SOL_SOCKET, SCM_WIFI_STATUS, sizeof(ack), &ack);
+}
+EXPORT_SYMBOL_GPL(__sock_recv_wifi_status);
+
 static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk,
 				   struct sk_buff *skb)
 {

^ permalink raw reply

* Re: Large file copy to NFS mounted directory causes delay in other application packets
From: Eric Dumazet @ 2011-11-09  8:59 UTC (permalink / raw)
  To: Manavalan Krishnan; +Cc: linux-kernel@vger.kernel.org, netdev
In-Reply-To: <1320826389.47194.YahooMailNeo@web160716.mail.bf1.yahoo.com>

Please dont top post on these lists, thanks.

Le mercredi 09 novembre 2011 à 00:13 -0800, Manavalan Krishnan a écrit :
> (1) NFS is using TCP
> (2) yes eth0 is dedicated to heartbeat and eth1 is dedicated to NFS
> (3) I notice the following at the system where file copy is occuring
> 
> The kernel Recv-Q of the heartbeat application socket grows but not delivered to the socket recv call. 
> Here is the netstat output.
> 
> Proto  Recv-Q  Send-Q   Local Address         Foreign Address
> 
> udp    11522                0  *:23435                     *:*
> 

OK so the sending side is OK : The delay is at receiver side.

Note that since netstat shows receive queue has some skbs, it should be
available to heartbeat daemon immediately.

> As soon as I stop the file transfer, the socket recv call receives the packets and Recv-Q goes 0.
> (4) The server has 4 cpu cores and 25G RAM
> 

1) How many nfsd threads are running ?
   grep th /proc/net/rpc/nfsd

2) WHat kind of NIC do you use ?
   lsmod , lspci

3) Hmm, are IRQ to eth0/eth1 handled by same cpu ?
  grep eth /proc/interrupts

4) You could try to cpu affine all nfsd to cpu0,cpu1,cpu2  and heartbeat
daemon to cpu3.
   man taskset

5) You could 'strace -ttt' heartbeat daemon to check if it is not
blocked on some local disk access (it competes with all nfsd threads)

^ permalink raw reply

* Re: Large file copy to NFS mounted directory causes delay in other application packets
From: Manavalan Krishnan @ 2011-11-09  8:42 UTC (permalink / raw)
  To: Dave Taht, Eric Dumazet, linux-kernel@vger.kernel.org; +Cc: netdev
In-Reply-To: <CAA93jw4gDuHyG508zxRzyn+MJ4gL5m958OaCWJxoDsaLgjdnqg@mail.gmail.com>

>When I see behavior like this I keep thinking interactions between overlarge large txqueuelens, somewhat busted TCP offloads on NICs, and that pfifo_fast must die in favor of fair queuing and/or diffserv classification. But seeing >it on two different nics implies that your switch (which I assume is shared) is possibly to blame...
>(I see bufferbloat everywhere, but mostly because it's what I work on)

>Is this NFS over TCP? Does the HA daemons prioritize packets at all? Does your switch? Does your qdiscs? How deep are your buffers on the network cards and txqueuelens and switch? 
>(eric's other questions below are probably more valid)


HA   daemons does not prioritize. could you please provide info on how to prioritize application packets?
We tried with different switches, the same problem occurs. so switch may not be the issue here.
switch and qdisks does not prirotize the packets.
Here are network buffers used in the servers

txqueuelen is 1000 

net.core.netdev_max_backlog = 1000
net.core.rmem_default = 262144
net.core.rmem_max = 262144
net.core.wmem_default = 129024
net.core.wmem_max = 131071
net.ipv4.tcp_rmem = 4096    87380    4194304
net.ipv4.tcp_wmem = 4096    16384    4194304
net.ipv4.tcp_mem = 196608    262144    393216

Thanks

^ permalink raw reply

* [PATCH] wireless: libertas: fix unaligned le64 accesses
From: Steven Miao @ 2011-11-09  8:30 UTC (permalink / raw)
  To: Dan Williams, John W. Linville, libertas-dev
  Cc: Steven Miao, open list:NETWORKING [WIREL...,
	open list:NETWORKING DRIVERS, open list

use get_unaligned_le64() to get timestamp

Signed-off-by: Steven Miao <realmz6@gmail.com>
---
 drivers/net/wireless/libertas/cfg.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/wireless/libertas/cfg.c b/drivers/net/wireless/libertas/cfg.c
index b456a53..f23c2b1 100644
--- a/drivers/net/wireless/libertas/cfg.c
+++ b/drivers/net/wireless/libertas/cfg.c
@@ -630,7 +630,7 @@ static int lbs_ret_scan(struct lbs_private *priv, unsigned long dummy,
 			if (channel &&
 			    !(channel->flags & IEEE80211_CHAN_DISABLED))
 				cfg80211_inform_bss(wiphy, channel,
-					bssid, le64_to_cpu(*(__le64 *)tsfdesc),
+					bssid, get_unaligned_le64(tsfdesc),
 					capa, intvl, ie, ielen,
 					LBS_SCAN_RSSI_TO_MBM(rssi),
 					GFP_KERNEL);
-- 
1.7.0.4

^ permalink raw reply related

* Re: Large file copy to NFS mounted directory causes delay in other application packets
From: Manavalan Krishnan @ 2011-11-09  8:13 UTC (permalink / raw)
  To: Eric Dumazet, linux-kernel@vger.kernel.org; +Cc: linux-kernel, netdev
In-Reply-To: <1320819875.26025.48.camel@edumazet-laptop>

(1) NFS is using TCP
(2) yes eth0 is dedicated to heartbeat and eth1 is dedicated to NFS
(3) I notice the following at the system where file copy is occuring

The kernel Recv-Q of the heartbeat application socket grows but not delivered to the socket recv call. 
Here is the netstat output.

Proto  Recv-Q  Send-Q   Local Address         Foreign Address

udp    11522                0  *:23435                     *:*

As soon as I stop the file transfer, the socket recv call receives the packets and Recv-Q goes 0.
(4) The server has 4 cpu cores and 25G RAM

________________________________
From: Eric Dumazet <eric.dumazet@gmail.com>
To: Manavalan Krishnan <manavalan_k@yahoo.com>
Cc: "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>; netdev <netdev@vger.kernel.org>
Sent: Tuesday, November 8, 2011 10:24 PM
Subject: Re: Large file copy to NFS mounted directory causes delay in other application packets

Le mardi 08 novembre 2011 à 21:28 -0800, Manavalan Krishnan a écrit :
> Hi All
> 
> I have two systems with two network interfaces each(eth0 and eth1). I
> am running linux-HA (heartbeat deamon) on both the systems and they
> use eth0 for exchanging heartbeats. I have NFS mounted directory in
> one system and the NFS client uses the interface eth1. 
> 
> I try to copy a large file to NFS mounted directory. But the heartbeat
> daemons misses the heartbeat packets from peers while copy is under
> progress. I did tcpdump and found that the heartbeat packets are
> delayed for few seconds before sent out on eth0. When I stop the file
> copy, the heartbeats are delivered properly. It seems linux kernel
> somehow giving priority for NFS packets(generated from the file copy)
> over other application packets.
> 
> Any thoughts on this behavior? Is there any way we can avoid this so
> that application packets get equal chance while large file copy to NFS
> mounted directory under progress?
> 
CC netdev

1) Is your NFS using UDP or TCP ?
2) Is your eth0 dedicated to heartbeats and eth1 to NFS traffic ?
3) How do you know heartbeats are delayed ?
4) Is your server CPU bounded ?

Thanks


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply

* [net-next-2.6 PATCH 6/6 v4] macvlan: Add support to get MAC/VLAN filter netdev ops
From: Roopa Prabhu @ 2011-11-09  7:56 UTC (permalink / raw)
  To: netdev, davem
  Cc: chrisw, sri, dragos.tatulea, kvm, arnd, mst, gregory.v.rose,
	mchan, dwang2, shemminger, eric.dumazet, kaber, benve
In-Reply-To: <20111109075449.13549.58135.stgit@rhel6.1>

From: Roopa Prabhu <roprabhu@cisco.com>

This patch adds support to get MAC and VLAN filter netdev ops
on a macvlan interface. It adds support for get_rx_filter_addr_size,
get_rx_filter_vlan_size, fill_rx_filter_addr and fill_rx_filter_vlan
netdev ops

Signed-off-by: Roopa Prabhu <roprabhu@cisco.com>
Signed-off-by: Christian Benvenuti <benve@cisco.com>
Signed-off-by: David Wang <dwang2@cisco.com>
---
 drivers/net/macvlan.c |  158 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 158 insertions(+), 0 deletions(-)


diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index c2dea97..8a5320b 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -623,6 +623,55 @@ static int macvlan_set_rx_filter_vlan(struct net_device *dev, int vf,
 	return 0;
 }
 
+static size_t macvlan_get_rx_filter_vlan_size(const struct net_device *dev,
+					      int vf)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct net_device *lowerdev = vlan->lowerdev;
+	const struct net_device_ops *ops = lowerdev->netdev_ops;
+
+	if (vf != SELF_VF)
+		return -EINVAL;
+
+	switch (vlan->mode) {
+	case MACVLAN_MODE_PASSTHRU:
+		if (ops->ndo_get_rx_filter_vlan_size)
+			return ops->ndo_get_rx_filter_vlan_size(dev, vf);
+		/* IFLA_RX_FILTER_VLAN_BITMAP */
+		return nla_total_size(VLAN_BITMAP_SIZE);
+	default:
+		return 0;
+	}
+}
+
+static int macvlan_get_rx_filter_vlan(const struct net_device *dev, int vf,
+				      struct sk_buff *skb)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct net_device *lowerdev = vlan->lowerdev;
+	const struct net_device_ops *ops = lowerdev->netdev_ops;
+
+	if (vf != SELF_VF)
+		return -EINVAL;
+
+	switch (vlan->mode) {
+	case MACVLAN_MODE_PASSTHRU:
+		if (ops->ndo_get_rx_filter_vlan)
+			return ops->ndo_get_rx_filter_vlan(dev, vf, skb);
+
+		NLA_PUT(skb, IFLA_RX_FILTER_VLAN_BITMAP, VLAN_BITMAP_SIZE,
+			vlan->vlan_filter);
+		break;
+	default:
+		return -ENODATA; /* No data to Fill */
+	}
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
 static int macvlan_addr_in_hw_list(struct netdev_hw_addr_list *list,
 				   u8 *addr, int addrlen)
 {
@@ -802,6 +851,111 @@ static int macvlan_set_rx_filter_addr(struct net_device *dev, int vf,
 	return 0;
 }
 
+static size_t macvlan_get_rx_filter_addr_passthru_size(
+			const struct net_device *dev, int vf)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct net_device *lowerdev = vlan->lowerdev;
+	const struct net_device_ops *ops = lowerdev->netdev_ops;
+	size_t size;
+
+	if (ops->ndo_get_rx_filter_addr_size)
+		return ops->ndo_get_rx_filter_addr_size(dev, vf);
+
+	/* IFLA_RX_FILTER_ADDR_FLAGS */
+	size = nla_total_size(sizeof(u32));
+
+	if (netdev_uc_count(dev))
+		/* IFLA_RX_FILTER_ADDR_UC_LIST */
+		size += nla_total_size(netdev_uc_count(dev) *
+				       ETH_ALEN * sizeof(struct nlattr));
+
+	if (netdev_mc_count(dev))
+		/* IFLA_RX_FILTER_ADDR_MC_LIST */
+		size += nla_total_size(netdev_mc_count(dev) *
+				       ETH_ALEN * sizeof(struct nlattr));
+
+	return size;
+}
+
+static size_t macvlan_get_rx_filter_addr_size(const struct net_device *dev,
+					      int vf)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+
+	if (vf != SELF_VF)
+		return -EINVAL;
+
+	switch (vlan->mode) {
+	case MACVLAN_MODE_PASSTHRU:
+		return macvlan_get_rx_filter_addr_passthru_size(dev, vf);
+	default:
+		return 0;
+	}
+}
+
+static int macvlan_get_rx_filter_addr_passthru(const struct net_device *dev,
+					       int vf, struct sk_buff *skb)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+	struct net_device *lowerdev = vlan->lowerdev;
+	const struct net_device_ops *ops = lowerdev->netdev_ops;
+	struct nlattr *uninitialized_var(uc_list), *mc_list;
+	struct netdev_hw_addr *ha;
+
+	if (ops->ndo_get_rx_filter_addr)
+		return ops->ndo_get_rx_filter_addr(dev, vf, skb);
+
+	NLA_PUT_U32(skb, IFLA_RX_FILTER_ADDR_FLAGS,
+		dev->flags & RX_FILTER_FLAGS);
+
+	if (netdev_uc_count(dev)) {
+		uc_list = nla_nest_start(skb, IFLA_RX_FILTER_ADDR_UC_LIST);
+		if (uc_list == NULL)
+			goto nla_put_failure;
+
+		netdev_for_each_uc_addr(ha, dev) {
+			NLA_PUT(skb, IFLA_ADDR_LIST_ENTRY, ETH_ALEN, ha->addr);
+		}
+		nla_nest_end(skb, uc_list);
+	}
+
+	if (netdev_mc_count(dev)) {
+		mc_list = nla_nest_start(skb, IFLA_RX_FILTER_ADDR_MC_LIST);
+		if (mc_list == NULL)
+			goto nla_uc_list_cancel;
+
+		netdev_for_each_mc_addr(ha, dev) {
+			NLA_PUT(skb, IFLA_ADDR_LIST_ENTRY, ETH_ALEN, ha->addr);
+		}
+		nla_nest_end(skb, mc_list);
+	}
+
+	return 0;
+
+nla_uc_list_cancel:
+	if (netdev_uc_count(dev))
+		nla_nest_cancel(skb, uc_list);
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int macvlan_get_rx_filter_addr(const struct net_device *dev, int vf,
+				      struct sk_buff *skb)
+{
+	struct macvlan_dev *vlan = netdev_priv(dev);
+
+	if (vf != SELF_VF)
+		return -EINVAL;
+
+	switch (vlan->mode) {
+	case MACVLAN_MODE_PASSTHRU:
+		return macvlan_get_rx_filter_addr_passthru(dev, vf, skb);
+	default:
+		return -ENODATA; /* No data to Fill */
+	}
+}
+
 static void macvlan_ethtool_get_drvinfo(struct net_device *dev,
 					struct ethtool_drvinfo *drvinfo)
 {
@@ -838,7 +992,11 @@ static const struct net_device_ops macvlan_netdev_ops = {
 	.ndo_vlan_rx_add_vid		= macvlan_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid		= macvlan_vlan_rx_kill_vid,
 	.ndo_set_rx_filter_addr		= macvlan_set_rx_filter_addr,
+	.ndo_get_rx_filter_addr_size	= macvlan_get_rx_filter_addr_size,
+	.ndo_get_rx_filter_addr		= macvlan_get_rx_filter_addr,
 	.ndo_set_rx_filter_vlan		= macvlan_set_rx_filter_vlan,
+	.ndo_get_rx_filter_vlan_size	= macvlan_get_rx_filter_vlan_size,
+	.ndo_get_rx_filter_vlan		= macvlan_get_rx_filter_vlan,
 };
 
 void macvlan_common_setup(struct net_device *dev)


^ permalink raw reply related

* [net-next-2.6 PATCH 4/6 v4] rtnetlink: Add support to get MAC/VLAN filters
From: Roopa Prabhu @ 2011-11-09  7:55 UTC (permalink / raw)
  To: netdev, davem
  Cc: chrisw, sri, dragos.tatulea, kvm, arnd, mst, gregory.v.rose,
	mchan, dwang2, shemminger, eric.dumazet, kaber, benve
In-Reply-To: <20111109075449.13549.58135.stgit@rhel6.1>

From: Roopa Prabhu <roprabhu@cisco.com>

This patch adds support in rtnetlink for IFLA_RX_VF_FILTERS and
IFLA_RX_FILTER get. It gets the size of the filters using
netdev_ops->get_rx_filter_addr_size and netdev_ops->get_rx_filter_vlan_size
and uses netdev_ops->get_rx_filter_addr and netdev_ops->get_rx_filter_vlan.
In case of IFLA_RX_VF_FILTERS it loops through all vf's to get the filter
data

Signed-off-by: Roopa Prabhu <roprabhu@cisco.com>
Signed-off-by: Christian Benvenuti <benve@cisco.com>
Signed-off-by: David Wang <dwang2@cisco.com>
---
 net/core/rtnetlink.c |  159 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 158 insertions(+), 1 deletions(-)


diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a042910..ea861b4 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -475,6 +475,62 @@ static size_t rtnl_link_get_af_size(const struct net_device *dev)
 	return size;
 }
 
+static size_t rtnl_vf_rx_filter_size(const struct net_device *dev, int vf)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+	size_t size;
+
+	/* IFLA_RX_FILTER  or IFLA_VF_RX_FILTER */
+	size = nla_total_size(sizeof(struct nlattr));
+
+	if (vf != SELF_VF)
+		size = nla_total_size(4); /* IFLA_RX_FILTER_VF */
+
+	if (ops->ndo_get_rx_filter_addr_size) {
+		size_t rx_filter_addr_size =
+				ops->ndo_get_rx_filter_addr_size(dev, vf);
+
+		if (rx_filter_addr_size)
+			/* IFLA_RX_FILTER_ADDR */
+			size += nla_total_size(sizeof(struct nlattr)) +
+					rx_filter_addr_size;
+	}
+
+	if (ops->ndo_get_rx_filter_vlan_size) {
+		size_t rx_filter_vlan_size =
+				ops->ndo_get_rx_filter_vlan_size(dev, vf);
+
+		if (rx_filter_vlan_size)
+			/* IFLA_RX_FILTER_VLAN */
+			size += nla_total_size(sizeof(struct nlattr)) +
+					rx_filter_vlan_size;
+	}
+
+	return size;
+}
+
+static size_t rtnl_rx_filter_size(const struct net_device *dev)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+	int vf = SELF_VF;
+	size_t size;
+
+	if (!ops->ndo_get_rx_filter_addr_size &&
+	    !ops->ndo_get_rx_filter_vlan_size)
+		return 0;
+
+	size = rtnl_vf_rx_filter_size(dev, vf); /* SELF_VF */
+
+	if (dev->dev.parent && dev_num_vf(dev->dev.parent)) {
+		/* IFLA_VF_RX_FILTERS */
+		size = nla_total_size(sizeof(struct nlattr));
+		for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++)
+			size += rtnl_vf_rx_filter_size(dev, vf);
+	}
+
+	return size;
+}
+
 static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev)
 {
 	const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
@@ -513,6 +569,102 @@ out:
 	return err;
 }
 
+static int rtnl_vf_rx_filter_fill(struct sk_buff *skb,
+				  const struct net_device *dev, int vf)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+	struct nlattr *addr_filter = NULL, *vlan_filter = NULL;
+	struct nlattr *rx_filter;
+	int err = -EMSGSIZE;
+	int filter_attrtype =
+		(vf == SELF_VF ? IFLA_RX_FILTER : IFLA_VF_RX_FILTER);
+
+	rx_filter = nla_nest_start(skb, filter_attrtype);
+	if (rx_filter == NULL)
+		goto nla_put_failure;
+
+	if (vf != SELF_VF)
+		NLA_PUT_U32(skb, IFLA_RX_FILTER_VF, vf);
+
+	if (ops->ndo_get_rx_filter_addr) {
+		addr_filter = nla_nest_start(skb, IFLA_RX_FILTER_ADDR);
+		if (addr_filter == NULL)
+			goto err_cancel_rx_filter;
+		err = ops->ndo_get_rx_filter_addr(dev, vf, skb);
+		if (err == -ENODATA)
+			nla_nest_cancel(skb, addr_filter);
+		else if (err < 0)
+			goto err_cancel_addr_filter;
+		else
+			nla_nest_end(skb, addr_filter);
+	}
+
+	if (ops->ndo_get_rx_filter_vlan) {
+		vlan_filter = nla_nest_start(skb, IFLA_RX_FILTER_VLAN);
+		if (vlan_filter == NULL)
+			goto err_cancel_addr_filter;
+		err = ops->ndo_get_rx_filter_vlan(dev, vf, skb);
+		if (err == -ENODATA)
+			nla_nest_cancel(skb, vlan_filter);
+		else if (err)
+			goto err_cancel_vlan_filter;
+		else
+			nla_nest_end(skb, vlan_filter);
+	}
+	nla_nest_end(skb, rx_filter);
+
+	return 0;
+
+err_cancel_vlan_filter:
+	if (vlan_filter)
+		nla_nest_cancel(skb, vlan_filter);
+err_cancel_addr_filter:
+	if (addr_filter)
+		nla_nest_cancel(skb, addr_filter);
+err_cancel_rx_filter:
+	nla_nest_cancel(skb, rx_filter);
+nla_put_failure:
+	return err;
+}
+
+static int rtnl_rx_filter_fill(struct sk_buff *skb,
+			       const struct net_device *dev)
+{
+	struct nlattr *vf_rx_filters = NULL;
+	int vf = SELF_VF;
+	int err;
+
+	if (!dev->netdev_ops->ndo_get_rx_filter_addr &&
+	    !dev->netdev_ops->ndo_get_rx_filter_vlan)
+		return 0;
+
+	err = rtnl_vf_rx_filter_fill(skb, dev, vf); /* SELF_VF */
+	if (err)
+		return err;
+
+	if (dev->dev.parent && dev_num_vf(dev->dev.parent)) {
+		vf_rx_filters = nla_nest_start(skb, IFLA_VF_RX_FILTERS);
+		if (!vf_rx_filters)
+			return -EMSGSIZE;
+
+		for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) {
+			err = rtnl_vf_rx_filter_fill(skb, dev, vf);
+			if (err == -EMSGSIZE)
+				goto err_cancel_nest_vf_rx_filters;
+		}
+
+		nla_nest_end(skb, vf_rx_filters);
+	}
+
+	return 0;
+
+err_cancel_nest_vf_rx_filters:
+	if (vf_rx_filters)
+		nla_nest_cancel(skb, vf_rx_filters);
+
+	return err;
+}
+
 static const int rtm_min[RTM_NR_FAMILIES] =
 {
 	[RTM_FAM(RTM_NEWLINK)]      = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
@@ -786,7 +938,9 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev)
 	       + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */
 	       + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
 	       + rtnl_link_get_size(dev) /* IFLA_LINKINFO */
-	       + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */
+	       + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */
+		/* IFLA_VF_RX_FILTERS + IFLA_RX_FILTER */
+	       + rtnl_rx_filter_size(dev);
 }
 
 static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
@@ -996,6 +1150,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	if (rtnl_port_fill(skb, dev))
 		goto nla_put_failure;
 
+	if (rtnl_rx_filter_fill(skb, dev) < 0)
+		goto nla_put_failure;
+
 	if (dev->rtnl_link_ops) {
 		if (rtnl_link_fill(skb, dev) < 0)
 			goto nla_put_failure;


^ permalink raw reply related

* [net-next-2.6 PATCH 3/6 v4] rtnetlink: Add support to set MAC/VLAN filters
From: Roopa Prabhu @ 2011-11-09  7:55 UTC (permalink / raw)
  To: netdev, davem
  Cc: chrisw, sri, dragos.tatulea, kvm, arnd, mst, gregory.v.rose,
	mchan, dwang2, shemminger, eric.dumazet, kaber, benve
In-Reply-To: <20111109075449.13549.58135.stgit@rhel6.1>

From: Roopa Prabhu <roprabhu@cisco.com>

This patch adds support in rtnetlink for IFLA_RX_FILTER and
IFLA_VF_RX_FILTERS set. It calls netdev_ops->set_rx_filter_addr and
rtnl_link_ops->set_rx_filter_vlan

Signed-off-by: Roopa Prabhu <roprabhu@cisco.com>
Signed-off-by: Christian Benvenuti <benve@cisco.com>
Signed-off-by: David Wang <dwang2@cisco.com>
---
 include/linux/if_link.h |    2 +
 net/core/rtnetlink.c    |  101 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+), 0 deletions(-)


diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 74a9f17..a8c2c14 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -268,6 +268,8 @@ enum macvlan_mode {
 
 /* SR-IOV virtual function management section */
 
+#define SELF_VF		-1
+
 enum {
 	IFLA_VF_INFO_UNSPEC,
 	IFLA_VF_INFO,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9eead8e..a042910 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1294,6 +1294,66 @@ static int do_set_master(struct net_device *dev, int ifindex)
 	return 0;
 }
 
+static int do_set_rx_filter(struct net_device *dev, int vf,
+			    struct nlattr *rx_filter[],
+			    int *modified)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+	int err = 0;
+
+	if (rx_filter[IFLA_RX_FILTER_ADDR]) {
+		struct nlattr *addr_filter[IFLA_RX_FILTER_ADDR_MAX+1];
+
+		if (!ops->ndo_set_rx_filter_addr) {
+			err = -EOPNOTSUPP;
+			goto errout;
+		}
+
+		err = nla_parse_nested(addr_filter, IFLA_RX_FILTER_ADDR_MAX,
+				rx_filter[IFLA_RX_FILTER_ADDR],
+				ifla_addr_filter_policy);
+		if (err < 0)
+			goto errout;
+
+		if (addr_filter[IFLA_RX_FILTER_ADDR_FLAGS]) {
+			unsigned int flags = nla_get_u32(
+					addr_filter[IFLA_RX_FILTER_ADDR_FLAGS]);
+			if (flags & ~RX_FILTER_FLAGS) {
+				err = -EINVAL;
+				goto errout;
+			}
+		}
+
+		err = ops->ndo_set_rx_filter_addr(dev, vf, addr_filter);
+		if (err < 0)
+			goto errout;
+		*modified = 1;
+	}
+
+	if (rx_filter[IFLA_RX_FILTER_VLAN]) {
+		struct nlattr *vlan_filter[IFLA_RX_FILTER_VLAN_MAX+1];
+
+		if (!ops->ndo_set_rx_filter_vlan) {
+			err = -EOPNOTSUPP;
+			goto errout;
+		}
+
+		err = nla_parse_nested(vlan_filter, IFLA_RX_FILTER_VLAN_MAX,
+				rx_filter[IFLA_RX_FILTER_VLAN],
+				ifla_vlan_filter_policy);
+		if (err < 0)
+			goto errout;
+
+		err = ops->ndo_set_rx_filter_vlan(dev, vf, vlan_filter);
+		if (err < 0)
+			goto errout;
+		*modified = 1;
+	}
+
+errout:
+	return err;
+}
+
 static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 		      struct nlattr **tb, char *ifname, int modified)
 {
@@ -1515,6 +1575,47 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 			modified = 1;
 		}
 	}
+
+	if (tb[IFLA_VF_RX_FILTERS]) {
+		struct nlattr *vf_rx_filter[IFLA_RX_FILTER_MAX+1];
+		struct nlattr *attr;
+		int vf;
+		int rem;
+
+		nla_for_each_nested(attr, tb[IFLA_VF_RX_FILTERS], rem) {
+			if (nla_type(attr) != IFLA_VF_RX_FILTER)
+				continue;
+			err = nla_parse_nested(vf_rx_filter, IFLA_RX_FILTER_MAX,
+					attr, ifla_rx_filter_policy);
+			if (err < 0)
+				goto errout;
+
+			if (!vf_rx_filter[IFLA_RX_FILTER_VF]) {
+				err = -EOPNOTSUPP;
+				goto errout;
+			}
+			vf = nla_get_u32(vf_rx_filter[IFLA_RX_FILTER_VF]);
+
+			err = do_set_rx_filter(dev, vf, vf_rx_filter,
+					 &modified);
+			if (err < 0)
+				goto errout;
+		}
+	}
+
+	if (tb[IFLA_RX_FILTER]) {
+		struct nlattr *rx_filter[IFLA_RX_FILTER_MAX+1];
+
+		err = nla_parse_nested(rx_filter, IFLA_RX_FILTER_MAX,
+				tb[IFLA_RX_FILTER], ifla_rx_filter_policy);
+		if (err < 0)
+			goto errout;
+
+		err = do_set_rx_filter(dev, SELF_VF, rx_filter, &modified);
+		if (err < 0)
+			goto errout;
+	}
+
 	err = 0;
 
 errout:


^ permalink raw reply related

* [net-next-2.6 PATCH 0/6 v4] macvlan: MAC Address filtering support for passthru mode
From: Roopa Prabhu @ 2011-11-09  7:55 UTC (permalink / raw)
  To: netdev, davem
  Cc: chrisw, sri, dragos.tatulea, kvm, arnd, mst, gregory.v.rose,
	mchan, dwang2, shemminger, eric.dumazet, kaber, benve

v3 -> v4
- Removed RFC in subject-prefix
- Regenerated patches over latest net-next
(no code changes)

Thanks to Greg Rose <gregory.v.rose@intel.com> for evaluating v3

v2 -> v3
- Moved set and get filter ops from rtnl_link_ops to netdev_ops
- Support for SRIOV VFs.
	[Note: The get filters msg (in the way current get rtnetlink handles
	it) might get too big for SRIOV vfs. This patch follows existing sriov 
	vf get code and tries to accomodate filters for all VF's in a PF. 
        And for the SRIOV case I have only tested the fact that the VF 
	arguments are getting delivered to rtnetlink correctly. The code
	follows existing sriov vf handling code so rest of it should work fine]
- Fixed all op and netlink attribute names to start with IFLA_RX_FILTER
- Changed macvlan filter ops to call corresponding lowerdev op if lowerdev 
  supports it for passthru mode. Else it falls back on macvlan handling the 
  filters locally as in v1 and v2

v1 -> v2
- Instead of TUNSETTXFILTER introduced rtnetlink interface for the same


Background and details:
=======================
Today macvtap used in virtualized environment does not have support to 
propagate MAC, VLAN and interface flags from guest to lowerdev.
Which means to be able to register additional VLANs, unicast and multicast
addresses or change pkt filter flags in the guest, the lowerdev has to be
put in promisocous mode. Today the only macvlan mode that supports this is 
the PASSTHRU mode and it puts the lower dev in promiscous mode.

PASSTHRU mode was added primarily for the SRIOV usecase. In PASSTHRU mode 
there is a 1-1 mapping between macvtap and physical NIC or VF.

There are two problems with putting the lowerdev in promiscous mode (ie SRIOV 
VF's):
	- Some SRIOV cards dont support promiscous mode today (Thread on Intel
	driver indicates that http://lists.openwall.net/netdev/2011/09/27/6)
	- For the SRIOV NICs that support it, Putting the lowerdev in 
	promiscous mode leads to additional traffic being sent up to the 
	guest virtio-net to filter result in extra overheads.
	
Both the above problems can be solved by offloading filtering to the 
lowerdev hw. ie lowerdev does not need to be in promiscous mode as 
long as the guest filters are passed down to the lowerdev. 

This patch basically adds the infrastructure to set and get MAC and VLAN 
filters on an interface via rtnetlink. It adds new netlink msg and netdev
ops for the same. And implements these ops in macvlan for passthru mode.

- Netlink interface:
    This patch provides the following netlink interface to set mac and vlan
    filters :

    Interface to set RX filter on a SRIOV VF:
    [IFLA_VF_RX_FILTERS] = {
    	[IFLA_VF_RX_FILTER] = {
    		[IFLA_RX_FILTER_VF]
    		[IFLA_RX_FILTER_ADDR] = {
    			[IFLA_RX_FILTER_ADDR_FLAGS]
    			[IFLA_RX_FILTER_ADDR_UC_LIST] = {
    				[IFLA_ADDR_LIST_ENTRY]
    			}
    			[IFLA_RX_FILTER_ADDR_MC_LIST] = {
    				[IFLA_ADDR_LIST_ENTRY]
    			}
    		}
    		[IFLA_RX_FILTER_VLAN] = {
    			[IFLA_RX_FILTER_VLAN_BITMAP]
    		}
    	}
    	...
    }
    
    Interface to set RX filter on a any network interface.:
    [IFLA_RX_FILTER] = {
    	[IFLA_RX_FILTER_VF]
    	[IFLA_RX_FILTER_ADDR] = {
    		[IFLA_RX_FILTER_ADDR_FLAGS]
    		[IFLA_RX_FILTER_ADDR_UC_LIST] = {
    			[IFLA_ADDR_LIST_ENTRY]
    		}
    		[IFLA_RX_FILTER_ADDR_MC_LIST] = {
    			[IFLA_ADDR_LIST_ENTRY]
    		}
    	}
    	[IFLA_RX_FILTER_VLAN] = {
    		[IFLA_RX_FILTER_VLAN_BITMAP]
	}
    } 

    Note1: The IFLA_RX_FILTER_VLAN is a nested attribute, but contains only 
    IFLA_RX_FILTER_VLAN_BITMAP today. The idea is that the IFLA_RX_FILTER_VLAN 
    can be extended tomorrow to have a vlan list if some implementations 
    prefer a list instead. 

    And it provides the following netdev_ops to set/get MAC/VLAN filters:

    int                     (*ndo_set_rx_filter_addr)(
	                                        struct net_device *dev, int vf,
                                                struct nlattr *tb[]);
    int                     (*ndo_set_rx_filter_vlan)(
                                                struct net_device *dev, int vf,
                                                struct nlattr *tb[]);
    size_t                  (*ndo_get_rx_filter_addr_size)(
                                                const struct net_device *dev,
                                                int vf);
    size_t                  (*ndo_get_rx_filter_vlan_size)(
                                                const struct net_device *dev,
                                                int vf);
    int                     (*ndo_get_rx_filter_addr)(
                                                const struct net_device *dev,
                                                int vf, struct sk_buff *skb);
    int                     (*ndo_get_rx_filter_vlan)(
                                                const struct net_device *dev,
                                                int vf, struct sk_buff *skb);

Some answers to questions that were raised during the review:
- Protection against address spoofing:
	- This patch adds filtering support only for macvtap PASSTHRU 
	Mode. PASSTHRU mode is used mainly with SRIOV VF's. And SRIOV VF's 
	come with anti mac/vlan spoofing support in the lowerdev driver. 
	(netdev infrastructure to support this was added recently 
	with IFLA_VF_SPOOFCHK). For 802.1Qbh devices, the port profile has a 
	knob to enable/disable anti spoof check. Lowerdevice drivers also 
	enforce limits on the number of address registrations allowed. 
	For non-SRIOV VF's its the responsibility of the lowerdev driver
	to implement any such protection. The currrent netdev hooks for 
	SRIOV VF's spoof check could be extended to accomodate any network 
	interface in the future.

- Support for multiqueue devices: Enable filtering on individual queues (?):
	As i understand after the thread between (Micheal and Greg),
	VMdq Linux implementation is not in yet and dont know how its going to
	take shape. But Intel VMdq devices do accept filters on a per-queue
	basis. Since the netdev infrastructure for VMdq is not in yet, Its
	hard to say how this patch can support it.

	This patch makes use of current netdev infrastructure for setting
	address and vlan filters. And if that changes for vmdq tomorrow,
	then the work that this patch represents can be modified to accomodate
	vmdq devices at that time. 

	So i dont see a huge problem with this patch coming in the way for
	vmdq devices.

- Support for non-PASSTHRU mode:
	I started implementing this. But there are a couple of problems.	
	- Today, in non-PASSTHRU cases macvlan_handle_frame assumes that 
	every macvlan device has a single unique mac.
	And the macvlans are hashed on that single mac address. 
	To support filtering for non-PASSTHRU mode in addition to this 
	patch the following needs to be done:
		- non-passthru mode with a single macvlan over a lower dev
		can be treated as PASSTHRU case
		- For non-PASSTHRU mode with multiple macvlans over a single 
		lower dev:  
			- Multiple unicast mac's now need to be hashed to the 
			same macvlan device. The macvlan hash needs to change 
			for lookup based on any one of the multiple unicast 
			addresses a macvlan is interested in
			- We need to consider vlans during the lookup too
			- So the macvlan device hash needs to hash on both mac 
			and vlan
		- But the support for filtering in non-PASSTHRU mode can be 
		built on this patch

This patch series implements the following 
01/6 rtnetlink: Netlink interface for setting MAC and VLAN filters
02/6 netdev: Add netdev_ops to set and get MAC/VLAN rx filters
03/6 rtnetlink: Add support to set MAC/VLAN filters
04/6 rtnetlink: Add support to get MAC/VLAN filters
05/6 macvlan: Add support to set MAC/VLAN filter netdev ops
06/6 macvlan: Add support to get MAC/VLAN filter netdev ops

Please comment. Thanks.

Signed-off-by: Roopa Prabhu <roprabhu@cisco.com>
Signed-off-by: Christian Benvenuti <benve@cisco.com>
Signed-off-by: David Wang <dwang2@cisco.com>

^ permalink raw reply

* Re: [PATCH] neigh: replace unres_qlen by unres_qlen_bytes
From: Eric Dumazet @ 2011-11-09  7:55 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20111109.001843.1753987502002673227.davem@davemloft.net>

Le mercredi 09 novembre 2011 à 00:18 -0500, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Wed, 09 Nov 2011 01:14:16 +0100
> 
> > unres_qlen is the number of frames we are able to queue per unresolved
> > neighbour. Its default value (3) was never changed and is responsible
> > for strange drops, especially if IP fragments are used, or multiple
> > sessions start in parallel. TCP initial congestion window is now bigger
> > than 3.
> 
> BTW, it has been observed in practice that if a long living connection
> suddently sends a burst of traffic after a very long idle period
> (hitting ARP expiry) or something invalidates the ARP entry in use, we
> will drop frames.  Because even if the ARP reply comes "fast" it's
> never quick enough to beat the burst of frames.
> 
> And if this happens in a scenerio where such lost packets potentially
> mean lost money...

I'll submit a more complete patch, including a fallback support of
unres_qlen, and one missing initializer in nl_ntbl_parm_policy[]

+	[NDTPA_QUEUE_LENBYTES]          = { .type = NLA_U32 },

 

^ permalink raw reply

* RE: [PATCH] r8169: more driver shutdown WoL regression.
From: hayeswang @ 2011-11-09  7:49 UTC (permalink / raw)
  To: 'Francois Romieu', netdev
  Cc: 'Stefan Becker', 'David Miller'
In-Reply-To: <20111108223502.GA20437@electric-eye.fr.zoreil.com>

Francois Romieu [mailto:romieu@fr.zoreil.com] 
> Sent: Wednesday, November 09, 2011 6:35 AM
> To: netdev@vger.kernel.org
> Cc: Stefan Becker; David Miller; Hayeswang
> Subject: [PATCH] r8169: more driver shutdown WoL regression.
> 
> Almost the same narrative as 649b3b8c4e8681de443b4dc9e387c3036369e02e
> but with more experimental data.
> 
> Stefan Becker has reported that the same kind of fix as the one
> introduced in 649b3b8c4e8681de443b4dc9e387c3036369e02e ("r8169: fix
> driver shutdown WoL regression") before 3.1 was released is required
> for his 8168c (RTL_GIGA_MAC_VER_22).
> 
> I have tested a few chipsets as well:
> - without patch, shutdown + WoL works fine for :
>   o RTL_GIGA_MAC_VER_30 (8105e and 8105evc)
>   o RTL_GIGA_MAC_VER_33 (8168ed)
>   o RTL_GIGA_MAC_VER_34 (8168evl)
>   o RTL_GIGA_MAC_VER_35 (8168f)
>   o RTL_GIGA_MAC_VER_06 (plain old PCI 8169sc)
> - without patch, shutdown + WoL is broken with :
>   o RTL_GIGA_MAC_VER_26 (8168d-vb-gr)
>   o RTL_GIGA_MAC_VER_25 (8168d-gr)
>   o RTL_GIGA_MAC_VER_12 (8168b)
>   o RTL_GIGA_MAC_VER_09 (both 8102e-vb-gr and 8103e-gr)
> 

I am confused with your results. According to the information from hw and my
tests, the chips which need enable RxConfig for WOL are 8105e series, 8168e
series, 8168evl, and 8168f series. The previous chips, include 8168d, work fine
for WOL without enabling RxConfig.
PS. I test 8111d (RTL_GIGA_MAC_VER_25) and 8111e (RTL_GIGA_MAC_VER_33) with
kernel 3.1.0.

Besides, I find rtl_shutdown would call rtl8169_net_suspend, rtl8169_net_suspend
would call rtl_pll_power_down, and rtl_pll_power_down would call
r810x_pll_power_down or r8168_pll_power_down. Finally, I find
rtl_wol_suspend_quirk would be called. Is it necessary to call
rtl_wol_suspend_quirk again in rtl_shutdown?

> I have widened rtl_wol_suspend_quirk a bit beyond those data 
> to include
> a broader subset of chipsets from the same families, thus 
> including the
> 8168cp and 8168dp.
> 
> Signed-off-by: Francois Romieu <romieu@fr.zoreil.com>
> Tested-by: Stefan Becker <chemobejk@gmail.com>
> Cc: Hayes <hayeswang@realtek.com>
> ---
> 
>  Hayes, any insight ?
> 
>  drivers/net/ethernet/realtek/r8169.c |   12 ++++++++++++
>  1 files changed, 12 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/net/ethernet/realtek/r8169.c 
> b/drivers/net/ethernet/realtek/r8169.c
> index 92b45f0..829674d 100644
> --- a/drivers/net/ethernet/realtek/r8169.c
> +++ b/drivers/net/ethernet/realtek/r8169.c
> @@ -3496,6 +3496,18 @@ static void 
> rtl_wol_suspend_quirk(struct rtl8169_private *tp)
>  	void __iomem *ioaddr = tp->mmio_addr;
>  
>  	switch (tp->mac_version) {
> +	case RTL_GIGA_MAC_VER_07:
> +	case RTL_GIGA_MAC_VER_08:
> +	case RTL_GIGA_MAC_VER_09:
> +	case RTL_GIGA_MAC_VER_11:
> +	case RTL_GIGA_MAC_VER_12:
> +	case RTL_GIGA_MAC_VER_17:
> +	case RTL_GIGA_MAC_VER_19:
> +	case RTL_GIGA_MAC_VER_20:
> +	case RTL_GIGA_MAC_VER_21:
> +	case RTL_GIGA_MAC_VER_22:
> +	case RTL_GIGA_MAC_VER_25:
> +	case RTL_GIGA_MAC_VER_26:
>  	case RTL_GIGA_MAC_VER_29:
>  	case RTL_GIGA_MAC_VER_30:
>  	case RTL_GIGA_MAC_VER_32:

^ permalink raw reply

* [PATCH] ipv4: fix for ip_options_rcv_srr() daddr update.
From: Li Wei @ 2011-11-09  7:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

When opt->srr_is_hit is set skb_rtable(skb) has been updated for
'nexthop' and iph->daddr should always equals to skb_rtable->rt_dst
holds, We need update iph->daddr either.

Signed-off-by: Li Wei <lw@cn.fujitsu.com>
---
 net/ipv4/ip_options.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index ec93335..05d20cc 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -640,6 +640,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 	}
 	if (srrptr <= srrspace) {
 		opt->srr_is_hit = 1;
+		iph->daddr = nexthop;
 		opt->is_changed = 1;
 	}
 	return 0;
-- 
1.7.3.2

^ permalink raw reply related

* Re: [PATCH] ipv4: fix a bug in SRR option matching.
From: Li Wei @ 2011-11-09  7:37 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20111108.120621.691425261290061620.davem@davemloft.net>

> From: Li Wei <lw@cn.fujitsu.com>
> Date: Tue, 08 Nov 2011 15:56:40 +0800
> 
>> Since commit 7be799a7 (ipv4: Remove rt->rt_dst reference from
>> ip_forward_options()) and commit 0374d9ce (ipv4: Kill spurious
>> write to iph->daddr in ip_forward_options()) we use iph->daddr
>> for SRR option matching and assume iph->daddr equals to rt->rt_dst,
>> Unfortunately skb_rtable(skb) has been updated in ip_options_rcv_srr()
>> for the nexthop in SRR option but iph->daddr *not* updated,
>> We should use the updated rt->rt_dst for SRR option matching
>> and update iph->daddr here.
>>
>> Signed-off-by: Li Wei <lw@cn.fujitsu.com>
> 
> Please replace this by whatever logic ip_options_rcv_srr() uses to
> determine the destination address.
> 
> I would strongly encourage you, when fixing bugs like this, to use
> as a hint the intentions of the commit which introduced the bug.  And
> try as hard as possible to retain the goals of the guilty commit.
> 
> In this case, that means not introducing references to rt->rt_dst
> back into the code.
> 
> Thank you.
> 
> 

Thank you for your advice, I reviewed the code again think that as you said
in commit def57687, "No matter what kind of header mangling occurs due to IP
options processing, rt->rt_dst will always equal iph->daddr in the packet",
iph->daddr in ip_options_rcv_srr() should be updated either as skb_rtable(skb)
has been updated for 'nexthop'. So we can elide all rt->rt_dst reference
in ip_forward() and ip_forward_options().

I will submit another patch to fix this bug.

^ permalink raw reply

* re: usbnet: fix oops in usbnet_start_xmit
From: Dan Carpenter @ 2011-11-09  7:34 UTC (permalink / raw)
  To: khlebnikov-GEFAQzZX7r8dnm+yROfE0A
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-usb-u79uwXL29TY76Z2rM5mHXA

Hello Konstantin Khlebnikov,

This is a semi-automatic email about new static checker warnings.

The patch 23ba07991dad: "usbnet: fix oops in usbnet_start_xmit" from 
Nov 7, 2011, leads to the following Smatch complaint:

drivers/net/usb/usbnet.c +1077 usbnet_start_xmit()
	 error: we previously assumed 'skb' could be null (see line 1060)

drivers/net/usb/usbnet.c
  1059	
  1060		if (skb)
                    ^^^
check introduced here.

  1061			skb_tx_timestamp(skb);
  1062	
  1063		// some devices want funky USB-level framing, for
  1064		// win32 driver (usually) and/or hardware quirks
  1065		if (info->tx_fixup) {
  1066			skb = info->tx_fixup (dev, skb, GFP_ATOMIC);
  1067			if (!skb) {
  1068				if (netif_msg_tx_err(dev)) {
  1069					netif_dbg(dev, tx_err, dev->net, "can't tx_fixup skb\n");
  1070					goto drop;
  1071				} else {
  1072					/* cdc_ncm collected packet; waits for more */
  1073					goto not_drop;
  1074				}
  1075			}
  1076		}
  1077		length = skb->len;
                         ^^^^^^^^
dereference without checking.

  1078	
  1079		if (!(urb = usb_alloc_urb (0, GFP_ATOMIC))) {

regards,
dan carpenter

--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: Add IPSec IP Range in Linux kernel
From: Daniil Stolnikov @ 2011-11-09  7:25 UTC (permalink / raw)
  To: Herbert Xu
  Cc: linux-kernel, netdev, linux-crypto, linux-security-module, davem,
	adobriyan, peter.p.waskiewicz.jr, davem
In-Reply-To: <20111109032729.GA11312@gondor.apana.org.au>

> Simple, you break a range policy into parts that can be expressed
> as network/mask and install multiple policies.  The actual policies
> in the kernel just has to have the same effect as the one you
> negotiated with the other side, it does not have to look the same.

> This is also why you can do the same thing with masks + netfilter.

> Cheers,


Please describe in detail, including the schema and sample configuration files!

The fact that I have already tried some options and I did not. Here is a link to a Russian-language correspondence on my problem: http://www.opennet.ru/openforum/vsluhforumID10/4941.html.

Once again, draw a diagram of which should be implemented:

                 (server)                                                        (client 1)
                 _______                                                          _______
                 |      |                                                        |      |---192.168.7.1
                 |      |                                                        |      |---192.168.7.2
192.168.1.0/24---|      |192.168.5.1/24----------------------------192.168.5.2/24|      |---192.168.7.3
                 |      |                       |                                |      |---192.168.7.4
                 |______|                       |                                |______|---192.168.7.5      
                                                |
                                                |                                 (client 2)
                                                |                                _______
                                                |                                |      |---192.168.7.6
                                                |                                |      |---192.168.7.7
                                                -------------------192.168.5.3/24|      |---192.168.7.8
                                                |                                |      |---192.168.7.9
                                                |                                |______|---192.168.7.10
                                                |
                                                |                                 (client 3)
                                                |                                _______
                                                |                                |      |---192.168.7.11
                                                |                                |      |---192.168.7.12
                                                -------------------192.168.5.4/24|      |---192.168.7.13
                                                                                 |      |---192.168.7.14
                                                                                 |______|---192.168.7.15


Tried to do the following:

1) network to host. The connection of course been established, but not all traffic be encrypted.
2) host to host. similarly to 1.
3) network to IP range. I tried different configurations, but the connection did not succeed.

So I came to the conclusion that when we point to the IP range zywall IPSec we actually prescribe what kind of traffic will be encrypted. The rest will be routed but no encryption. Correct if I'm wrong.
If you are not working hard to lay out the configuration files!

^ permalink raw reply

* Re: Large file copy to NFS mounted directory causes delay in other application packets
From: Eric Dumazet @ 2011-11-09  6:24 UTC (permalink / raw)
  To: Manavalan Krishnan; +Cc: linux-kernel@vger.kernel.org, netdev
In-Reply-To: <1320816503.73813.YahooMailNeo@web160718.mail.bf1.yahoo.com>

Le mardi 08 novembre 2011 à 21:28 -0800, Manavalan Krishnan a écrit :
> Hi All
> 
> I have two systems with two network interfaces each(eth0 and eth1). I
> am running linux-HA (heartbeat deamon) on both the systems and they
> use eth0 for exchanging heartbeats. I have NFS mounted directory in
> one system and the NFS client uses the interface eth1. 
> 
> I try to copy a large file to NFS mounted directory. But the heartbeat
> daemons misses the heartbeat packets from peers while copy is under
> progress. I did tcpdump and found that the heartbeat packets are
> delayed for few seconds before sent out on eth0. When I stop the file
> copy, the heartbeats are delivered properly. It seems linux kernel
> somehow giving priority for NFS packets(generated from the file copy)
> over other application packets.
> 
> Any thoughts on this behavior? Is there any way we can avoid this so
> that application packets get equal chance while large file copy to NFS
> mounted directory under progress?
> 
CC netdev

1) Is your NFS using UDP or TCP ?
2) Is your eth0 dedicated to heartbeats and eth1 to NFS traffic ?
3) How do you know heartbeats are delayed ?
4) Is your server CPU bounded ?

Thanks

^ permalink raw reply

* Re: [PATCH 2/5] net/sunrpc: use kstrtoul, etc
From: Julia Lawall @ 2011-11-09  6:15 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: J. Bruce Fields, kernel-janitors, Neil Brown, Trond Myklebust,
	David S. Miller, linux-nfs, netdev, linux-kernel
In-Reply-To: <alpine.DEB.2.02.1111082113380.1880@hadrien>

In looking through some examples, I see, e.g.:

 	if (strict_strtoul(buf, 10, &val) < 0)
  		return -EINVAL;
 	if (val < 1 || val > 2)
  		return -EINVAL;

In this case the only valid values are 1 and 2, which are much smaller 
than the u8 range.  Is it useful to use kstrtou8 anyway?  I see that 
kstrtou8 returns -ERANGE not -EINVAL when the value is out of bounds.  If 
kstrtou8 is to be used, should the subsequent if (val < 1 || val > 2) now 
return -ERANGE to be consistent?

julia

^ permalink raw reply

* Re: [PATCH] neigh: replace unres_qlen by unres_qlen_bytes
From: David Miller @ 2011-11-09  5:18 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1320797656.26025.43.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 09 Nov 2011 01:14:16 +0100

> unres_qlen is the number of frames we are able to queue per unresolved
> neighbour. Its default value (3) was never changed and is responsible
> for strange drops, especially if IP fragments are used, or multiple
> sessions start in parallel. TCP initial congestion window is now bigger
> than 3.

BTW, it has been observed in practice that if a long living connection
suddently sends a burst of traffic after a very long idle period
(hitting ARP expiry) or something invalidates the ARP entry in use, we
will drop frames.  Because even if the ARP reply comes "fast" it's
never quick enough to beat the burst of frames.

And if this happens in a scenerio where such lost packets potentially
mean lost money...

^ permalink raw reply

* Re: Add IPSec IP Range in Linux kernel
From: Herbert Xu @ 2011-11-09  3:27 UTC (permalink / raw)
  To: Daniil Stolnikov
  Cc: davem, linux-kernel, netdev, linux-crypto, linux-security-module,
	adobriyan, peter.p.waskiewicz.jr
In-Reply-To: <552673196.20111109103207@mail.ru>

Daniil Stolnikov <danila.st@mail.ru> wrote:
>> Like I said, if you want address ranges, ask the userland IPSEC daemon
>> authors to synthesize it.
> 
> In this letter, the mailing list http://marc.info/?l=strongswan-users&m=130613736616488&w=4 strongswan-users say that their product has support for IP ranges, but the stack of Linux is based on network masks. So I do not understand how this would work without the support at the kernel level? How will coordination of policies?

Simple, you break a range policy into parts that can be expressed
as network/mask and install multiple policies.  The actual policies
in the kernel just has to have the same effect as the one you
negotiated with the other side, it does not have to look the same.

This is also why you can do the same thing with masks + netfilter.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox