Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 4/7] neigh: Do not set tbl->entry_size in ipv4/ipv6 neigh tables.
From: David Miller @ 2011-12-01  3:41 UTC (permalink / raw)
  To: netdev


Let the core self-size the neigh entry based upon the key length.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/atm/clip.c   |    1 -
 net/ipv4/arp.c   |    1 -
 net/ipv6/ndisc.c |    1 -
 3 files changed, 0 insertions(+), 3 deletions(-)

diff --git a/net/atm/clip.c b/net/atm/clip.c
index aea7cad..b1c7ada 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -322,7 +322,6 @@ static u32 clip_hash(const void *pkey, const struct net_device *dev, __u32 rnd)
 
 static struct neigh_table clip_tbl = {
 	.family 	= AF_INET,
-	.entry_size 	= sizeof(struct neighbour)+sizeof(struct atmarp_entry),
 	.key_len 	= 4,
 	.hash 		= clip_hash,
 	.constructor 	= clip_constructor,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 5c29ac5..fd4b3e8 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -164,7 +164,6 @@ static const struct neigh_ops arp_broken_ops = {
 
 struct neigh_table arp_tbl = {
 	.family		= AF_INET,
-	.entry_size	= sizeof(struct neighbour) + 4,
 	.key_len	= 4,
 	.hash		= arp_hash,
 	.constructor	= arp_constructor,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 2854705..cfb9709 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -126,7 +126,6 @@ static const struct neigh_ops ndisc_direct_ops = {
 
 struct neigh_table nd_tbl = {
 	.family =	AF_INET6,
-	.entry_size =	sizeof(struct neighbour) + sizeof(struct in6_addr),
 	.key_len =	sizeof(struct in6_addr),
 	.hash =		ndisc_hash,
 	.constructor =	ndisc_constructor,
-- 
1.7.6.4

^ permalink raw reply related

* [PATCH 3/7] neigh: Add infrastructure for allocating device neigh privates.
From: David Miller @ 2011-12-01  3:41 UTC (permalink / raw)
  To: netdev


netdev->neigh_priv_len records the private area length.

This will trigger for neigh_table objects which set tbl->entry_size
to zero, and the first instances of this will be forthcoming.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c |    2 ++
 include/linux/netdevice.h                 |    1 +
 net/atm/clip.c                            |    1 +
 net/core/neighbour.c                      |   14 +++++++++++---
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index efd7a96..57ae9b9 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1218,6 +1218,8 @@ static struct net_device *ipoib_add_port(const char *format,
 	priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
 	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
 
+	priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh);
+
 	result = ib_query_pkey(hca, port, 0, &priv->pkey);
 	if (result) {
 		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 97edb32..5462c2c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1080,6 +1080,7 @@ struct net_device {
 	unsigned char		perm_addr[MAX_ADDR_LEN]; /* permanent hw address */
 	unsigned char		addr_assign_type; /* hw address assignment type */
 	unsigned char		addr_len;	/* hardware address length	*/
+	unsigned char		neigh_priv_len;
 	unsigned short          dev_id;		/* for shared network cards */
 
 	spinlock_t		addr_list_lock;
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 11439a7..aea7cad 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -535,6 +535,7 @@ static void clip_setup(struct net_device *dev)
 {
 	dev->netdev_ops = &clip_netdev_ops;
 	dev->type = ARPHRD_ATM;
+	dev->neigh_priv_len = sizeof(struct atmarp_entry);
 	dev->hard_header_len = RFC1483LLC_LEN;
 	dev->mtu = RFC1626_MTU;
 	dev->tx_queue_len = 100;	/* "normal" queue (packets) */
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 661ad12..ef750ff 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -273,7 +273,7 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
 }
 EXPORT_SYMBOL(neigh_ifdown);
 
-static struct neighbour *neigh_alloc(struct neigh_table *tbl)
+static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
 {
 	struct neighbour *n = NULL;
 	unsigned long now = jiffies;
@@ -288,7 +288,15 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
 			goto out_entries;
 	}
 
-	n = kzalloc(tbl->entry_size, GFP_ATOMIC);
+	if (tbl->entry_size)
+		n = kzalloc(tbl->entry_size, GFP_ATOMIC);
+	else {
+		int sz = sizeof(*n) + tbl->key_len;
+
+		sz = ALIGN(sz, NEIGH_PRIV_ALIGN);
+		sz += dev->neigh_priv_len;
+		n = kzalloc(sz, GFP_ATOMIC);
+	}
 	if (!n)
 		goto out_entries;
 
@@ -463,7 +471,7 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
 	u32 hash_val;
 	int key_len = tbl->key_len;
 	int error;
-	struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
+	struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
 	struct neigh_hash_table *nht;
 
 	if (!n) {
-- 
1.7.6.4

^ permalink raw reply related

* [PATCH 2/7] neigh: Get rid of neigh_table->kmem_cachep
From: David Miller @ 2011-12-01  3:41 UTC (permalink / raw)
  To: netdev


We are going to alloc for device specific private areas for
neighbour entries, and in order to do that we have to move
away from the fixed allocation size enforced by using
neigh_table->kmem_cachep

As a nice side effect we can now use kfree_rcu().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h |    1 -
 net/core/neighbour.c    |   18 ++----------------
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 87c0e5c..e31f0a8 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -173,7 +173,6 @@ struct neigh_table {
 	atomic_t		entries;
 	rwlock_t		lock;
 	unsigned long		last_rand;
-	struct kmem_cache	*kmem_cachep;
 	struct neigh_statistics	__percpu *stats;
 	struct neigh_hash_table __rcu *nht;
 	struct pneigh_entry	**phash_buckets;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 27d3fef..661ad12 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -288,7 +288,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
 			goto out_entries;
 	}
 
-	n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC);
+	n = kzalloc(tbl->entry_size, GFP_ATOMIC);
 	if (!n)
 		goto out_entries;
 
@@ -678,12 +678,6 @@ static inline void neigh_parms_put(struct neigh_parms *parms)
 		neigh_parms_destroy(parms);
 }
 
-static void neigh_destroy_rcu(struct rcu_head *head)
-{
-	struct neighbour *neigh = container_of(head, struct neighbour, rcu);
-
-	kmem_cache_free(neigh->tbl->kmem_cachep, neigh);
-}
 /*
  *	neighbour must already be out of the table;
  *
@@ -711,7 +705,7 @@ void neigh_destroy(struct neighbour *neigh)
 	NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);
 
 	atomic_dec(&neigh->tbl->entries);
-	call_rcu(&neigh->rcu, neigh_destroy_rcu);
+	kfree_rcu(neigh, rcu);
 }
 EXPORT_SYMBOL(neigh_destroy);
 
@@ -1486,11 +1480,6 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
 	tbl->parms.reachable_time =
 			  neigh_rand_reach_time(tbl->parms.base_reachable_time);
 
-	if (!tbl->kmem_cachep)
-		tbl->kmem_cachep =
-			kmem_cache_create(tbl->id, tbl->entry_size, 0,
-					  SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-					  NULL);
 	tbl->stats = alloc_percpu(struct neigh_statistics);
 	if (!tbl->stats)
 		panic("cannot create neighbour cache statistics");
@@ -1575,9 +1564,6 @@ int neigh_table_clear(struct neigh_table *tbl)
 	free_percpu(tbl->stats);
 	tbl->stats = NULL;
 
-	kmem_cache_destroy(tbl->kmem_cachep);
-	tbl->kmem_cachep = NULL;
-
 	return 0;
 }
 EXPORT_SYMBOL(neigh_table_clear);
-- 
1.7.6.4

^ permalink raw reply related

* [PATCH 1/7] neigh: Create mechanism for generic neigh private areas.
From: David Miller @ 2011-12-01  3:41 UTC (permalink / raw)
  To: netdev


The implementation private sits right after the primary_key memory.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h |    7 +++++++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 7ae5acf..87c0e5c 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -179,6 +179,13 @@ struct neigh_table {
 	struct pneigh_entry	**phash_buckets;
 };
 
+#define NEIGH_PRIV_ALIGN	sizeof(long long)
+
+static inline void *neighbour_priv(const struct neighbour *n)
+{
+	return (char *)n + ALIGN(sizeof(*n) + n->tbl->key_len, NEIGH_PRIV_ALIGN);
+}
+
 /* flags for neigh_update() */
 #define NEIGH_UPDATE_F_OVERRIDE			0x00000001
 #define NEIGH_UPDATE_F_WEAK_OVERRIDE		0x00000002
-- 
1.7.6.4

^ permalink raw reply related

* [PATCH 0/7] Respin of dynamic neigh patches.
From: David Miller @ 2011-12-01  3:41 UTC (permalink / raw)
  To: netdev

Just is just a respin of the neighbour layer patches I posted some
time ago which allows us to get rid of the ATM specific code in
net/ipv4/route.c

Committed to net-next

^ permalink raw reply

* Re: [PATCH] netconsole: implement ipv4 tos support
From: David Miller @ 2011-12-01  3:37 UTC (permalink / raw)
  To: shemminger; +Cc: zenczykowski, maze, netdev
In-Reply-To: <20111130173435.5e63afda@nehalam.linuxnetplumber.net>

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Wed, 30 Nov 2011 17:34:35 -0800

> On Wed, 30 Nov 2011 17:18:50 -0800
> Maciej Żenczykowski <zenczykowski@gmail.com> wrote:
> 
>> From: Maciej Żenczykowski <maze@google.com>
>> 
>> Signed-off-by: Maciej Żenczykowski <maze@google.com>
> 
> Why make it an option? TOS should always be set to interactive
> traffic (like telnet and slogin)

Next they will ask for TTL as well.

Nope, sorry, we'll not be turning netconsole into a cluster-f*ck of
every configuration option anyone can come up with.

^ permalink raw reply

* Re: Is there any necessary to add multicast route for a loopback device?
From: David Miller @ 2011-12-01  3:35 UTC (permalink / raw)
  To: lw; +Cc: netdev
In-Reply-To: <4ED6E6A8.3080702@cn.fujitsu.com>

From: Li Wei <lw@cn.fujitsu.com>
Date: Thu, 01 Dec 2011 10:30:00 +0800

> what's your opinion?

I have many higher priority tasks than this issue, so it is a poor
idea to try and force me to look into this.

If no other developer cares to answer, maybe it isn't all that
important.

^ permalink raw reply

* Re: [PATCH v3 net-next 2/2] netem: add cell concept to simulate special MAC behavior
From: Eric Dumazet @ 2011-12-01  3:30 UTC (permalink / raw)
  To: Hagen Paul Pfeifer; +Cc: netdev, Stephen Hemminger
In-Reply-To: <1322691627-20551-2-git-send-email-hagen@jauu.net>

Le mercredi 30 novembre 2011 à 23:20 +0100, Hagen Paul Pfeifer a écrit :
> This extension can be used to simulate special link layer
> characteristics. Simulate because packet data is not modified, only the
> calculation base is changed to delay a packet based on the original
> packet size and artificial cell information.
> 
> packet_overhead can be used to simulate a link layer header compression
> scheme (e.g. set packet_overhead to -20) or with a positive
> packet_overhead value an additional MAC header can be simulated. It is
> also possible to "replace" the 14 byte Ethernet header with something
> else.
> 
> cell_size and cell_overhead can be used to simulate link layer schemes,
> based on cells, like some TDMA schemes. Another application area are MAC
> schemes using a link layer fragmentation with a (small) header each.
> Cell size is the maximum amount of data bytes within one cell. Cell
> overhead is an additional variable to change the per-cell-overhead (e.g.
> 5 byte header per fragment).
> 
> Example (5 kbit/s, 20 byte per packet overhead, cell-size 100 byte, per
> cell overhead 5 byte):
> 
> 	tc qdisc add dev eth0 root netem rate 5kbit 20 100 5
> 
> Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net>
> ---
> 
> The actual version of packet_len_2_sched_time() address Eric's div/mod
> instruction concerns. I benchmarked the version in the patch with the
> following version:
> 
> 
> 	if (q->cell_size) {
> 		u32 mod_carry = len % q->cell_size;
> 		u32 cells     = len / q->cell_size;
> 		if (mod_carry)
> 			mod_carry = (len > q->cell_size || !cells) ?
> 				q->cell_size - mod_carry : len - mod_carry;
> 
> 		if (q->cell_overhead) {
> 			if (mod_carry)
> 				++cells;
> 			len += cells * q->cell_overhead;
> 		}
> 		len += mod_carry;
> 	}
> 	return len;
> 
> 
> The patch version is a little bit faster for "all" packet sizes. For common
> cases (e.g. max. 1000 byte packets, cellsize 100 byte, the patch version
> exhibit significant improvements). IMHO the actual version is also more
> understandable. Replace div and mod by do_div() was not that successful.
> 
> 
>  include/linux/pkt_sched.h |    3 +++
>  net/sched/sch_netem.c     |   32 +++++++++++++++++++++++++++++---
>  2 files changed, 32 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
> index 26c37ca..63845cf 100644
> --- a/include/linux/pkt_sched.h
> +++ b/include/linux/pkt_sched.h
> @@ -498,6 +498,9 @@ struct tc_netem_corrupt {
>  
>  struct tc_netem_rate {
>  	__u32	rate;	/* byte/s */
> +	__s32   packet_overhead;
> +	__u32   cell_size;
> +	__s32   cell_overhead;
>  };
>  
>  enum {
> diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
> index 9b7af9f..bcd2b3f 100644
> --- a/net/sched/sch_netem.c
> +++ b/net/sched/sch_netem.c
> @@ -80,6 +80,9 @@ struct netem_sched_data {
>  	u32 reorder;
>  	u32 corrupt;
>  	u32 rate;
> +	s32 packet_overhead;
> +	u32 cell_size;
> +	s32 cell_overhead;
>  
>  	struct crndstate {
>  		u32 last;
> @@ -299,9 +302,26 @@ static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
>  	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
>  }
>  
> -static psched_time_t packet_len_2_sched_time(unsigned int len, u32 rate)
> +static psched_time_t packet_len_2_sched_time(unsigned int len,
> +					     struct netem_sched_data *q)
>  {
> -	return PSCHED_NS2TICKS((u64)len * NSEC_PER_SEC / rate);
> +	u32 cells = 0;
> +	u32 datalen;
> +
> +	len += q->packet_overhead;
> +
> +	if (q->cell_size) {
> +		for (datalen = len; datalen >  q->cell_size; datalen -= q->cell_size)
> +			cells++;

Oh well.. you can exit this loop with data len = q->cell_size


Hmm, take a look at reciprocal divide ...

(include/linux/reciprocal_div.h)


Instead of :

u32 cells     = len / q->cell_size;

You set once q->cell_size_reciprocal = reciprocal_value(q->cell_size);
(in Qdisc init)

Then you do :

cells = reciprocal_divide(len, q->cell_size_reciprocal);

Thats a multiply instead of a divide. On many cpus thats a lot faster.

Think about a super packet (TSO) of 65000 bytes and cell_size=64

> +
> +		if (q->cell_overhead)
> +			len += cells * q->cell_overhead;
> +
> +		if (datalen)
> +			len += (q->cell_size - datalen);
> +	}
> +
> +	return PSCHED_NS2TICKS((u64)len * NSEC_PER_SEC / q->rate);
>  }
>  
>  /*
> @@ -381,7 +401,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
>  		if (q->rate) {
>  			struct sk_buff_head *list = &q->qdisc->q;
>  
> -			delay += packet_len_2_sched_time(skb->len, q->rate);
> +			delay += packet_len_2_sched_time(skb->len, q);
>  
>  			if (!skb_queue_empty(list)) {
>  				/*
> @@ -565,6 +585,9 @@ static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
>  	const struct tc_netem_rate *r = nla_data(attr);
>  
>  	q->rate = r->rate;
> +	q->packet_overhead = r->packet_overhead;
> +	q->cell_size       = r->cell_size;
> +	q->cell_overhead   = r->cell_overhead;
>  }
>  
>  static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
> @@ -906,6 +929,9 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
>  	NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
>  
>  	rate.rate = q->rate;
> +	rate.packet_overhead = q->packet_overhead;
> +	rate.cell_size       = q->cell_size;
> +	rate.cell_overhead   = q->cell_overhead;
>  	NLA_PUT(skb, TCA_NETEM_RATE, sizeof(rate), &rate);
>  
>  	if (dump_loss_model(q, skb) != 0)

^ permalink raw reply

* [PATCH] net/core: fix rollback handler in register_netdevice_notifier
From: roy.qing.li @ 2011-12-01  2:37 UTC (permalink / raw)
  To: netdev

From: RongQing.Li <roy.qing.li@gmail.com>

Within nested statements, the break statement terminates only the
do, for, switch, or while statement that immediately encloses it,
So replace the break with goto.

Signed-off-by: RongQing.Li <roy.qing.li@gmail.com>
---
 net/core/dev.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 278463e..88876fa 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1387,7 +1387,7 @@ rollback:
 	for_each_net(net) {
 		for_each_netdev(net, dev) {
 			if (dev == last)
-				break;
+				goto outroll;
 
 			if (dev->flags & IFF_UP) {
 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
@@ -1398,6 +1398,7 @@ rollback:
 		}
 	}
 
+outroll:
 	raw_notifier_chain_unregister(&netdev_chain, nb);
 	goto unlock;
 }
-- 
1.7.1

^ permalink raw reply related

* Re: Is there any necessary to add multicast route for a loopback device?
From: Li Wei @ 2011-12-01  2:30 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev
In-Reply-To: <4ED5DD7D.10908@cn.fujitsu.com>

Hi David,

what's your opinion?

> Hi all,
> 
> I found that since Fedora 15, it use systemd as the 'init'. When systemd start 
> it would config the ipv6 address(::1/128) for 'lo' and start it. While adding 
> this address to 'lo', kernel will call addrconf_add_mroute() to set a multicast
> route for 'lo' with rt->dst.error = -ENETUNREACH. After that, when I send multi-
> cast message, the route subsystem return a route with .error set to NETUNREACH.
> 
> Is there any necessary to add multicast route for a loopback device? 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 

^ permalink raw reply

* Re: [PATCH] netconsole: implement ipv4 tos support
From: Stephen Hemminger @ 2011-12-01  2:27 UTC (permalink / raw)
  To: Maciej Żenczykowski; +Cc: netdev
In-Reply-To: <CANP3RGfwUr3ho3Tz_cfVz4CBJWMktOb7MDbUYYXz7XfjErw7Zg@mail.gmail.com>

On Wed, 30 Nov 2011 18:11:32 -0800
Maciej Żenczykowski <zenczykowski@gmail.com> wrote:

> > Why make it an option? TOS should always be set to interactive
> > traffic (like telnet and slogin)
> 
> Because 'interactive' doesn't mean anything.  You don't know what
> value of tos defines 'interactive' traffic on my network.
> I may also consider network console/debug/dump/etc traffic less
> important then say serving web traffic, and thus not even want it to
> be considered 'interactive' in the first place.
> 
> TOS values are relevant within a LAN/WAN/Organization/AS, but are not
> relevant internet-wide.
> Almost all AS-boundary gateways/routers will do TOS remarking to their
> own internal specifications.
> 
> - Maciej

Giving the user choice in general is good, but network configuration
is already confusing enough.

Although interpretation of TOS is per organization, in practice the
values are standardized in places like RFC4594. For example, routing protocols
all use IPTOS_PREC_INTERNETCONTROL = 0xc0

^ permalink raw reply

* Re: [PATCH] netconsole: implement ipv4 tos support
From: Maciej Żenczykowski @ 2011-12-01  2:11 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20111130173435.5e63afda@nehalam.linuxnetplumber.net>

> Why make it an option? TOS should always be set to interactive
> traffic (like telnet and slogin)

Because 'interactive' doesn't mean anything.  You don't know what
value of tos defines 'interactive' traffic on my network.
I may also consider network console/debug/dump/etc traffic less
important then say serving web traffic, and thus not even want it to
be considered 'interactive' in the first place.

TOS values are relevant within a LAN/WAN/Organization/AS, but are not
relevant internet-wide.
Almost all AS-boundary gateways/routers will do TOS remarking to their
own internal specifications.

- Maciej

^ permalink raw reply

* Bug in computing data_len in tcp_sendmsg?
From: Tom Herbert @ 2011-12-01  1:48 UTC (permalink / raw)
  To: Linux Netdev List; +Cc: Eric Dumazet, David Miller

I believe that skb->data_len might no be computed correctly in
tcp_sendmsg.  Specifically, when skb_add_data_nocache (or
skb_add_data) is called skb->data_len is not updated (skb_put only
updates skb->len).  This results in the datalen in the head skbuf
being zero so any subsequent uses of the value lead to incorrect
results.  For instance, skb_headlen returns the length of the head
skbu data and not just that of the headers.  If I'm reading this
correctly, it's a pretty fundamental bug.

I don't have a fix for this yet.

Tom

^ permalink raw reply

* Re: [PATCH] netconsole: implement ipv4 tos support
From: Stephen Hemminger @ 2011-12-01  1:34 UTC (permalink / raw)
  To: Maciej Żenczykowski; +Cc: Maciej Żenczykowski, netdev
In-Reply-To: <1322702330-31325-1-git-send-email-zenczykowski@gmail.com>

On Wed, 30 Nov 2011 17:18:50 -0800
Maciej Żenczykowski <zenczykowski@gmail.com> wrote:

> From: Maciej Żenczykowski <maze@google.com>
> 
> Signed-off-by: Maciej Żenczykowski <maze@google.com>

Why make it an option? TOS should always be set to interactive
traffic (like telnet and slogin)

^ permalink raw reply

* [PATCH] netconsole: implement ipv4 tos support
From: Maciej Żenczykowski @ 2011-12-01  1:18 UTC (permalink / raw)
  To: Maciej Żenczykowski; +Cc: netdev, Maciej Żenczykowski

From: Maciej Żenczykowski <maze@google.com>

Signed-off-by: Maciej Żenczykowski <maze@google.com>
---
 Documentation/networking/netconsole.txt |    1 +
 drivers/net/netconsole.c                |   28 ++++++++++++++++++++++++++++
 include/linux/netpoll.h                 |    1 +
 net/core/netpoll.c                      |    4 +++-
 4 files changed, 33 insertions(+), 1 deletions(-)

diff --git a/Documentation/networking/netconsole.txt b/Documentation/networking/netconsole.txt
index 8d02207..a02374f 100644
--- a/Documentation/networking/netconsole.txt
+++ b/Documentation/networking/netconsole.txt
@@ -94,6 +94,7 @@ The interface exposes these parameters of a netconsole target to userspace:
 	remote_ip	Remote agent's IP address		(read-write)
 	local_mac	Local interface's MAC address		(read-only)
 	remote_mac	Remote agent's MAC address		(read-write)
+	tos		TOS byte value to utilize		(read-write)
 
 The "enabled" attribute is also used to control whether the parameters of
 a target can be updated or not -- you can modify the parameters of only
diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
index e888202..d24d5df 100644
--- a/drivers/net/netconsole.c
+++ b/drivers/net/netconsole.c
@@ -90,6 +90,7 @@ static DEFINE_SPINLOCK(target_list_lock);
  *		remote_ip	(read-write)
  *		local_mac	(read-only)
  *		remote_mac	(read-write)
+ *		tos		(read-write)
  */
 struct netconsole_target {
 	struct list_head	list;
@@ -221,6 +222,7 @@ static void free_param_target(struct netconsole_target *nt)
  *				|	remote_ip
  *				|	local_mac
  *				|	remote_mac
+ *				|	tos
  *				|
  *				<target>/...
  */
@@ -288,6 +290,11 @@ static ssize_t show_remote_mac(struct netconsole_target *nt, char *buf)
 	return snprintf(buf, PAGE_SIZE, "%pM\n", nt->np.remote_mac);
 }
 
+static ssize_t show_tos(struct netconsole_target *nt, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", nt->np.tos);
+}
+
 /*
  * This one is special -- targets created through the configfs interface
  * are not enabled (and the corresponding netpoll activated) by default.
@@ -451,6 +458,25 @@ static ssize_t store_remote_mac(struct netconsole_target *nt,
 	return strnlen(buf, count);
 }
 
+static ssize_t store_tos(struct netconsole_target *nt,
+			 const char *buf,
+			 size_t count)
+{
+	int rv;
+
+	if (nt->enabled) {
+		printk(KERN_ERR "netconsole: target (%s) is enabled, "
+				"disable to update parameters\n",
+				config_item_name(&nt->item));
+		return -EINVAL;
+	}
+
+	rv = kstrtou8(buf, 10, &nt->np.tos);
+	if (rv < 0)
+		return rv;
+	return strnlen(buf, count);
+}
+
 /*
  * Attribute definitions for netconsole_target.
  */
@@ -471,6 +497,7 @@ NETCONSOLE_TARGET_ATTR_RW(local_ip);
 NETCONSOLE_TARGET_ATTR_RW(remote_ip);
 NETCONSOLE_TARGET_ATTR_RO(local_mac);
 NETCONSOLE_TARGET_ATTR_RW(remote_mac);
+NETCONSOLE_TARGET_ATTR_RW(tos);
 
 static struct configfs_attribute *netconsole_target_attrs[] = {
 	&netconsole_target_enabled.attr,
@@ -481,6 +508,7 @@ static struct configfs_attribute *netconsole_target_attrs[] = {
 	&netconsole_target_remote_ip.attr,
 	&netconsole_target_local_mac.attr,
 	&netconsole_target_remote_mac.attr,
+	&netconsole_target_tos.attr,
 	NULL,
 };
 
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 5dfa091..d659c8b 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -21,6 +21,7 @@ struct netpoll {
 	__be32 local_ip, remote_ip;
 	u16 local_port, remote_port;
 	u8 remote_mac[ETH_ALEN];
+	u8 tos;
 
 	struct list_head rx; /* rx_np list element */
 };
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 0d38808..b01ce07 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -388,7 +388,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
 
 	/* iph->version = 4; iph->ihl = 5; */
 	put_unaligned(0x45, (unsigned char *)iph);
-	iph->tos      = 0;
+	iph->tos      = np->tos;
 	put_unaligned(htons(ip_len), &(iph->tot_len));
 	iph->id       = 0;
 	iph->frag_off = 0;
@@ -639,6 +639,8 @@ void netpoll_print_options(struct netpoll *np)
 			 np->name, &np->remote_ip);
 	printk(KERN_INFO "%s: remote ethernet address %pM\n",
 	                 np->name, np->remote_mac);
+	printk(KERN_INFO "%s: tos value %d (0x%02x)\n",
+	                 np->name, np->tos, np->tos);
 }
 EXPORT_SYMBOL(netpoll_print_options);
 
-- 
1.7.3.1

^ permalink raw reply related

* Re: [PATCH v4 06/10] e1000e: Support for byte queue limits
From: Tom Herbert @ 2011-12-01  1:04 UTC (permalink / raw)
  To: Jesse Brandeburg
  Cc: davem@davemloft.net, netdev@vger.kernel.org, Eric Dumazet
In-Reply-To: <20111129130137.000009ce@unknown>

> whats wrong with using total_tx_bytes or buffer_info->bytecount?  it
> contains the "bytes on the wire" value which will be slightly larger
> than skb->len, but avoids warming the skb->len cacheline unnecessarily.
>

This makes sense.  I made the changes, but the limits computed are
much higher than with using sbk->len.  I suspect there may be a bug in
GSO path.

For instance, in the driver at:

  bytecount = ((segs - 1) * skb_headlen(skb)) + skb->len;

I see cases like:

  segs=34, skb_header_len(skb)=70, skb->len=49298 so bytecount=51608

Which seems reasonable... but, I also see things like:

  segs=45, skb_header_len(skb)=1522, skb->len=65226, so bytecount= 132194
                                                      ^^^^
Which doesn't seem right at all.  I am thinking there may be a bug
setting skb->data_len improperly.

Tom

^ permalink raw reply

* Re: [v4 PATCH 1/2] NETFILTER module xt_hmark, new target for HASH based fwmark
From: Hans Schillstrom @ 2011-12-01  0:52 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Patrick McHardy, jengelh, netfilter-devel, netdev,
	hans.schillstrom
In-Reply-To: <20111130182815.GC20336@1984>


On Wednesday, November 30, 2011 19:28:15 Pablo Neira Ayuso wrote:
> On Wed, Nov 30, 2011 at 04:27:26PM +0100, Patrick McHardy wrote:
> > On 11/28/2011 10:36 AM, Hans Schillstrom wrote:
> > >>If you don't want to use conntrack in your setup and you want to handle
> > >>fragments, then you have to configure HMARK to calculate the hashing
> > >>based on the network addresses. If you want to fully support fragments,
> > >>then enable conntrack and you can configure HMARK to calculate the
> > >>hashing based on network address + transport bits.
> > >>
> > >>Fix this by removing the fragmentation handling, then assume that
> > >>people can select between two hashing configuration for HMARK. One
> > >>based for network address which is fragment-safe, one that uses the
> > >>transport layer information, that requires conntrack. Otherwise, I
> > >>don't see a sane way to handle this situation.
> > >Correct me if I'm wrong here,
> > >If conntrack is enabled hmark don't see the packet until it is reassembled and
> > >in that case the fragmentation header is removed.
> > >
> > >So, with conntrack HMARK will operate on full packets not fragments
> > >without conntrack ports will not be used on any fragment
> > 
> > Correct.
> 
> To complete what Patrick said. They are collected but not linearized.
> That's why you have to use skb_header_pointer.
OK, thanks
I'll will do that.

> 
> > You don't necessarily need conntrack for defragmentation though,
> > we've moved defragmentation to a seperate module for TPROXY. You
> > can depend on that and get defragmentation without full
> > connection tracking.
> 
> Indeed, I missed this. That way you can skip conntrack but solving the
> broken fragments handling.

In a cluster with inputs on many different blades (with a Virtual IP address) you can
receive the fragments on different blades and in that case there should not be any
defrag in that node. HMARK will just produce the same fw-mark for all fragments.

In our case defrag will happens in next hop i.e. before going into IPVS.

As mention earlier, this needs to be documented better.
I will add this to the man page.

Thanks
Hans

^ permalink raw reply

* Re: [v4 PATCH 1/2] NETFILTER module xt_hmark, new target for HASH based fwmark
From: Hans Schillstrom @ 2011-12-01  0:25 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: pablo, jengelh, netfilter-devel, netdev, hans.schillstrom
In-Reply-To: <4ED65107.1040309@trash.net>

On Wednesday, November 30, 2011 16:51:35 Patrick McHardy wrote:
> On 11/25/2011 10:36 AM, Hans Schillstrom wrote:
> > +__u32 hmark_v6(struct sk_buff *skb, const struct xt_action_param *par)
> > +{
> > +	struct xt_hmark_info *info = (struct xt_hmark_info *)par->targinfo;
> > +	int nhoff, poff, hdrlen;
> > +	u32 addr1, addr2, hash;
> > +	struct ipv6hdr *ip6;
> > +	u8 nexthdr;
> > +	int frag = 0, ip6hdrlvl = 0;	/* Header level */
> > +	struct ipv6_opt_hdr _hdr, *hp;
> > +	union {
> > +		u32 v32;
> > +		u16 v16[2];
> > +	} ports;
> > +
> > +	ports.v32 = 0;
> > +	nhoff = skb_network_offset(skb);
> > +
> > +hdr_new:
> > +	/* Get header info */
> > +	ip6 = (struct ipv6hdr *) (skb->data + nhoff);
> > +	nexthdr = ip6->nexthdr;
> > +	hdrlen = sizeof(struct ipv6hdr);
> > +	hp = skb_header_pointer(skb, nhoff + hdrlen, sizeof(_hdr),&_hdr);
> > +
> > +	while (nexthdr) {
> > +		switch (nexthdr) {
> > +		case IPPROTO_ICMPV6:
> > +			/* ICMP Error then move ptr to inner header */
> > +			if (get_inner6_hdr(skb,&nhoff, hdrlen)) {
> 
> This doesn't look right. You assume the ICMPv6 header is following
> the IPv6 header with any other headers in between. If there are
> other headers, hdrlen will contain the length of the last header.
 
RFC-4443  "Every ICMPv6 message is preceded by an IPv6 header and zero or more IPv6 extension headers."
hdrlen is actually previous header length in bytes, to be correct.
nhoff is the sum of processed headers.
So in case of an icmp the nhoff will be updated, and hdrlen preset to ipv6hdr size

> > +				ip6hdrlvl++;
> > +				if (!pskb_may_pull(skb, sizeof(_hdr) + nhoff))
> > +					return XT_CONTINUE;
> > +				goto hdr_new;
> > +			}
> > +			nhoff += hdrlen;
> > +			goto hdr_rdy;
> > +
> > +		case NEXTHDR_FRAGMENT:
> > +			if (!ip6hdrlvl) /* Do not use ports if fragmented */
> > +				frag = 1;
> 
> Shouldn't you also check for fragment offset == 0 here?

According to the RFC "Initialized to zero for transmission; ignored on reception"

> The fragment header also doesn't include the length, so
> using ipv6_optlen() below is incorrect.

True, it has a fixed size, of 8 octets
I'll fix that. 
(as long as it is zero it will work :-)

> > +			break;
> > +
> > +		/* End of hdr traversing cont. with ports and hash calc. */
> > +		case NEXTHDR_IPV6:	/* Do not process tunnels */
> 
> That comment looks misleading, you do seem to process them?

Ooops a "return XT_CONTINUE;" seems to be missing here.

> 
> > +		case NEXTHDR_TCP:
> > +		case NEXTHDR_UDP:
> > +		case NEXTHDR_ESP:
> > +		case NEXTHDR_AUTH:
> 
> Don't you want to use the port numbers if only authentication
> without encryption is used?

with esp or ah the SPI will be used instead of ports.
Useful or not I don't know since they are asymmetric in terms of a flow.

> 
> > +		case NEXTHDR_SCTP:
> > +		case NEXTHDR_NONE:	/* Last hdr of something unknown */
> > +			nhoff += hdrlen;
> > +			goto hdr_rdy;
> > +		default:
> > +			return XT_CONTINUE;
> > +		}
> > +		if (!hp)
> > +			return XT_CONTINUE;
> > +		nhoff += hdrlen;	/* eat current header */
> > +		nexthdr =  hp->nexthdr;	/* Next header */
> > +		hdrlen = ipv6_optlen(hp);
> > +		hp = skb_header_pointer(skb, nhoff + hdrlen, sizeof(_hdr),
> > +					&_hdr);
> > +
> > +		if (!pskb_may_pull(skb, nhoff))
> > +			return XT_CONTINUE;
> > +	}
> 
> And final question, why not simply use ipv6_skip_exthdr()?

problems with fragments...
But when looking into ipv6_skip_exthdr() again I realize that handling of 
NEXTHDR_HOP NEXTHDR_ROUTING, and NEXTHDR_DEST is wrong.

It think I need to rewrite this part a bit

just skip this headers;
	case NEXTHDR_HOP:
	case NEXTHDR_ROUTING:
	case NEXTHDR_DEST:
		break;

and exit on this:
	case NEXTHDR_IPV6:
	case	NEXTHDR_NONE:
	default:
		return XT_CONTINUE;	
> 
> > +hdr_rdy:
> > +...
> 
> 

Thanks
Hans

^ permalink raw reply

* Re: [net-next-2.6 PATCH 0/6 v4] macvlan: MAC Address filtering support for passthru mode
From: Chris Wright @ 2011-11-30 23:39 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Chris Wright, Ben Hutchings, Greg Rose, Roopa Prabhu,
	netdev@vger.kernel.org, davem@davemloft.net,
	dragos.tatulea@gmail.com, kvm@vger.kernel.org, arnd@arndb.de,
	mst@redhat.com, mchan@broadcom.com, dwang2@cisco.com,
	shemminger@vyatta.com, eric.dumazet@gmail.com, kaber@trash.net,
	benve@cisco.com
In-Reply-To: <4ED6BCA6.8040701@us.ibm.com>

* Sridhar Samudrala (sri@us.ibm.com) wrote:
> On 11/30/2011 3:00 PM, Chris Wright wrote:
> >        physical port
> >              |
> >+------------+------------+
> >|         +-----+         |
> >|         | VEB |         |
> >|         +-----+         |
> >|        /   |   \        |
> >|       /    |    \       |
> >|      /     |     \      |
> >+-----+------+------+-----+
> >       |      |       |
> >      PF    VF 1    VF 2
> >      /       |       |
> >  +---+---+  VM4  +---+---+
> >  |  sw   |       |macvtap|
> >  | switch|       +---+---+
> >  +-+-+-+-+           |
> >    / | \            VM5
> >   /  |  \
> >VM1 VM2 VM3
> >
> >This has VMs 1-3 hanging of the PF via a linux bridge (traditional hv
> >switching), VM4 directly owning VF1 (pci device assignement), and VM5
> >indirectly owning VF2 (macvtap passthrough, that started this whole
> >thing).
> >
> >So, I'm understanding you saying that VM4 or VM4 sending a packet to VM1
> >goes in to VEB, out PF, and into linux bridging code, rigth?  At which
> >point the PF is in promiscuous mode (btw, same does not work if bridge is
> >attached to VF, at least for some VFs, due to lack of promiscuous mode).
> >
> >>Packets sent from a guest with a VF to the address of another guest with
> >>a VF need to be forwarded similarly, but the driver should be able to
> >>infer that from (3).
> >Right, and that works currently for the case where both guests are like
> >VM4, they directly own the VF via PCI device assignement.  But for VM4
> >to talk to VM5, VF3 is not in promiscuous mode and has a different MAC
> >address than VM5's vNIC.  If the embedded bridge does not learn, and
> >nobody programmed it to fwd frames for VM5 via VF3...
> I think you are referring to VF2. There is no VF3 in your picture.

*sigh*  (also meant 'VM4 or VM5' up above, not 'VM4 or VM4')...

> In macvtap passthru mode, VF2 will be set to the same mac address as VM5's
> MAC.  So VM4 should be be able to talk to VM5.

yes (i think macvtap in bridging or vepa mode w/ single VM has that issue,
not passthru)

> >I believe this is what Roopa's patch will allow.  The question now is
> >whether there's a better way to handle this?
> My understanding is that Roopa's patch will allow setting additional mac
> addresses to
> VM5 without the need to put VF5 in promiscous mode.

Thanks for your corrections Sridar.

cheers,
-chris

^ permalink raw reply

* Re: [net-next-2.6 PATCH 0/6 v4] macvlan: MAC Address filtering support for passthru mode
From: Sridhar Samudrala @ 2011-11-30 23:30 UTC (permalink / raw)
  To: Chris Wright
  Cc: Ben Hutchings, Greg Rose, Roopa Prabhu, netdev@vger.kernel.org,
	davem@davemloft.net, dragos.tatulea@gmail.com,
	kvm@vger.kernel.org, arnd@arndb.de, mst@redhat.com,
	mchan@broadcom.com, dwang2@cisco.com, shemminger@vyatta.com,
	eric.dumazet@gmail.com, kaber@trash.net, benve@cisco.com
In-Reply-To: <20111130230049.GE29071@x200.localdomain>

On 11/30/2011 3:00 PM, Chris Wright wrote:
> * Ben Hutchings (bhutchings@solarflare.com) wrote:
>> On Wed, 2011-11-30 at 13:04 -0800, Chris Wright wrote:
>>> I agree that it's confusing.  Couldn't you simplify your ascii art
>>> (hopefully removing hw assumptions about receive processing, and
>>> completely ignoring vlans for the moment) to something like:
>>>
>>>               |RX
>>>               v
>>> +------------+-------------+
>>> |     +------+--------+    |
>>> |     | RX MAC filter |    |
>>> |     |and port select|    |
>>> |     +---------------+    |
>>> |            /|\           |
>>> |           / | \   match 2|
>>> |          /  v  \         |
>>> |         /match  \        |
>>> |        /  1 |    \       |
>>> |       /     |     \      |
>>> |match /      |      \     |
>>> |  0  /       |       \    |
>>> |    v        |        v   |
>>> |    |        |        |   |
>>> +----+--------+--------+---+
>>>       |        |        |
>>>      PF       VF 1     VF 2
>>>
>>> And there's an unclear number of ways to update "RX MAC filter and port
>>> select" table.
>>>
>>> 1) PF ndo_set_mac_addr
>>> I expect that to be implicit to match 0.
>>>
>>> 2) PF ndo_set_rx_mode
>>> Less clear, but I'd still expect these to implicitly match 0
>>>
>>> 3) PF ndo_set_vf_mac
>>> I expect these to be an explicit match to VF N (given the interface
>>> specifices which VF's MAC is being programmed).
>> I'm not sure whether this is supposed to implicitly add to the MAC
>> filter or whether that has to be changed too.  That's the main
>> difference between my models (a) and (b).
> I see now.  I wasn't entirely clear on the difference before.  It's also
> going to be hw specific.  I think (Intel folks can verify) that the
> Intel SR-IOV devices have a single global unicast exact match table,
> for example.
>
>> There's also PF ndo_set_vf_vlan.
> Right, although I had mentioned I was trying to limit just to MAC
> filtering to simplify.
>
>>> 4) VF ndo_set_mac_addr
>>> This one may or may not be allowed (setting MAC+port if the VF is owned
>>> by a guest is likely not allowed), but would expect an implicit VF N.
>>>
>>> 5) VF ndo_set_rx_mode
>>> Same as 4) above.
>> So this is where we are today.
> Cool, good that we agree there.
>
>>> 6) PF or VF? ndo_set_rx_filter_addr
>>> The new proposal, which has an explicit VF, although when it's VF_SELF
>>> I'm not clear if this is just the same as 5) above?
>>>
>>> Have I missed anything?
>> Any physical port can be bridged to a mixture of guests with and without
>> their own VFs.  Packets sent from a guest with a VF to the address of a
>> guest without a VF need to be forwarded to the PF rather than the
>> physical port, but none of the drivers currently get to know about those
>> addresses.
> To clarify, do you mean something like this?
>
>         physical port
>               |
> +------------+------------+
> |         +-----+         |
> |         | VEB |         |
> |         +-----+         |
> |        /   |   \        |
> |       /    |    \       |
> |      /     |     \      |
> +-----+------+------+-----+
>        |      |       |
>       PF    VF 1    VF 2
>       /       |       |
>   +---+---+  VM4  +---+---+
>   |  sw   |       |macvtap|
>   | switch|       +---+---+
>   +-+-+-+-+           |
>     / | \            VM5
>    /  |  \
> VM1 VM2 VM3
>
> This has VMs 1-3 hanging of the PF via a linux bridge (traditional hv
> switching), VM4 directly owning VF1 (pci device assignement), and VM5
> indirectly owning VF2 (macvtap passthrough, that started this whole
> thing).
>
> So, I'm understanding you saying that VM4 or VM4 sending a packet to VM1
> goes in to VEB, out PF, and into linux bridging code, rigth?  At which
> point the PF is in promiscuous mode (btw, same does not work if bridge is
> attached to VF, at least for some VFs, due to lack of promiscuous mode).
>
>> Packets sent from a guest with a VF to the address of another guest with
>> a VF need to be forwarded similarly, but the driver should be able to
>> infer that from (3).
> Right, and that works currently for the case where both guests are like
> VM4, they directly own the VF via PCI device assignement.  But for VM4
> to talk to VM5, VF3 is not in promiscuous mode and has a different MAC
> address than VM5's vNIC.  If the embedded bridge does not learn, and
> nobody programmed it to fwd frames for VM5 via VF3...
I think you are referring to VF2. There is no VF3 in your picture.
In macvtap passthru mode, VF2 will be set to the same mac address as 
VM5's MAC.
So VM4 should be be able to talk to VM5.
>
> I believe this is what Roopa's patch will allow.  The question now is
> whether there's a better way to handle this?
My understanding is that Roopa's patch will allow setting additional mac 
addresses to
VM5 without the need to put VF5 in promiscous mode.

Thanks
Sridhar
>
> In my mind, we'd model the NIC's embedded bridge as, well, a bridge.
> And set anti-spoofing, port mirroring, port mac/vlan filtering, etc via
> that bridge.
>
> thanks,
> -chris
>

^ permalink raw reply

* Re: [RFC iproute2] sch_red: TC_RED_HARDDROP
From: Thomas Graf @ 2011-11-30 23:29 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Stephen Hemminger
In-Reply-To: <1322684749.2602.3.camel@edumazet-laptop>

On Wed, Nov 30, 2011 at 09:25:49PM +0100, Eric Dumazet wrote:
> In 2005, TC_RED_HARDDROP kernel support was added to RED and GRED
> (commit bdc450a0bb1 [PKT_SCHED]: (G)RED: Introduce hard dropping), but
> current iproute2 doesnt have user land support.
> 
> Is there some patch waiting somewhere, or should we :
> 
> - Remove kernel support, since nobody uses it.
> 
> - Add iproute2 support.

I used it in a project which configured the GRED qdisc directly from
the application.

I thought I added iproute2 support back then but obviously must have
missed it or the patch got lost. I'll send a patch to Stephen.

^ permalink raw reply

* RE: [net-next-2.6 PATCH 0/6 v4] macvlan: MAC Address filtering support for passthru mode
From: Rose, Gregory V @ 2011-11-30 23:19 UTC (permalink / raw)
  To: Chris Wright, Ben Hutchings
  Cc: Roopa Prabhu, netdev@vger.kernel.org, davem@davemloft.net,
	sri@us.ibm.com, dragos.tatulea@gmail.com, kvm@vger.kernel.org,
	arnd@arndb.de, mst@redhat.com, mchan@broadcom.com,
	dwang2@cisco.com, shemminger@vyatta.com, eric.dumazet@gmail.com,
	kaber@trash.net, benve@cisco.com
In-Reply-To: <20111130230049.GE29071@x200.localdomain>

> -----Original Message-----
> From: Chris Wright [mailto:chrisw@redhat.com]
> Sent: Wednesday, November 30, 2011 3:01 PM
> To: Ben Hutchings
> Cc: Chris Wright; Rose, Gregory V; Roopa Prabhu; netdev@vger.kernel.org;
> davem@davemloft.net; sri@us.ibm.com; dragos.tatulea@gmail.com;
> kvm@vger.kernel.org; arnd@arndb.de; mst@redhat.com; mchan@broadcom.com;
> dwang2@cisco.com; shemminger@vyatta.com; eric.dumazet@gmail.com;
> kaber@trash.net; benve@cisco.com
> Subject: Re: [net-next-2.6 PATCH 0/6 v4] macvlan: MAC Address filtering
> support for passthru mode
> 
> * Ben Hutchings (bhutchings@solarflare.com) wrote:
> > On Wed, 2011-11-30 at 13:04 -0800, Chris Wright wrote:
> > > I agree that it's confusing.  Couldn't you simplify your ascii art
> > > (hopefully removing hw assumptions about receive processing, and
> > > completely ignoring vlans for the moment) to something like:
> > >
> > >              |RX
> > >              v
> > > +------------+-------------+
> > > |     +------+--------+    |
> > > |     | RX MAC filter |    |
> > > |     |and port select|    |
> > > |     +---------------+    |
> > > |            /|\           |
> > > |           / | \   match 2|
> > > |          /  v  \         |
> > > |         /match  \        |
> > > |        /  1 |    \       |
> > > |       /     |     \      |
> > > |match /      |      \     |
> > > |  0  /       |       \    |
> > > |    v        |        v   |
> > > |    |        |        |   |
> > > +----+--------+--------+---+
> > >      |        |        |
> > >     PF       VF 1     VF 2
> > >
> > > And there's an unclear number of ways to update "RX MAC filter and
> port
> > > select" table.
> > >
> > > 1) PF ndo_set_mac_addr
> > > I expect that to be implicit to match 0.
> > >
> > > 2) PF ndo_set_rx_mode
> > > Less clear, but I'd still expect these to implicitly match 0
> > >
> > > 3) PF ndo_set_vf_mac
> > > I expect these to be an explicit match to VF N (given the interface
> > > specifices which VF's MAC is being programmed).
> >
> > I'm not sure whether this is supposed to implicitly add to the MAC
> > filter or whether that has to be changed too.  That's the main
> > difference between my models (a) and (b).
> 
> I see now.  I wasn't entirely clear on the difference before.  It's also
> going to be hw specific.  I think (Intel folks can verify) that the
> Intel SR-IOV devices have a single global unicast exact match table,
> for example.
> 
> > There's also PF ndo_set_vf_vlan.
> 
> Right, although I had mentioned I was trying to limit just to MAC
> filtering to simplify.
> 
> > > 4) VF ndo_set_mac_addr
> > > This one may or may not be allowed (setting MAC+port if the VF is
> owned
> > > by a guest is likely not allowed), but would expect an implicit VF N.
> > >
> > > 5) VF ndo_set_rx_mode
> > > Same as 4) above.
> >
> > So this is where we are today.
> 
> Cool, good that we agree there.
> 
> > > 6) PF or VF? ndo_set_rx_filter_addr
> > > The new proposal, which has an explicit VF, although when it's VF_SELF
> > > I'm not clear if this is just the same as 5) above?
> > >
> > > Have I missed anything?
> >
> > Any physical port can be bridged to a mixture of guests with and without
> > their own VFs.  Packets sent from a guest with a VF to the address of a
> > guest without a VF need to be forwarded to the PF rather than the
> > physical port, but none of the drivers currently get to know about those
> > addresses.
> 
> To clarify, do you mean something like this?
> 
>        physical port
>              |
> +------------+------------+
> |         +-----+         |
> |         | VEB |         |
> |         +-----+         |
> |        /   |   \        |
> |       /    |    \       |
> |      /     |     \      |
> +-----+------+------+-----+
>       |      |       |
>      PF    VF 1    VF 2
>      /       |       |
>  +---+---+  VM4  +---+---+
>  |  sw   |       |macvtap|
>  | switch|       +---+---+
>  +-+-+-+-+           |
>    / | \            VM5
>   /  |  \
> VM1 VM2 VM3
> 
> This has VMs 1-3 hanging of the PF via a linux bridge (traditional hv
> switching), VM4 directly owning VF1 (pci device assignement), and VM5
> indirectly owning VF2 (macvtap passthrough, that started this whole
> thing).
> 
> So, I'm understanding you saying that VM4 or VM4 sending a packet to VM1
> goes in to VEB, out PF, and into linux bridging code, rigth?  At which
> point the PF is in promiscuous mode (btw, same does not work if bridge is
> attached to VF, at least for some VFs, due to lack of promiscuous mode).
> 
> > Packets sent from a guest with a VF to the address of another guest with
> > a VF need to be forwarded similarly, but the driver should be able to
> > infer that from (3).
> 
> Right, and that works currently for the case where both guests are like
> VM4, they directly own the VF via PCI device assignement.  But for VM4
> to talk to VM5, VF3 is not in promiscuous mode and has a different MAC
> address than VM5's vNIC.  If the embedded bridge does not learn, and
> nobody programmed it to fwd frames for VM5 via VF3...
> 
> I believe this is what Roopa's patch will allow.  The question now is
> whether there's a better way to handle this?
> 
> In my mind, we'd model the NIC's embedded bridge as, well, a bridge.
> And set anti-spoofing, port mirroring, port mac/vlan filtering, etc via
> that bridge.

If there was some way to push the bridge forwarding database down to the
underlying HW so that the filters could be programmed into the HW for
non-learning VEBs that would work too.

This hole has existed for a very long time, years now.  It'd be nice to get
it fixed.  If the community direction is to extend the current bridging
interface then that's fine, we'll go that way.

- Greg


^ permalink raw reply

* [PATCH] caif: Remove unused attributes from struct cflayer
From: Sjur Brændeland @ 2011-11-30 23:02 UTC (permalink / raw)
  To: netdev, David Miller; +Cc: Sjur Brændeland
In-Reply-To: <20111130.170457.1002117290486935116.davem@davemloft.net>

Signed-off-by: Sjur Brændeland <sjur.brandeland@stericsson.com>
---
Hi Dave,

>...
>Looks like this code was always buggy, passing the index in the type parameter
>and the type in the instance parameter.
>

Yes, you're right. The type and prio isn't used anyway,
so let's just kill it off.

Regards,
Sjur

 include/net/caif/caif_layer.h |    4 ----
 1 files changed, 0 insertions(+), 4 deletions(-)

diff --git a/include/net/caif/caif_layer.h b/include/net/caif/caif_layer.h
index 35bc788..0f3a391 100644
--- a/include/net/caif/caif_layer.h
+++ b/include/net/caif/caif_layer.h
@@ -121,9 +121,7 @@ enum caif_direction {
  * @transmit:	Packet transmit funciton.
  * @ctrlcmd:	Used for control signalling upwards in the stack.
  * @modemcmd:	Used for control signaling downwards in the stack.
- * @prio:	Priority of this layer.
  * @id:		The identity of this layer
- * @type:	The type of this layer
  * @name:	Name of the layer.
  *
  *  This structure defines the layered structure in CAIF.
@@ -230,9 +228,7 @@ struct cflayer {
 	 */
 	int (*modemcmd) (struct cflayer *layr, enum caif_modemcmd ctrl);
 
-	unsigned short prio;
 	unsigned int id;
-	unsigned int type;
 	char name[CAIF_LAYER_NAME_SZ];
 };
 
-- 
1.7.0.4

^ permalink raw reply related

* Re: [net-next-2.6 PATCH 0/6 v4] macvlan: MAC Address filtering support for passthru mode
From: Chris Wright @ 2011-11-30 23:00 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: Chris Wright, Greg Rose, Roopa Prabhu, netdev@vger.kernel.org,
	davem@davemloft.net, sri@us.ibm.com, dragos.tatulea@gmail.com,
	kvm@vger.kernel.org, arnd@arndb.de, mst@redhat.com,
	mchan@broadcom.com, dwang2@cisco.com, shemminger@vyatta.com,
	eric.dumazet@gmail.com, kaber@trash.net, benve@cisco.com
In-Reply-To: <1322688843.2762.45.camel@bwh-desktop>

* Ben Hutchings (bhutchings@solarflare.com) wrote:
> On Wed, 2011-11-30 at 13:04 -0800, Chris Wright wrote:
> > I agree that it's confusing.  Couldn't you simplify your ascii art
> > (hopefully removing hw assumptions about receive processing, and
> > completely ignoring vlans for the moment) to something like:
> >
> >              |RX
> >              v
> > +------------+-------------+
> > |     +------+--------+    |
> > |     | RX MAC filter |    |
> > |     |and port select|    |
> > |     +---------------+    |
> > |            /|\           |
> > |           / | \   match 2|
> > |          /  v  \         |
> > |         /match  \        |
> > |        /  1 |    \       |
> > |       /     |     \      |
> > |match /      |      \     |
> > |  0  /       |       \    |
> > |    v        |        v   |
> > |    |        |        |   |
> > +----+--------+--------+---+
> >      |        |        |
> >     PF       VF 1     VF 2
> > 
> > And there's an unclear number of ways to update "RX MAC filter and port
> > select" table.
> > 
> > 1) PF ndo_set_mac_addr
> > I expect that to be implicit to match 0.
> > 
> > 2) PF ndo_set_rx_mode
> > Less clear, but I'd still expect these to implicitly match 0
> > 
> > 3) PF ndo_set_vf_mac
> > I expect these to be an explicit match to VF N (given the interface
> > specifices which VF's MAC is being programmed).
> 
> I'm not sure whether this is supposed to implicitly add to the MAC
> filter or whether that has to be changed too.  That's the main
> difference between my models (a) and (b).

I see now.  I wasn't entirely clear on the difference before.  It's also
going to be hw specific.  I think (Intel folks can verify) that the
Intel SR-IOV devices have a single global unicast exact match table,
for example.

> There's also PF ndo_set_vf_vlan.

Right, although I had mentioned I was trying to limit just to MAC
filtering to simplify.

> > 4) VF ndo_set_mac_addr
> > This one may or may not be allowed (setting MAC+port if the VF is owned
> > by a guest is likely not allowed), but would expect an implicit VF N.
> > 
> > 5) VF ndo_set_rx_mode
> > Same as 4) above.
> 
> So this is where we are today.

Cool, good that we agree there.

> > 6) PF or VF? ndo_set_rx_filter_addr
> > The new proposal, which has an explicit VF, although when it's VF_SELF
> > I'm not clear if this is just the same as 5) above?
> > 
> > Have I missed anything?
> 
> Any physical port can be bridged to a mixture of guests with and without
> their own VFs.  Packets sent from a guest with a VF to the address of a
> guest without a VF need to be forwarded to the PF rather than the
> physical port, but none of the drivers currently get to know about those
> addresses.

To clarify, do you mean something like this?

       physical port
             |
+------------+------------+
|         +-----+         |
|         | VEB |         |
|         +-----+         |
|        /   |   \        |
|       /    |    \       |
|      /     |     \      |
+-----+------+------+-----+
      |      |       |
     PF    VF 1    VF 2
     /       |       | 
 +---+---+  VM4  +---+---+
 |  sw   |       |macvtap|
 | switch|       +---+---+
 +-+-+-+-+           |
   / | \            VM5
  /  |  \
VM1 VM2 VM3

This has VMs 1-3 hanging of the PF via a linux bridge (traditional hv
switching), VM4 directly owning VF1 (pci device assignement), and VM5
indirectly owning VF2 (macvtap passthrough, that started this whole
thing).

So, I'm understanding you saying that VM4 or VM4 sending a packet to VM1
goes in to VEB, out PF, and into linux bridging code, rigth?  At which
point the PF is in promiscuous mode (btw, same does not work if bridge is
attached to VF, at least for some VFs, due to lack of promiscuous mode).

> Packets sent from a guest with a VF to the address of another guest with
> a VF need to be forwarded similarly, but the driver should be able to
> infer that from (3).

Right, and that works currently for the case where both guests are like
VM4, they directly own the VF via PCI device assignement.  But for VM4
to talk to VM5, VF3 is not in promiscuous mode and has a different MAC
address than VM5's vNIC.  If the embedded bridge does not learn, and
nobody programmed it to fwd frames for VM5 via VF3...

I believe this is what Roopa's patch will allow.  The question now is
whether there's a better way to handle this?

In my mind, we'd model the NIC's embedded bridge as, well, a bridge.
And set anti-spoofing, port mirroring, port mac/vlan filtering, etc via
that bridge.

thanks,
-chris

^ permalink raw reply

* Re: [PATCH 10/10] sfc: Support for byte queue limits
From: Ben Hutchings @ 2011-11-30 22:51 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, therbert, netdev
In-Reply-To: <20111130.171239.236955684775039413.davem@davemloft.net>

On Wed, 2011-11-30 at 17:12 -0500, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Wed, 30 Nov 2011 13:44:03 +0100
> 
> > [PATCH net-next] sfc: fix race in efx_enqueue_skb_tso()
> > 
> > As soon as skb is pushed to hardware, it can be completed and freed, so
> > we should not dereference skb anymore.
> > 
> > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> 
> Applied, thanks Eric.

Thanks to all.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox