Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH] e100: expose broadcast_disabled as a module option
From: Erwan Velu @ 2010-04-23 21:03 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jeff Kirsher, netdev, David Miller, linux-kernel,
	jesse.brandeburg, bruce.w.allan, alexander.h.duyck,
	peter.p.waskiewicz.jr, john.ronciak
In-Reply-To: <20100423135816.23f5861f@nehalam>

I first tried "ifconfig -broadcast" without any success, so I forced
the driver to unset IFF_BROADCAST, the interface didn't showed anymore
the BROADCAST option with ifconfig. But I didn't noticed any reduction
in the amount of context/switches on my host.

I found the broadcast_disabled far more efficient when considering the
cpu impact.


2010/4/23 Stephen Hemminger <shemminger@vyatta.com>:
> On Fri, 23 Apr 2010 13:22:22 -0700
> Jeff Kirsher <jeffrey.t.kirsher@intel.com> wrote:
>
>> On Fri, Apr 23, 2010 at 13:14, Erwan Velu <erwanaliasr1@gmail.com> wrote:
>> > Hi folks,
>> >
>> > I've been facing a very noisy network where hundreds broadcast packets
>> > were generated every second.
>> > When this traffic can't be controlled at the source, there is a side
>> > effect on some systems.
>> > I was having some idle systems that will never be targeted by this
>> > broadcast traffic that got loaded just by receiving that "flood".
>> > I mean by loaded that this light hardware was generating 300
>> > context/switches per second.
>> >
>> > I was looking for many options to avoid this traffic to disturb this
>> > hosts and I discovered that the e100 driver was featuring a
>> > "broadcast_disabled" configure option.
>> > I realize that this option is not controllable, so I wrote this simple
>> > patch that expose this option as a module option.
>> > This allow me to tell this hosts not to listen anymore this traffic.
>> >
>> > The result is clearly good as my systems are now running at 21
>> > context/switches while being idle.
>> > Hope this patch isn't too bad and could help others that faces the same problem.
>> >
>> > Patch can be downloaded here :
>> > http://konilope.linuxeries.org/e100_broadcast_disabled.patch
>> >
>> > Even if gmail is eating the inlined, patch, at least that make it
>> > easier to read it for humans.
>> > If the patch is acked, the downloaded one will be more clean ;)
>> >
>> > This patch was generated on top of the latest 2.6 torvald's git.
>> > Cheers,
>> > Erwan
>> >
>> > Signed-off-by: Erwan Velu <erwanaliasr1@gmail.com>
>> >
>> > diff --git a/drivers/net/e100.c b/drivers/net/e100.c
>> > index b997e57..2ba582f 100644
>> > --- a/drivers/net/e100.c
>> > +++ b/drivers/net/e100.c
>> > @@ -194,12 +194,15 @@ MODULE_FIRMWARE(FIRMWARE_D102E);
>> >  static int debug = 3;
>> >  static int eeprom_bad_csum_allow = 0;
>> >  static int use_io = 0;
>> > +static int broadcast_disabled = 0;
>> >  module_param(debug, int, 0);
>> >  module_param(eeprom_bad_csum_allow, int, 0);
>> >  module_param(use_io, int, 0);
>> > +module_param(broadcast_disabled, int, 0);
>> >  MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
>> >  MODULE_PARM_DESC(eeprom_bad_csum_allow, "Allow bad eeprom checksums");
>> >  MODULE_PARM_DESC(use_io, "Force use of i/o access mode");
>> > +MODULE_PARM_DESC(broadcast_disabled, "Filter broadcast packets
>> > (0=disabled (default), 1=enabled)");
>> >  #define DPRINTK(nlevel, klevel, fmt, args...) \
>> >        (void)((NETIF_MSG_##nlevel & nic->msg_enable) && \
>> >        printk(KERN_##klevel PFX "%s: %s: " fmt, nic->netdev->name, \
>> > @@ -1131,6 +1134,8 @@ static void e100_configure(struct nic *nic,
>> > struct cb *cb, struct sk_buff *skb)
>> >                config->promiscuous_mode = 0x1;         /* 1=on, 0=off */
>> >        }
>> >
>> > +       config->broadcast_disabled = broadcast_disabled; /* Broadcast filtering */
>> > +
>> >        if (nic->flags & multicast_all)
>> >                config->multicast_all = 0x1;            /* 1=accept, 0=no */
>> > --
>>
>> Adding Netdev...
>>
>
> What is wrong with using existing IFF_BROADCAST flag?
>
>
> --
>

^ permalink raw reply

* Re: [PATCH] e100: expose broadcast_disabled as a module option
From: Stephen Hemminger @ 2010-04-23 20:58 UTC (permalink / raw)
  To: Jeff Kirsher
  Cc: Erwan Velu, netdev, David Miller, linux-kernel, jesse.brandeburg,
	bruce.w.allan, alexander.h.duyck, peter.p.waskiewicz.jr,
	john.ronciak
In-Reply-To: <h2p9929d2391004231322r23528f32z8447a711a29e28ea@mail.gmail.com>

On Fri, 23 Apr 2010 13:22:22 -0700
Jeff Kirsher <jeffrey.t.kirsher@intel.com> wrote:

> On Fri, Apr 23, 2010 at 13:14, Erwan Velu <erwanaliasr1@gmail.com> wrote:
> > Hi folks,
> >
> > I've been facing a very noisy network where hundreds broadcast packets
> > were generated every second.
> > When this traffic can't be controlled at the source, there is a side
> > effect on some systems.
> > I was having some idle systems that will never be targeted by this
> > broadcast traffic that got loaded just by receiving that "flood".
> > I mean by loaded that this light hardware was generating 300
> > context/switches per second.
> >
> > I was looking for many options to avoid this traffic to disturb this
> > hosts and I discovered that the e100 driver was featuring a
> > "broadcast_disabled" configure option.
> > I realize that this option is not controllable, so I wrote this simple
> > patch that expose this option as a module option.
> > This allow me to tell this hosts not to listen anymore this traffic.
> >
> > The result is clearly good as my systems are now running at 21
> > context/switches while being idle.
> > Hope this patch isn't too bad and could help others that faces the same problem.
> >
> > Patch can be downloaded here :
> > http://konilope.linuxeries.org/e100_broadcast_disabled.patch
> >
> > Even if gmail is eating the inlined, patch, at least that make it
> > easier to read it for humans.
> > If the patch is acked, the downloaded one will be more clean ;)
> >
> > This patch was generated on top of the latest 2.6 torvald's git.
> > Cheers,
> > Erwan
> >
> > Signed-off-by: Erwan Velu <erwanaliasr1@gmail.com>
> >
> > diff --git a/drivers/net/e100.c b/drivers/net/e100.c
> > index b997e57..2ba582f 100644
> > --- a/drivers/net/e100.c
> > +++ b/drivers/net/e100.c
> > @@ -194,12 +194,15 @@ MODULE_FIRMWARE(FIRMWARE_D102E);
> >  static int debug = 3;
> >  static int eeprom_bad_csum_allow = 0;
> >  static int use_io = 0;
> > +static int broadcast_disabled = 0;
> >  module_param(debug, int, 0);
> >  module_param(eeprom_bad_csum_allow, int, 0);
> >  module_param(use_io, int, 0);
> > +module_param(broadcast_disabled, int, 0);
> >  MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
> >  MODULE_PARM_DESC(eeprom_bad_csum_allow, "Allow bad eeprom checksums");
> >  MODULE_PARM_DESC(use_io, "Force use of i/o access mode");
> > +MODULE_PARM_DESC(broadcast_disabled, "Filter broadcast packets
> > (0=disabled (default), 1=enabled)");
> >  #define DPRINTK(nlevel, klevel, fmt, args...) \
> >        (void)((NETIF_MSG_##nlevel & nic->msg_enable) && \
> >        printk(KERN_##klevel PFX "%s: %s: " fmt, nic->netdev->name, \
> > @@ -1131,6 +1134,8 @@ static void e100_configure(struct nic *nic,
> > struct cb *cb, struct sk_buff *skb)
> >                config->promiscuous_mode = 0x1;         /* 1=on, 0=off */
> >        }
> >
> > +       config->broadcast_disabled = broadcast_disabled; /* Broadcast filtering */
> > +
> >        if (nic->flags & multicast_all)
> >                config->multicast_all = 0x1;            /* 1=accept, 0=no */
> > --
> 
> Adding Netdev...
> 

What is wrong with using existing IFF_BROADCAST flag?


-- 

^ permalink raw reply

* Re: DDoS attack causing bad effect on conntrack searches
From: Eric Dumazet @ 2010-04-23 20:57 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: paulmck, Patrick McHardy, Changli Gao, hawk,
	Linux Kernel Network Hackers, Netfilter Developers
In-Reply-To: <Pine.LNX.4.64.1004222213290.10919@ask.diku.dk>

Le jeudi 22 avril 2010 à 22:38 +0200, Jesper Dangaard Brouer a écrit :

> 
> I think its plausable, there is a lot of modification going on.
> Approx 40.000 deletes/sec and 40.000 inserts/sec.
> The hash bucket size is 300032, and with 80000 modifications/sec, we are 
> (potentially) changing 26.6% of the hash chains each second.
> 
> As can be seen from the graphs:
>   http://people.netfilter.org/hawk/DDoS/2010-04-12__001/list.html
> 
> Notice that primarily CPU2 is doing the 40k deletes/sec, while CPU1 is 
> caught searching...
> 
> 
> > maybe hash table has one slot :)
> 
> Guess I have to reproduce the DoS attack in a testlab (I will first have 
> time Tuesday).  So we can determine if its bad hashing or restart of the 
> search loop.
> 
> 
> The traffic pattern was fairly simple:
> 
> 200 bytes UDP packets, comming from approx 60 source IPs, going to one 
> destination IP.  The UDP destination port number was varied in the range 
> of 1 to 6000.   The source UDP port was varied a bit more, some ranging 
> from 32768 to 61000, and some from 1028 to 5000.
> 
> 

Re-reading this, I am not sure there is a real problem on RCU as you
pointed out.

With 800.000 entries, in a 300.032 buckets hash table, each lookup hit
about 3 entries (aka searches in conntrack stats)

300.000 packets/second -> 900.000 'searches' per second.

If you have four cpus all trying to insert/delete entries in //, they
all hit the central conntrack lock.

On a DDOS scenario, every packet needs to take this lock twice,
once to free an old conntrack (early drop), once to insert a new entry.

To scale this, only way would be to have an array of locks, like we have
for TCP/UDP hash tables.

I did some tests here, with a multiqueue card, flooded with 300.000
pack/second, 65.536 source IP, millions of flows, and nothing wrong
happened (but packets drops, of course)

My two cpus were busy 100%, after tweaking smp_affinities, because on
first try, irqbalance put "01" mask on both queues, so only one ksoftirq
was working, other cpu was idle :(




^ permalink raw reply

* Re: [PATCH net-next-2.6] rps: consistent rxhash
From: David Miller @ 2010-04-23 20:44 UTC (permalink / raw)
  To: therbert; +Cc: eric.dumazet, franco, xiaosuo, netdev
In-Reply-To: <g2m65634d661004211212t13714cccyd27936c520515684@mail.gmail.com>

From: Tom Herbert <therbert@google.com>
Date: Wed, 21 Apr 2010 12:12:41 -0700

> On Tue, Apr 20, 2010 at 2:41 PM, David Miller <davem@davemloft.net> wrote:
>> Eric, do you remember that "TCP friends" rough patch I sent you last
>> year that essentailly made TCP sockets over loopback behave like
>> AF_UNIX ones and just queue the SKBs directly to the destination
>> socket without doing any protocol work?
>>
> 
> This is sounds very interesting!  Could you post a patch? :-)

I'll see if I can find it, I sent it to Eric more than a year
ago...

The basic scheme was pretty simple:

1) Add "struct sock *friend" to struct sk_buff

2) TCP initial handshake SYN and SYN+ACK transmits set "skb->friend =
   sk" and TCP receive path notices this and stores this 'friend'
   socket pointer locally in the newly created connection socket.

   The purpose of skb->friend is to let the receiving socket on
   loopback see that the other end is on the local system and
   can be directly communicated to.

3) TCP sendmsg queues data directly to sk->friend's receive queue
   instead sending TCP protocol packets.

The only complications come from making sendmsg and recvmsg not
try to do all of the sequence handling and checking, stuff like
that.  Also, URG would need to be dealt with somehow too.

I'm sure someone suitably motivated could get a working patch
going in no time :-)

^ permalink raw reply

* Re: [PATCH] NIU support for skb->rxhash
From: David Miller @ 2010-04-23 20:28 UTC (permalink / raw)
  To: therbert; +Cc: eric.dumazet, netdev
In-Reply-To: <h2z65634d661004230832o4f7a6d35ub207bc301ee8925c@mail.gmail.com>

From: Tom Herbert <therbert@google.com>
Date: Fri, 23 Apr 2010 08:32:02 -0700

>> I looked into implementing this and it doesn't work.  The
>> problem is GRO want's to look into the packet very early
>> and we want to batch GRO a set of packets into a big packet
>> before shooting them over to a remote cpu.
>>
> 
> Can you reconsider? :-)  The majority of our servers see packet loads
> which don't allow for much batching (a lot of small RPC messages), so
> for those GRO is mostly unnecessary overhead and mechanisms that
> improve unbatched packet performance are compelling.  Also, if a
> device already does LRO, I don't see that GRO could add a lot of value
> anyway.

LRO is extremely discouraged, because it has to be disabled
when any form of forwarding or bridging is enabled.  LRO is
done such that the input packet stream cannot be reconstituted
on transmit.

GRO on the other hand, allows proper reconstitution of the input
packet stream so it can be enabled unconditionally.

We are encouraging hardware manufacturers to tweak their receive
batching offload such that it matches the rules imposed by GRO
which allow proper reconsitution on transmit.

The fact is the code patch is there and it is going to be enabled all
the time, so we have to cope with it.

^ permalink raw reply

* Re: [PATCH] e100: expose broadcast_disabled as a module option
From: Jeff Kirsher @ 2010-04-23 20:22 UTC (permalink / raw)
  To: Erwan Velu, netdev, David Miller
  Cc: linux-kernel, jesse.brandeburg, bruce.w.allan, alexander.h.duyck,
	peter.p.waskiewicz.jr, john.ronciak
In-Reply-To: <l2nb43bf5491004231314i13503c67yeccfc54bc1cae850@mail.gmail.com>

On Fri, Apr 23, 2010 at 13:14, Erwan Velu <erwanaliasr1@gmail.com> wrote:
> Hi folks,
>
> I've been facing a very noisy network where hundreds broadcast packets
> were generated every second.
> When this traffic can't be controlled at the source, there is a side
> effect on some systems.
> I was having some idle systems that will never be targeted by this
> broadcast traffic that got loaded just by receiving that "flood".
> I mean by loaded that this light hardware was generating 300
> context/switches per second.
>
> I was looking for many options to avoid this traffic to disturb this
> hosts and I discovered that the e100 driver was featuring a
> "broadcast_disabled" configure option.
> I realize that this option is not controllable, so I wrote this simple
> patch that expose this option as a module option.
> This allow me to tell this hosts not to listen anymore this traffic.
>
> The result is clearly good as my systems are now running at 21
> context/switches while being idle.
> Hope this patch isn't too bad and could help others that faces the same problem.
>
> Patch can be downloaded here :
> http://konilope.linuxeries.org/e100_broadcast_disabled.patch
>
> Even if gmail is eating the inlined, patch, at least that make it
> easier to read it for humans.
> If the patch is acked, the downloaded one will be more clean ;)
>
> This patch was generated on top of the latest 2.6 torvald's git.
> Cheers,
> Erwan
>
> Signed-off-by: Erwan Velu <erwanaliasr1@gmail.com>
>
> diff --git a/drivers/net/e100.c b/drivers/net/e100.c
> index b997e57..2ba582f 100644
> --- a/drivers/net/e100.c
> +++ b/drivers/net/e100.c
> @@ -194,12 +194,15 @@ MODULE_FIRMWARE(FIRMWARE_D102E);
>  static int debug = 3;
>  static int eeprom_bad_csum_allow = 0;
>  static int use_io = 0;
> +static int broadcast_disabled = 0;
>  module_param(debug, int, 0);
>  module_param(eeprom_bad_csum_allow, int, 0);
>  module_param(use_io, int, 0);
> +module_param(broadcast_disabled, int, 0);
>  MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
>  MODULE_PARM_DESC(eeprom_bad_csum_allow, "Allow bad eeprom checksums");
>  MODULE_PARM_DESC(use_io, "Force use of i/o access mode");
> +MODULE_PARM_DESC(broadcast_disabled, "Filter broadcast packets
> (0=disabled (default), 1=enabled)");
>  #define DPRINTK(nlevel, klevel, fmt, args...) \
>        (void)((NETIF_MSG_##nlevel & nic->msg_enable) && \
>        printk(KERN_##klevel PFX "%s: %s: " fmt, nic->netdev->name, \
> @@ -1131,6 +1134,8 @@ static void e100_configure(struct nic *nic,
> struct cb *cb, struct sk_buff *skb)
>                config->promiscuous_mode = 0x1;         /* 1=on, 0=off */
>        }
>
> +       config->broadcast_disabled = broadcast_disabled; /* Broadcast filtering */
> +
>        if (nic->flags & multicast_all)
>                config->multicast_all = 0x1;            /* 1=accept, 0=no */
> --

Adding Netdev...

-- 
Cheers,
Jeff

^ permalink raw reply

* [PATCHv5] add mergeable receiver buffers support to vhost
From: David L Stevens @ 2010-04-23 20:06 UTC (permalink / raw)
  To: mst, rusty, kvm, virtualization; +Cc: netdev

This patch adds mergeable receive buffers support to vhost.

Signed-off-by: David L Stevens <dlstevens@us.ibm.com>

diff -ruNp net-next-v0/drivers/vhost/net.c net-next-v5/drivers/vhost/net.c
--- net-next-v0/drivers/vhost/net.c	2010-04-22 11:31:57.000000000 -0700
+++ net-next-v5/drivers/vhost/net.c	2010-04-22 12:41:17.000000000 -0700
@@ -109,7 +109,7 @@ static void handle_tx(struct vhost_net *
 	};
 	size_t len, total_len = 0;
 	int err, wmem;
-	size_t hdr_size;
+	size_t vhost_hlen;
 	struct socket *sock = rcu_dereference(vq->private_data);
 	if (!sock)
 		return;
@@ -128,13 +128,13 @@ static void handle_tx(struct vhost_net *
 
 	if (wmem < sock->sk->sk_sndbuf / 2)
 		tx_poll_stop(net);
-	hdr_size = vq->hdr_size;
+	vhost_hlen = vq->vhost_hlen;
 
 	for (;;) {
-		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
-					 ARRAY_SIZE(vq->iov),
-					 &out, &in,
-					 NULL, NULL);
+		head = vhost_get_desc(&net->dev, vq, vq->iov,
+				      ARRAY_SIZE(vq->iov),
+				      &out, &in,
+				      NULL, NULL);
 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
 		if (head == vq->num) {
 			wmem = atomic_read(&sock->sk->sk_wmem_alloc);
@@ -155,20 +155,20 @@ static void handle_tx(struct vhost_net *
 			break;
 		}
 		/* Skip header. TODO: support TSO. */
-		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
+		s = move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, out);
 		msg.msg_iovlen = out;
 		len = iov_length(vq->iov, out);
 		/* Sanity check */
 		if (!len) {
 			vq_err(vq, "Unexpected header len for TX: "
 			       "%zd expected %zd\n",
-			       iov_length(vq->hdr, s), hdr_size);
+			       iov_length(vq->hdr, s), vhost_hlen);
 			break;
 		}
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
 		err = sock->ops->sendmsg(NULL, sock, &msg, len);
 		if (unlikely(err < 0)) {
-			vhost_discard_vq_desc(vq);
+			vhost_discard_desc(vq, 1);
 			tx_poll_start(net, sock);
 			break;
 		}
@@ -187,12 +187,25 @@ static void handle_tx(struct vhost_net *
 	unuse_mm(net->dev.mm);
 }
 
+static int vhost_head_len(struct vhost_virtqueue *vq, struct sock *sk)
+{
+	struct sk_buff *head;
+	int len = 0;
+
+	lock_sock(sk);
+	head = skb_peek(&sk->sk_receive_queue);
+	if (head)
+		len = head->len + vq->sock_hlen;
+	release_sock(sk);
+	return len;
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_rx(struct vhost_net *net)
 {
 	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
-	unsigned head, out, in, log, s;
+	unsigned in, log, s;
 	struct vhost_log *vq_log;
 	struct msghdr msg = {
 		.msg_name = NULL,
@@ -203,14 +216,14 @@ static void handle_rx(struct vhost_net *
 		.msg_flags = MSG_DONTWAIT,
 	};
 
-	struct virtio_net_hdr hdr = {
-		.flags = 0,
-		.gso_type = VIRTIO_NET_HDR_GSO_NONE
+	struct virtio_net_hdr_mrg_rxbuf hdr = {
+		.hdr.flags = 0,
+		.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
 	};
 
 	size_t len, total_len = 0;
-	int err;
-	size_t hdr_size;
+	int err, headcount, datalen;
+	size_t vhost_hlen;
 	struct socket *sock = rcu_dereference(vq->private_data);
 	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
 		return;
@@ -218,18 +231,18 @@ static void handle_rx(struct vhost_net *
 	use_mm(net->dev.mm);
 	mutex_lock(&vq->mutex);
 	vhost_disable_notify(vq);
-	hdr_size = vq->hdr_size;
+	vhost_hlen = vq->vhost_hlen;
 
 	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 		vq->log : NULL;
 
-	for (;;) {
-		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
-					 ARRAY_SIZE(vq->iov),
-					 &out, &in,
-					 vq_log, &log);
+	while ((datalen = vhost_head_len(vq, sock->sk))) {
+		headcount = vhost_get_desc_n(vq, vq->heads, datalen+vhost_hlen,
+					     &in, vq_log, &log);
+		if (headcount < 0)
+			break;
 		/* OK, now we need to know about added descriptors. */
-		if (head == vq->num) {
+		if (!headcount) {
 			if (unlikely(vhost_enable_notify(vq))) {
 				/* They have slipped one in as we were
 				 * doing that: check again. */
@@ -241,46 +254,54 @@ static void handle_rx(struct vhost_net *
 			break;
 		}
 		/* We don't need to be notified again. */
-		if (out) {
-			vq_err(vq, "Unexpected descriptor format for RX: "
-			       "out %d, int %d\n",
-			       out, in);
-			break;
-		}
-		/* Skip header. TODO: support TSO/mergeable rx buffers. */
-		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
+		/* Skip header. TODO: support TSO. */
+		s = move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
 		msg.msg_iovlen = in;
 		len = iov_length(vq->iov, in);
 		/* Sanity check */
 		if (!len) {
 			vq_err(vq, "Unexpected header len for RX: "
 			       "%zd expected %zd\n",
-			       iov_length(vq->hdr, s), hdr_size);
+			       iov_length(vq->hdr, s), vhost_hlen);
 			break;
 		}
 		err = sock->ops->recvmsg(NULL, sock, &msg,
 					 len, MSG_DONTWAIT | MSG_TRUNC);
 		/* TODO: Check specific error and bomb out unless EAGAIN? */
 		if (err < 0) {
-			vhost_discard_vq_desc(vq);
+			vhost_discard_desc(vq, headcount);
 			break;
 		}
-		/* TODO: Should check and handle checksum. */
-		if (err > len) {
-			pr_err("Discarded truncated rx packet: "
-			       " len %d > %zd\n", err, len);
-			vhost_discard_vq_desc(vq);
+		if (err != datalen) {
+			pr_err("Discarded rx packet: "
+			       " len %d, expected %zd\n", err, datalen);
+			vhost_discard_desc(vq, headcount);
 			continue;
 		}
 		len = err;
-		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
+		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr,
+				     vhost_hlen);
 		if (err) {
 			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
 			       vq->iov->iov_base, err);
 			break;
 		}
-		len += hdr_size;
-		vhost_add_used_and_signal(&net->dev, vq, head, len);
+		/* TODO: Should check and handle checksum. */
+		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) {
+			struct virtio_net_hdr_mrg_rxbuf hdr;
+			struct iovec *iov = vhost_hlen ? vq->hdr : vq->iov;
+
+			if (memcpy_toiovecend(iov, (unsigned char *)&headcount,
+				      offsetof(typeof(hdr), num_buffers),
+				      sizeof(hdr.num_buffers))) {
+				vq_err(vq, "Failed num_buffers write");
+				vhost_discard_desc(vq, headcount);
+				break;
+			}
+		}
+		len += vhost_hlen;
+		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
+					    headcount);
 		if (unlikely(vq_log))
 			vhost_log_write(vq, vq_log, log, len);
 		total_len += len;
@@ -561,9 +582,24 @@ done:
 
 static int vhost_net_set_features(struct vhost_net *n, u64 features)
 {
-	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
-		sizeof(struct virtio_net_hdr) : 0;
+	size_t vhost_hlen;
+	size_t sock_hlen;
 	int i;
+
+	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
+		/* vhost provides vnet_hdr */
+		vhost_hlen = sizeof(struct virtio_net_hdr);
+		if (features & (1 << VIRTIO_NET_F_MRG_RXBUF))
+			vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+		sock_hlen = 0;
+	} else {
+		/* socket provides vnet_hdr */
+		vhost_hlen = 0;
+		if (features & (1 << VIRTIO_NET_F_MRG_RXBUF))
+			sock_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+		else
+			sock_hlen = sizeof(struct virtio_net_hdr);
+	}
 	mutex_lock(&n->dev.mutex);
 	if ((features & (1 << VHOST_F_LOG_ALL)) &&
 	    !vhost_log_access_ok(&n->dev)) {
@@ -574,7 +610,8 @@ static int vhost_net_set_features(struct
 	smp_wmb();
 	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
 		mutex_lock(&n->vqs[i].mutex);
-		n->vqs[i].hdr_size = hdr_size;
+		n->vqs[i].vhost_hlen = vhost_hlen;
+		n->vqs[i].sock_hlen = sock_hlen;
 		mutex_unlock(&n->vqs[i].mutex);
 	}
 	vhost_net_flush(n);
diff -ruNp net-next-v0/drivers/vhost/vhost.c net-next-v5/drivers/vhost/vhost.c
--- net-next-v0/drivers/vhost/vhost.c	2010-04-22 11:31:57.000000000 -0700
+++ net-next-v5/drivers/vhost/vhost.c	2010-04-22 12:19:59.000000000 -0700
@@ -114,7 +114,8 @@ static void vhost_vq_reset(struct vhost_
 	vq->used_flags = 0;
 	vq->log_used = false;
 	vq->log_addr = -1ull;
-	vq->hdr_size = 0;
+	vq->vhost_hlen = 0;
+	vq->sock_hlen = 0;
 	vq->private_data = NULL;
 	vq->log_base = NULL;
 	vq->error_ctx = NULL;
@@ -861,6 +862,53 @@ static unsigned get_indirect(struct vhos
 	return 0;
 }
 
+/* This is a multi-buffer version of vhost_get_vq_desc
+ * @vq		- the relevant virtqueue
+ * datalen	- data length we'll be reading
+ * @iovcount	- returned count of io vectors we fill
+ * @log		- vhost log
+ * @log_num	- log offset
+ *	returns number of buffer heads allocated, negative on error
+ */
+int vhost_get_desc_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
+		     int datalen, int *iovcount, struct vhost_log *log,
+		     unsigned int *log_num)
+{
+	int out, in;
+	int seg = 0;		/* iov index */
+	int hc = 0;		/* head count */
+	int rv;
+
+	while (datalen > 0) {
+		if (hc >= VHOST_NET_MAX_SG) {
+			rv = -ENOBUFS;
+			goto err;
+		}
+		heads[hc].id = vhost_get_desc(vq->dev, vq, vq->iov+seg,
+					      ARRAY_SIZE(vq->iov)-seg, &out,
+					      &in, log, log_num);
+		if (heads[hc].id == vq->num) {
+			rv = 0;
+			goto err;
+		}
+		if (out || in <= 0) {
+			vq_err(vq, "unexpected descriptor format for RX: "
+				"out %d, in %d\n", out, in);
+			rv = -EINVAL;
+			goto err;
+		}
+		heads[hc].len = iov_length(vq->iov+seg, in);
+		datalen -= heads[hc].len;
+		hc++;
+		seg += in;
+	}
+	*iovcount = seg;
+	return hc;
+err:
+	vhost_discard_desc(vq, hc);
+	return rv;
+}
+
 /* This looks in the virtqueue and for the first available buffer, and converts
  * it to an iovec for convenient access.  Since descriptors consist of some
  * number of output then some number of input descriptors, it's actually two
@@ -868,7 +916,7 @@ static unsigned get_indirect(struct vhos
  *
  * This function returns the descriptor number found, or vq->num (which
  * is never a valid descriptor number) if none was found. */
-unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+unsigned vhost_get_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
 			   struct iovec iov[], unsigned int iov_size,
 			   unsigned int *out_num, unsigned int *in_num,
 			   struct vhost_log *log, unsigned int *log_num)
@@ -986,9 +1034,9 @@ unsigned vhost_get_vq_desc(struct vhost_
 }
 
 /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
-void vhost_discard_vq_desc(struct vhost_virtqueue *vq)
+void vhost_discard_desc(struct vhost_virtqueue *vq, int n)
 {
-	vq->last_avail_idx--;
+	vq->last_avail_idx -= n;
 }
 
 /* After we've used one of their buffers, we tell them about it.  We'll then
@@ -1017,6 +1065,54 @@ int vhost_add_used(struct vhost_virtqueu
 	if (unlikely(vq->log_used)) {
 		/* Make sure data is seen before log. */
 		smp_wmb();
+		log_write(vq->log_base, vq->log_addr + sizeof *vq->used->ring *
+			  (vq->last_used_idx % vq->num),
+			  sizeof *vq->used->ring);
+		log_write(vq->log_base, vq->log_addr, sizeof *vq->used->ring);
+		if (vq->log_ctx)
+			eventfd_signal(vq->log_ctx, 1);
+	}
+	vq->last_used_idx++;
+	return 0;
+}
+
+/* After we've used one of their buffers, we tell them about it.  We'll then
+ * want to notify the guest, using eventfd. */
+int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
+		   int count)
+{
+	struct vring_used_elem *used;
+	int start, n;
+
+	if (count <= 0)
+		return -EINVAL;
+
+	start = vq->last_used_idx % vq->num;
+	if (vq->num - start < count)
+		n = vq->num - start;
+	else
+		n = count;
+	used = vq->used->ring + start;
+	if (copy_to_user(used, heads, sizeof(heads[0])*n)) {
+		vq_err(vq, "Failed to write used");
+		return -EFAULT;
+	}
+	if (n < count) {	/* wrapped the ring */
+		used = vq->used->ring;
+		if (copy_to_user(used, heads+n, sizeof(heads[0])*(count-n))) {
+			vq_err(vq, "Failed to write used");
+			return -EFAULT;
+		}
+	}
+	/* Make sure buffer is written before we update index. */
+	smp_wmb();
+	if (put_user(vq->last_used_idx+count, &vq->used->idx)) {
+		vq_err(vq, "Failed to increment used idx");
+		return -EFAULT;
+	}
+	if (unlikely(vq->log_used)) {
+		/* Make sure data is seen before log. */
+		smp_wmb();
 		/* Log used ring entry write. */
 		log_write(vq->log_base,
 			  vq->log_addr +
@@ -1029,7 +1125,7 @@ int vhost_add_used(struct vhost_virtqueu
 		if (vq->log_ctx)
 			eventfd_signal(vq->log_ctx, 1);
 	}
-	vq->last_used_idx++;
+	vq->last_used_idx += count;
 	return 0;
 }
 
@@ -1062,6 +1158,15 @@ void vhost_add_used_and_signal(struct vh
 	vhost_signal(dev, vq);
 }
 
+/* multi-buffer version of vhost_add_used_and_signal */
+void vhost_add_used_and_signal_n(struct vhost_dev *dev,
+				 struct vhost_virtqueue *vq,
+				 struct vring_used_elem *heads, int count)
+{
+	vhost_add_used_n(vq, heads, count);
+	vhost_signal(dev, vq);
+}
+
 /* OK, now we need to know about added descriptors. */
 bool vhost_enable_notify(struct vhost_virtqueue *vq)
 {
@@ -1086,7 +1191,7 @@ bool vhost_enable_notify(struct vhost_vi
 		return false;
 	}
 
-	return avail_idx != vq->last_avail_idx;
+	return avail_idx != vq->avail_idx;
 }
 
 /* We don't need to be notified again. */
diff -ruNp net-next-v0/drivers/vhost/vhost.h net-next-v5/drivers/vhost/vhost.h
--- net-next-v0/drivers/vhost/vhost.h	2010-03-22 12:04:38.000000000 -0700
+++ net-next-v5/drivers/vhost/vhost.h	2010-04-22 11:35:54.000000000 -0700
@@ -84,7 +84,9 @@ struct vhost_virtqueue {
 	struct iovec indirect[VHOST_NET_MAX_SG];
 	struct iovec iov[VHOST_NET_MAX_SG];
 	struct iovec hdr[VHOST_NET_MAX_SG];
-	size_t hdr_size;
+	size_t vhost_hlen;
+	size_t sock_hlen;
+	struct vring_used_elem heads[VHOST_NET_MAX_SG];
 	/* We use a kind of RCU to access private pointer.
 	 * All readers access it from workqueue, which makes it possible to
 	 * flush the workqueue instead of synchronize_rcu. Therefore readers do
@@ -120,16 +122,23 @@ long vhost_dev_ioctl(struct vhost_dev *,
 int vhost_vq_access_ok(struct vhost_virtqueue *vq);
 int vhost_log_access_ok(struct vhost_dev *);
 
-unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
+int vhost_get_desc_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
+		     int datalen, int *iovcount, struct vhost_log *log,
+		     unsigned int *log_num);
+unsigned vhost_get_desc(struct vhost_dev *, struct vhost_virtqueue *,
 			   struct iovec iov[], unsigned int iov_count,
 			   unsigned int *out_num, unsigned int *in_num,
 			   struct vhost_log *log, unsigned int *log_num);
-void vhost_discard_vq_desc(struct vhost_virtqueue *);
+void vhost_discard_desc(struct vhost_virtqueue *, int);
 
 int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
-void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
+int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
+		    int count);
 void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
-			       unsigned int head, int len);
+			       unsigned int id, int len);
+void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
+			       struct vring_used_elem *heads, int count);
+void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
 void vhost_disable_notify(struct vhost_virtqueue *);
 bool vhost_enable_notify(struct vhost_virtqueue *);
 
@@ -149,7 +158,8 @@ enum {
 	VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
 			 (1 << VIRTIO_RING_F_INDIRECT_DESC) |
 			 (1 << VHOST_F_LOG_ALL) |
-			 (1 << VHOST_NET_F_VIRTIO_NET_HDR),
+			 (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
+			 (1 << VIRTIO_NET_F_MRG_RXBUF),
 };
 
 static inline int vhost_has_feature(struct vhost_dev *dev, int bit)



^ permalink raw reply

* Re: eSwitch management
From: Chris Wright @ 2010-04-23 19:44 UTC (permalink / raw)
  To: Anirban Chakraborty
  Cc: Chris Wright, Scott Feldman, David Miller, netdev@vger.kernel.org,
	Arnd Bergmann, Ameen Rahman, Amit Salecha, Rajesh Borundia,
	shemminger
In-Reply-To: <8A4C54B1-B5E5-461D-9699-38526B9CEBF4@qlogic.com>

* Anirban Chakraborty (anirban.chakraborty@qlogic.com) wrote:
> On Apr 23, 2010, at 9:23 AM, Chris Wright wrote:
> > * Anirban Chakraborty (anirban.chakraborty@qlogic.com) wrote:
> >> It looks like ifla_vf_info does contain most of the data set. But if I use it, what NETLINK protocol family should I use in my driver to receive netlink messages? Do I need to create a private protocol family?
> > 
> > No, you don't need to use netlink in your driver.  You just need to fill
> > in the relevant net_device_ops in your driver init.  Specifically:
> > 
> > *      SR-IOV management functions.
> > * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
> > * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, u8 qos);
> > * int (*ndo_set_vf_tx_rate)(struct net_device *dev, int vf, int rate);
> > * int (*ndo_get_vf_config)(struct net_device *dev,
> > *                          int vf, struct ifla_vf_info *ivf);
> > 
> > These are all operating on a VF indexed internally w/in the driver, so it's
> > a little cumbersome to use from userspace.
> 
> These are all intended for VFs and are configureable from PF.

Yes, and while the set of callbacks can change, they are always tied to
some net_device (typically the PF) that knows how to make hardware
settings on behalf of a VF.

> However, in our case, there are multiple physical NIC function on a
> port which are configureable by the eswitch.

Is there a PCI function that represents the switch?  Or a special PCI
NIC function that has VEB mgmt plane access?  And do you have examples
of configuration that you'll do here?

> So, what we are setting
> is essentially switch ports, rather than configuring any setting on the
> physical functions. If netlink doesn't fly, is sysfs going to work?

Before we go to implementation specifics (i.e. netlink vs. sysfs, and my
guess is sysfs isn't going to be the right fit), let's step back and
look at what needs setting.

> If
> we allocate a buffer and fill it up with user space tools that the driver
> grabs it and does the configuration itself?

One idea that has been discussed in the past is to create essentially
a pluggable set of bridge_ops.  The first step would be purely internal
shuffling, to make the existing sw bridge code go through the bridge_ops.
The second step would be making your driver for whichever PCI function
you have that supports managing the bridge create a net_device which is
a bridge during driver init.  And now normal brctl can call into your
VEB via the bridge_ops callbacks. </handwave>

But this too starts w/ looking at what the management requirements are
for your bridge.  Can you enumerate those?

thanks,
-chris

^ permalink raw reply

* Re: [PATCH] RCU: don't turn off lockdep when find suspicious rcu_dereference_check() usage
From: Paul E. McKenney @ 2010-04-23 19:42 UTC (permalink / raw)
  To: Miles Lane
  Cc: Vivek Goyal, Eric Paris, Lai Jiangshan, Ingo Molnar,
	Peter Zijlstra, LKML, nauman, eric.dumazet, netdev, Jens Axboe,
	Gui Jianfeng, Li Zefan
In-Reply-To: <h2xa44ae5cd1004230550uf734c89eo2b1d1945d446068c@mail.gmail.com>

On Fri, Apr 23, 2010 at 08:50:59AM -0400, Miles Lane wrote:
> Hi Paul,
> There has been a bit of back and forth, and I am not sure what patches
> I should test now.
> Could you send me a bundle of whatever needs testing now?

Hello, Miles,

I am posting my set as replies to this message.  There are a couple
of KVM fixes that are going up via Avi's tree, and a number of networking
fixes that are going up via Dave Miller's tree -- a number of these
are against quickly changing code, so it didn't make sense for me to
keep them separately.

I believe that the two splats below are addressed by this patch set
carried in the networking tree:

	https://patchwork.kernel.org/patch/90754/

							Thanx, Paul

> I currently have a build of 2.6.34-rc5-git3 with the same patch I
> tested before applied.
> I notice a few minor differences in the warnings given.  I suspect
> these do not indicate
> new issues, since the trace from <IRQ> through <EOI> is the same as before.
> 
> [   60.174809] [ INFO: suspicious rcu_dereference_check() usage. ]
> [   60.174812] ---------------------------------------------------
> [   60.174816] net/mac80211/sta_info.c:886 invoked
> rcu_dereference_check() without protection!
> [   60.174820]
> [   60.174821] other info that might help us debug this:
> [   60.174822]
> [   60.174825]
> [   60.174826] rcu_scheduler_active = 1, debug_locks = 1
> [   60.174829] no locks held by wpa_supplicant/3973.
> [   60.174832]
> [   60.174833] stack backtrace:
> [   60.174838] Pid: 3973, comm: wpa_supplicant Not tainted 2.6.34-rc5-git3 #19
> [   60.174841] Call Trace:
> [   60.174844]  <IRQ>  [<ffffffff81067faa>] lockdep_rcu_dereference+0x9d/0xa5
> [   60.174873]  [<ffffffffa014e9ae>]
> ieee80211_find_sta_by_hw+0x46/0x10f [mac80211]
> [   60.174886]  [<ffffffffa014ea8e>] ieee80211_find_sta+0x17/0x19 [mac80211]
> [   60.174902]  [<ffffffffa01a60f2>] iwl_tx_queue_reclaim+0xdb/0x1b1 [iwlcore]
> [   60.174909]  [<ffffffff81068417>] ? mark_lock+0x2d/0x235
> [   60.174920]  [<ffffffffa01d5f1c>] iwl5000_rx_reply_tx+0x4a9/0x556 [iwlagn]
> [   60.174927]  [<ffffffff8120a2d3>] ? is_swiotlb_buffer+0x2e/0x3b
> [   60.174936]  [<ffffffffa01cebf4>] iwl_rx_handle+0x163/0x2b5 [iwlagn]
> [   60.174943]  [<ffffffff810688f0>] ? trace_hardirqs_on_caller+0xfa/0x13f
> [   60.174952]  [<ffffffffa01cf3ac>] iwl_irq_tasklet+0x2bb/0x3c0 [iwlagn]
> [   60.174959]  [<ffffffff810411df>] tasklet_action+0xa7/0x10f
> [   60.174965]  [<ffffffff810421f1>] __do_softirq+0x144/0x252
> [   60.174972]  [<ffffffff81003a8c>] call_softirq+0x1c/0x34
> [   60.174977]  [<ffffffff810050e4>] do_softirq+0x38/0x80
> [   60.174982]  [<ffffffff81041cbe>] irq_exit+0x45/0x94
> [   60.174987]  [<ffffffff81004829>] do_IRQ+0xad/0xc4
> [   60.174994]  [<ffffffff813cfb13>] ret_from_intr+0x0/0xf
> [   60.174997]  <EOI>  [<ffffffff810e5114>] ? kmem_cache_alloc+0xa9/0x15f
> [   60.175010]  [<ffffffff81342182>] ? __alloc_skb+0x3d/0x155
> [   60.175016]  [<ffffffff81342182>] __alloc_skb+0x3d/0x155
> [   60.175023]  [<ffffffff8133d237>] sock_alloc_send_pskb+0xc0/0x2e5
> [   60.175030]  [<ffffffff8133d46c>] sock_alloc_send_skb+0x10/0x12
> [   60.175036]  [<ffffffff813b1ab5>] unix_stream_sendmsg+0x117/0x2e2
> [   60.175044]  [<ffffffff811bdca8>] ? avc_has_perm+0x57/0x69
> [   60.175050]  [<ffffffff8133b892>] ? sock_aio_write+0x0/0xcf
> [   60.175056]  [<ffffffff813392c2>] __sock_sendmsg+0x59/0x64
> [   60.175062]  [<ffffffff8133b94d>] sock_aio_write+0xbb/0xcf
> [   60.175069]  [<ffffffff810e98b1>] do_sync_readv_writev+0xbc/0xfb
> [   60.175077]  [<ffffffff811c1726>] ? selinux_file_permission+0xa2/0xaf
> [   60.175082]  [<ffffffff810e9638>] ? copy_from_user+0x2a/0x2c
> [   60.175089]  [<ffffffff811baf85>] ? security_file_permission+0x11/0x13
> [   60.175095]  [<ffffffff810ea64e>] do_readv_writev+0xa2/0x122
> [   60.175101]  [<ffffffff810ead3b>] ? fcheck_files+0x8f/0xc9
> [   60.175107]  [<ffffffff810ea70c>] vfs_writev+0x3e/0x49
> [   60.175113]  [<ffffffff810ea7f2>] sys_writev+0x45/0x8e
> [   60.175119]  [<ffffffff81002b6b>] system_call_fastpath+0x16/0x1b
> 
> [   60.223213] [ INFO: suspicious rcu_dereference_check() usage. ]
> [   60.223216] ---------------------------------------------------
> [   60.223221] net/mac80211/sta_info.c:886 invoked
> rcu_dereference_check() without protection!
> [   60.223224]
> [   60.223225] other info that might help us debug this:
> [   60.223227]
> [   60.223230]
> [   60.223230] rcu_scheduler_active = 1, debug_locks = 1
> [   60.223234] no locks held by udisks-daemon/4398.
> [   60.223236]
> [   60.223237] stack backtrace:
> [   60.223242] Pid: 4398, comm: udisks-daemon Not tainted 2.6.34-rc5-git3 #19
> [   60.223245] Call Trace:
> [   60.223249]  <IRQ>  [<ffffffff81067faa>] lockdep_rcu_dereference+0x9d/0xa5
> [   60.223275]  [<ffffffffa014e9fe>]
> ieee80211_find_sta_by_hw+0x96/0x10f [mac80211]
> [   60.223288]  [<ffffffffa014ea8e>] ieee80211_find_sta+0x17/0x19 [mac80211]
> [   60.223304]  [<ffffffffa01a60f2>] iwl_tx_queue_reclaim+0xdb/0x1b1 [iwlcore]
> [   60.223310]  [<ffffffff81068417>] ? mark_lock+0x2d/0x235
> [   60.223321]  [<ffffffffa01d5f1c>] iwl5000_rx_reply_tx+0x4a9/0x556 [iwlagn]
> [   60.223329]  [<ffffffff8120a2d3>] ? is_swiotlb_buffer+0x2e/0x3b
> [   60.223338]  [<ffffffffa01cebf4>] iwl_rx_handle+0x163/0x2b5 [iwlagn]
> [   60.223344]  [<ffffffff810688f0>] ? trace_hardirqs_on_caller+0xfa/0x13f
> [   60.223353]  [<ffffffffa01cf3ac>] iwl_irq_tasklet+0x2bb/0x3c0 [iwlagn]
> [   60.223360]  [<ffffffff810411df>] tasklet_action+0xa7/0x10f
> [   60.223367]  [<ffffffff810421f1>] __do_softirq+0x144/0x252
> [   60.223374]  [<ffffffff81003a8c>] call_softirq+0x1c/0x34
> [   60.223379]  [<ffffffff810050e4>] do_softirq+0x38/0x80
> [   60.223384]  [<ffffffff81041cbe>] irq_exit+0x45/0x94
> [   60.223389]  [<ffffffff81004829>] do_IRQ+0xad/0xc4
> [   60.223396]  [<ffffffff813cfb13>] ret_from_intr+0x0/0xf
> [   60.223399]  <EOI>  [<ffffffff810e34f1>] ? kmem_cache_free+0xb0/0x134
> [   60.223412]  [<ffffffff810f391a>] ? putname+0x2d/0x36
> [   60.223417]  [<ffffffff810f391a>] putname+0x2d/0x36
> [   60.223423]  [<ffffffff810f5536>] user_path_at+0x5f/0x8e
> [   60.223429]  [<ffffffff81068671>] ? mark_held_locks+0x52/0x70
> [   60.223435]  [<ffffffff810e34ee>] ? kmem_cache_free+0xad/0x134
> [   60.223441]  [<ffffffff8106890a>] ? trace_hardirqs_on_caller+0x114/0x13f
> [   60.223447]  [<ffffffff81068942>] ? trace_hardirqs_on+0xd/0xf
> [   60.223454]  [<ffffffff810ed93f>] vfs_fstatat+0x32/0x5d
> [   60.223460]  [<ffffffff810ed9bb>] vfs_lstat+0x19/0x1b
> [   60.223465]  [<ffffffff810ed9d7>] sys_newlstat+0x1a/0x38
> [   60.223471]  [<ffffffff8106890a>] ? trace_hardirqs_on_caller+0x114/0x13f
> [   60.223477]  [<ffffffff813cec00>] ? trace_hardirqs_on_thunk+0x3a/0x3f
> [   60.223485]  [<ffffffff81002b6b>] system_call_fastpath+0x16/0x1b

^ permalink raw reply

* Re: [RFC 2/2] phylib: Convert MDIO bitbang to new MDIO 45 format
From: Andy Fleming @ 2010-04-23 19:39 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: davem, netdev
In-Reply-To: <1272018128.11697.37.camel@localhost>


On Apr 23, 2010, at 5:22 AM, Ben Hutchings wrote:

> On Thu, 2010-04-22 at 23:38 -0500, Andy Fleming wrote:
>> Now that we've added somewhat more complete MDIO 45 support to the PHY
>> Lib, convert the MDIO bitbang driver to use this new infrastructure.
>> 
>> Signed-off-by: Andy Fleming <afleming@freescale.com>
>> ---
>> drivers/net/phy/mdio-bitbang.c |   23 +++++++++++------------
>> 1 files changed, 11 insertions(+), 12 deletions(-)
>> 
>> diff --git a/drivers/net/phy/mdio-bitbang.c b/drivers/net/phy/mdio-bitbang.c
>> index 2f6f02e..4c0c89b 100644
>> --- a/drivers/net/phy/mdio-bitbang.c
>> +++ b/drivers/net/phy/mdio-bitbang.c
> [...]
>> @@ -157,9 +154,10 @@ static int mdiobb_read(struct mii_bus *bus, int phy, int devad, int reg)
>> 	struct mdiobb_ctrl *ctrl = bus->priv;
>> 	int ret, i;
>> 
>> -	if (reg & MII_ADDR_C45) {
>> -		reg = mdiobb_cmd_addr(ctrl, phy, reg);
>> -		mdiobb_cmd(ctrl, MDIO_C45_READ, phy, reg);
>> +	/* Clause 22 PHYs only use devad = 0, and Clause 45 only use nonzero */
>> +	if (devad) {
>> +		mdiobb_cmd_addr(ctrl, phy, devad, reg);
>> +		mdiobb_cmd(ctrl, MDIO_C45_READ, phy, devad);
>> 	} else
>> 		mdiobb_cmd(ctrl, MDIO_READ, phy, reg);
>> 
> [...]
> 
> I don't believe there's any protocol requirement in clause 45 that
> devad != 0 (although the address is not allocated).  In the mdio module
> I played safe and defined MDIO_DEVAD_NONE == -1 to indicate a clause 22
> request.


Yeah, best to play it safe.  I'm also realizing that the bus probing code has the implicit assumption that the bus will either support clause 45 and therefore use device addresses, or will not support it, but if we support both on the same bus, the probe will not catch any clause 22 PHYs.

I will fix.

Also, thank you for your work on the mdio code!

Andy

Andy

^ permalink raw reply

* pull request: wireless-next-2.6 2010-04-23
From: John W. Linville @ 2010-04-23 19:01 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA

Dave,

Yet another huge batch of updates intended for 2.6.35.  The ath9k driver
in particular gets a lot of attention, and the iwlwifi team continues
its usual strong showing.

Please let me know if there are problems!  Again, this is for the
'for-davem' branch where I have pre-resolved some merge conflicts.

Thanks,

John

---

The following changes since commit c68ed255265968c3948fa2678bf59d15c471b055:
  Tom Herbert (1):
        bnx2x: add support for receive hashing

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-next-2.6.git for-davem

Abhijeet Kolekar (1):
      iwlwifi: add debugfs ops to iwlwifi

Benoit Papillault (1):
      ath5k/ath9k: Fix 64 bits TSF reads

Christian Lamparter (1):
      p54pci: fix serious sparse warning

Dan Williams (2):
      libertas: consolidate SDIO firmware wait code
      libertas: Davinci platforms need more time loading helper firmware

Daniel Halperin (1):
      mac80211: fix typo in comments

Daniel Yingqiang Ma (1):
      ath9k: Group Key fix for VAPs

David Kilroy (3):
      orinoco: implement set_wiphy_params
      orinoco: use cfg80211_find_ie
      orinoco: have sparse check endian issues

Felix Fietkau (23):
      ath9k_hw: add silicon revision macros for AR9300
      ath9k_hw: add a macro for abstracting generic timer access
      ath9k_hw: fix a missing hex prefix for a register mask
      ath9k_hw: add simple register abstraction for some AR9300 registers
      ath9k_hw: add support for GPIO differences on AR9003
      ath9k_hw: Add AR9003 PHY register definitions
      ath9k_hw: Set the channel on AR9003
      ath9k_hw: Implement PLL control on AR9003
      ath9k_hw: Implement spur mitigation on AR9003
      ath9k_hw: Split off ANI control to the PHY ops
      ath9k: Add Rx EDMA support
      ath9k_hw: Split out the function for reading the noise floor
      ath9k_hw: move AR9280 PCI EEPROM fix to eeprom_def.c
      ath9k_hw: Update ath9k_hw_set_dma for AR9300
      ath9k: check for specific rx stuck conditions and recover from them
      ath9k: clean up tx buffer handling
      ath9k: update the MCS mask for MCS16 and above
      ath9k: update the ath_max_4ms_framelen table
      ath9k: reduce the bits_per_symbol table size, support more streams
      ath9k: initialize the number of tx/rx streams correctly
      mac80211: add flags for STBC (Space-Time Block Coding)
      ath9k: add support for Tx and Rx STBC
      ath9k: set the STBC flag in rate control if the peer supports it

Grazvydas Ignotas (3):
      wl1251: read default MAC address from EEPROM when available
      wl1251: register platform_device to pass board data
      wl1251: add support for dedicated IRQ line

Helmut Schaa (2):
      rt2x00: add txdesc parameter to write_tx_data
      rt2x00: rt2800pci: fix tx path by not accessing the skb after it was DMA mapped

Holger Schurig (2):
      mac80211: sample survey implementation for mac80211 & hwsim
      ath5k: basic support for survey

Johannes Berg (14):
      iwlwifi: remove scan_bands logic
      iwlwifi: correct atomic bitops usage
      iwlwifi: remove next_scan_jiffies
      iwlwifi: remove scan_pass_start
      iwlwifi: rename priv->scan to priv->scan_cmd
      iwlwifi: trigger scan synchronously
      iwlwifi: make BT coex config a virtual method
      iwlwifi: rename TX_CMD_FLG_BT_DIS_MSK
      iwlwifi: don't check monitor for scanning
      iwlwifi: remove monitor check
      iwlwifi: make scan antenna forcing more generic
      mac80211: fix stopping RX BA session from timer
      mac80211: add missing newline
      radiotap parser: fix endian annotation

John W. Linville (2):
      Merge branch 'wireless-next-2.6' of git://git.kernel.org/.../iwlwifi/iwlwifi-2.6
      Merge branch 'master' into for-davem

Juuso Oikarinen (2):
      mac80211: Prevent running sta_cleanup timer unnecessarily
      mac80211: Fix ieee80211_sta_conn_mon_timer with hw connection monitoring

Larry Finger (1):
      rtl818x: Move configuration details to the rtl818x directory

Luis R. Rodriguez (54):
      ath9k_hw: start building an abstraction layer for hardware routines
      ath9k_hw: AR9003 does not have AR_RC_AHB skip its setting
      ath9k_hw: remove wrapper ath9k_hw_write_regs()
      ath9k_hw: Move some RF ops to the private callbacks
      ath9k_hw: skip PLL initialization on AR9003 on Power-On-Reset
      ath9k_hw: add some comments for ath9k_set_power_network_sleep()
      ath9k_hw: add a private callback for PLL control computation
      ath9k_hw: Add AR9003 PHY support
      ath9k_hw: move init config and default after chip is up
      ath9k_hw: add the AR9003 ar9003_hw_macversion_supported()
      ath9k_hw: disable ANI for AR9003
      ath9k: disable the MIB interrupt if ANI is disabled
      ath9k_hw: add common channel select helpers for ar900[23]
      ath9k_hw: split initvals.h by hardware family
      ath9k_hw: add initvals for the AR9003 hardware family
      ath9k_hw: add helpers for processing the AR9003 INI
      ath9k_hw: add all the AR9003 PHY callbacks
      ath9k_hw: add a helper for Power Amplifier calibration for AR9002
      ath9k_hw: add a helper for the OLC tem compensation for AR9002
      ath9k_hw: rename PA calib for AR9287
      ath9k_hw: shift code for AR9280 OLC temp comp
      ath9k_hw: move the AR9280 OLC temp comp to its own helper
      ath9k_hw: simplify OLC temp compensation for AR9002
      ath9k_hw: rename the PA calib routines to match their families
      ath9k_hw: rename getNoiseFloorThresh() to ath9k_hw_loadnf()
      ath9k_hw: move the cal AR9100 calibration settings
      ath9k_hw: split calib code by hardware families
      ath9k_hw: add the AR9003 ar9003_hw_init_cal callback
      ath9k_hw: add the config_pci_powersave AR9003 callback
      ath9k_hw: split the generic hardware code by hardware family
      ath9k_hw: move the cck channel 14 INI to the AR9002 hw code
      ath9k_hw: move TX/RX gain INI stuff to its own hardware family code
      ath9k_hw: abstract the AR_PHY_AGC_CONTROL register access
      ath9k_hw: abstract loading noisefloor
      ath9k_hw: fill in the callbacks for calibration for AR9003
      ath9k_hw: complete AR9003 calibration
      ath9k_hw: rename eep_AR9287_ops to eep_ar9287_ops
      ath9k_hw: restore mac address reading logic
      ath9k_hw: add OFDM spur mitigation for AR9003
      ath9k_hw: move the RF claim stuff to AR9002 hardware family
      ath9k_hw: add the AR9300 SREV hw name print
      ath9k_hw: add TX/RX gain register initialization for AR9003
      ath9k_hw: skip asynch fifo enablement to AR9003
      ath9k_hw: skip WEP aggregation enable code for AR9003
      ath9k_hw: move AR9002 mac ops to its own file
      ath9k: add RXLP and RXHP to debugfs counters
      ath9k_hw: enable CRC check of descriptors for AR9003
      ath9k_hw: set cwmin and cwmax to 0 for for AR9003 upon txq reset
      mac80211: add LDPC control flag
      ath9k_hw: add LDPC support for AR9003
      ath9k: add LDPC support
      ath9k_hw: add the PCI ID for the first AR9300 device
      ath9k_hw: make two initvals consto for the AR9001 family
      ath9k_hw: make all AR9002 initvals use u32

Nishant Sarmukadam (1):
      cfg80211: Avoid sending IWEVASSOCREQIE and IWEVASSOCRESPIE events with NULL event body

Reinette Chatre (1):
      Merge branch 'wireless-2.6' into wireless-next-2.6

Samuel Ortiz (2):
      iwmc3200wifi: Fix sparse warnings
      iwmc3200wifi: check sparse endianness annotations

Senthil Balasubramanian (5):
      ath9k_hw: Add the PCI IDs for AR9300 and fill up the pci_id_tables
      ath9k_hw: update the chip tests for AR9003
      ath9k_hw: prevent reset control register zeroing on AR9003 reset
      ath9k_hw: the eep_map is used only for AR9280 PCI card ini fixup
      ath9k_hw: Implement AR9003 eeprom callbacks

Shanyu Zhao (2):
      iwlwifi: bring up 6000 Series 2x2 AGN Gen2 adapters
      iwlwifi: remove redundant iwl_dump_lq_cmd()

Stanislaw Gruszka (3):
      iwlwifi: check scan request ie_len
      iwlwifi: initialize iwl_wimax_coex_cmd.flags
      mac80211: document IEEE80211_CONF_CHANGE_QOS

Sujith (10):
      ath9k_htc: Cleanup beacon configuration
      ath: Add buffered register write operations
      ath9k_htc: Implement multiple register write support
      ath9k_hw: Add macros for multiple register writes
      ath9k_hw: Relocate Opmode initialization
      ath9k_hw: Use buffered register writes
      ath9k_htc: Remove GPIO set on unload
      ath9k_htc: Add dropped SKB count to debugfs
      ath9k_htc: Handle WMI timeouts properly
      ath9k_htc: Fix sparse endian warnings

Vasanthakumar Thiagarajan (26):
      ath9k_hw: Add hw cap flag for EDMA for the AR9003 family
      ath9k_hw: Fill few hw cap for edma
      ath9k_hw: Add abstraction for rx enable
      ath9k_hw: Fill rx_enable() for the AR9003 hardware family
      ath9k_hw: Add few routines for rx edma support
      ath9k_hw: Define tx control struct for AR9003
      ath9k_hw: Move code which populates ds_data to ath9k_hw
      ath9k_hw: Add abstraction to set/get link pointer
      ath9k: Use abstraction to get link pointer
      ath9k: Use memcpy in ath_clone_txbuf()
      ath9k: Remove ATH9K_TX_SW_ABORTED and introduce a bool for this purpose
      ath9k: Make bf_desc of ath_buf opaque
      ath9k_hw: Abstract the routine which returns interrupt status
      ath9k_hw: Initialize interrupt mask for AR9003
      ath9k_hw: Fill get_isr() for AR9003
      ath9k_hw: Configure Tx interrupt mitigation timer
      ath9k: Load SW filtered NF values and start NF cal during full reset for AR9003
      ath9k_hw: Define abstraction for tx desc access
      ath9k_hw: Add function to configure tx status ring buffer
      ath9k_hw: Fill descriptor abstrations for AR9003
      ath9k: Setup appropriate tx desc for regular dma and edma
      ath9k: Initialize and configure tx status for EDMA
      ath9k_hw: Compute pointer checksum over the link descriptor
      ath9k: Add Tx EDMA support
      ath9k: Enable TXOK and TXERR interrupts for TX EDMA
      ath9k_hw: Abort rx if hw is not coming out of full sleep in reset

Wey-Yi Guy (7):
      iwlwifi: set correct single/dual stream mask
      iwlwifi: more generic eeprom defines
      iwlwifi: remove duplicated debug functions
      iwlwifi: add hw revision for 6000g2 NIC
      iwlwifi: PA type for 6000g2 series
      iwlwifi: sanity check for turn on aggregation tid
      iwlwifi: more code clean up for agn devices

Xose Vazquez Perez (2):
      wireless: rt2x00: rt2800usb: identify Hawking devices
      wireless: rt2x00: rt2800usb: identify Allwin devices

 drivers/net/wireless/Kconfig                       |   85 +-
 drivers/net/wireless/ath/ath.h                     |   14 +-
 drivers/net/wireless/ath/ath5k/base.c              |   19 +
 drivers/net/wireless/ath/ath5k/pcu.c               |   31 +-
 drivers/net/wireless/ath/ath9k/Makefile            |   16 +-
 drivers/net/wireless/ath/ath9k/ani.c               |  208 +--
 drivers/net/wireless/ath/ath9k/ar5008_initvals.h   |  742 +++++++
 drivers/net/wireless/ath/ath9k/ar5008_phy.c        | 1375 +++++++++++++
 drivers/net/wireless/ath/ath9k/ar9001_initvals.h   | 1254 ++++++++++++
 drivers/net/wireless/ath/ath9k/ar9002_calib.c      | 1000 ++++++++++
 drivers/net/wireless/ath/ath9k/ar9002_hw.c         |  593 ++++++
 .../ath/ath9k/{initvals.h => ar9002_initvals.h}    | 2052 +-------------------
 drivers/net/wireless/ath/ath9k/ar9002_mac.c        |  480 +++++
 drivers/net/wireless/ath/ath9k/ar9002_phy.c        |  539 +++++
 drivers/net/wireless/ath/ath9k/ar9002_phy.h        |  572 ++++++
 drivers/net/wireless/ath/ath9k/ar9003_calib.c      |  802 ++++++++
 drivers/net/wireless/ath/ath9k/ar9003_eeprom.c     | 1856 ++++++++++++++++++
 drivers/net/wireless/ath/ath9k/ar9003_eeprom.h     |  323 +++
 drivers/net/wireless/ath/ath9k/ar9003_hw.c         |  205 ++
 drivers/net/wireless/ath/ath9k/ar9003_initvals.h   | 1793 +++++++++++++++++
 drivers/net/wireless/ath/ath9k/ar9003_mac.c        |  611 ++++++
 drivers/net/wireless/ath/ath9k/ar9003_mac.h        |  120 ++
 drivers/net/wireless/ath/ath9k/ar9003_phy.c        | 1142 +++++++++++
 drivers/net/wireless/ath/ath9k/ar9003_phy.h        |  847 ++++++++
 drivers/net/wireless/ath/ath9k/ath9k.h             |   24 +-
 drivers/net/wireless/ath/ath9k/beacon.c            |    5 +-
 drivers/net/wireless/ath/ath9k/calib.c             | 1089 +----------
 drivers/net/wireless/ath/ath9k/calib.h             |   19 +-
 drivers/net/wireless/ath/ath9k/common.h            |    4 +-
 drivers/net/wireless/ath/ath9k/debug.c             |   22 +-
 drivers/net/wireless/ath/ath9k/debug.h             |    4 +
 drivers/net/wireless/ath/ath9k/eeprom.c            |    9 +-
 drivers/net/wireless/ath/ath9k/eeprom.h            |   22 +-
 drivers/net/wireless/ath/ath9k/eeprom_4k.c         |   17 +-
 drivers/net/wireless/ath/ath9k/eeprom_9287.c       |    9 +-
 drivers/net/wireless/ath/ath9k/eeprom_def.c        |   13 +-
 drivers/net/wireless/ath/ath9k/hif_usb.c           |   13 +-
 drivers/net/wireless/ath/ath9k/htc.h               |   19 +-
 drivers/net/wireless/ath/ath9k/htc_drv_beacon.c    |   29 +-
 drivers/net/wireless/ath/ath9k/htc_drv_init.c      |  104 +-
 drivers/net/wireless/ath/ath9k/htc_drv_main.c      |   20 +-
 drivers/net/wireless/ath/ath9k/htc_drv_txrx.c      |   11 +-
 drivers/net/wireless/ath/ath9k/htc_hst.c           |    8 +-
 drivers/net/wireless/ath/ath9k/htc_hst.h           |   24 +-
 drivers/net/wireless/ath/ath9k/hw-ops.h            |  280 +++
 drivers/net/wireless/ath/ath9k/hw.c                | 1761 ++++-------------
 drivers/net/wireless/ath/ath9k/hw.h                |  253 +++-
 drivers/net/wireless/ath/ath9k/init.c              |   83 +-
 drivers/net/wireless/ath/ath9k/mac.c               |  490 ++---
 drivers/net/wireless/ath/ath9k/mac.h               |   67 +-
 drivers/net/wireless/ath/ath9k/main.c              |   82 +-
 drivers/net/wireless/ath/ath9k/pci.c               |    1 +
 drivers/net/wireless/ath/ath9k/phy.c               |  978 ----------
 drivers/net/wireless/ath/ath9k/phy.h               |  584 +------
 drivers/net/wireless/ath/ath9k/rc.c                |   13 +
 drivers/net/wireless/ath/ath9k/recv.c              |  518 ++++-
 drivers/net/wireless/ath/ath9k/reg.h               |  167 ++-
 drivers/net/wireless/ath/ath9k/wmi.c               |   16 +-
 drivers/net/wireless/ath/ath9k/wmi.h               |   19 +-
 drivers/net/wireless/ath/ath9k/xmit.c              |  488 ++++--
 drivers/net/wireless/iwlwifi/Makefile              |    1 +
 drivers/net/wireless/iwlwifi/iwl-1000.c            |    6 +
 drivers/net/wireless/iwlwifi/iwl-3945.c            |    2 +
 drivers/net/wireless/iwlwifi/iwl-3945.h            |    3 +
 drivers/net/wireless/iwlwifi/iwl-4965.c            |   15 +-
 drivers/net/wireless/iwlwifi/iwl-5000.c            |   11 +
 drivers/net/wireless/iwlwifi/iwl-6000.c            |   34 +-
 drivers/net/wireless/iwlwifi/iwl-agn-debugfs.c     |  834 ++++++++
 drivers/net/wireless/iwlwifi/iwl-agn-debugfs.h     |   56 +
 drivers/net/wireless/iwlwifi/iwl-agn-hcmd.c        |    2 +
 drivers/net/wireless/iwlwifi/iwl-agn-lib.c         |  403 ++++-
 drivers/net/wireless/iwlwifi/iwl-agn-rs.c          |   47 +-
 drivers/net/wireless/iwlwifi/iwl-agn-tx.c          |   42 +-
 drivers/net/wireless/iwlwifi/iwl-agn-ucode.c       |   36 +-
 drivers/net/wireless/iwlwifi/iwl-agn.c             |   50 +-
 drivers/net/wireless/iwlwifi/iwl-agn.h             |    3 +
 drivers/net/wireless/iwlwifi/iwl-commands.h        |    2 +-
 drivers/net/wireless/iwlwifi/iwl-core.c            |   46 +-
 drivers/net/wireless/iwlwifi/iwl-core.h            |   18 +-
 drivers/net/wireless/iwlwifi/iwl-csr.h             |    1 +
 drivers/net/wireless/iwlwifi/iwl-debug.h           |    2 +
 drivers/net/wireless/iwlwifi/iwl-debugfs.c         |  770 +-------
 drivers/net/wireless/iwlwifi/iwl-dev.h             |    9 +-
 drivers/net/wireless/iwlwifi/iwl-eeprom.h          |   32 +-
 drivers/net/wireless/iwlwifi/iwl-prph.h            |   80 +-
 drivers/net/wireless/iwlwifi/iwl-scan.c            |  506 +-----
 drivers/net/wireless/iwlwifi/iwl-sta.c             |   13 +-
 drivers/net/wireless/iwlwifi/iwl3945-base.c        |   87 +-
 drivers/net/wireless/iwmc3200wifi/Makefile         |    2 +
 drivers/net/wireless/iwmc3200wifi/rx.c             |    3 +-
 drivers/net/wireless/iwmc3200wifi/trace.h          |    4 +-
 drivers/net/wireless/iwmc3200wifi/tx.c             |    4 +-
 drivers/net/wireless/libertas/if_sdio.c            |  103 +-
 drivers/net/wireless/mac80211_hwsim.c              |   28 +
 drivers/net/wireless/orinoco/Makefile              |    3 +
 drivers/net/wireless/orinoco/cfg.c                 |   88 +-
 drivers/net/wireless/orinoco/hw.c                  |   26 +
 drivers/net/wireless/orinoco/main.h                |   12 -
 drivers/net/wireless/orinoco/orinoco.h             |    2 +
 drivers/net/wireless/orinoco/scan.c                |    4 +-
 drivers/net/wireless/orinoco/wext.c                |  183 +--
 drivers/net/wireless/p54/p54pci.c                  |    2 +-
 drivers/net/wireless/rt2x00/rt2800pci.c            |   34 +-
 drivers/net/wireless/rt2x00/rt2800usb.c            |   26 +-
 drivers/net/wireless/rt2x00/rt2x00.h               |    3 +-
 drivers/net/wireless/rt2x00/rt2x00pci.c            |    3 +-
 drivers/net/wireless/rt2x00/rt2x00pci.h            |    3 +-
 drivers/net/wireless/rt2x00/rt2x00queue.c          |    3 +-
 drivers/net/wireless/rt2x00/rt2x00usb.c            |    3 +-
 drivers/net/wireless/rt2x00/rt2x00usb.h            |    3 +-
 drivers/net/wireless/rtl818x/Kconfig               |   88 +
 drivers/net/wireless/wl12xx/wl1251_main.c          |   63 +
 drivers/net/wireless/wl12xx/wl1251_reg.h           |    7 +
 drivers/net/wireless/wl12xx/wl1251_sdio.c          |   96 +-
 include/linux/ieee80211.h                          |    1 +
 include/linux/spi/wl12xx.h                         |    2 +
 include/net/cfg80211.h                             |    2 +-
 include/net/mac80211.h                             |   17 +-
 net/mac80211/agg-rx.c                              |   18 +-
 net/mac80211/agg-tx.c                              |    2 +-
 net/mac80211/cfg.c                                 |   12 +
 net/mac80211/driver-ops.h                          |    9 +
 net/mac80211/key.c                                 |    1 +
 net/mac80211/mlme.c                                |    5 +
 net/mac80211/sta_info.c                            |   13 +-
 net/mac80211/tx.c                                  |    7 +
 net/wireless/sme.c                                 |   16 +-
 127 files changed, 19021 insertions(+), 8935 deletions(-)
 create mode 100644 drivers/net/wireless/ath/ath9k/ar5008_initvals.h
 create mode 100644 drivers/net/wireless/ath/ath9k/ar5008_phy.c
 create mode 100644 drivers/net/wireless/ath/ath9k/ar9001_initvals.h
 create mode 100644 drivers/net/wireless/ath/ath9k/ar9002_calib.c
 create mode 100644 drivers/net/wireless/ath/ath9k/ar9002_hw.c
 rename drivers/net/wireless/ath/ath9k/{initvals.h => ar9002_initvals.h} (78%)
 create mode 100644 drivers/net/wireless/ath/ath9k/ar9002_mac.c
 create mode 100644 drivers/net/wireless/ath/ath9k/ar9002_phy.c
 create mode 100644 drivers/net/wireless/ath/ath9k/ar9002_phy.h
 create mode 100644 drivers/net/wireless/ath/ath9k/ar9003_calib.c
 create mode 100644 drivers/net/wireless/ath/ath9k/ar9003_eeprom.c
 create mode 100644 drivers/net/wireless/ath/ath9k/ar9003_eeprom.h
 create mode 100644 drivers/net/wireless/ath/ath9k/ar9003_hw.c
 create mode 100644 drivers/net/wireless/ath/ath9k/ar9003_initvals.h
 create mode 100644 drivers/net/wireless/ath/ath9k/ar9003_mac.c
 create mode 100644 drivers/net/wireless/ath/ath9k/ar9003_mac.h
 create mode 100644 drivers/net/wireless/ath/ath9k/ar9003_phy.c
 create mode 100644 drivers/net/wireless/ath/ath9k/ar9003_phy.h
 create mode 100644 drivers/net/wireless/ath/ath9k/hw-ops.h
 delete mode 100644 drivers/net/wireless/ath/ath9k/phy.c
 create mode 100644 drivers/net/wireless/iwlwifi/iwl-agn-debugfs.c
 create mode 100644 drivers/net/wireless/iwlwifi/iwl-agn-debugfs.h
 create mode 100644 drivers/net/wireless/rtl818x/Kconfig

Omnibus patch is available here:

	http://www.kernel.org/pub/linux/kernel/people/linville/wireless-next-2.6-2010-04-23.patch.bz2

-- 
John W. Linville		Someday the world will need a hero, and you
linville-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org			might be all we have.  Be ready.
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: eSwitch management
From: Anirban Chakraborty @ 2010-04-23 19:00 UTC (permalink / raw)
  To: Chris Wright
  Cc: Scott Feldman, David Miller, netdev@vger.kernel.org,
	Arnd Bergmann, Ameen Rahman, Amit Salecha, Rajesh Borundia
In-Reply-To: <20100423162307.GI30693@x200.localdomain>


On Apr 23, 2010, at 9:23 AM, Chris Wright wrote:

> * Anirban Chakraborty (anirban.chakraborty@qlogic.com) wrote:
>> 
>> On Apr 22, 2010, at 6:29 PM, Scott Feldman wrote:
>> 
>>> On 4/22/10 5:47 PM, "Scott Feldman" <scofeldm@cisco.com> wrote:
>>> 
>>>> On 4/22/10 4:16 PM, "Anirban Chakraborty" <anirban.chakraborty@qlogic.com>
>>>> wrote:
>>>> 
>>>>> I am following the discussions on iovnl patch closely. While it is going to
>>>>> take some time for iovnl patch to be reviewed and accepted, what would be the
>>>>> interim approach to manage the eswitch in NIC? We need to add support in
>>>>> qlcnic driver to configure the eswitch in our 10G NIC. Some of the things
>>>>> that
>>>>> we need to set to the switch are setting a port's VLAN, tx bandwidth etc. We
>>>>> would like to set these parameters for a bunch of ports at the start of the
>>>>> day and set it to the eswitch.
>>>> 
>>>> Are any of these settings covered in DCB?  (net/dcb/dcbnl.c).  Maybe you can
>>>> get a start there?  Not sure not knowing your device requirements.
>>> 
>>> Or maybe the RTM_SETLINK IFLA_VF_* ops in include/linux/if_link.h?  Those
>>> seem like what you're looking for.  I'm looking at moving iovnl here as well
>>> for port-profile.
>> 
>> It looks like ifla_vf_info does contain most of the data set. But if I use it, what NETLINK protocol family should I use in my driver to receive netlink messages? Do I need to create a private protocol family?
> 
> No, you don't need to use netlink in your driver.  You just need to fill
> in the relevant net_device_ops in your driver init.  Specifically:
> 
> *      SR-IOV management functions.
> * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
> * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, u8 qos);
> * int (*ndo_set_vf_tx_rate)(struct net_device *dev, int vf, int rate);
> * int (*ndo_get_vf_config)(struct net_device *dev,
> *                          int vf, struct ifla_vf_info *ivf);
> 
> These are all operating on a VF indexed internally w/in the driver, so it's
> a little cumbersome to use from userspace.

These are all intended for VFs and are configureable from PF. However, in our case, there are multiple physical NIC function on a port which are configureable by the eswitch. So, what we are setting is essentially switch ports, rather than configuring any setting on the physical functions. If netlink doesn't fly, is sysfs going to work? If we allocate a buffer and fill it up with user space tools that the driver grabs it and does the configuration itself?  

thanks,
Anirban



^ permalink raw reply

* RESPONSE NEEDED !
From: Dr Raymond Kuo Fung Chien @ 2010-04-23 16:50 UTC (permalink / raw)





Dear Friend,

I am Dr Raymond Kuo Fung CHIEN Executive Director and Chief Financial
Officer of the operations of the Hang Seng Bank Ltd.
Befor the U.S and Iraqi war our client Mr.Fayez A Mohammed a business
merchant made a fixed deposit of USD30 Million for 2Yrs where i was the
only one that knew about his deposits.

Upon maturity during the war in 2003,Fayez,his wife and only daugther died
in a bomb blast that hits His Resident.
Investigations showed that he didnt declear any next of kin.As a
foreigner,I want you to stand as the next of kin to claim the fund because
soon the fund will be claimed by my government if no one comes for it.

I have an attorney that will prepare all the documents to back you up as
the next of kin to Mr.Fayez A.Mohammed.Plz let me know your willingness so
that i can provide you with more details of this transaction.

contact me on email: drfungch111@yahoo.com.hk


1. Full name and Age

2. Occupation

3. Private/office phone number

4. Current residential address

Kind Regards,
Dr Raymond Kuo Fung CHIEN.


^ permalink raw reply

* [PATCH 2/2] gianfar: Fix potential oops during OF address translation
From: Anton Vorontsov @ 2010-04-23 17:12 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Sandeep Gopalpet, linuxppc-dev

gianfar driver may pass NULL pointer to the of_translate_address(),
which may lead to a kernel oops. Fix this by using of_iomap(), which
is also much simpler and shorter.

Signed-off-by: Anton Vorontsov <avorontsov@mvista.com>
---
 drivers/net/gianfar.c |    6 +-----
 1 files changed, 1 insertions(+), 5 deletions(-)

diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 080d1ce..df49af3 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -549,12 +549,8 @@ static int gfar_parse_group(struct device_node *np,
 		struct gfar_private *priv, const char *model)
 {
 	u32 *queue_mask;
-	u64 addr, size;
-
-	addr = of_translate_address(np,
-			of_get_address(np, 0, &size, NULL));
-	priv->gfargrp[priv->num_grps].regs = ioremap(addr, size);
 
+	priv->gfargrp[priv->num_grps].regs = of_iomap(np, 0);
 	if (!priv->gfargrp[priv->num_grps].regs)
 		return -ENOMEM;
 
-- 
1.7.0.5

^ permalink raw reply related

* [PATCH 1/2] fsl_pq_mdio: Fix kernel oops during OF address translation
From: Anton Vorontsov @ 2010-04-23 17:12 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Sandeep Gopalpet, linuxppc-dev

Old P1020RDB device trees were not specifing tbipa address for
MDIO nodes, which is now causing this kernel oops:

 ...
 eth2: TX BD ring size for Q[6]: 256
 eth2: TX BD ring size for Q[7]: 256
 Unable to handle kernel paging request for data at address 0x00000000
 Faulting instruction address: 0xc0015504
 Oops: Kernel access of bad area, sig: 11 [#1]
 ...
 NIP [c0015504] memcpy+0x3c/0x9c
 LR [c000a9f8] __of_translate_address+0xfc/0x21c
 Call Trace:
 [df839e00] [c000a94c] __of_translate_address+0x50/0x21c (unreliable)
 [df839e50] [c01a33e8] get_gfar_tbipa+0xb0/0xe0
 ...

The old device trees are buggy, though having a dead ethernet is
better than a dead kernel, so fix the issue by using of_iomap().

Also, a somewhat similar issue exist in the probe() routine, though
there the oops is only a possibility. Nonetheless, fix it too.

Signed-off-by: Anton Vorontsov <avorontsov@mvista.com>
---
 drivers/net/fsl_pq_mdio.c |   20 ++++++++++++++------
 1 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/net/fsl_pq_mdio.c b/drivers/net/fsl_pq_mdio.c
index d5160ed..3acac5f 100644
--- a/drivers/net/fsl_pq_mdio.c
+++ b/drivers/net/fsl_pq_mdio.c
@@ -205,8 +205,6 @@ static int fsl_pq_mdio_find_free(struct mii_bus *new_bus)
 static u32 __iomem *get_gfar_tbipa(struct fsl_pq_mdio __iomem *regs, struct device_node *np)
 {
 	struct gfar __iomem *enet_regs;
-	u32 __iomem *ioremap_tbipa;
-	u64 addr, size;
 
 	/*
 	 * This is mildly evil, but so is our hardware for doing this.
@@ -220,9 +218,7 @@ static u32 __iomem *get_gfar_tbipa(struct fsl_pq_mdio __iomem *regs, struct devi
 		return &enet_regs->tbipa;
 	} else if (of_device_is_compatible(np, "fsl,etsec2-mdio") ||
 			of_device_is_compatible(np, "fsl,etsec2-tbi")) {
-		addr = of_translate_address(np, of_get_address(np, 1, &size, NULL));
-		ioremap_tbipa = ioremap(addr, size);
-		return ioremap_tbipa;
+		return of_iomap(np, 1);
 	} else
 		return NULL;
 }
@@ -279,6 +275,7 @@ static int fsl_pq_mdio_probe(struct of_device *ofdev,
 	u32 __iomem *tbipa;
 	struct mii_bus *new_bus;
 	int tbiaddr = -1;
+	const u32 *addrp;
 	u64 addr = 0, size = 0;
 	int err = 0;
 
@@ -297,8 +294,19 @@ static int fsl_pq_mdio_probe(struct of_device *ofdev,
 	new_bus->priv = priv;
 	fsl_pq_mdio_bus_name(new_bus->id, np);
 
+	addrp = of_get_address(np, 0, &size, NULL);
+	if (!addrp) {
+		err = -EINVAL;
+		goto err_free_bus;
+	}
+
 	/* Set the PHY base address */
-	addr = of_translate_address(np, of_get_address(np, 0, &size, NULL));
+	addr = of_translate_address(np, addrp);
+	if (addr == OF_BAD_ADDR) {
+		err = -EINVAL;
+		goto err_free_bus;
+	}
+
 	map = ioremap(addr, size);
 	if (!map) {
 		err = -ENOMEM;
-- 
1.7.0.5

^ permalink raw reply related

* Re: eSwitch management
From: Chris Wright @ 2010-04-23 16:23 UTC (permalink / raw)
  To: Anirban Chakraborty
  Cc: Scott Feldman, David Miller, netdev@vger.kernel.org,
	chrisw@redhat.com, Arnd Bergmann, Ameen Rahman, Amit Salecha,
	Rajesh Borundia
In-Reply-To: <DD92D5A8-1ECC-4440-BE81-ABDCC6847021@qlogic.com>

* Anirban Chakraborty (anirban.chakraborty@qlogic.com) wrote:
> 
> On Apr 22, 2010, at 6:29 PM, Scott Feldman wrote:
> 
> > On 4/22/10 5:47 PM, "Scott Feldman" <scofeldm@cisco.com> wrote:
> > 
> >> On 4/22/10 4:16 PM, "Anirban Chakraborty" <anirban.chakraborty@qlogic.com>
> >> wrote:
> >> 
> >>> I am following the discussions on iovnl patch closely. While it is going to
> >>> take some time for iovnl patch to be reviewed and accepted, what would be the
> >>> interim approach to manage the eswitch in NIC? We need to add support in
> >>> qlcnic driver to configure the eswitch in our 10G NIC. Some of the things
> >>> that
> >>> we need to set to the switch are setting a port's VLAN, tx bandwidth etc. We
> >>> would like to set these parameters for a bunch of ports at the start of the
> >>> day and set it to the eswitch.
> >> 
> >> Are any of these settings covered in DCB?  (net/dcb/dcbnl.c).  Maybe you can
> >> get a start there?  Not sure not knowing your device requirements.
> > 
> > Or maybe the RTM_SETLINK IFLA_VF_* ops in include/linux/if_link.h?  Those
> > seem like what you're looking for.  I'm looking at moving iovnl here as well
> > for port-profile.
> 
> It looks like ifla_vf_info does contain most of the data set. But if I use it, what NETLINK protocol family should I use in my driver to receive netlink messages? Do I need to create a private protocol family?

No, you don't need to use netlink in your driver.  You just need to fill
in the relevant net_device_ops in your driver init.  Specifically:

 *      SR-IOV management functions.
 * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
 * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, u8 qos);
 * int (*ndo_set_vf_tx_rate)(struct net_device *dev, int vf, int rate);
 * int (*ndo_get_vf_config)(struct net_device *dev,
 *                          int vf, struct ifla_vf_info *ivf);

These are all operating on a VF indexed internally w/in the driver, so it's
a little cumbersome to use from userspace.

thanks,
-chris

^ permalink raw reply

* Re: [PATCH] e100: Fix the TX workqueue race
From: Jeff Garzik @ 2010-04-23 16:20 UTC (permalink / raw)
  To: Alan Cox; +Cc: e1000-devel, netdev
In-Reply-To: <20100423143356.7092.45260.stgit@localhost.localdomain>

On 04/23/2010 10:34 AM, Alan Cox wrote:
> I'd assumed someone would have picked up on this and fixed it using rtnl_lock
> as was suggested but it seems to have fallen through the cracks ?
>
> Anyway this is I assume what was meant ?
>
> ---
>
> Nothing stops the workqueue being left to run in parallel with close or a
> few other operations. This causes double unmaps and the like.
>
> See kerneloops.org #1041230 for an example
>
> Signed-off-by: Alan Cox<alan@linux.intel.com>

Acked-by: Jeff Garzik <jgarzik@redhat.com>

Glad someone finally fixed this, it has bugged me for years...



------------------------------------------------------------------------------
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* Re: [PATCH] NIU support for skb->rxhash
From: Tom Herbert @ 2010-04-23 15:32 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, netdev
In-Reply-To: <20100423.011456.48472321.davem@davemloft.net>

> I looked into implementing this and it doesn't work.  The
> problem is GRO want's to look into the packet very early
> and we want to batch GRO a set of packets into a big packet
> before shooting them over to a remote cpu.
>

Can you reconsider? :-)  The majority of our servers see packet loads
which don't allow for much batching (a lot of small RPC messages), so
for those GRO is mostly unnecessary overhead and mechanisms that
improve unbatched packet performance are compelling.  Also, if a
device already does LRO, I don't see that GRO could add a lot of value
anyway.

Tom

> This reminds me that we can start using ->rxhash as a quick
> mismatch check in the GRO flow matcher.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* Re: [PATCH 1/7] Topcliff GbE: Add The Main code
From: Arnd Bergmann @ 2010-04-23 15:27 UTC (permalink / raw)
  To: Masayuki Ohtake; +Cc: NETDEV, Wang, Yong Y, Wang, Qi, Intel OTC, Andrew

On Friday 23 April 2010, Masayuki Ohtake wrote:
> From: Masayuki Ohtake <masa-korg@dsn.okisemi.com>
> 
> This patch adds the Main code of GbE driver for Topcliff.
> The GbE driver needs all patch[1/7 to 7/7].
> 
> Signed-off-by: Masayuki Ohtake <masa-korg@dsn.okisemi.com>

I already commented on the "Topcliff PHUB: Add The Packet Hub driver"
submission. Many of my comments there apply here as well, but
there are a few more things that you may want to address in
future submissions:

> +static int
> +pch_gbe_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id);
> +static void pch_gbe_remove(struct pci_dev *pdev);
> +static int pch_gbe_suspend(struct pci_dev *pdev, pm_message_t state);
> +static int pch_gbe_resume(struct pci_dev *pdev);

Ideally, static functions are ordered such that the caller is
last, so you can drop all of the forward declarations like these.

> +/*!
> + * @ingroup PCI driver Layer
> + * @struct  pch_gbe_pcidev_id
> + * @brief   PCI Device ID Table
> + * @remarks
> + *  This is an instance of pci_device_id structure defined in linux/pci.h,
> + *  and holds information of the PCI devices that are supported by this
> driver.
> + */
> +static const struct pci_device_id pch_gbe_pcidev_id[3] = {
> + {.vendor = PCI_VENDOR_ID_INTEL,
> +  .device = PCI_DEVICE_ID_INTEL_IOH1_GBE,
> +  .subvendor = PCI_ANY_ID,
> +  .subdevice = PCI_ANY_ID,
> +  .class = (PCI_CLASS_NETWORK_ETHERNET << 8),
> +  .class_mask = (0xFFFF00)
> +  },
> + /* required last entry */
> + {0}
> +};

Your array size above is three, but you only define two members.
Better may the array automatically sized. Also, it's clearer to
use the PCI_DEVICE_CLASS() helper macro, e.g.

static const struct pci_device_id pch_gbe_pcidev_id[] = {
	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOH1_GBE) },
	{ 0 },
};

	Arnd

^ permalink raw reply

* Re: [PATCH 1/7] Topcliff GbE: Add The Main code [1/3]
From: Stephen Hemminger @ 2010-04-23 15:26 UTC (permalink / raw)
  To: Masayuki Ohtake; +Cc: NETDEV, Wang, Yong Y, Wang, Qi, Intel OTC, Andrew
In-Reply-To: <002d01cae2dd$1f6e42d0$66f8800a@maildom.okisemi.com>

On Fri, 23 Apr 2010 20:56:25 +0900
"Masayuki Ohtake" <masa-korg@dsn.okisemi.com> wrote:

Even though the patch was sent as an attachment, long lines were
wrapped.

Do you want this to go directly to kernel, or do you want help
fixing coding issues by submitting to staging tree?

The code uses a comment style that is kind of like the existing
docbook comment style; why not convert it to use the official
docbook style for examples look at other kernel code:
/**
 *	dev_alloc_skb - allocate an skbuff for receiving
 *	@length: length to allocate
 *
 *	Allocate a new &sk_buff and assign it a usage count of one. The
 *	buffer has unspecified headroom built in. Users should allocate
 *	the headroom they think they need without accounting for the
 *	built in space. The built in space is used for optimisations.
 *
 *	%NULL is returned if there is no free memory. Although this function
 *	allocates memory it can be called from an interrupt.
 */

The code is also indented with a non-standard indentation format.
Please read Documentation/CodingStyle.  Indentation is supposed
to be 4 characters (and using tabs of 8 characters).

The PCI device table could be changed to:

static DEFINE_PCI_DEVICE_TABLE(pch_gbe_pcidev_id) = {
    { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOH1_GBE) },
    { 0 }
};
Also PCI_DEVICE_ID_INTEL_IOH1_GBE is not defined anywhere I can see.

^ permalink raw reply

* Re: [PATCH 1/4] Fix acquiring socket lock before reading RTNETLINK response
From: Dan Smith @ 2010-04-23 15:24 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg; +Cc: netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1272034539-19899-2-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

DS> Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

I went too deep with my send-email command line.  This patch isn't
related to the others, so please disregard.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply

* [PATCH] e100: Fix the TX workqueue race
From: Alan Cox @ 2010-04-23 14:34 UTC (permalink / raw)
  To: netdev, e1000-devel

I'd assumed someone would have picked up on this and fixed it using rtnl_lock
as was suggested but it seems to have fallen through the cracks ?

Anyway this is I assume what was meant ?

---

Nothing stops the workqueue being left to run in parallel with close or a
few other operations. This causes double unmaps and the like.

See kerneloops.org #1041230 for an example

Signed-off-by: Alan Cox <alan@linux.intel.com>
---

 drivers/net/e100.c |    9 +++++++--
 1 files changed, 7 insertions(+), 2 deletions(-)


diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index 3e8d000..859e833 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -2280,8 +2280,13 @@ static void e100_tx_timeout_task(struct work_struct *work)
 
 	netif_printk(nic, tx_err, KERN_DEBUG, nic->netdev,
 		     "scb.status=0x%02X\n", ioread8(&nic->csr->scb.status));
-	e100_down(netdev_priv(netdev));
-	e100_up(netdev_priv(netdev));
+
+	rtnl_lock();
+	if (netif_running(dev)) {
+		e100_down(netdev_priv(netdev));
+		e100_up(netdev_priv(netdev));
+	}
+	rtnl_unlock();
 }
 
 static int e100_loopback_test(struct nic *nic, enum loopback loopback_mode)


^ permalink raw reply related

* Re: IPv6: race condition in __ipv6_ifa_notify() and dst_free() ?
From: Herbert Xu @ 2010-04-23 15:05 UTC (permalink / raw)
  To: David Miller; +Cc: jbohac, yoshfuji, netdev, shemminger
In-Reply-To: <20100423021000.GA21777@gondor.apana.org.au>

On Fri, Apr 23, 2010 at 10:10:00AM +0800, Herbert Xu wrote:
> 
> I will post an updated patch later today to deal with that.

Just got back from a business trip.

This stuff is more broken than I thought.  For example, we perform
a number of actions when DAD succeeds, e.g., joining an anycast
group.  However, this is not synchronised with respect to address
deletion at all, so if DAD succeeds just as someone deletes the
address, you can easily get stuck on that anycast group.

I will try to untangle this mess tomorrow.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH] Topcliff PHUB: Add The Packet Hub driver [1/2]
From: Arnd Bergmann @ 2010-04-23 14:57 UTC (permalink / raw)
  To: Masayuki Ohtake; +Cc: NETDEV, Wang, Yong Y, Wang, Qi, andrew.chih.howe.khor
In-Reply-To: <000501cae2eb$de97e950$66f8800a@maildom.okisemi.com>

On Friday 23 April 2010, Masayuki Ohtake wrote:
>From: Masayuki Ohtake <masa-korg@dsn.okisemi.com>
>
>This patch adds the Packet Hub driver for Topcliff.
>Patch created against 2.6.33.1

Thank you for your submission. Since this is the first time
you posted this patch, there is naturally a lot that needs
to be improved on before the driver can be merged. I'll
go through this in detail so you can improve it for the
next submission.

First of all, you need to decide where you want the code
to go. The quickest path is to get it into drivers/staging,
which can accept code that does not yet follow the kernel
coding style or may still need changes to its user interface,
see http://www.kroah.com/log/linux/linux-staging-update.html

If you want to go straight into the regular supported driver
directory, that is also ok but requires more upfront work on
coding style and addressing review comments like the ones
I make below. Addressing the comments typically means that
you either change your code as suggested or that you argue
that your version is actually correct.

The patch description above needs to tell the reader what
the driver actually does and what the hardware is for.

>Signed-off-by: Masayuki Ohtake <masa-korg@dsn.okisemi.com>
>---
> drivers/char/Kconfig                                 |    7++
> drivers/char/Makefile                                |    2
> drivers/char/pch_phub/Makefile                       |    9
> drivers/char/pch_phub/pch_common.h                   |    147
> drivers/char/pch_phub/pch_debug.h                    |    58
> drivers/char/pch_phub/pch_phub.c                     |    375
> drivers/char/pch_phub/pch_phub.h                     |    195
> drivers/char/pch_phub/pch_phub_hal.c                 |    544
> drivers/char/pch_phub/pch_phub_hal.h                 |    124
> drivers/char/pch_phub/pch_phub_pci.c                 |    499
>+++++++++++++++++++++++++++++++ 10 files changed, 1960 insertions(+)

You have submitted this driver to the netdev list, but put
the code into drivers/char. If this is really a network driver,
it should probably go into drivers/net, otherwise it needs to be
reviewed on the main linux-kernel mailing list.

If you want it to be applied as a staging driver first, change
the code location to drivers/staging first a

>diff -urN linux-2.6.33.1/drivers/char/Kconfig
>topcliff-2.6.33.1/drivers/char/Kconfig
>--- linux-2.6.33.1/drivers/char/Kconfig 2010-03-16 01:09:39.000000000 +0900
>+++ topcliff-2.6.33.1/drivers/char/Kconfig 2010-04-14 18:19:10.000000000
>+0900
>@@ -4,6 +4,13 @@

Evidently, your email client breaks line-wrapping. This means that it's
not possibly to apply the patch. Please see Documentation/email-clients.txt
on how to fix this.

To make sure this does not happen again, you can send the patch to yourself
and try to apply it from the email as a test before you send it to a
mailing list.

> menu "Character devices"
>
>+config PCH_PHUB
>+        tristate "PCH PHUB"
>+        depends on PCI
>+        help
>+          If you say yes to this option, support will be included for the
>+          PCH Packet Hub Host controller.
>+
> config VT
>  bool "Virtual terminal" if EMBEDDED
>  depends on !S390

This description also should be more helpful. This could be the same
text that you will put in the patch description above.

>diff -urN linux-2.6.33.1/drivers/char/pch_phub/pch_common.h
>topcliff-2.6.33.1/drivers/char/pch_phub/pch_common.h
>--- linux-2.6.33.1/drivers/char/pch_phub/pch_common.h 1970-01-01
>09:00:00.000000000 +0900
>+++ topcliff-2.6.33.1/drivers/char/pch_phub/pch_common.h 2010-04-14
>15:29:48.000000000 +0900
>@@ -0,0 +1,147 @@
>+/*!
>+ * @file pch_common.h
>+ * @brief Provides the macro definitions used by all files.
>+ * @version 1.0.0.0
>+ * @section

You seem to use a tool for processing the comments into documentation,
which is good. However, the syntax you use is incompatible with the
one used in the kernel and should be changed to the 'kerneldoc'
format, see Documentation/kernel-doc-nano-HOWTO.txt.

>+/*! @ingroup Global
>+@def      PCH_WRITE8
>+@brief   Macro for writing 8 bit data to an io/mem address
>+*/
>+#define PCH_WRITE8(val, addr)   iowrite8((val), (void __iomem *)(addr))
>+/*! @ingroup Global
>+@def      PCH_LOG
>+@brief   Macro for writing 16 bit data to an io/mem address
>+*/
>+#define PCH_WRITE16(val, addr)  iowrite16((val), (void __iomem *)(addr))
>+/*! @ingroup Global
>+@def      PCH_LOG
>+@brief   Macro for writing 32 bit data to an io/mem address

In general, wrapping kernel functions in a driver specific macro that
does not do anything else is discouraged. It's best to delete these
macros and change the code to use the underlying interfaces directly.

>+#ifndef __PCH_DEBUG_H__
>+#define __PCH_DEBUG_H__
>+
>+#ifdef MODULE
>+#define PCH_LOG(level, fmt, args...) printk(level "%s:" fmt "\n",\
>+       THIS_MODULE->name, ##args)
>+#else
>+#define PCH_LOG(level, fmt, args...) printk(level "%s:" fmt "\n" ,\
>+        __FILE__, ##args)
>+#endif
>+
>+
>+#ifdef DEBUG
>+ #define PCH_DEBUG(fmt, args...) PCH_LOG(KERN_DEBUG, fmt, ##args)
>+#else
>+ #define PCH_DEBUG(fmt, args...)
>+#endif
>+
>+#ifdef PCH_TRACE_ENABLED
>+ #define PCH_TRACE PCH_DEBUG
>+#else
>+ #define PCH_TRACE(fmt, args...)
>+#endif
>+
>+#define PCH_TRACE_ENTER PCH_TRACE("Enter %s", __func__)
>+#define PCH_TRACE_EXIT  PCH_TRACE("Exit %s", __func__)

For these macros, we already have existing interfaces in the kernel,
you should remove yours and use dev_dbg, dev_info, pr_debug etc.

The tracing functions can probably just be removed here. If you
feel that you need tracing, please take a look at the kernel
tracing subsystem. There is an excellent series of articles
about tracing at http://lwn.net/Articles/383362/.

>+/**
>+ * file_operations structure initialization
>+ */
>+const struct file_operations pch_phub_fops = {
>+ .owner = THIS_MODULE,
>+ .open = pch_phub_open,
>+ .release = pch_phub_release,
>+ .ioctl = pch_phub_ioctl,
>+};

New code should use the 'unlocked_ioctl' callback instead of 'ioctl'.

The whitespace in this patch is damaged: indentation of code should
be done with tabs instead of spaces. It's not clear if the code is
written like this, or it was damaged by your email client.

If the problem is part of your actual code, have a look at
Documentation/CodingStyle for how it should look like.

>+/*function implementations*/
>+
>+/*! @ingroup PHUB_InterfaceLayerAPI
>+  @fn  int pch_phub_open( struct inode *inode,struct file *file)
>+  @remarks  Implements the Initializing and opening of the Packet Hub
>module.
>+  @param  inode  [@ref INOUT] Contains the reference of the inode
>+         structure
>+  @param  file  [@ref INOUT] Contains the reference of the file structure
>+  @retval returnvalue [@ref OUT] contains the result for the concerned
>+         attempt.
>+  The result would generally comprise of success code
>+  or failure code. The failure code will indicate reason for
>+  failure.
>+  @see
>+  EBUSY
>+  */
>+int pch_phub_open(struct inode *inode, struct file *file)
>+{

Since this is not an exported interface, it the function should
probably be marked 'static'.

>+ int ret_value = PCH_PHUB_SUCCESS;
>+ struct pch_phub_reqt *p_pch_phub_reqt;
>+ unsigned long addr_offset;
>+ unsigned long data;
>+ unsigned long mask;
>+
>+ do {
>+  if (pch_phub_suspended == true) {
>+   PCH_LOG(KERN_ERR, "pch_phub_ioctl : "
>+    "suspend initiated returning =%d\n",
>+    PCH_PHUB_FAIL);
>+   ret_value = PCH_PHUB_FAIL;
>+   break;
>+  }

Using the do { ... } while (0) construct in this way is not wrong,
but unconventional. The way this is normally done in Linux is to
have a goto target at the end.

The macros PCH_PHUB_SUCCESS and PCH_PHUB_FAIL should probably
be removed, because they are not standard error codes. By convention,
you should return 0 for success in ioctl or one of the errno.h
values for failure, as you do below.

>+  p_pch_phub_reqt = (struct pch_phub_reqt *)arg;
>+  ret_value =
>+   copy_from_user((void *)&addr_offset,
>+    (void *)&p_pch_phub_reqt->addr_offset,
>+    sizeof(addr_offset));
>+  if (ret_value) {
>+   PCH_LOG(KERN_ERR, "pch_phub_ioctl : "
>+    "copy_from_user fail returning =%d\n",
>+    -EFAULT);
>+   ret_value = -EFAULT;
>+   break;
>+  }

It is much easier to use get_user() here than copy_from_user.

The definition of struct pch_phub_reqt is problematic because
it contains members of type 'unsigned long'. This means that
a 32 bit user process uses a different data structure than a 
64 bit kernel.

Ideally, you only pass a single integer as an ioctl argument.
There are cases where it needs to be a data structure, but
if that happens, you should only use members types like __u32
or __u64, not long or pointer.

>+  switch (cmd) {
>+  case IOCTL_PHUB_READ_REG:
>+   {
>+
>+    pch_phub_read_reg(addr_offset, &data);
>+    PCH_DEBUG("pch_phub_ioctl  : Invoked "
>+     "pch_phub_read_reg successfully\n");
>+
>+    ret_value =
>+        copy_to_user((void *)&p_pch_phub_reqt->
>+       data, (void *)&data,
>+       sizeof(data));
>+    if (ret_value) {
>+     PCH_LOG(KERN_ERR, "pch_phub_ioctl : "
>+     "copy_to_user fail returning =%d\n",
>+      -EFAULT);
>+     ret_value = -EFAULT;
>+     break;
>+    }
>+    break;
>+   }
>+
>+  case IOCTL_PHUB_WRITE_REG:
>+   {
>+
>+    ret_value =
>+        copy_from_user((void *)&data,
>+         (void *)&p_pch_phub_reqt->
>+         data, sizeof(data));
>+    if (ret_value) {
>+     PCH_LOG(KERN_ERR, "pch_phub_ioctl : "
>+     "copy_from_user fail returning =%d\n",
>+      -EFAULT);
>+     ret_value = -EFAULT;
>+     break;
>+    }
>+    pch_phub_write_reg(addr_offset, data);
>+    PCH_DEBUG("pch_phub_ioctl  : Invoked "
>+     "pch_phub_write_reg successfully\n");
>+    break;
>+   }

My feeling is that this ioctl interface is too
low-level in general. You only export access to specific
registers, not to functionality exposed by them.
The best kernel interfaces are defined in a way that
is independent of the underlying hardware and
convert generic commands into device specific commands.

If you really want to allow direct register access,
a simpler way would be to map the memory into the user
address space using the mmap() operation and not
provide any ioctl commands.

I don't see any range cheching on the addr_offset
argument, which means that a malicious user can use this
function to access not only your device, but any data
in the kernel address space.

Note that your open count does not protect the
hardware from concurrent access, because a file
descriptor can be shared by multiple user threads.
You can probably safely drop that count.

>+/*! @ingroup PHUB_InterfaceLayer
>+  @def IOCTL_PHUB_READ_REG
>+  @brief Outlines the read register function signature.
>+  */
>+#define IOCTL_PHUB_READ_REG (_IOW(PHUB_IOCTL_MAGIC, 1, unsigned long))

If I read your code correctly, you actually pass a struct pch_phub_reqt
argument, not an unsigned long argument, so this definition should be
changed accordingly. The same applies to the other ioctl commands.

Your patch continues in a second email, which is not how you normally
split submissions. When there is a logical separation between parts
of the driver, make multiple patches, each with a separate description
of what the patch does.

In case of this driver, that does not seem necessary. In fact, it seems
to me like it is simple enough to become a single source file, which
would simplify it even more because you no longer need header files
defining the interface between parts of the driver.

	Arnd

^ permalink raw reply

* [PATCH 4/4] C/R: inet4 and inet6 unicast routes
From: Dan Smith @ 2010-04-23 14:55 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg; +Cc: netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1272034539-19899-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

This patch adds support for checkpointing and restoring route information.
It keeps enough information to restore basic routes at the level of detail
of /proc/net/route.  It uses RTNETLINK to extract the information during
checkpoint and also to insert it back during restore.  This gives us a
nice layer of isolation between us and the various "fib" implementations.

Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 include/linux/checkpoint_hdr.h |   31 +++
 net/checkpoint_dev.c           |  412 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 442 insertions(+), 1 deletions(-)

diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 633c9b0..187d706 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -23,6 +23,7 @@
 #include <sys/un.h>
 #include <netinet/in.h>
 #endif
+#include <linux/if.h>
 
 /*
  * /usr/include/linux/security.h is not exported to userspace, so
@@ -783,6 +784,7 @@ struct ckpt_hdr_file_socket {
 struct ckpt_hdr_netns {
 	struct ckpt_hdr h;
 	__s32 this_ref;
+	__u32 routes;
 } __attribute__((aligned(8)));
 
 enum ckpt_netdev_types {
@@ -837,6 +839,35 @@ struct ckpt_netdev_addr {
 	} __attribute__((aligned(8)));
 } __attribute__((aligned(8)));
 
+enum ckpt_route_types {
+	CKPT_ROUTE_IPV4,
+	CKPT_ROUTE_IPV6,
+	CKPT_ROUTE_MAX
+};
+
+#define CKPT_ROUTE_FLAG_GW 1
+
+struct ckpt_route {
+	__u16 type;
+	__u16 flags;
+
+	union {
+		struct {
+			__be32 inet4_len;          /* mask length (bits) */
+			__u32  inet4_met;          /* metric             */
+			__be32 inet4_dst;          /* route address      */
+			__be32 inet4_gwy;          /* gateway address    */
+		};
+		struct {
+			__u32 inet6_len;           /* mask length (bits) */
+			__u32 inet6_met;           /* metric             */
+			struct in6_addr inet6_dst; /* route address      */
+			struct in6_addr inet6_gwy; /* gateway address    */
+		};
+	} __attribute__((aligned(8)));
+	char dev[IFNAMSIZ+1];
+} __attribute__((aligned(8)));
+
 struct ckpt_hdr_eventpoll_items {
 	struct ckpt_hdr h;
 	__s32  epfile_objref;
diff --git a/net/checkpoint_dev.c b/net/checkpoint_dev.c
index df8b16a..b34d1f2 100644
--- a/net/checkpoint_dev.c
+++ b/net/checkpoint_dev.c
@@ -17,9 +17,11 @@
 #include <linux/checkpoint_hdr.h>
 #include <linux/deferqueue.h>
 #include <linux/module.h>
+#include <linux/fib_rules.h>
 
 #include <net/net_namespace.h>
 #include <net/sch_generic.h>
+#include <net/ipv6.h>
 
 struct veth_newlink {
 	char *peer;
@@ -107,6 +109,22 @@ static int __kern_dev_ioctl(struct net *net, unsigned int cmd, void *arg)
 	return ret;
 }
 
+static void debug_route(struct ckpt_route *route)
+{
+	if (route->type == CKPT_ROUTE_IPV4)
+		ckpt_debug("inet4 route %pI4/%i gw %pI4 metric %i dev %s\n",
+			   &route->inet4_dst, route->inet4_len,
+			   &route->inet4_gwy, route->inet4_met,
+			   route->dev);
+	else if (route->type == CKPT_ROUTE_IPV6)
+		ckpt_debug("inet6 route %pI6/%i gw %pI6 metric %i dev %s\n",
+			   &route->inet6_dst, route->inet6_len,
+			   &route->inet6_gwy, route->inet6_met,
+			   route->dev);
+	else
+		ckpt_debug("unknown route type %i\n", route->type);
+}
+
 static struct socket *rtnl_open(struct net *net)
 {
 	struct socket *sock;
@@ -313,11 +331,236 @@ int checkpoint_netdev(struct ckpt_ctx *ctx, void *ptr)
 	return ret;
 }
 
+static int rtnl_dump_routes(struct socket *rtnl, int family)
+{
+	struct sk_buff *skb;
+	struct rtmsg *rtm;
+	int flags = NLM_F_ROOT | NLM_F_REQUEST;
+	struct msghdr msg;
+	struct kvec kvec;
+	struct nlmsghdr *nlh;
+	int ret = -ENOMEM;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	nlh = nlmsg_put(skb, 0, 0, RTM_GETROUTE, sizeof(*rtm), flags);
+	if (!nlh)
+		goto out;
+
+	rtm = nlmsg_data(nlh);
+	memset(rtm, 0, sizeof(*rtm));
+	rtm->rtm_family = family;
+
+	nlmsg_end(skb, nlh);
+
+	memset(&msg, 0, sizeof(msg));
+	kvec.iov_len = skb->len;
+	kvec.iov_base = skb->head;
+
+	ret = kernel_sendmsg(rtnl, &msg, &kvec, 1, kvec.iov_len);
+	if ((ret >= 0) && (ret != skb->len))
+		ret = -EIO;
+ out:
+	kfree_skb(skb);
+	return ret;
+}
+
+static int rtnl_process_inet4_route(struct net *net,
+				    struct rtmsg *rtm,
+				    struct nlattr **tb,
+				    struct ckpt_route *route)
+{
+	if (rtm->rtm_type != RTN_UNICAST)
+		return 0; /* skip non-unicast routes */
+
+	route->type = CKPT_ROUTE_IPV4;
+	route->inet4_len = rtm->rtm_dst_len;
+
+	if (tb[RTA_DST])
+		route->inet4_dst = htonl(nla_get_u32(tb[RTA_DST]));
+	if (tb[RTA_GATEWAY]) {
+		route->flags |= CKPT_ROUTE_FLAG_GW;
+		route->inet4_gwy = htonl(nla_get_u32(tb[RTA_GATEWAY]));
+	}
+	if (tb[RTA_PRIORITY])
+		route->inet4_met = nla_get_u32(tb[RTA_PRIORITY]);
+
+	if (tb[RTA_OIF]) {
+		struct net_device *dev;
+
+		dev = dev_get_by_index(net, nla_get_u32(tb[RTA_OIF]));
+		if (dev) {
+			strncpy(route->dev, dev->name, IFNAMSIZ);
+			dev_put(dev);
+		}
+	}
+
+	debug_route(route);
+
+	return 1; /* save this route */
+}
+
+static int rtnl_process_inet6_route(struct net *net,
+				    struct rtmsg *rtm,
+				    struct nlattr **tb,
+				    struct ckpt_route *route)
+{
+	if (rtm->rtm_type != RTN_UNICAST)
+		return 0; /* skip non-unicast routes */
+
+	route->type = CKPT_ROUTE_IPV6;
+	route->inet6_len = rtm->rtm_dst_len;
+
+	if (tb[RTA_DST])
+		ipv6_addr_copy(&route->inet6_dst, nla_data(tb[RTA_DST]));
+	if (tb[RTA_GATEWAY]) {
+		route->flags |= CKPT_ROUTE_FLAG_GW;
+		ipv6_addr_copy(&route->inet6_gwy, nla_data(tb[RTA_GATEWAY]));
+	}
+	if (tb[RTA_PRIORITY])
+		route->inet6_met = nla_get_u32(tb[RTA_PRIORITY]);
+
+	if (tb[RTA_OIF]) {
+		struct net_device *dev;
+
+		dev = dev_get_by_index(net, nla_get_u32(tb[RTA_OIF]));
+		if (dev) {
+			strncpy(route->dev, dev->name, IFNAMSIZ);
+			dev_put(dev);
+		}
+	}
+
+	debug_route(route);
+
+	return 1;
+}
+
+static int rtnl_process_routes(struct net *net,
+			       struct nlmsghdr *nlh, int len,
+			       struct ckpt_route *routes,
+			       int idx, int max)
+{
+	struct nlmsghdr *i;
+
+	for (i = nlh; NLMSG_OK(i, len); i = NLMSG_NEXT(i, len)) {
+		struct ckpt_route *route = &routes[idx];
+		struct rtmsg *rtm = NLMSG_DATA(i);
+		struct nlattr *tb[FRA_MAX+1];
+		int ret;
+
+		if (idx >= max)
+			return -E2BIG;
+
+		if (i->nlmsg_type == NLMSG_DONE)
+			break;
+		else if (nlh->nlmsg_type != RTM_NEWROUTE) {
+			struct nlmsgerr *errmsg = nlmsg_data(nlh);
+			return errmsg->error;
+		}
+
+		ret = nlmsg_parse(i, sizeof(*rtm), tb, FRA_MAX, NULL);
+		if (ret < 0)
+			return ret;
+
+		memset(route, 0, sizeof(*route));
+
+		if (rtm->rtm_family == AF_INET)
+			ret = rtnl_process_inet4_route(net, rtm, tb, route);
+		else if (rtm->rtm_family == AF_INET6)
+			ret = rtnl_process_inet6_route(net, rtm, tb, route);
+		else
+			ret = 0; /* skip */
+		if (ret < 0)
+			return ret;
+		else if (ret)
+			idx += 1;
+	}
+
+	return idx;
+}
+
+static int rtnl_get_routes(struct net *net, int family,
+			   struct ckpt_route *routes, int idx, int max)
+{
+	int ret;
+	long timeo = MAX_SCHEDULE_TIMEOUT;
+	struct nlmsghdr *nlh;
+	struct sk_buff *skb = NULL;
+	struct socket *rtnl = NULL;
+
+	rtnl = rtnl_open(net);
+	if (IS_ERR(rtnl))
+		return PTR_ERR(rtnl);
+
+	ret = rtnl_dump_routes(rtnl, family);
+	if (ret < 0)
+		goto out;
+
+	lock_sock(rtnl->sk);
+	ret = sk_wait_data(rtnl->sk, &timeo);
+	if (ret)
+		skb = skb_dequeue(&rtnl->sk->sk_receive_queue);
+	release_sock(rtnl->sk);
+	if (!skb) {
+		ret = -EIO;
+		goto out;
+	}
+
+	nlh = nlmsg_hdr(skb);
+	if (!nlh) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = rtnl_process_routes(net, nlh, skb->len, routes, idx, max);
+ out:
+	rtnl_close(rtnl);
+	kfree_skb(skb);
+	return ret;
+}
+
+int checkpoint_netns_routes(struct ckpt_ctx *ctx, struct net *net,
+			    struct ckpt_route **_routes)
+{
+	struct ckpt_route *routes = NULL;
+	int max = 32;
+	int idx;
+	int families[] = {AF_INET, AF_INET6, 0};
+	int family;
+ retry:
+	idx = 0;
+	kfree(routes);
+	routes = kmalloc(max * sizeof(*routes), GFP_KERNEL);
+	if (!routes)
+		return -ENOMEM;
+
+	for (family = 0; families[family]; family++) {
+		idx = rtnl_get_routes(net, families[family], routes, idx, max);
+		if (idx == -E2BIG) {
+			max *= 2;
+			goto retry;
+		} else if (idx < 0)
+			break;
+	}
+
+	if (idx < 0) {
+		kfree(routes);
+		routes = NULL;
+		ckpt_err(ctx, idx, "error saving routes\n");
+	}
+	*_routes = routes;
+
+	return idx;
+}
+
 int checkpoint_netns(struct ckpt_ctx *ctx, void *ptr)
 {
 	struct net *net = ptr;
 	struct net_device *dev;
 	struct ckpt_hdr_netns *h;
+	struct ckpt_route *routes = NULL;
 	int ret;
 
 	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_NET_NS);
@@ -327,10 +570,19 @@ int checkpoint_netns(struct ckpt_ctx *ctx, void *ptr)
 	h->this_ref = ckpt_obj_lookup(ctx, net, CKPT_OBJ_NET_NS);
 	BUG_ON(h->this_ref <= 0);
 
+	ret = checkpoint_netns_routes(ctx, net, &routes);
+	if (ret < 0)
+		goto out;
+	h->routes = ret;
+
 	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
 	if (ret < 0)
 		goto out;
 
+	ret = ckpt_write_buffer(ctx, routes, h->routes * sizeof(*routes));
+	if (ret < 0)
+		goto out;
+
 	for_each_netdev(net, dev) {
 		if (dev->netdev_ops->ndo_checkpoint)
 			ret = checkpoint_obj(ctx, dev, CKPT_OBJ_NETDEV);
@@ -347,6 +599,7 @@ int checkpoint_netns(struct ckpt_ctx *ctx, void *ptr)
 	}
  out:
 	ckpt_hdr_put(ctx, h);
+	kfree(routes);
 
 	return ret;
 }
@@ -862,10 +1115,145 @@ void *restore_netdev(struct ckpt_ctx *ctx)
 	return dev;
 }
 
+static int rtnl_restore_route(struct net *net, struct ckpt_route *route)
+{
+	struct sk_buff *skb;
+	struct rtmsg *rtm;
+	int flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK;
+	struct nlmsghdr *nlh;
+	int ret = 0;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	nlh = nlmsg_put(skb, 0, 0, RTM_NEWROUTE, sizeof(*rtm), flags);
+	if (!nlh) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	rtm = nlmsg_data(nlh);
+	memset(rtm, 0, sizeof(*rtm));
+
+	rtm->rtm_table = RT_TABLE_MAIN;
+	rtm->rtm_protocol = RTPROT_BOOT;
+	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+	rtm->rtm_type = RTN_UNICAST;
+
+	if (route->dev[0]) {
+		struct net_device *dev;
+
+		dev = dev_get_by_name(net, route->dev);
+		if (!dev) {
+			ckpt_debug("unable to find dev %s for route\n",
+				   route->dev);
+			ret = -EINVAL;
+			goto out;
+		}
+		nla_put_u32(skb, RTA_OIF, dev->ifindex);
+		dev_put(dev);
+	}
+
+	if (route->type == CKPT_ROUTE_IPV4) {
+		rtm->rtm_family = AF_INET;
+		rtm->rtm_dst_len = route->inet4_len;
+
+		nla_put_u32(skb, RTA_DST, route->inet4_dst);
+		if (route->flags & CKPT_ROUTE_FLAG_GW)
+			nla_put_u32(skb, RTA_GATEWAY, route->inet4_gwy);
+		nla_put_u32(skb, RTA_PRIORITY, route->inet4_met);
+	} else if (route->type == CKPT_ROUTE_IPV6) {
+		int len = sizeof(route->inet6_dst);
+
+		if (ipv6_addr_scope(&route->inet6_dst))
+			goto out; /* Skip non-global scope routes */
+
+		rtm->rtm_family = AF_INET6;
+		rtm->rtm_dst_len = route->inet6_len;
+
+		nla_put(skb, RTA_DST, len, &route->inet6_dst);
+		if (route->flags & CKPT_ROUTE_FLAG_GW)
+			nla_put(skb, RTA_GATEWAY, len, &route->inet6_gwy);
+		nla_put_u32(skb, RTA_PRIORITY, route->inet6_met);
+	} else {
+		ckpt_debug("unsupported route type %i\n", route->type);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	nlmsg_end(skb, nlh);
+
+	debug_route(route);
+
+	ret = rtnl_do(net, skb);
+ out:
+	kfree_skb(skb);
+	return ret;
+}
+
+static int restore_routes(struct net *net, struct ckpt_route *routes, int count)
+{
+	int i;
+	int ret = 0;
+
+	for (i = 0; i < count; i++) {
+		struct ckpt_route *route = &routes[i];
+
+		ret = rtnl_restore_route(net, route);
+		if (ret == -EEXIST)
+			/* Some routes have been implied by device addresses */
+			continue;
+		else if (ret < 0)
+			break;
+	}
+
+	return ret;
+}
+
+struct dq_routes {
+	struct ckpt_ctx *ctx;
+	struct net *net;
+	struct ckpt_route *routes;
+	int count;
+};
+
+static int deferred_restore_routes(void *data)
+{
+	struct dq_routes *dq = data;
+	int ret;
+
+	ret = restore_routes(dq->net, dq->routes, dq->count);
+	if (ret < 0)
+		ckpt_err(dq->ctx, ret, "failed to restore routes\n");
+
+	kfree(dq->routes);
+
+	return ret;
+}
+
+static int defer_restore_routes(struct ckpt_ctx *ctx,
+				struct net *net,
+				struct ckpt_route *routes,
+				int count)
+{
+	struct dq_routes dq;
+
+	dq.ctx = ctx;
+	dq.net = net;
+	dq.routes = routes;
+	dq.count = count;
+
+	return deferqueue_add(ctx->files_deferq, &dq, sizeof(dq),
+			      deferred_restore_routes, NULL);
+}
+
 void *restore_netns(struct ckpt_ctx *ctx)
 {
 	struct ckpt_hdr_netns *h;
 	struct net *net;
+	struct ckpt_route *routes = NULL;
+	int ret;
 
 	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_NET_NS);
 	if (IS_ERR(h)) {
@@ -873,12 +1261,34 @@ void *restore_netns(struct ckpt_ctx *ctx)
 		return h;
 	}
 
+	ret = ckpt_read_payload(ctx, (void **)&routes,
+				h->routes * sizeof(*routes), CKPT_HDR_BUFFER);
+	if (ret < 0) {
+		ckpt_err(ctx, ret, "Unable to read routes buffer\n");
+		net = ERR_PTR(ret);
+		goto out;
+	}
+
 	if (h->this_ref != 0) {
 		net = copy_net_ns(CLONE_NEWNET, current->nsproxy->net_ns);
 		if (IS_ERR(net))
 			goto out;
-	} else
+
+		ret = defer_restore_routes(ctx, net, routes, h->routes);
+		if (ret < 0) {
+			kfree(routes);
+			put_net(net);
+			net = ERR_PTR(ret);
+		}
+	} else {
+		if (h->routes) {
+			net = ERR_PTR(-EINVAL);
+			ckpt_err(ctx, -EINVAL,
+				 "Parent netns claims to have routes\n");
+			goto out;
+		}
 		net = current->nsproxy->net_ns;
+	}
  out:
 	ckpt_hdr_put(ctx, h);
 
-- 
1.6.2.5

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox