Netdev List
 help / color / mirror / Atom feed
* [PATCH v4 2/2] macvtap: restore vlan header on user read
From: Basil Gor @ 2012-05-04  8:55 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Eric W. Biederman, David S. Miller, netdev

Ethernet vlan header is not on the packet and kept in the skb->vlan_tci
when it comes from lower dev. This patch inserts vlan header in user
buffer during skb copy on user read.

Signed-off-by: Basil Gor <basil.gor@gmail.com>
---
 drivers/net/macvtap.c |   43 ++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 0427c65..cb8fd50 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -1,5 +1,6 @@
 #include <linux/etherdevice.h>
 #include <linux/if_macvlan.h>
+#include <linux/if_vlan.h>
 #include <linux/interrupt.h>
 #include <linux/nsproxy.h>
 #include <linux/compat.h>
@@ -759,6 +760,8 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 	struct macvlan_dev *vlan;
 	int ret;
 	int vnet_hdr_len = 0;
+	int vlan_offset = 0;
+	int copied;
 
 	if (q->flags & IFF_VNET_HDR) {
 		struct virtio_net_hdr vnet_hdr;
@@ -773,18 +776,48 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 		if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr)))
 			return -EFAULT;
 	}
+	copied = vnet_hdr_len;
+
+	if (!vlan_tx_tag_present(skb))
+		len = min_t(int, skb->len, len);
+	else {
+		int copy;
+		struct {
+			__be16 h_vlan_proto;
+			__be16 h_vlan_TCI;
+		} veth;
+		veth.h_vlan_proto = htons(ETH_P_8021Q);
+		veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));
+
+		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
+		len = min_t(int, skb->len + VLAN_HLEN, len);
+
+		copy = min_t(int, vlan_offset, len);
+		ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);
+		len -= copy;
+		copied += copy;
+		if (ret || !len)
+			goto done;
+
+		copy = min_t(int, sizeof(veth), len);
+		ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);
+		len -= copy;
+		copied += copy;
+		if (ret || !len)
+			goto done;
+	}
 
-	len = min_t(int, skb->len, len);
-
-	ret = skb_copy_datagram_const_iovec(skb, 0, iv, vnet_hdr_len, len);
+	ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);
+	copied += len;
 
+done:
 	rcu_read_lock_bh();
 	vlan = rcu_dereference_bh(q->vlan);
 	if (vlan)
-		macvlan_count_rx(vlan, len, ret == 0, 0);
+		macvlan_count_rx(vlan, copied - vnet_hdr_len, ret == 0, 0);
 	rcu_read_unlock_bh();
 
-	return ret ? ret : (len + vnet_hdr_len);
+	return ret ? ret : copied;
 }
 
 static ssize_t macvtap_do_read(struct macvtap_queue *q, struct kiocb *iocb,
-- 
1.7.6.5

^ permalink raw reply related

* [PATCH v4 1/2] vhost-net: fix handle_rx buffer size
From: Basil Gor @ 2012-05-04  8:55 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Eric W. Biederman, David S. Miller, netdev

Take vlan header length into account, when vlan id is stored as
vlan_tci. Otherwise tagged packets comming from macvtap will be
truncated.

Signed-off-by: Basil Gor <basil.gor@gmail.com>
---
 drivers/vhost/net.c |    7 ++++++-
 1 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 1f21d2a..5c17010 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -24,6 +24,7 @@
 #include <linux/if_arp.h>
 #include <linux/if_tun.h>
 #include <linux/if_macvlan.h>
+#include <linux/if_vlan.h>
 
 #include <net/sock.h>
 
@@ -283,8 +284,12 @@ static int peek_head_len(struct sock *sk)
 
 	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
 	head = skb_peek(&sk->sk_receive_queue);
-	if (likely(head))
+	if (likely(head)) {
 		len = head->len;
+		if (vlan_tx_tag_present(head))
+			len += VLAN_HLEN;
+	}
+
 	spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
 	return len;
 }
-- 
1.7.6.5

^ permalink raw reply related

* [patch net-next] net: IP_MULTICAST_IF setsockopt now recognizes struct mreq
From: Jiri Pirko @ 2012-05-04  8:37 UTC (permalink / raw)
  To: netdev; +Cc: davem, eric.dumazet, kuznet, jmorris, yoshfuji, kaber

Until now, struct mreq has not been recognized and it was worked with
as with struct in_addr. That means imr_multiaddr was copied to
imr_address. So do recognize struct mreq here and copy that correctly.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 net/ipv4/ip_sockglue.c |   13 +++++++++----
 1 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 51c6c67..0d11f23 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -673,10 +673,15 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 				break;
 		} else {
 			memset(&mreq, 0, sizeof(mreq));
-			if (optlen >= sizeof(struct in_addr) &&
-			    copy_from_user(&mreq.imr_address, optval,
-					   sizeof(struct in_addr)))
-				break;
+			if (optlen >= sizeof(struct ip_mreq)) {
+				if (copy_from_user(&mreq, optval,
+						   sizeof(struct ip_mreq)))
+					break;
+			} else if (optlen >= sizeof(struct in_addr)) {
+				if (copy_from_user(&mreq.imr_address, optval,
+						   sizeof(struct in_addr)))
+					break;
+			}
 		}
 
 		if (!mreq.imr_ifindex) {
-- 
1.7.9.1

^ permalink raw reply related

* Re: [PATCH v2] RPS: Sparse connection optimizations - v2
From: Eric Dumazet @ 2012-05-04  7:47 UTC (permalink / raw)
  To: Deng-Cheng Zhu; +Cc: Tom Herbert, davem, netdev
In-Reply-To: <4FA35A3D.8000205@mips.com>

On Fri, 2012-05-04 at 12:25 +0800, Deng-Cheng Zhu wrote:
> On 05/04/2012 11:22 AM, Tom Herbert wrote:
> >> +struct cpu_flow {
> >> +       struct net_device *dev;
> >> +       u32 rxhash;
> >> +       unsigned long ts;
> >> +};
> >
> > This seems like overkill, we already have the rps_flow_table and this
> > used in accelerated RFS so the device can also take advantage of
> > steering.
> 
> I think the mechanisms of rps_dev_flow_table and cpu_flow (in this
> patch) are different: The former works along with rps_sock_flow_table
> whose CPU info is based on recvmsg by the application. But for the tests
> like what I did, there's no application involved.
> 
> 
> Deng-Cheng

I really suggest you speak with MIPS arch maintainers about these IRQ
being all serviced by CPU0.

Adding tweaks in network stack to lower the impact of this huge problem
is a no go.

^ permalink raw reply

* [mac802154][question] MAC layer RX datapath design
From: Alexander Smirnov @ 2012-05-04  7:34 UTC (permalink / raw)
  To: David Miller; +Cc: Dmitry Eremin-Solenikov, open list:NETWORKING [GENERAL]

Hi David,

Two months ago I sent a patch-set for the basic mac802154 support. The problem
was in the design of RX datapath. The doubtful patch is listed in the end.

Your comments were:

> If packets are properly pushed into the stack via the core receive
> packet processing of the networking, you should always be in softirq
> context and therefore have no need to a workqueue to get out of hardirq
> context.
> Please redo this code so that it processes RX packets properly and
> consistently with how other protocol layers do things, which is taking
> packets from the netif_receive_skb() et al. paths via core protocol
> demux and therefore running always in softirq context.

Please see below my thoughts, and Dmitry Eremin-Solenikov, please
correct me if I'm wrong and you see this mail:

The IEEE 802.15.4 stack contains 3 main levels:
 - ieee802154 - 802.15.4 address family/netlink interface
 - mac802154 - 802.15.4 MAC layer
 - physical - device drivers

The IEEE 802.15.4 standard defines 4 MAC packet types:
 - beacon frame
 - MAC command frame
 - acknowledgement frame
 - data frame

and only the data frame should be pushed to the upper layer. Others must be
managed by MAC layer only. And only if the frame represents a data burst, MAC
layer calls netif_rx and sends packet to the network queue (which is handled
by the ieee802154 layer).

So that's the reason why workqueue was designed as a middle "medium" between
drivers and MAC layer - to communicate MAC-specific information. And the overall
conception is based on mac80211 code in mainline.


If this conception is still undesirable, could you, or anybody else, please
give me any hint how to improve it?


With best regards,
Alexander


8<--

diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c
new file mode 100644
index 0000000..3446379
--- /dev/null
+++ b/net/mac802154/rx.c
@@ -0,0 +1,109 @@
...
...
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/workqueue.h>
+#include <linux/netdevice.h>
+#include <linux/crc-ccitt.h>
+
+#include <net/mac802154.h>
+#include <net/ieee802154_netdev.h>
+
+#include "mac802154.h"
+
+static void
+mac802154_subif_rx(struct ieee802154_dev *hw, struct sk_buff *skb, u8 lqi)
+{
+       struct mac802154_priv *priv = mac802154_to_priv(hw);
+
+       mac_cb(skb)->lqi = lqi;
+       skb->protocol = htons(ETH_P_IEEE802154);
+       skb_reset_mac_header(skb);
+
+       BUILD_BUG_ON(sizeof(struct ieee802154_mac_cb) > sizeof(skb->cb));
+
+       if (!(priv->hw.flags & IEEE802154_HW_OMIT_CKSUM)) {
+               u16 crc;
+
+               if (skb->len < 2) {
+                       pr_debug("(%s): got invalid frame\n", __func__);
+                       goto out;
+               }
+               crc = crc_ccitt(0, skb->data, skb->len);
+               if (crc) {
+                       pr_debug("(%s): CRC mismatch\n", __func__);
+                       goto out;
+               }
+               skb_trim(skb, skb->len - 2); /* CRC */
+       }
+
+out:
+       dev_kfree_skb(skb);
+       return;
+}
+
+struct rx_work {
+       struct sk_buff *skb;
+       struct work_struct work;
+       struct ieee802154_dev *dev;
+       u8 lqi;
+};
+
+static void mac802154_rx_worker(struct work_struct *work)
+{
+       struct rx_work *rw = container_of(work, struct rx_work, work);
+       struct sk_buff *skb = rw->skb;
+
+       mac802154_subif_rx(rw->dev, skb, rw->lqi);
+       kfree(rw);
+}
+
+void
+ieee802154_rx_irqsafe(struct ieee802154_dev *dev, struct sk_buff *skb, u8 lqi)
+       struct mac802154_priv *priv = mac802154_to_priv(dev);
+       struct rx_work *work;
+
+       if (!skb)
+               return;
+
+       work = kzalloc(sizeof(struct rx_work), GFP_ATOMIC);
+       if (!work)
+               return;
+
+       INIT_WORK(&work->work, mac802154_rx_worker);
+       work->skb = skb;
+       work->dev = dev;
+       work->lqi = lqi;
+
+       queue_work(priv->dev_workqueue, &work->work);
+}
+EXPORT_SYMBOL(ieee802154_rx_irqsafe);
+
+void mac802154_rx(struct ieee802154_dev *dev, struct sk_buff *skb, u8 lqi)
+{
+       if (skb)
+               mac802154_subif_rx(dev, skb, lqi);
+}
+EXPORT_SYMBOL(mac802154_rx);

^ permalink raw reply related

* Re: [PATCH 7/9] net: add skb_orphan_frags to copy aside frags with destructors
From: David Miller @ 2012-05-04  6:54 UTC (permalink / raw)
  To: mst; +Cc: ian.campbell, netdev, eric.dumazet
In-Reply-To: <20120503210826.GA30504@redhat.com>

From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 4 May 2012 00:10:24 +0300

> Hmm we orphan skbs when we loop them back so how about reusing the
> skb->destructor for this?

That's one possibility.

But I fear we're about to toss Ian into yet another rabbit hole. :-)

Let's try to converge on something quickly as I think integration of
his work has been delayed enough as-is.

^ permalink raw reply

* Re: [PATCHv2 4/7] mISDN: Make layer1 timer 3 value configurable
From: Karsten Keil @ 2012-05-04  6:47 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, isdn
In-Reply-To: <20120503.170830.1574473660496726919.davem@davemloft.net>

Am 03.05.2012 23:08, schrieb David Miller:
> From: Karsten Keil <kkeil@linux-pingi.de>
> Date: Thu,  3 May 2012 17:47:29 +0200
> 
>> @@ -372,6 +372,7 @@ clear_channelmap(u_int nr, u_char *map)
>>  #define MISDN_CTRL_RX_OFF		0x0100
>>  #define MISDN_CTRL_FILL_EMPTY		0x0200
>>  #define MISDN_CTRL_GETPEER		0x0400
>> +#define MISDN_CTRL_L1_TIMER3		0x0800
>>  #define MISDN_CTRL_HW_FEATURES_OP	0x2000
>>  #define MISDN_CTRL_HW_FEATURES		0x2001
>>  #define MISDN_CTRL_HFC_OP		0x4000
> 
> This define is completely unused by this patch.
> 
> To say that I'm frustrated by this process would be an understatement.
> 
> 

It is used in the next patch of the series. I usually make different
patches for core infrastructure  changes and the low level driver changes.
If you think it would be better to do this in one patch - I can provide
a new series.

Karsten

^ permalink raw reply

* Re: [PATCH v4] tilegx network driver: initial support
From: David Miller @ 2012-05-04  6:42 UTC (permalink / raw)
  To: cmetcalf; +Cc: arnd, linux-kernel, netdev
In-Reply-To: <201205031936.q43Ja9Xb031644@lab-41.internal.tilera.com>

From: Chris Metcalf <cmetcalf@tilera.com>
Date: Thu, 3 May 2012 12:41:56 -0400

> +/* First, "tile_net_init_module()" initializes each network cpu to
> + * handle incoming packets, and initializes all the network devices.
> + *
> + * Then, "ifconfig DEVICE up" calls "tile_net_open()", which will
> + * turn on packet processing, if needed.
> + *
> + * If "ifconfig DEVICE down" is called, it uses "tile_net_stop()" to
> + * stop egress, and possibly turn off packet processing.
> + *
> + * We start out with the ingress IRQ enabled on each CPU.  When it
> + * fires, it is automatically disabled, and we call "napi_schedule()".
> + * This will cause "tile_net_poll()" to be called, which will pull
> + * packets from the netio queue, filtering them out, or passing them
> + * to "netif_receive_skb()".  If our budget is exhausted, we will
> + * return, knowing we will be called again later.  Otherwise, we
> + * reenable the ingress IRQ, and call "napi_complete()".

This is not the place where you document how the generic networking
brings devices up and down, and what driver methods are called during
those actions.

Imagine if every driver writer decided to do this.

> +#define TILE_NET_MAX_COMPS 64
> +
> +

Please get rid of all of these more-than-one empty line sequences.

> +#define ROUND_UP(n, align) (((n) + (align) - 1) & -(align))

This is ALIGN() from linux/kernel.h, please us it.

At this rate I anticipate at least 20 rounds of review, this driver
still needs quite a bit of work.

^ permalink raw reply

* Re: [net-next PATCH v4 0/8] Managing the forwarding database(FDB)
From: Sridhar Samudrala @ 2012-05-04  5:43 UTC (permalink / raw)
  To: John Fastabend
  Cc: Roopa Prabhu, Michael S. Tsirkin, shemminger, bhutchings, hadi,
	jeffrey.t.kirsher, netdev, gregory.v.rose, krkumar2
In-Reply-To: <4FA2DEA5.6050802@intel.com>

On 5/3/2012 12:38 PM, John Fastabend wrote:
> On 5/2/2012 4:36 PM, Sridhar Samudrala wrote:
>> On 5/2/2012 2:52 PM, John Fastabend wrote:
>>> On 5/2/2012 8:08 AM, Michael S. Tsirkin wrote:
>>>> On Sun, Apr 15, 2012 at 01:06:37PM -0400, David Miller wrote:
>>>>> From: John Fastabend<john.r.fastabend@intel.com>
>>>>> Date: Sun, 15 Apr 2012 09:43:51 -0700
>>>>>
>>>>>> The following series is a submission for net-next to allow
>>>>>> embedded switches and other stacked devices other then the
>>>>>> Linux bridge to manage a forwarding database.
>>>>>>
>>>>>> Previously discussed here,
>>>>>>
>>>>>> http://lists.openwall.net/netdev/2012/03/19/26
>>>>>>
>>>>>> v4: propagate return codes correctly for ndo_dflt_Fdb_dump()
>>>>>>
>>>>>> v3: resolve the macvlan patch 8/8 to fix a dev_set_promiscuity()
>>>>>>       error and add the flags field to change and get link routines.
>>>>>>
>>>>>> v2: addressed feedback from Ben Hutchings resolving a typo in the
>>>>>>       multicast add/del routines and improving the error handling
>>>>>>       when both NTF_SELF and NTF_MASTER are set.
>>>>>>
>>>>>> I've tested this with 'br' tool published by Stephen Hemminger
>>>>>> soon to be renamed 'bridge' I believe and various traffic
>>>>>> generators mostly pktgen, ping, and netperf.
>>>>> All applied, if we need any more tweaks we can just add them
>>>>> on top of this work.
>>>>>
>>>>> Thanks John.
>>>> John, do you plan to update kvm userspace to use this interface?
>>>>
>>> No immediate plans. I would really appreciate it if you or one
>>> of the IBM developers working in this space took it on. Of course
>>> if no one steps up I guess I can eventually get at it but it will
>>> be sometime. For now I've been doing this manually with the bridge
>>> tool yet to be published.
>>>
>>>
>> Does this mean that when we add an interface to a bridge, it need not be put in promiscuous mode and
>> add/delete fdb entries dynamically?
> The net/bridge will automatically put the interface in promisc mode
> when the device is attached. We do need to add/delete fdb entries
> though to allow forwarding packets from the virtual function and
> any emulated devices e.g. tap devices on the bridge.

Consider the following scenario where we have a SR-IOV NIC with 1 PF
and 2 VFs (VF1 & VF2).
- eth0 is the PF which is attached to bridge br0 and connected to 2 VMs 
VM1 and VM2.
- eth1 is the VF1 terminated on the host and assigned to VM3 via 
macvtap0 in passthru mode.
- VF2 is directly assigned to VM4 via pci-device assignment.

  VM1      VM2         VM3           VM4
(mac1)  (mac2)     (mac3)         (mac4)
  |        |           |             |
  |        |           |             |
vnet0   vnet1         |             |
  |        |           |             |
  \        /           |             |
   \      /            |             |
     br0            macvtap0         |
      |              (mac3)          |
      |                |             |
     eth0            eth1            |
      |              (mac3)          |
      |               |              |
    ------------------------------------
   | PF              VF1           VF2  |
   |                                    |
   |                 VEB                |
   ------------------------------------

In this setup, i think when VM1 and VM2 come up, mac1 and mac2 have to 
be added to the
embedded bridge's fdb.  Once we add these 2 entries, all the 4 VMs can 
talk to each other.
Is this correct?

Now, if VM1 or VM2 wants to add secondary mac addresses, i think we need 
qemu to add a new fdb
entry when it receives add mac address command via virtio control vq.

Can we add multiple mac addresses to VFs? For example VM3 and VM4 trying 
to add a secondary mac address.

What about VMs trying to create VLANs? I think this will work on VM1 and 
VM2. However with VM3
and VM4, i think we need qemu to add vlans to the VFs when the VMs 
create them.

Thanks
Sridhar

^ permalink raw reply

* Re: linux-next: manual merge of the net-next tree with the wireless tree
From: Guy, Wey-Yi @ 2012-05-04  4:35 UTC (permalink / raw)
  To: Stephen Rothwell
  Cc: David Miller, netdev, linux-next, linux-kernel, Eric Dumazet,
	John W. Linville, alexander.duyck, alexander.h.duyck,
	jeffrey.t.kirsher, linux-wireless
In-Reply-To: <20120504132138.d0142070434e502dfcb8a980@canb.auug.org.au>

On Fri, 2012-05-04 at 13:21 +1000, Stephen Rothwell wrote:
> Hi all,
> 
> Today's linux-next merge of the net-next tree got conflicts in
> drivers/net/wireless/iwlwifi/iwl-trans.h,
> drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c, and
> drivers/net/wireless/iwlwifi/iwl-agn-rx.c between commit ed90542b0ce5
> ("iwlwifi: fix skb truesize underestimation") from the wireless tree and
> various commits from the net-next tree.
> 
> This was anticipated and I have applied the fix supplied by John (see
> below just to check).  Thanks to John for this!

Thanks everyone to help address this, it is my miss to cause all these trouble

Wey

^ permalink raw reply

* Re: [PATCH v2] RPS: Sparse connection optimizations - v2
From: Deng-Cheng Zhu @ 2012-05-04  4:25 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev, eric.dumazet
In-Reply-To: <CA+mtBx_J=fm3+k6_NzMH9KCdrYpZPwT2XBQe4r424eNDCKzazg@mail.gmail.com>

On 05/04/2012 11:22 AM, Tom Herbert wrote:
>> +struct cpu_flow {
>> +       struct net_device *dev;
>> +       u32 rxhash;
>> +       unsigned long ts;
>> +};
>
> This seems like overkill, we already have the rps_flow_table and this
> used in accelerated RFS so the device can also take advantage of
> steering.

I think the mechanisms of rps_dev_flow_table and cpu_flow (in this
patch) are different: The former works along with rps_sock_flow_table
whose CPU info is based on recvmsg by the application. But for the tests
like what I did, there's no application involved.


Deng-Cheng

^ permalink raw reply

* Re: [PATCH v2] RPS: Sparse connection optimizations - v2
From: Deng-Cheng Zhu @ 2012-05-04  3:39 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev, eric.dumazet
In-Reply-To: <CA+mtBx_J=fm3+k6_NzMH9KCdrYpZPwT2XBQe4r424eNDCKzazg@mail.gmail.com>

On 05/04/2012 11:22 AM, Tom Herbert wrote:
>> +struct cpu_flow {
>> +       struct net_device *dev;
>> +       u32 rxhash;
>> +       unsigned long ts;
>> +};
>
> This seems like overkill, we already have the rps_flow_table and this
> used in accelerated RFS so the device can also take advantage of
> steering.  Maybe somehow program that table for your sparse flows?

In fact I did ever try something different in rps_flow_cnt (except for
rps_cpus, the only tunable thing relating to RPS in sysfs, am I missing
something?) and found no effect in my tests (iperf between 2 PCs via
Malta which works as router and uses iptables/NAT+RPS)...


Deng-Cheng

^ permalink raw reply

* Re: [PATCH v2] RPS: Sparse connection optimizations - v2
From: Tom Herbert @ 2012-05-04  3:22 UTC (permalink / raw)
  To: Deng-Cheng Zhu; +Cc: davem, netdev, eric.dumazet
In-Reply-To: <1336035412-2161-1-git-send-email-dczhu@mips.com>

> +struct cpu_flow {
> +       struct net_device *dev;
> +       u32 rxhash;
> +       unsigned long ts;
> +};

This seems like overkill, we already have the rps_flow_table and this
used in accelerated RFS so the device can also take advantage of
steering.  Maybe somehow program that table for your sparse flows?

Tom

> +#endif
> +
>  /*
>  * This structure holds an RPS map which can be of variable length.  The
>  * map is an array of CPUs.
> diff --git a/net/Kconfig b/net/Kconfig
> index e07272d..d5aa682 100644
> --- a/net/Kconfig
> +++ b/net/Kconfig
> @@ -222,6 +222,28 @@ config RPS
>        depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
>        default y
>
> +config RPS_SPARSE_FLOW_OPTIMIZATION
> +       bool "RPS optimizations for sparse flows"
> +       depends on RPS
> +       default n
> +       ---help---
> +         This feature will try to map some network flows to consecutive
> +         CPUs in the RPS map. It will bring in some per packet overhead
> +         but should be able to do good to network throughput in the case
> +         of low number of connections while not much affecting other
> +         cases. (e.g. relatively consistent and high bandwidth in single
> +         connection tests).
> +
> +config NR_RPS_MAP_LOOPS
> +       int "Number of loops walking RPS map before hash indexing (1-5)"
> +       range 1 5
> +       depends on RPS_SPARSE_FLOW_OPTIMIZATION
> +       default "4"
> +       ---help---
> +         It defines how many loops to go through the RPS map while
> +         determing target CPU to process the incoming packet. After that,
> +         the decision will fall back on hash indexing the RPS map.
> +
>  config RFS_ACCEL
>        boolean
>        depends on RPS && GENERIC_HARDIRQS
> diff --git a/net/core/dev.c b/net/core/dev.c
> index c25d453..92e292b 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2698,6 +2698,61 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
>        return rflow;
>  }
>
> +#ifdef CONFIG_RPS_SPARSE_FLOW_OPTIMIZATION
> +static DEFINE_PER_CPU(struct cpu_flow [CONFIG_NR_RPS_MAP_LOOPS], cpu_flows);
> +static unsigned long hash_active;
> +
> +#define FLOW_INACTIVE(now, base) (time_after((now), (base) + HZ) || \
> +                        unlikely(time_before((now), (base))))
> +
> +static u16 find_cpu(const struct rps_map *map, const struct sk_buff *skb)
> +{
> +       struct cpu_flow *flow;
> +       u16 cpu;
> +       int i, l, do_alloc = 0;
> +       unsigned long now = jiffies;
> +
> +retry:
> +       for (l = 0; l < CONFIG_NR_RPS_MAP_LOOPS; l++) {
> +               for (i = map->len - 1; i >= 0; i--) {
> +                       cpu = map->cpus[i];
> +                       flow = &per_cpu(cpu_flows, cpu)[l];
> +
> +                       if (do_alloc) {
> +                               if (flow->dev == NULL ||
> +                                   FLOW_INACTIVE(now, flow->ts)) {
> +                                       flow->dev = skb->dev;
> +                                       flow->rxhash = skb->rxhash;
> +                                       flow->ts = now;
> +                                       return cpu;
> +                               }
> +                       } else {
> +                               /*
> +                                * Unlike hash indexing, this avoids packet
> +                                * processing imbalance across CPUs.
> +                                */
> +                               if (flow->rxhash == skb->rxhash &&
> +                                   flow->dev == skb->dev &&
> +                                   !FLOW_INACTIVE(now, flow->ts)) {
> +                                       flow->ts = now;
> +                                       return cpu;
> +                               }
> +                       }
> +               }
> +       }
> +
> +       if (FLOW_INACTIVE(now, hash_active) && do_alloc == 0) {
> +               do_alloc = 1;
> +               goto retry;
> +       }
> +
> +       /* For all other flows */
> +       hash_active = now;
> +
> +       return map->cpus[((u64) skb->rxhash * map->len) >> 32];
> +}
> +#endif
> +
>  /*
>  * get_rps_cpu is called from netif_receive_skb and returns the target
>  * CPU from the RPS map of the receiving queue for a given skb.
> @@ -2780,7 +2835,11 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
>        }
>
>        if (map) {
> +#ifdef CONFIG_RPS_SPARSE_FLOW_OPTIMIZATION
> +               tcpu = find_cpu(map, skb);
> +#else
>                tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
> +#endif
>
>                if (cpu_online(tcpu)) {
>                        cpu = tcpu;
> --
> 1.7.1
>

^ permalink raw reply

* linux-next: manual merge of the net-next tree with the wireless tree
From: Stephen Rothwell @ 2012-05-04  3:21 UTC (permalink / raw)
  To: David Miller, netdev
  Cc: linux-next, linux-kernel, Eric Dumazet, John W. Linville,
	Guy, Wey-Yi, alexander.duyck, alexander.h.duyck,
	jeffrey.t.kirsher, linux-wireless

[-- Attachment #1: Type: text/plain, Size: 7266 bytes --]

Hi all,

Today's linux-next merge of the net-next tree got conflicts in
drivers/net/wireless/iwlwifi/iwl-trans.h,
drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c, and
drivers/net/wireless/iwlwifi/iwl-agn-rx.c between commit ed90542b0ce5
("iwlwifi: fix skb truesize underestimation") from the wireless tree and
various commits from the net-next tree.

This was anticipated and I have applied the fix supplied by John (see
below just to check).  Thanks to John for this!
-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au

diff --cc drivers/net/wireless/iwlwifi/iwl-agn-rx.c
index 2247460,f941223..0000000
--- a/drivers/net/wireless/iwlwifi/iwl-agn-rx.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn-rx.c
@@@ -787,25 -752,15 +751,24 @@@ static void iwlagn_pass_packet_to_mac80
  	    iwlagn_set_decrypted_flag(priv, hdr, ampdu_status, stats))
  		return;
  
 -	skb = dev_alloc_skb(128);
 +	/* Dont use dev_alloc_skb(), we'll have enough headroom once
 +	 * ieee80211_hdr pulled.
 +	 */
 +	skb = alloc_skb(128, GFP_ATOMIC);
  	if (!skb) {
 -		IWL_ERR(priv, "dev_alloc_skb failed\n");
 +		IWL_ERR(priv, "alloc_skb failed\n");
  		return;
  	}
 +	hdrlen = min_t(unsigned int, len, skb_tailroom(skb));
 +	memcpy(skb_put(skb, hdrlen), hdr, hdrlen);
 +	fraglen = len - hdrlen;
 +
 +	if (fraglen) {
- 		int offset = (void *)hdr + hdrlen - rxb_addr(rxb);
++		int offset = (void *)hdr - rxb_addr(rxb) + rxb_offset(rxb);
  
 -	offset = (void *)hdr - rxb_addr(rxb) + rxb_offset(rxb);
 -	p = rxb_steal_page(rxb);
 -	skb_add_rx_frag(skb, 0, p, offset, len, len);
 +		skb_add_rx_frag(skb, 0, rxb_steal_page(rxb), offset,
 +				fraglen, rxb->truesize);
 +	}
- 	iwl_update_stats(priv, false, fc, len);
  
  	/*
  	* Wake any queues that were stopped due to a passive channel tx
diff --cc drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c
index aa7aea1,d2239aa..0000000
--- a/drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c
+++ b/drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c
@@@ -374,72 -373,89 +373,90 @@@ static void iwl_rx_handle_rxbuf(struct 
  	if (WARN_ON(!rxb))
  		return;
  
- 	rxcb.truesize = PAGE_SIZE << hw_params(trans).rx_page_order;
- 	dma_unmap_page(trans->dev, rxb->page_dma,
- 		       rxcb.truesize,
- 		       DMA_FROM_DEVICE);
- 
- 	rxcb._page = rxb->page;
- 	pkt = rxb_addr(&rxcb);
- 
- 	IWL_DEBUG_RX(trans, "%s, 0x%02x\n",
- 		     get_cmd_string(pkt->hdr.cmd), pkt->hdr.cmd);
+ 	dma_unmap_page(trans->dev, rxb->page_dma, max_len, DMA_FROM_DEVICE);
  
+ 	while (offset + sizeof(u32) + sizeof(struct iwl_cmd_header) < max_len) {
+ 		struct iwl_rx_packet *pkt;
+ 		struct iwl_device_cmd *cmd;
+ 		u16 sequence;
+ 		bool reclaim;
+ 		int index, cmd_index, err, len;
+ 		struct iwl_rx_cmd_buffer rxcb = {
+ 			._offset = offset,
+ 			._page = rxb->page,
+ 			._page_stolen = false,
++			.truesize = max_len,
+ 		};
  
- 	len = le32_to_cpu(pkt->len_n_flags) & FH_RSCSR_FRAME_SIZE_MSK;
- 	len += sizeof(u32); /* account for status word */
- 	trace_iwlwifi_dev_rx(trans->dev, pkt, len);
+ 		pkt = rxb_addr(&rxcb);
  
- 	/* Reclaim a command buffer only if this packet is a response
- 	 *   to a (driver-originated) command.
- 	 * If the packet (e.g. Rx frame) originated from uCode,
- 	 *   there is no command buffer to reclaim.
- 	 * Ucode should set SEQ_RX_FRAME bit if ucode-originated,
- 	 *   but apparently a few don't get set; catch them here. */
- 	reclaim = !(pkt->hdr.sequence & SEQ_RX_FRAME);
- 	if (reclaim) {
- 		int i;
+ 		if (pkt->len_n_flags == cpu_to_le32(FH_RSCSR_FRAME_INVALID))
+ 			break;
  
- 		for (i = 0; i < trans_pcie->n_no_reclaim_cmds; i++) {
- 			if (trans_pcie->no_reclaim_cmds[i] == pkt->hdr.cmd) {
- 				reclaim = false;
- 				break;
+ 		IWL_DEBUG_RX(trans, "cmd at offset %d: %s (0x%.2x)\n",
+ 			rxcb._offset,
+ 			trans_pcie_get_cmd_string(trans_pcie, pkt->hdr.cmd),
+ 			pkt->hdr.cmd);
+ 
+ 		len = le32_to_cpu(pkt->len_n_flags) & FH_RSCSR_FRAME_SIZE_MSK;
+ 		len += sizeof(u32); /* account for status word */
+ 		trace_iwlwifi_dev_rx(trans->dev, pkt, len);
+ 
+ 		/* Reclaim a command buffer only if this packet is a response
+ 		 *   to a (driver-originated) command.
+ 		 * If the packet (e.g. Rx frame) originated from uCode,
+ 		 *   there is no command buffer to reclaim.
+ 		 * Ucode should set SEQ_RX_FRAME bit if ucode-originated,
+ 		 *   but apparently a few don't get set; catch them here. */
+ 		reclaim = !(pkt->hdr.sequence & SEQ_RX_FRAME);
+ 		if (reclaim) {
+ 			int i;
+ 
+ 			for (i = 0; i < trans_pcie->n_no_reclaim_cmds; i++) {
+ 				if (trans_pcie->no_reclaim_cmds[i] ==
+ 							pkt->hdr.cmd) {
+ 					reclaim = false;
+ 					break;
+ 				}
  			}
  		}
- 	}
  
- 	sequence = le16_to_cpu(pkt->hdr.sequence);
- 	index = SEQ_TO_INDEX(sequence);
- 	cmd_index = get_cmd_index(&txq->q, index);
+ 		sequence = le16_to_cpu(pkt->hdr.sequence);
+ 		index = SEQ_TO_INDEX(sequence);
+ 		cmd_index = get_cmd_index(&txq->q, index);
  
- 	if (reclaim)
- 		cmd = txq->cmd[cmd_index];
- 	else
- 		cmd = NULL;
+ 		if (reclaim)
+ 			cmd = txq->entries[cmd_index].cmd;
+ 		else
+ 			cmd = NULL;
  
- 	err = iwl_op_mode_rx(trans->op_mode, &rxcb, cmd);
+ 		err = iwl_op_mode_rx(trans->op_mode, &rxcb, cmd);
  
- 	/*
- 	 * XXX: After here, we should always check rxcb._page
- 	 * against NULL before touching it or its virtual
- 	 * memory (pkt). Because some rx_handler might have
- 	 * already taken or freed the pages.
- 	 */
+ 		/*
+ 		 * After here, we should always check rxcb._page_stolen,
+ 		 * if it is true then one of the handlers took the page.
+ 		 */
  
- 	if (reclaim) {
- 		/* Invoke any callbacks, transfer the buffer to caller,
- 		 * and fire off the (possibly) blocking
- 		 * iwl_trans_send_cmd()
- 		 * as we reclaim the driver command queue */
- 		if (rxcb._page)
- 			iwl_tx_cmd_complete(trans, &rxcb, err);
- 		else
- 			IWL_WARN(trans, "Claim null rxb?\n");
+ 		if (reclaim) {
+ 			/* Invoke any callbacks, transfer the buffer to caller,
+ 			 * and fire off the (possibly) blocking
+ 			 * iwl_trans_send_cmd()
+ 			 * as we reclaim the driver command queue */
+ 			if (!rxcb._page_stolen)
+ 				iwl_tx_cmd_complete(trans, &rxcb, err);
+ 			else
+ 				IWL_WARN(trans, "Claim null rxb?\n");
+ 		}
+ 
+ 		page_stolen |= rxcb._page_stolen;
+ 		offset += ALIGN(len, FH_RSCSR_FRAME_ALIGN);
  	}
  
- 	/* page was stolen from us */
- 	if (rxcb._page == NULL)
+ 	/* page was stolen from us -- free our reference */
+ 	if (page_stolen) {
+ 		__free_pages(rxb->page, trans_pcie->rx_page_order);
  		rxb->page = NULL;
+ 	}
  
  	/* Reuse the page if possible. For notification packets and
  	 * SKBs that fail to Rx correctly, add them back into the
diff --cc drivers/net/wireless/iwlwifi/iwl-trans.h
index fdf9788,7018d31..0000000
--- a/drivers/net/wireless/iwlwifi/iwl-trans.h
+++ b/drivers/net/wireless/iwlwifi/iwl-trans.h
@@@ -260,7 -256,8 +256,9 @@@ static inline void iwl_free_resp(struc
  
  struct iwl_rx_cmd_buffer {
  	struct page *_page;
+ 	int _offset;
+ 	bool _page_stolen;
 +	unsigned int truesize;
  };
  
  static inline void *rxb_addr(struct iwl_rx_cmd_buffer *r)

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: [PATCH v3 2/2] macvtap: restore vlan header on user read
From: David Miller @ 2012-05-04  1:06 UTC (permalink / raw)
  To: basil.gor; +Cc: mst, ebiederm, netdev
In-Reply-To: <20120503234732.GA26349@nanobar>

From: Basil Gor <basil.gor@gmail.com>
Date: Fri, 4 May 2012 03:47:33 +0400

> On Fri, May 04, 2012 at 02:30:29AM +0300, Michael S. Tsirkin wrote:
>> On Fri, May 04, 2012 at 03:11:52AM +0400, Basil Gor wrote:
>> > Below is a bit reworked version I've tested.
>> 
>> Will review thanks.
>> 
>> Can you add the signature according to the rules pls?
> 
> Copy vlan header to user when vlan id is stored in skb->vlan_tci
> 
> Signed-off-by: Basil Gor <basil.gor@gmail.com>

This doesn't work, you have to make the patch submission a
completely fresh one.  With a full, complete, commit message.

Not as a reply to another email posting.

^ permalink raw reply

* Re: [PATCH v3 2/2] macvtap: restore vlan header on user read
From: Basil Gor @ 2012-05-03 23:47 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Eric W. Biederman, David S. Miller, netdev
In-Reply-To: <20120503233029.GA31083@redhat.com>

On Fri, May 04, 2012 at 02:30:29AM +0300, Michael S. Tsirkin wrote:
> On Fri, May 04, 2012 at 03:11:52AM +0400, Basil Gor wrote:
> > Below is a bit reworked version I've tested.
> 
> Will review thanks.
> 
> Can you add the signature according to the rules pls?

Copy vlan header to user when vlan id is stored in skb->vlan_tci

Signed-off-by: Basil Gor <basil.gor@gmail.com>
---
 drivers/net/macvtap.c |   43 ++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 0427c65..cb8fd50 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -1,5 +1,6 @@
 #include <linux/etherdevice.h>
 #include <linux/if_macvlan.h>
+#include <linux/if_vlan.h>
 #include <linux/interrupt.h>
 #include <linux/nsproxy.h>
 #include <linux/compat.h>
@@ -759,6 +760,8 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 	struct macvlan_dev *vlan;
 	int ret;
 	int vnet_hdr_len = 0;
+	int vlan_offset = 0;
+	int copied;
 
 	if (q->flags & IFF_VNET_HDR) {
 		struct virtio_net_hdr vnet_hdr;
@@ -773,18 +776,48 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 		if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr)))
 			return -EFAULT;
 	}
+	copied = vnet_hdr_len;
+
+	if (!vlan_tx_tag_present(skb))
+		len = min_t(int, skb->len, len);
+	else {
+		int copy;
+		struct {
+			__be16 h_vlan_proto;
+			__be16 h_vlan_TCI;
+		} veth;
+		veth.h_vlan_proto = htons(ETH_P_8021Q);
+		veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));
+
+		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
+		len = min_t(int, skb->len + VLAN_HLEN, len);
+
+		copy = min_t(int, vlan_offset, len);
+		ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);
+		len -= copy;
+		copied += copy;
+		if (ret || !len)
+			goto done;
+
+		copy = min_t(int, sizeof(veth), len);
+		ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);
+		len -= copy;
+		copied += copy;
+		if (ret || !len)
+			goto done;
+	}
 
-	len = min_t(int, skb->len, len);
-
-	ret = skb_copy_datagram_const_iovec(skb, 0, iv, vnet_hdr_len, len);
+	ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);
+	copied += len;
 
+done:
 	rcu_read_lock_bh();
 	vlan = rcu_dereference_bh(q->vlan);
 	if (vlan)
-		macvlan_count_rx(vlan, len, ret == 0, 0);
+		macvlan_count_rx(vlan, copied - vnet_hdr_len, ret == 0, 0);
 	rcu_read_unlock_bh();
 
-	return ret ? ret : (len + vnet_hdr_len);
+	return ret ? ret : copied;
 }
 
 static ssize_t macvtap_do_read(struct macvtap_queue *q, struct kiocb *iocb,
-- 
1.7.6.5

^ permalink raw reply related

* Re: [PATCH v3 2/2] macvtap: restore vlan header on user read
From: Michael S. Tsirkin @ 2012-05-03 23:30 UTC (permalink / raw)
  To: Basil Gor; +Cc: Eric W. Biederman, David S. Miller, netdev
In-Reply-To: <20120503231152.GA8602@nanobar>

On Fri, May 04, 2012 at 03:11:52AM +0400, Basil Gor wrote:
> Below is a bit reworked version I've tested.

Will review thanks.

Can you add the signature according to the rules pls?

> ---
>  drivers/net/macvtap.c |   43 ++++++++++++++++++++++++++++++++++++++-----
>  1 files changed, 38 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
> index 0427c65..cb8fd50 100644
> --- a/drivers/net/macvtap.c
> +++ b/drivers/net/macvtap.c
> @@ -1,5 +1,6 @@
>  #include <linux/etherdevice.h>
>  #include <linux/if_macvlan.h>
> +#include <linux/if_vlan.h>
>  #include <linux/interrupt.h>
>  #include <linux/nsproxy.h>
>  #include <linux/compat.h>
> @@ -759,6 +760,8 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
>  	struct macvlan_dev *vlan;
>  	int ret;
>  	int vnet_hdr_len = 0;
> +	int vlan_offset = 0;
> +	int copied;
>  
>  	if (q->flags & IFF_VNET_HDR) {
>  		struct virtio_net_hdr vnet_hdr;
> @@ -773,18 +776,48 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
>  		if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr)))
>  			return -EFAULT;
>  	}
> +	copied = vnet_hdr_len;
> +
> +	if (!vlan_tx_tag_present(skb))
> +		len = min_t(int, skb->len, len);
> +	else {
> +		int copy;
> +		struct {
> +			__be16 h_vlan_proto;
> +			__be16 h_vlan_TCI;
> +		} veth;
> +		veth.h_vlan_proto = htons(ETH_P_8021Q);
> +		veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));
> +
> +		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
> +		len = min_t(int, skb->len + VLAN_HLEN, len);
> +
> +		copy = min_t(int, vlan_offset, len);
> +		ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);
> +		len -= copy;
> +		copied += copy;
> +		if (ret || !len)
> +			goto done;
> +
> +		copy = min_t(int, sizeof(veth), len);
> +		ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);
> +		len -= copy;
> +		copied += copy;
> +		if (ret || !len)
> +			goto done;
> +	}
>  
> -	len = min_t(int, skb->len, len);
> -
> -	ret = skb_copy_datagram_const_iovec(skb, 0, iv, vnet_hdr_len, len);
> +	ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);
> +	copied += len;
>  
> +done:
>  	rcu_read_lock_bh();
>  	vlan = rcu_dereference_bh(q->vlan);
>  	if (vlan)
> -		macvlan_count_rx(vlan, len, ret == 0, 0);
> +		macvlan_count_rx(vlan, copied - vnet_hdr_len, ret == 0, 0);
>  	rcu_read_unlock_bh();
>  
> -	return ret ? ret : (len + vnet_hdr_len);
> +	return ret ? ret : copied;
>  }
>  
>  static ssize_t macvtap_do_read(struct macvtap_queue *q, struct kiocb *iocb,
> -- 
> 1.7.6.5

^ permalink raw reply

* Re: arcnet: protocol is buggy messages in 3.0 kernel, not in 2.33 kernel
From: Francois Romieu @ 2012-05-03 23:23 UTC (permalink / raw)
  To: Rob Janssen; +Cc: netdev
In-Reply-To: <CAJDhSdc5J1xzvxYhvru6kjqAbq1NAtvWQCu-tXbi4aE_NXGXrg@mail.gmail.com>

Rob Janssen <rob.janssen76@gmail.com> :
[...]
> Does anyone have what causes this message in the 3.0 kernel and/or how
> to fix this ?

Wild guess: hard_header_len is sizeof(struct archdr). archdr embeds a huge
ethhdr but the header build helper only accounts for ARC_HDR_SIZE +
RFC1201_HDR_SIZE.

-- 
Ueimor

^ permalink raw reply

* Re: [PATCH v3 2/2] macvtap: restore vlan header on user read
From: Basil Gor @ 2012-05-03 23:11 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Eric W. Biederman, David S. Miller, netdev
In-Reply-To: <20120503152225.GA25579@nanobar>

On Thu, May 03, 2012 at 07:22:25PM +0400, Basil Gor wrote:
> On Thu, May 03, 2012 at 05:31:10PM +0300, Michael S. Tsirkin wrote:
> > On Thu, May 03, 2012 at 06:37:46AM -0700, Eric W. Biederman wrote:
> > > "Michael S. Tsirkin" <mst@redhat.com> writes:
> > > 
> > > > On Wed, Apr 25, 2012 at 10:31:25PM -0700, Eric W. Biederman wrote:
> > > >> Basil Gor <basil.gor@gmail.com> writes:
> > > >> 
> > > >> > Vlan tag is restored during buffer transmit to a network device (bridge
> > > >> > port) in bridging code in case of tun/tap driver. In case of macvtap it
> > > >> > has to be done explicitly. Otherwise vlan_tci is ignored and user always
> > > >> > gets untagged packets.
> > > >> 
> > > >> We could quibble about efficiencies but this looks good except for
> > > >> macvtap_recvmsg which isn't setting the auxdata for the vlan header.
> > > >> 
> > > >> Eric
> > > >
> > > > Right. I'm guessing we need to support old userspace
> > > > so if there's auxdata, put vlan there but if not,
> > > > put the vlan in the packet like this patch does.
> > > 
> > > This patch isn't horrible.
> > > 
> > > Still why copy the skb when you can just split the copy to userspace
> > > into a couple of pieces?
> > > 
> > > We don't need to change the skb and changing the skb looks like
> > > it is likely to confuse things and cause bugs because we are
> > > not working with a consistent model of how vlan information
> > > is encoded.
> > > 
> > > Still something needs to happen and this works in more cases even if it
> > > isn't perfect.
> > > 
> > > Eric
> > 
> > Absolutely. And it's easier than I thought.
> > So we can do something like the below (warning: compiled only).
> > Basil - want to take a look?
> 
> Sure, I'll give it a try.
> Thanks
> 
> Basil Gor
> 
> > My only concern if we put this logic in an out of way
> > driver like macvtap will people remember to update it?
> > Maybe better to update skb_copy_datagram_const_iovec which is in core?
> > 
> > 
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > 
> > diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
> > index 0427c65..5a1724c 100644
> > --- a/drivers/net/macvtap.c
> > +++ b/drivers/net/macvtap.c
> > @@ -1,5 +1,6 @@
> >  #include <linux/etherdevice.h>
> >  #include <linux/if_macvlan.h>
> > +#include <linux/if_vlan.h>
> >  #include <linux/interrupt.h>
> >  #include <linux/nsproxy.h>
> >  #include <linux/compat.h>
> > @@ -759,6 +760,7 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
> >  	struct macvlan_dev *vlan;
> >  	int ret;
> >  	int vnet_hdr_len = 0;
> > +	int vlan_offset = 0;
> >  
> >  	if (q->flags & IFF_VNET_HDR) {
> >  		struct virtio_net_hdr vnet_hdr;
> > @@ -776,8 +778,29 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
> >  
> >  	len = min_t(int, skb->len, len);
        skb->len + VLAN_HLEN if vlan tag present
> >  
> > -	ret = skb_copy_datagram_const_iovec(skb, 0, iv, vnet_hdr_len, len);
> > +	if (vlan_tx_tag_present(skb)) {
> > +		struct {
> > +			__be16 h_vlan_proto;
> > +			__be16 h_vlan_TCI;
> > +		} veth;
> > + 		veth.h_vlan_proto = htons(ETH_P_8021Q);
> > + 		veth.h_vlan_TCI = vlan_tx_tag_get(skb);
                htons(vlan_tx_tag_get(skb))
> > +
> > +		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
> > +		ret = skb_copy_datagram_const_iovec(skb, 0, iv, vnet_hdr_len,
> > +						    vlan_offset);
                do we need to count how much we copy carefully?
> > +		if (ret)
> > +			goto done;
> > +		ret = memcpy_toiovecend(iv, (unsigned char *)&veth, vlan_offset,
> > +					sizeof veth);
                offset has to be vnet_hdr_len + vlan_offset
> > +		if (ret)
> > +			goto done;
> > +		vlan_offset += sizeof veth;
                offset to use in next copy: vnet_hdr_len + vlan_offset + sizeof(veth)
> > +	}
> > +	ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, vnet_hdr_len,
> > +					    len);
> >  
> > +done:
> >  	rcu_read_lock_bh();
> >  	vlan = rcu_dereference_bh(q->vlan);
> >  	if (vlan)

Below is a bit reworked version I've tested.

---
 drivers/net/macvtap.c |   43 ++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 0427c65..cb8fd50 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -1,5 +1,6 @@
 #include <linux/etherdevice.h>
 #include <linux/if_macvlan.h>
+#include <linux/if_vlan.h>
 #include <linux/interrupt.h>
 #include <linux/nsproxy.h>
 #include <linux/compat.h>
@@ -759,6 +760,8 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 	struct macvlan_dev *vlan;
 	int ret;
 	int vnet_hdr_len = 0;
+	int vlan_offset = 0;
+	int copied;
 
 	if (q->flags & IFF_VNET_HDR) {
 		struct virtio_net_hdr vnet_hdr;
@@ -773,18 +776,48 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 		if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr)))
 			return -EFAULT;
 	}
+	copied = vnet_hdr_len;
+
+	if (!vlan_tx_tag_present(skb))
+		len = min_t(int, skb->len, len);
+	else {
+		int copy;
+		struct {
+			__be16 h_vlan_proto;
+			__be16 h_vlan_TCI;
+		} veth;
+		veth.h_vlan_proto = htons(ETH_P_8021Q);
+		veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));
+
+		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
+		len = min_t(int, skb->len + VLAN_HLEN, len);
+
+		copy = min_t(int, vlan_offset, len);
+		ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);
+		len -= copy;
+		copied += copy;
+		if (ret || !len)
+			goto done;
+
+		copy = min_t(int, sizeof(veth), len);
+		ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);
+		len -= copy;
+		copied += copy;
+		if (ret || !len)
+			goto done;
+	}
 
-	len = min_t(int, skb->len, len);
-
-	ret = skb_copy_datagram_const_iovec(skb, 0, iv, vnet_hdr_len, len);
+	ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);
+	copied += len;
 
+done:
 	rcu_read_lock_bh();
 	vlan = rcu_dereference_bh(q->vlan);
 	if (vlan)
-		macvlan_count_rx(vlan, len, ret == 0, 0);
+		macvlan_count_rx(vlan, copied - vnet_hdr_len, ret == 0, 0);
 	rcu_read_unlock_bh();
 
-	return ret ? ret : (len + vnet_hdr_len);
+	return ret ? ret : copied;
 }
 
 static ssize_t macvtap_do_read(struct macvtap_queue *q, struct kiocb *iocb,
-- 
1.7.6.5

^ permalink raw reply related

* Re: [PATCH 2/4] ipgre: follow state of lower device
From: Stephen Hemminger @ 2012-05-03 22:40 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20120414.145302.154475725141872738.davem@davemloft.net>

On Sat, 14 Apr 2012 14:53:02 -0400 (EDT)
David Miller <davem@davemloft.net> wrote:

> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Thu, 12 Apr 2012 09:31:17 -0700
> 
> > GRE tunnels like other layered devices should propogate
> > carrier and RFC2863 state from lower device to tunnel.
> > 
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> Like others I don't like the ugly hash traversal.
> 
> A small hash on ifindex, iflink, or whatever ought to be easy and make
> the code look much nicer.
> 
> Longer term project is that a lot of this tunneling code can be
> commonized at some point.

The whole set of tunnels needs to be cleaned up to be something modular, clean
and cached like the code in OpenVswitch.

^ permalink raw reply

* Re: [PATCH] net: davinci_emac: Add pre_open, post_stop platform callbacks
From: Kevin Hilman @ 2012-05-03 21:32 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: Bedia, Vaibhav, Mark A. Greer, netdev@vger.kernel.org,
	linux-omap@vger.kernel.org, linux-arm-kernel@lists.infradead.org
In-Reply-To: <1336076542.2998.23.camel@bwh-desktop.uk.solarflarecom.com>

Ben Hutchings <bhutchings@solarflare.com> writes:

> On Thu, 2012-05-03 at 19:25 +0000, Bedia, Vaibhav wrote:
>> On Fri, May 04, 2012 at 00:16:32, Mark A. Greer wrote:
>> [...]
>> > > 
>> > > So, if I understood this correctly, it's effectively like blocking a low power
>> > > state transition (here wfi execution) when EMAC is active?
>> > 
>> > Assuming "it" is my patch, correct.
>> > 
>> 
>> Recently I was thinking about how to get certain drivers to disallow some or all
>> low power states and to me this also seems to fall in a similar category.
>> 
>> One of the suggestions that I got was to check if the 'wakeup' entry associated with
>> the device under sysfs could be leveraged for this. The PM code could maintain
>> a whitelist (or blacklist) of devices and it decides the low power state to enter
>> based on the 'wakeup' entries associated with these devices. In this particular case,
>> maybe the driver could simply set this entry to non-wakeup capable when necessary and
>> then let the PM code take care of skipping the wfi execution.
>> 
>> Thoughts/brickbats welcome :)
>
> You can maybe (ab)use the pm_qos mechanism for this.

I thought of using this too, but it doesn't actually solve the problem:

Using PM QoS, you can avoid hitting the deeper idle states by setting a
very low wakeup latency.  However, on ARM platforms, even the shallowest
idle states use the WFI instruction, and the EMAC would still not be
able to wake the system from WFI.  A possibility would be define the
shallowest idle state to be one that doesn't call WFI and just does
cpu_relax().  However, that would only work for CPUidle since PM QoS
constraints are only checked by CPUidle.  So, a non-CPUidle kernel would
still have this bug. :(

Ultimately, this is just broken HW.  This network HW was bolted onto an
existing SoC without consideration for wakeup capabilities.  The result
is that any use of this device with networking has to completely disable
SoC power management.

Kevin

^ permalink raw reply

* Re: sky2 still badly broken
From: Stephen Hemminger @ 2012-05-03 21:25 UTC (permalink / raw)
  To: Niccolò Belli; +Cc: netdev
In-Reply-To: <4FA2DE52.306@linuxsystems.it>

On Thu, 03 May 2012 21:36:50 +0200
Niccolò Belli <darkbasic@linuxsystems.it> wrote:

> Il 03/05/2012 17:23, Stephen Hemminger ha scritto:
> > The receiver on some versions of the chip can't keep up with full speed
> > of 1G bit/sec. The receive  FIFO has hardware issues, and since I don't
> > work for Marvell, working around the problem is guesswork.
> 
> Just one question: if that is true why don't I have any problem with a 
> point to point gigabit link? I did transfer 3GB from ram to ram (ramfs) 
> at 100+MB/s without any issue in a point to point setup, as soon as I 
> attach it to the switch I have problems (very low transfer rate and tons 
> of rx errors).

I don't know but the PHY interaction is different with auto-crossover.

^ permalink raw reply

* Re: [PATCH iproute2 ] Update man8 Makefile
From: Stephen Hemminger @ 2012-05-03 21:16 UTC (permalink / raw)
  To: Vijay Subramanian; +Cc: netdev, Christoph J. Thompson
In-Reply-To: <1336076146-5464-1-git-send-email-subramanian.vijay@gmail.com>

On Thu,  3 May 2012 13:15:46 -0700
Vijay Subramanian <subramanian.vijay@gmail.com> wrote:

> Commit (761a1e60 iproute2 - Split up manual page installation )
> introduced man/man8/Makefile but did not add all the man pages.
> This patch adds the missing man pages for installation.
> 
> Signed-off-by: Vijay Subramanian <subramanian.vijay@gmail.com>
> ---
>  man/man8/Makefile |    6 +++++-
>  1 files changed, 5 insertions(+), 1 deletions(-)
> 
> diff --git a/man/man8/Makefile b/man/man8/Makefile
> index 5d64012..6873a4b 100644
> --- a/man/man8/Makefile
> +++ b/man/man8/Makefile
> @@ -3,7 +3,11 @@ TARGETS = ip-address.8 ip-link.8 ip-route.8
>  MAN8PAGES = $(TARGETS) ip.8 arpd.8 lnstat.8 routel.8 rtacct.8 rtmon.8 ss.8 \
>  	tc-bfifo.8 tc-cbq-details.8 tc-cbq.8 tc-drr.8 tc-htb.8 \
>  	tc-pfifo.8 tc-pfifo_fast.8 tc-prio.8 tc-red.8 tc-sfq.8 \
> -	tc-tbf.8 tc.8 rtstat.8 ctstat.8 nstat.8 routef.8
> +	tc-tbf.8 tc.8 rtstat.8 ctstat.8 nstat.8 routef.8 \
> +	tc-sfb.8 tc-netem.8 tc-choke.8 ip-tunnel.8 ip-rule.8 ip-ntable.8 \
> +	ip-monitor.8 tc-stab.8 tc-hfsc.8 ip-xfrm.8 ip-netns.8 \
> +	ip-neighbour.8 ip-mroute.8 ip-maddress.8 ip-addrlabel.8
> +
>  
>  all: $(TARGETS)
>  

Applied, thanks

^ permalink raw reply

* Re: [PATCH 7/9] net: add skb_orphan_frags to copy aside frags with destructors
From: Michael S. Tsirkin @ 2012-05-03 21:10 UTC (permalink / raw)
  To: David Miller; +Cc: ian.campbell, netdev, eric.dumazet
In-Reply-To: <20120503.135532.1038384417514973419.davem@davemloft.net>

On Thu, May 03, 2012 at 01:55:32PM -0400, David Miller wrote:
> From: "Michael S. Tsirkin" <mst@redhat.com>
> Date: Thu, 3 May 2012 18:41:43 +0300
> 
> > On Thu, May 03, 2012 at 03:56:09PM +0100, Ian Campbell wrote:
> >> This should be used by drivers which need to hold on to an skb for an extended
> >> (perhaps unbounded) period of time. e.g. the tun driver which relies on
> >> userspace consuming the skb.
> >> 
> >> Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
> >> Cc: mst@redhat.com
> > 
> > 
> > Right. But local sockets queue at socket forever as well.
> > I think this should be called in skb_set_owner_r?
> > 
> > This might somewhat penalize speed for local clients in the name
> > of correctness but these are rare so being correct is
> > more important I think.
> 
> But, on the other hand, putting the check into skb_set_owner_r() is a
> not so nice test to have in the fast path of every socket receive.

True.  Hmm we orphan skbs when we loop them back
so how about reusing the skb->destructor for this?

We could teach pskb_copy pskb_expand_head etc that
when skb with this flag is cloned (expand head etc)
destructor would be set to a function that copies frags.
(clone is less of a fast path so I think adding
 a branch there is less of an issue).

Of course destructor is also called from kfree_skb
but we could clear this flag before the call
in kfree_skb so that destructor can distinguish.

-- 
MST

^ permalink raw reply

* Re: [PATCHv2 4/7] mISDN: Make layer1 timer 3 value configurable
From: David Miller @ 2012-05-03 21:08 UTC (permalink / raw)
  To: kkeil; +Cc: netdev, isdn
In-Reply-To: <1336060052-27119-5-git-send-email-kkeil@linux-pingi.de>

From: Karsten Keil <kkeil@linux-pingi.de>
Date: Thu,  3 May 2012 17:47:29 +0200

> @@ -372,6 +372,7 @@ clear_channelmap(u_int nr, u_char *map)
>  #define MISDN_CTRL_RX_OFF		0x0100
>  #define MISDN_CTRL_FILL_EMPTY		0x0200
>  #define MISDN_CTRL_GETPEER		0x0400
> +#define MISDN_CTRL_L1_TIMER3		0x0800
>  #define MISDN_CTRL_HW_FEATURES_OP	0x2000
>  #define MISDN_CTRL_HW_FEATURES		0x2001
>  #define MISDN_CTRL_HFC_OP		0x4000

This define is completely unused by this patch.

To say that I'm frustrated by this process would be an understatement.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox