* [PATCH net-next] tcp: md5: add more const attributes
From: Eric Dumazet @ 2011-10-24 6:35 UTC (permalink / raw)
To: David Miller; +Cc: netdev
Now tcp_md5_hash_header() has a const tcphdr argument, we can add more
const attributes to callers.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
include/net/tcp.h | 17 +++++++++--------
net/ipv4/tcp_ipv4.c | 12 ++++++------
net/ipv6/tcp_ipv6.c | 13 +++++++------
3 files changed, 22 insertions(+), 20 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 910cc29..ed0e814 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1185,8 +1185,9 @@ struct tcp_md5sig_pool {
/* - functions */
extern int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
- struct sock *sk, struct request_sock *req,
- struct sk_buff *skb);
+ const struct sock *sk,
+ const struct request_sock *req,
+ const struct sk_buff *skb);
extern struct tcp_md5sig_key * tcp_v4_md5_lookup(struct sock *sk,
struct sock *addr_sk);
extern int tcp_v4_md5_do_add(struct sock *sk, __be32 addr, u8 *newkey,
@@ -1448,9 +1449,9 @@ struct tcp_sock_af_ops {
struct sock *addr_sk);
int (*calc_md5_hash) (char *location,
struct tcp_md5sig_key *md5,
- struct sock *sk,
- struct request_sock *req,
- struct sk_buff *skb);
+ const struct sock *sk,
+ const struct request_sock *req,
+ const struct sk_buff *skb);
int (*md5_add) (struct sock *sk,
struct sock *addr_sk,
u8 *newkey,
@@ -1467,9 +1468,9 @@ struct tcp_request_sock_ops {
struct request_sock *req);
int (*calc_md5_hash) (char *location,
struct tcp_md5sig_key *md5,
- struct sock *sk,
- struct request_sock *req,
- struct sk_buff *skb);
+ const struct sock *sk,
+ const struct request_sock *req,
+ const struct sk_buff *skb);
#endif
};
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 955c925..1dad7e9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -92,7 +92,7 @@ EXPORT_SYMBOL(sysctl_tcp_low_latency);
static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
__be32 addr);
static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
- __be32 daddr, __be32 saddr, struct tcphdr *th);
+ __be32 daddr, __be32 saddr, const struct tcphdr *th);
#else
static inline
struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
@@ -1090,7 +1090,7 @@ static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
}
static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
- __be32 daddr, __be32 saddr, struct tcphdr *th)
+ __be32 daddr, __be32 saddr, const struct tcphdr *th)
{
struct tcp_md5sig_pool *hp;
struct hash_desc *desc;
@@ -1122,12 +1122,12 @@ clear_hash_noput:
}
int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
- struct sock *sk, struct request_sock *req,
- struct sk_buff *skb)
+ const struct sock *sk, const struct request_sock *req,
+ const struct sk_buff *skb)
{
struct tcp_md5sig_pool *hp;
struct hash_desc *desc;
- struct tcphdr *th = tcp_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
__be32 saddr, daddr;
if (sk) {
@@ -1172,7 +1172,7 @@ clear_hash_noput:
}
EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
-static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
+static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
{
/*
* This gets called for each TCP segment that arrives
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index da2ada8..c8683fc 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -761,7 +761,7 @@ static int tcp_v6_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
const struct in6_addr *daddr, struct in6_addr *saddr,
- struct tcphdr *th)
+ const struct tcphdr *th)
{
struct tcp_md5sig_pool *hp;
struct hash_desc *desc;
@@ -793,13 +793,14 @@ clear_hash_noput:
}
static int tcp_v6_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
- struct sock *sk, struct request_sock *req,
- struct sk_buff *skb)
+ const struct sock *sk,
+ const struct request_sock *req,
+ const struct sk_buff *skb)
{
const struct in6_addr *saddr, *daddr;
struct tcp_md5sig_pool *hp;
struct hash_desc *desc;
- struct tcphdr *th = tcp_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
if (sk) {
saddr = &inet6_sk(sk)->saddr;
@@ -842,12 +843,12 @@ clear_hash_noput:
return 1;
}
-static int tcp_v6_inbound_md5_hash (struct sock *sk, struct sk_buff *skb)
+static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
{
const __u8 *hash_location = NULL;
struct tcp_md5sig_key *hash_expected;
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
- struct tcphdr *th = tcp_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
int genhash;
u8 newhash[16];
^ permalink raw reply related
* Re: [PATCH 0/5] macvtap fixes.
From: Michael S. Tsirkin @ 2011-10-24 6:31 UTC (permalink / raw)
To: David Miller; +Cc: ebiederm, netdev, arnd, jasowang, ian.campbell, mashirley
In-Reply-To: <20111021.025607.2193225431127507358.davem@redhat.com>
On Fri, Oct 21, 2011 at 02:56:07AM -0400, David Miller wrote:
> From: ebiederm@xmission.com (Eric W. Biederman)
> Date: Thu, 20 Oct 2011 07:24:59 -0700
>
> >
> > This series of patches fixes a series of minor bugs in the macvtap code.
> >
> > The fixes to handle failures in newlink and the change in how we handle
> > minor device number allocations are particularly significant.
>
> Applied to net-next, thanks.
The 'Don't leak unreceived packets' patch probably makes sense in 'net'.
--
MST
^ permalink raw reply
* Re: [PATCH net-next] Add ethtool -g support to virtio_net
From: David Miller @ 2011-10-24 6:15 UTC (permalink / raw)
To: mst; +Cc: rusty, raj, netdev, virtualization
In-Reply-To: <20111024053103.GA24528@redhat.com>
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 24 Oct 2011 07:31:05 +0200
> On Sun, Oct 23, 2011 at 11:26:04PM -0400, David Miller wrote:
>> From: Rusty Russell <rusty@rustcorp.com.au>
>> Date: Thu, 20 Oct 2011 18:25:24 +1030
>>
>> > On Wed, 19 Oct 2011 11:10:59 -0700 (PDT), raj@tardy.cup.hp.com (Rick Jones) wrote:
>> >> From: Rick Jones <rick.jones2@hp.com>
>> >>
>> >> Add support for reporting ring sizes via ethtool -g to the virtio_net
>> >> driver.
>> >>
>> >> Signed-off-by: Rick Jones <rick.jones2@hp.com>
>> >
>> > Acked-by: Rusty Russell <rusty@rustcorp.com.au>
>> >
>> > MST, want me to take this, or do you?
>>
>> I can also take it directly, just let me know.
>
> Please do, thanks very much.
>
> Acked-by: Michael S. Tsirkin <mst@redhat.com>
>
Done.
^ permalink raw reply
* Re: [PATCH net-next] Add ethtool -g support to virtio_net
From: Rusty Russell @ 2011-10-24 4:41 UTC (permalink / raw)
To: David Miller; +Cc: raj, netdev, mst, virtualization
In-Reply-To: <20111023.232604.18246342474494526.davem@davemloft.net>
On Sun, 23 Oct 2011 23:26:04 -0400 (EDT), David Miller <davem@davemloft.net> wrote:
> From: Rusty Russell <rusty@rustcorp.com.au>
> Date: Thu, 20 Oct 2011 18:25:24 +1030
>
> > On Wed, 19 Oct 2011 11:10:59 -0700 (PDT), raj@tardy.cup.hp.com (Rick Jones) wrote:
> >> From: Rick Jones <rick.jones2@hp.com>
> >>
> >> Add support for reporting ring sizes via ethtool -g to the virtio_net
> >> driver.
> >>
> >> Signed-off-by: Rick Jones <rick.jones2@hp.com>
> >
> > Acked-by: Rusty Russell <rusty@rustcorp.com.au>
> >
> > MST, want me to take this, or do you?
>
> I can also take it directly, just let me know.
I'm happy with that.
Thanks,
Rusty.
^ permalink raw reply
* Re: [net-next-2.6 PATCH 7/8 RFC v2] macvtap: Add support to set MAC/VLAN filter rtnl link operations
From: Michael S. Tsirkin @ 2011-10-24 5:57 UTC (permalink / raw)
To: Roopa Prabhu
Cc: netdev, sri, dragos.tatulea, arnd, kvm, davem, mchan, dwang2,
shemminger, eric.dumazet, kaber, benve
In-Reply-To: <20111019062630.7242.99374.stgit@savbu-pc100.cisco.com>
On Tue, Oct 18, 2011 at 11:26:30PM -0700, Roopa Prabhu wrote:
> From: Roopa Prabhu <roprabhu@cisco.com>
>
> This patch adds support to set MAC and VLAN filter rtnl_link_ops
> on a macvtap interface. It adds support for set_rx_addr_filter and
> set_rx_vlan_filter rtnl link operations. These operations inturn call the
> equivalent operations defined in macvlan
>
> Signed-off-by: Roopa Prabhu <roprabhu@cisco.com>
> Signed-off-by: Christian Benvenuti <benve@cisco.com>
> Signed-off-by: David Wang <dwang2@cisco.com>
> ---
> drivers/net/macvtap.c | 22 ++++++++++++++++++----
> 1 files changed, 18 insertions(+), 4 deletions(-)
>
>
> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
> index 3da5578..8a2cb59 100644
> --- a/drivers/net/macvtap.c
> +++ b/drivers/net/macvtap.c
> @@ -273,6 +273,18 @@ static int macvtap_receive(struct sk_buff *skb)
> return macvtap_forward(skb->dev, skb);
> }
>
> +static int macvtap_set_rx_addr_filter(struct net_device *dev,
> + struct nlattr *tb[])
> +{
> + return macvlan_set_rx_addr_filter(dev, tb);
> +}
> +
> +static int macvtap_set_rx_vlan_filter(struct net_device *dev,
> + struct nlattr *tb[])
> +{
> + return macvlan_set_rx_vlan_filter(dev, tb);
> +}
> +
> static int macvtap_newlink(struct net *src_net,
> struct net_device *dev,
> struct nlattr *tb[],
Can use macvlanXXX directly here? Why wrap it?
> @@ -317,10 +329,12 @@ static void macvtap_setup(struct net_device *dev)
> }
>
> static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
> - .kind = "macvtap",
> - .setup = macvtap_setup,
> - .newlink = macvtap_newlink,
> - .dellink = macvtap_dellink,
> + .kind = "macvtap",
> + .setup = macvtap_setup,
> + .newlink = macvtap_newlink,
> + .dellink = macvtap_dellink,
> + .set_rx_addr_filter = macvtap_set_rx_addr_filter,
> + .set_rx_vlan_filter = macvtap_set_rx_vlan_filter,
> };
>
>
^ permalink raw reply
* Re: [net-next-2.6 PATCH 8/8 RFC v2] macvtap: Add support to get MAC/VLAN filter rtnl link operations
From: Michael S. Tsirkin @ 2011-10-24 5:56 UTC (permalink / raw)
To: Roopa Prabhu
Cc: netdev, sri, dragos.tatulea, arnd, kvm, davem, mchan, dwang2,
shemminger, eric.dumazet, kaber, benve
In-Reply-To: <20111019062636.7242.46889.stgit@savbu-pc100.cisco.com>
On Tue, Oct 18, 2011 at 11:26:36PM -0700, Roopa Prabhu wrote:
> From: Roopa Prabhu <roprabhu@cisco.com>
>
> This patch adds support to get MAC and VLAN filter rtnl_link_ops
> on a macvtap interface. It adds support for get_rx_addr_filter_size,
> get_rx_vlan_filter_size, fill_rx_addr_filter and fill_rx_vlan_filter
> rtnl link operations. Calls equivalent macvlan operations.
>
> Signed-off-by: Roopa Prabhu <roprabhu@cisco.com>
> Signed-off-by: Christian Benvenuti <benve@cisco.com>
> Signed-off-by: David Wang <dwang2@cisco.com>
> ---
> drivers/net/macvtap.c | 27 +++++++++++++++++++++++++++
> 1 files changed, 27 insertions(+), 0 deletions(-)
>
>
> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
> index 8a2cb59..9b40de7 100644
> --- a/drivers/net/macvtap.c
> +++ b/drivers/net/macvtap.c
> @@ -285,6 +285,29 @@ static int macvtap_set_rx_vlan_filter(struct net_device *dev,
> return macvlan_set_rx_vlan_filter(dev, tb);
> }
>
> +static int macvtap_fill_rx_addr_filter(struct sk_buff *skb,
> + const struct net_device *dev)
> +{
> + return macvlan_fill_rx_addr_filter(skb, dev);
> +}
> +
> +static int macvtap_fill_rx_vlan_filter(struct sk_buff *skb,
> + const struct net_device *dev)
> +{
> + return macvlan_fill_rx_vlan_filter(skb, dev);
> +}
> +
> +static size_t macvtap_get_rx_addr_filter_size(const struct net_device *dev)
> +{
> + return macvlan_get_rx_addr_filter_size(dev);
> +}
> +
> +static size_t macvtap_get_rx_vlan_filter_size(const struct net_device *dev)
> +{
> + return macvlan_get_rx_vlan_filter_size(dev);
> +}
So why do we need the above wrappers? Can't use macvlanXXX directly?
> +
> +
don't add double emoty lines pls.
> static int macvtap_newlink(struct net *src_net,
> struct net_device *dev,
> struct nlattr *tb[],
> @@ -335,6 +358,10 @@ static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
> .dellink = macvtap_dellink,
> .set_rx_addr_filter = macvtap_set_rx_addr_filter,
> .set_rx_vlan_filter = macvtap_set_rx_vlan_filter,
> + .get_rx_addr_filter_size = macvtap_get_rx_addr_filter_size,
> + .get_rx_vlan_filter_size = macvtap_get_rx_vlan_filter_size,
> + .fill_rx_addr_filter = macvtap_fill_rx_addr_filter,
> + .fill_rx_vlan_filter = macvtap_fill_rx_vlan_filter,
> };
>
>
^ permalink raw reply
* Re: [PATCH net-next] tcp: avoid frag allocation for small frames
From: David Miller @ 2011-10-24 5:53 UTC (permalink / raw)
To: eric.dumazet; +Cc: netdev
In-Reply-To: <1319286237.6180.49.camel@edumazet-laptop>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sat, 22 Oct 2011 14:23:57 +0200
> tcp_sendmsg() uses select_size() helper to choose skb head size when a
> new skb must be allocated.
>
> If GSO is enabled for the socket, current strategy is to force all
> payload data to be outside of headroom, in PAGE fragments.
>
> This strategy is not welcome for small packets, wasting memory.
>
> Experiments show that best results are obtained when using 2048 bytes
> for skb head (This includes the skb overhead and various headers)
>
> This patch provides better len/truesize ratios for packets sent to
> loopback device, and reduce memory needs for in-flight loopback packets,
> particularly on arches with big pages.
>
> If a sender sends many 1-byte packets to an unresponsive application,
> receiver rmem_alloc will grow faster and will stop queuing these packets
> sooner, or will collapse its receive queue to free excess memory.
>
> netperf -t TCP_RR results are improved by ~4 %, and many workloads are
> improved as well (tbench, mysql...)
>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Applied, thanks Eric.
^ permalink raw reply
* Re: [net-next-2.6 PATCH 0/8 RFC v2] macvlan: MAC Address filtering support for passthru mode
From: Michael S. Tsirkin @ 2011-10-24 5:47 UTC (permalink / raw)
To: Roopa Prabhu
Cc: netdev, sri, dragos.tatulea, arnd, kvm, davem, mchan, dwang2,
shemminger, eric.dumazet, kaber, benve
In-Reply-To: <20111019062543.7242.3969.stgit@savbu-pc100.cisco.com>
On Tue, Oct 18, 2011 at 11:25:54PM -0700, Roopa Prabhu wrote:
> v1 version of this RFC patch was posted at http://www.spinics.net/lists/netdev/msg174245.html
>
> Today macvtap used in virtualized environment does not have support to
> propagate MAC, VLAN and interface flags from guest to lowerdev.
> Which means to be able to register additional VLANs, unicast and multicast
> addresses or change pkt filter flags in the guest, the lowerdev has to be
> put in promisocous mode. Today the only macvlan mode that supports this is
> the PASSTHRU mode and it puts the lower dev in promiscous mode.
>
> PASSTHRU mode was added primarily for the SRIOV usecase. In PASSTHRU mode
> there is a 1-1 mapping between macvtap and physical NIC or VF.
>
> There are two problems with putting the lowerdev in promiscous mode (ie SRIOV
> VF's):
> - Some SRIOV cards dont support promiscous mode today (Thread on Intel
> driver indicates that http://lists.openwall.net/netdev/2011/09/27/6)
> - For the SRIOV NICs that support it, Putting the lowerdev in
> promiscous mode leads to additional traffic being sent up to the
> guest virtio-net to filter result in extra overheads.
>
> Both the above problems can be solved by offloading filtering to the
> lowerdev hw. ie lowerdev does not need to be in promiscous mode as
> long as the guest filters are passed down to the lowerdev.
>
> This patch basically adds the infrastructure to set and get MAC and VLAN
> filters on an interface via rtnetlink. And adds support in macvlan and macvtap
> to allow set and get filter operations.
Looks sane to me. Some minor comments below.
> Earlier version of this patch provided the TUNSETTXFILTER macvtap interface
> for setting address filtering. In response to feedback, This version
> introduces a netlink interface for the same.
>
> Response to some of the questions raised during v1:
>
> - Netlink interface:
> This patch provides the following netlink interface to set mac and vlan
> filters :
> [IFLA_RX_FILTER] = {
> [IFLA_ADDR_FILTER] = {
> [IFLA_ADDR_FILTER_FLAGS]
> [IFLA_ADDR_FILTER_UC_LIST] = {
> [IFLA_ADDR_LIST_ENTRY]
> }
> [IFLA_ADDR_FILTER_MC_LIST] = {
> [IFLA_ADDR_LIST_ENTRY]
> }
> }
> [IFLA_VLAN_FILTER] = {
> [IFLA_VLAN_BITMAP]
> }
> }
>
> Note: The IFLA_VLAN_FILTER is a nested attribute and contains only
> IFLA_VLAN_BITMAP today. The idea is that the IFLA_VLAN_FILTER can
> be extended tomorrow to use a vlan list option if some implementations
> prefer a list instead.
>
> And it provides the following rtnl_link_ops to set/get MAC/VLAN filters:
>
> int (*set_rx_addr_filter)(struct net_device *dev,
> struct nlattr *tb[]);
> int (*set_rx_vlan_filter)(struct net_device *dev,
> struct nlattr *tb[]);
> size_t (*get_rx_addr_filter_size)(const struct
> net_device *dev);
> size_t (*get_rx_vlan_filter_size)(const struct
> net_device *dev);
> int (*fill_rx_addr_filter)(struct sk_buff *skb,
> const struct net_device *dev);
> int (*fill_rx_vlan_filter)(struct sk_buff *skb,
> const struct net_device *dev);
>
>
> Note: The choice of rtnl_link_ops was because I saw the use case for
> this in virtual devices that need to do filtering in sw like macvlan
> and tun. Hw devices usually have filtering in hw with netdev->uc and
> mc lists to indicate active filters. But I can move from rtnl_link_ops
> to netdev_ops if that is the preferred way to go and if there is a
> need to support this interface on all kinds of interfaces.
> Please suggest.
>
> - Protection against address spoofing:
> - This patch adds filtering support only for macvtap PASSTHRU
> Mode. PASSTHRU mode is used mainly with SRIOV VF's. And SRIOV VF's
> come with anti mac/vlan spoofing support. (Recently added
> IFLA_VF_SPOOFCHK). In 802.1Qbh case the port profile has a knob to
> enable/disable anti spoof check. Lowerdevice drivers also enforce limits
> on the number of address registrations allowed.
>
> - Support for multiqueue devices: Enable filtering on individual queues (?):
> AFAIK, there is no netdev interface to install per queue hw
> filters for a multi queue interface. And also I dont know of any hw
> that provides an interface to set hw filters on a per queue basis.
VMDq hardware would support this, no?
> A multi queue device appears as a single lowerdev (ie netdev) and
> uses the same uc and mc lists to setup unicast and multicast hw filters.
> So i dont see a huge problem with this patch coming in the way for
> multi queue devices.
>
> - Support for non-PASSTHRU mode:
> I started implementing this. But there are a couple of problems.
> - The lowerdev may not be a SRIOV VF and may not have
> anti spoof capability
Anti-spoofing a really a separate feature, isn't it?
> - Today, in non-PASSTHRU cases macvlan_handle_frame assumes that
> every macvlan device on top of the lowerdev has a single unique mac.
> And the macvlans are hashed on that single mac address.
> To support filtering for non-PASSTHRU mode in addition to this
> patch the following needs to be done:
> - non-passthru mode with a single macvlan over a lower dev
> can be treated as PASSTHRU case
> - For non-PASSTHRU mode with multiple macvlans over a single
> lower dev:
> - Multiple unicast mac's now need to be hashed to the
> same macvlan device. The macvlan hash needs to change
> for lookup based on any one of the multiple unicast
> addresses a macvlan is interested in
> - We need to consider vlans during the lookup too
> - So the macvlan device hash needs to hash on both mac
> and vlan
It might be useful to expose the filters to the device.
> - But the support for filtering in non-PASSTHRU mode can be
> built on this patch
Agree, this can be added gradually.
> This patch series implements the following
> 01/8 rtnetlink: Netlink interface for setting MAC and VLAN filters
> 02/8 rtnetlink: Add rtnl link operations for MAC address and VLAN filtering
> 03/8 rtnetlink: Add support to set MAC/VLAN filters
> 04/8 rtnetlink: Add support to get MAC/VLAN filters
> 05/8 macvlan: Add support to set MAC/VLAN filter rtnl link operations
> 06/8 macvlan: Add support to get MAC/VLAN filter rtnl link operations
> 07/8 macvtap: Add support to set MAC/VLAN filter rtnl link operations
> 08/8 macvtap: Add support to get MAC/VLAN filter rtnl link operations
>
> Please comment. Thanks.
>
> Signed-off-by: Roopa Prabhu <roprabhu@cisco.com>
> Signed-off-by: Christian Benvenuti <benve@cisco.com>
> Signed-off-by: David Wang <dwang2@cisco.com>
^ permalink raw reply
* Re: [PATCH net-next] Add ethtool -g support to virtio_net
From: Michael S. Tsirkin @ 2011-10-24 5:31 UTC (permalink / raw)
To: David Miller; +Cc: rusty, raj, netdev, virtualization
In-Reply-To: <20111023.232604.18246342474494526.davem@davemloft.net>
On Sun, Oct 23, 2011 at 11:26:04PM -0400, David Miller wrote:
> From: Rusty Russell <rusty@rustcorp.com.au>
> Date: Thu, 20 Oct 2011 18:25:24 +1030
>
> > On Wed, 19 Oct 2011 11:10:59 -0700 (PDT), raj@tardy.cup.hp.com (Rick Jones) wrote:
> >> From: Rick Jones <rick.jones2@hp.com>
> >>
> >> Add support for reporting ring sizes via ethtool -g to the virtio_net
> >> driver.
> >>
> >> Signed-off-by: Rick Jones <rick.jones2@hp.com>
> >
> > Acked-by: Rusty Russell <rusty@rustcorp.com.au>
> >
> > MST, want me to take this, or do you?
>
> I can also take it directly, just let me know.
Please do, thanks very much.
Acked-by: Michael S. Tsirkin <mst@redhat.com>
--
MST
^ permalink raw reply
* Re: [RFC v2 PATCH 5/4 PATCH] virtio-net: send gratuitous packet when needed
From: Michael S. Tsirkin @ 2011-10-24 5:25 UTC (permalink / raw)
To: Rusty Russell
Cc: aliguori, kvm, quintela, jan.kiszka, Jason Wang, qemu-devel,
blauwirbel, netdev, pbonzini
In-Reply-To: <87ty6z10f8.fsf@rustcorp.com.au>
On Mon, Oct 24, 2011 at 02:54:59PM +1030, Rusty Russell wrote:
> On Sat, 22 Oct 2011 13:43:11 +0800, Jason Wang <jasowang@redhat.com> wrote:
> > This make let virtio-net driver can send gratituous packet by a new
> > config bit - VIRTIO_NET_S_ANNOUNCE in each config update
> > interrupt. When this bit is set by backend, the driver would schedule
> > a workqueue to send gratituous packet through NETDEV_NOTIFY_PEERS.
> >
> > This feature is negotiated through bit VIRTIO_NET_F_GUEST_ANNOUNCE.
> >
> > Signed-off-by: Jason Wang <jasowang@redhat.com>
>
> This seems like a huge layering violation. Imagine this in real
> hardware, for example.
commits 06c4648d46d1b757d6b9591a86810be79818b60c
and 99606477a5888b0ead0284fecb13417b1da8e3af
document the need for this:
NETDEV_NOTIFY_PEERS notifier indicates that a device moved to a
different physical link.
and
In real hardware such notifications are only
generated when the device comes up or the address changes.
So hypervisor could get the same behaviour by sending link up/down
events, this is just an optimization so guest won't do
unecessary stuff like try to reconfigure an IP address.
Maybe LOCATION_CHANGE would be a better name?
> There may be a good reason why virtual devices might want this kind of
> reconfiguration cheat, which is unnecessary for normal machines,
I think yes, the difference with real hardware is guest can change
location without link getting dropped.
FWIW, Xen seems to use this capability too.
> but
> it'd have to be spelled out clearly in the spec to justify it...
>
> Cheers,
> Rusty.
Agree, and I'd like to see the spec too. The interface seems
to involve the guest clearing the status bit when it detects
an event?
Also - how does it interact with the link up event?
We probably don't want to schedule this when we detect
a link status change or during initialization, as
this patch seems to do? What if link goes down
while the work is running? Is that OK?
--
MST
^ permalink raw reply
* Re: -next: NET_VENDOR_8390 dependencies
From: Jeff Kirsher @ 2011-10-24 5:07 UTC (permalink / raw)
To: Geert Uytterhoeven; +Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <CAMuHMdW6hC1JchxVL9-4f58a8Vg7cF+97BbKePxjEDHY3H8wNw@mail.gmail.com>
[-- Attachment #1: Type: text/plain, Size: 2060 bytes --]
On Sun, 2011-10-23 at 14:21 -0700, Geert Uytterhoeven wrote:
> drivers/net/ethernet/8390/Kconfig:
>
> config NET_VENDOR_8390
> bool "National Semi-conductor 8390 devices"
> default y
> depends on NET_VENDOR_NATSEMI && (AMIGA_PCMCIA || PCI || SUPERH || \
> ISA || MCA || EISA || MAC || M32R || MACH_TX49XX || \
> MCA_LEGACY || H8300 || ARM || MIPS || ZORRO || PCMCIA || \
> EXPERIMENTAL)
> ---help---
> If you have a network (Ethernet) card belonging to this class, say Y
> and read the Ethernet-HOWTO, available from
> <http://www.tldp.org/docs.html#howto>.
>
> Note that the answer to this question doesn't directly affect the
> kernel: saying N will just cause the configurator to skip all
> the questions about Western Digital cards. If you say Y, you will be
> asked for your specific card in the following questions.
>
> So NET_VENDOR_8390 depends on NET_VENDOR_NATSEMI.
>
> drivers/net/ethernet/natsemi/Kconfig:
>
> config NET_VENDOR_NATSEMI
> bool "National Semi-conductor devices"
> default y
> depends on MCA || MAC || MACH_JAZZ || PCI || XTENSA_PLATFORM_XT2000
> ---help---
> If you have a network (Ethernet) card belonging to this class, say Y
> and read the Ethernet-HOWTO, available from
> <http://www.tldp.org/docs.html#howto>.
>
> Note that the answer to this question doesn't directly affect the
> kernel: saying N will just cause the configurator to skip all
> the questions about National Semi-conductor devices. If you say Y,
> you will be asked for your specific card in the following questions.
>
> But NET_VENDOR_NATSEMI will never be true for several of the other
> dependencies of NET_VENDOR_8390 (e.g. AMIGA_PCMCIA, EISA, H8300, ARM,
> ZORRO, PCMCIA)?
>
You are correct, I will put together a patch to fix that. Thanks!
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 836 bytes --]
^ permalink raw reply
* Re: [RFD] Network configuration data in sysfs
From: David Miller @ 2011-10-24 4:59 UTC (permalink / raw)
To: kirill
Cc: kay.sievers, netdev, kuznet, jmorris, yoshfuji, kaber, gregkh,
gladkov.alexey
In-Reply-To: <20111024042400.GA31344@shutemov.name>
From: "Kirill A. Shutemov" <kirill@shutemov.name>
Date: Mon, 24 Oct 2011 07:24:00 +0300
> On Sun, Oct 23, 2011 at 11:24:16PM -0400, David Miller wrote:
>> From: "Kirill A. Shutemov" <kirill@shutemov.name>
>> Date: Mon, 24 Oct 2011 04:34:07 +0300
>>
>> You can use netlink to perform any configuration change you want, or
>> to view any network configuration setting.
>
> You need /sbin/ip or similar tool to do this, right?
I'm talking about udev using netlink natively.
^ permalink raw reply
* Re: [PATCH 9/9] make net/core/scm.c uid comparisons user namespace aware
From: Eric W. Biederman @ 2011-10-24 4:27 UTC (permalink / raw)
To: Serge E. Hallyn
Cc: Serge E. Hallyn, linux-kernel, akpm, oleg, richard, mikevs,
segoon, gregkh, dhowells, eparis, netdev
In-Reply-To: <20111024041529.GA23618@hallyn.com>
"Serge E. Hallyn" <serge@hallyn.com> writes:
> Never mind, it all gets a little convoluted, but I see how it works,
> and - when the time comes - how to do it right for userns. :) Sorry
> about that.
No problem.
Eric
^ permalink raw reply
* Re: [RFC v2 PATCH 5/4 PATCH] virtio-net: send gratuitous packet when needed
From: Rusty Russell @ 2011-10-24 4:24 UTC (permalink / raw)
To: Jason Wang, aliguori, quintela, jan.kiszka, mst, qemu-devel,
blauwirbel
Cc: pbonzini, kvm, netdev
In-Reply-To: <20111022054311.21798.3340.stgit@dhcp-8-146.nay.redhat.com>
On Sat, 22 Oct 2011 13:43:11 +0800, Jason Wang <jasowang@redhat.com> wrote:
> This make let virtio-net driver can send gratituous packet by a new
> config bit - VIRTIO_NET_S_ANNOUNCE in each config update
> interrupt. When this bit is set by backend, the driver would schedule
> a workqueue to send gratituous packet through NETDEV_NOTIFY_PEERS.
>
> This feature is negotiated through bit VIRTIO_NET_F_GUEST_ANNOUNCE.
>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
This seems like a huge layering violation. Imagine this in real
hardware, for example.
There may be a good reason why virtual devices might want this kind of
reconfiguration cheat, which is unnecessary for normal machines, but
it'd have to be spelled out clearly in the spec to justify it...
Cheers,
Rusty.
^ permalink raw reply
* Re: [RFD] Network configuration data in sysfs
From: Kirill A. Shutemov @ 2011-10-24 4:24 UTC (permalink / raw)
To: David Miller
Cc: kay.sievers, netdev, kuznet, jmorris, yoshfuji, kaber, gregkh,
gladkov.alexey
In-Reply-To: <20111023.232416.1038111296509565828.davem@davemloft.net>
On Sun, Oct 23, 2011 at 11:24:16PM -0400, David Miller wrote:
> From: "Kirill A. Shutemov" <kirill@shutemov.name>
> Date: Mon, 24 Oct 2011 04:34:07 +0300
>
> > On Sun, Oct 23, 2011 at 08:49:43PM -0400, David Miller wrote:
> >> From: "Kirill A. Shutemov" <kirill@shutemov.name>
> >> Date: Mon, 24 Oct 2011 02:35:58 +0300
> >>
> >> > Is there something fundamental preventing us to have sysfs interface for
> >> > network configuration?
> >>
> >> Netlink already provides everything sysfs would potentially provide as
> >> well as event tracking.
> >
> > Not everything. You still need special utilities to view/change the
> > configuration.
>
> You can use netlink to perform any configuration change you want, or
> to view any network configuration setting.
You need /sbin/ip or similar tool to do this, right?
--
Kirill A. Shutemov
^ permalink raw reply
* Re: [PATCH 9/9] make net/core/scm.c uid comparisons user namespace aware
From: Serge E. Hallyn @ 2011-10-24 4:15 UTC (permalink / raw)
To: Serge E. Hallyn
Cc: Eric W. Biederman, linux-kernel, akpm, oleg, richard, mikevs,
segoon, gregkh, dhowells, eparis, netdev
In-Reply-To: <20111020141440.GA6201@sergelap>
Quoting Serge E. Hallyn (serge.hallyn@canonical.com):
> Quoting Eric W. Biederman (ebiederm@xmission.com):
> > "Serge E. Hallyn" <serge@hallyn.com> writes:
> >
> > > Quoting Eric W. Biederman (ebiederm@xmission.com):
> > >> Serge Hallyn <serge@hallyn.com> writes:
> > >>
> > >> > From: "Serge E. Hallyn" <serge.hallyn@canonical.com>
> > >> >
> > >> > Currently uids are compared without regard for the user namespace.
> > >> > Fix that to prevent tasks in a different user namespace from
> > >> > wrongly matching on SCM_CREDENTIALS.
> > >> >
> > >> > In the past, either your uids had to match, or you had to have
> > >> > CAP_SETXID. In a namespaced world, you must either (both be in the
> > >> > same user namespace and have your uids match), or you must have
> > >> > CAP_SETXID targeted at the other user namespace. The latter can
> > >> > happen for instance if uid 500 created a new user namespace and
> > >> > now interacts with uid 0 in it.
> > >>
> > >> Serge this approach is wrong.
> > >
> > > Thanks for looking, Eric.
> > >
> > >> Because we pass the cred and the pid through the socket socket itself
> > >> is just a conduit and should be ignored in this context.
> > >
> > > Ok, that makes sense, but
> > >
> > >> The only interesting test should be are you allowed to impersonate other
> > >> users in your current userk namespace.
> > >
> > > Why in your current user namespace? Shouldn't it be in the
> > > target user ns? I understand it could be wrong to tie the
> > > user ns owning the socket to the target userns (though I still
> > > kind of like it), but just because I have CAP_SETUID in my
> > > own user_ns doesn't mean I should be able to pose as another
> > > uid in your user_ns.
> >
> > First and foremost it is important that you be able if you have the
> > capability to impersonate other users in your current user namespace.
> > That is what the capability actually controls.
> >
> > None of this allows you to impersonate any user in any other user
> > namespace. The translation between users prevents that.
> >
> > > (Now I also see that cred_to_ucred() translates to the current
> > > user_ns, so that should have been a hint to me before about
> > > your intent, but I'm not convinced I agree with your intent).
> > >
> > > And you do the same with the pid. Why is that a valid assumption?
> >
> > Yes. Basically all the code is allow you to impersonate people you
> > would have been able to impersonate before. If your target is in
> > another namespace you can not fool them.
> >
> > With pids the logic should be a lot clearer. Pretend to be a pid you can
> > see in your current pid namespace. Lookup and convert to struct pid aka
> > the namespace agnostic object. On output return the pid value that
>
> No. That conversion is happending before the user-specified pid is
> set.
Never mind, it all gets a little convoluted, but I see how it works,
and - when the time comes - how to do it right for userns. :) Sorry
about that.
thanks,
-serge
^ permalink raw reply
* Re: Flow classifier proto-dst and TOS (and proto-src)
From: Eric Dumazet @ 2011-10-24 4:02 UTC (permalink / raw)
To: Dan Siemon; +Cc: netdev
In-Reply-To: <1319418232.20602.16.camel@ganymede>
Le dimanche 23 octobre 2011 à 21:03 -0400, Dan Siemon a écrit :
> On Mon, 2011-10-17 at 08:09 +0200, Eric Dumazet wrote:
> > Le samedi 15 octobre 2011 à 12:51 -0400, Dan Siemon a écrit :
> > > cls_flow.c: flow_get_proto_dst()
> > >
> > > The proto-dst key returns the destination port for UDP, TCP and a few
> > > other protocols [see proto_ports_offset()]. For ICMP and IPIP it falls
> > > back to:
> > >
> > > return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
> > >
> > > Since Linux maintains a dst_entry for each TOS value this causes the
> > > returned value to be affected by the TOS which is unexpected and
> > > probably broken.
> >
> > Hi Dan
> >
> > I think Patrick did this on purpose, because of of the lack of
> > perturbation in cls_flow.c : If all these frames were mapped to a single
> > flow, they might interfere with an other regular flow and hurt it.
> >
> > I dont qualify existing code as buggy. Its about fallback behavior
> > anyway (I dont think its even documented)
>
> Thanks for the review Eric.
>
> Won't virtually all uses of proto-dst also use the dst key anyway? In
> which case this fallback does nothing except make the TOS effect the
> hash output because the dst will be the same and dst_entry would be the
> same if it wasn't for the different TOS (by far the common case). I
> don't see the value of the unintuitive behavior.
>
> I'm not certain this is a problem but also note that including TOS will
> mean that packets within a tunnel will be reordered if 'tos inherit' is
> set on the tunnel and only the typical src,dst,proto,proto-src,proto-dst
> is used. Again, probably not expected.
Hmm, it seems cls_flow now supports "perturbation", so my prior argument
is not true anymore. I guess your patch would be fine, but I prefer
waiting Patrick review on it ;)
^ permalink raw reply
* Re: [PATCH net -v2] [BUGFIX] bonding: use flush_delayed_work_sync in bond_close
From: HAYASAKA Mitsuo @ 2011-10-24 4:00 UTC (permalink / raw)
To: Jay Vosburgh
Cc: netdev, Américo Wang, Stephen Hemminger, Andy Gospodarek,
linux-kernel, yrl.pp-manager.tt
In-Reply-To: <14766.1319245142@death>
(2011/10/22 9:59), Jay Vosburgh wrote:
> Jay Vosburgh <fubar@us.ibm.com> wrote:
>
>> Américo Wang <xiyou.wangcong@gmail.com> wrote:
>>
>>> On Thu, Oct 20, 2011 at 3:09 AM, Jay Vosburgh <fubar@us.ibm.com> wrote:
>>>> Stephen Hemminger <shemminger@vyatta.com> wrote:
>>>>
>>>>> On Wed, 19 Oct 2011 11:01:02 -0700
>>>>> Jay Vosburgh <fubar@us.ibm.com> wrote:
>>>>>
>>>>>> Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com> wrote:
>>>>>>
>>>>>>> The bond_close() calls cancel_delayed_work() to cancel delayed works.
>>>>>>> It, however, cannot cancel works that were already queued in workqueue.
>>>>>>> The bond_open() initializes work->data, and proccess_one_work() refers
>>>>>>> get_work_cwq(work)->wq->flags. The get_work_cwq() returns NULL when
>>>>>>> work->data has been initialized. Thus, a panic occurs.
>>>>>>>
>>>>>>> This patch uses flush_delayed_work_sync() instead of cancel_delayed_work()
>>>>>>> in bond_close(). It cancels delayed timer and waits for work to finish
>>>>>>> execution. So, it can avoid the null pointer dereference due to the
>>>>>>> parallel executions of proccess_one_work() and initializing proccess
>>>>>>> of bond_open().
>>>>>>
>>>>>> I'm setting up to test this. I have a dim recollection that we
>>>>>> tried this some years ago, and there was a different deadlock that
>>>>>> manifested through the flush path. Perhaps changes since then have
>>>>>> removed that problem.
>>>>>>
>>>>>> -J
>>>>>
>>>>> Won't this deadlock on RTNL. The problem is that:
>>>>>
>>>>> CPU0 CPU1
>>>>> rtnl_lock
>>>>> bond_close
>>>>> delayed_work
>>>>> mii_work
>>>>> read_lock(bond->lock);
>>>>> read_unlock(bond->lock);
>>>>> rtnl_lock... waiting for CPU0
>>>>> flush_delayed_work_sync
>>>>> waiting for delayed_work to finish...
>>>>
>>>> Yah, that was it. We discussed this a couple of years ago in
>>>> regards to a similar patch:
>>>>
>>>> http://lists.openwall.net/netdev/2009/12/17/3
>>>>
>>>> The short version is that we could rework the rtnl_lock inside
>>>> the montiors to be conditional and retry on failure (where "retry" means
>>>> "reschedule the work and try again later," not "spin retrying on rtnl").
>>>> That should permit the use of flush or cancel to terminate the work
>>>> items.
>>>
>>> Yes? Even if we use rtnl_trylock(), doesn't flush_delayed_work_sync()
>>> still queue the pending delayed work and wait for it to be finished?
>>
>> Yes, it does. The original patch wants to use flush instead of
>> cancel to wait for the work to finish, because there's evidently a
>> possibility of getting back into bond_open before the work item
>> executes, and bond_open would reinitialize the work queue and corrupt
>> the queued work item.
>>
>> The original patch series, and recipe for destruction, is here:
>>
>> http://www.spinics.net/lists/netdev/msg176382.html
>>
>> I've been unable to reproduce the work queue panic locally,
>> although it sounds plausible.
>>
>> Mitsuo: can you provide the precise bonding configuration you're
>> using to induce the problem? Driver options, number and type of slaves,
>> etc.
>>
>>> Maybe I am too blind, why do we need rtnl_lock for cancel_delayed_work()
>>> inside bond_close()?
>>
>> We don't need RTNL for cancel/flush. However, bond_close is an
>> ndo_stop operation, and is called in the dev_close path, which always
>> occurs under RTNL. The mii / arp monitor work functions separately
>> acquire RTNL if they need to perform various failover related
>> operations.
>>
>> I'm working on a patch that should resolve the mii / arp monitor
>> RTNL problem as I described above (if rtnl_trylock fails, punt and
>> reschedule the work). I need to rearrange the netdev_bonding_change
>> stuff a bit as well, since it acquires RTNL separately.
>>
>> Once these changes are made to mii / arp monitor, then
>> bond_close can call flush instead of cancel, which should eliminate the
>> original problem described at the top.
>
> Just an update: there are three functions that may deadlock if
> the cancel work calls are changed to flush_sync. There are two
> rtnl_lock calls in each of the bond_mii_monitor and
> bond_activebackup_arp_mon functions, and one more in the
> bond_alb_monitor.
>
> Still testing to make sure I haven't missed anything, and I
> still haven't been able to reproduce Mitsuo's original failure.
The interval of mii_mon was set to 1 to reproduce this bug easily and
the 802.3ad mode was used. Then, I executed the following command.
# while true; do ifconfig bond0 down; done &
# while true; do ifconfig bond0 up; done &
This bug rarely occurs since it is the severe timing problem.
I found that it is more easily to reproduce this bug when using guest OS.
For example, it took one to three days for me to reproduce it on host OS,
but some hours on guest OS.
Thanks.
>
> -J
>
> ---
> -Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com
>
>
^ permalink raw reply
* Re: [PATCH] cls_flow: Add tunnel support to the flow classifier
From: Eric Dumazet @ 2011-10-24 3:59 UTC (permalink / raw)
To: Dan Siemon; +Cc: netdev
In-Reply-To: <1319426081.2517.20.camel@edumazet-laptop>
Le lundi 24 octobre 2011 à 05:14 +0200, Eric Dumazet a écrit :
> If you prefer, I can do the preliminary work
>
Since it is a bit tricky, I finished it.
It'll be easier for you to use existing functions without copy/paste,
since I added an "nhoff" argument that you can play with (skipping
tunnel header)
Please test it !
Thanks
[PATCH net-next] net_sched: cls_flow: use skb_header_pointer()
Dan Siemon would like to add tunnelling support to cls_flow
This preliminary patch introduces use of skb_header_pointer() to help
this task, while avoiding skb head reallocation because of deep packet
inspection.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
net/sched/cls_flow.c | 188 ++++++++++++++++++++---------------------
1 file changed, 96 insertions(+), 92 deletions(-)
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 6994214..9e087d8 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -65,132 +65,134 @@ static inline u32 addr_fold(void *addr)
return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0);
}
-static u32 flow_get_src(struct sk_buff *skb)
+static u32 flow_get_src(const struct sk_buff *skb, int nhoff)
{
+ __be32 *data = NULL, hdata;
+
switch (skb->protocol) {
case htons(ETH_P_IP):
- if (pskb_network_may_pull(skb, sizeof(struct iphdr)))
- return ntohl(ip_hdr(skb)->saddr);
+ data = skb_header_pointer(skb,
+ nhoff + offsetof(struct iphdr,
+ saddr),
+ 4, &hdata);
break;
case htons(ETH_P_IPV6):
- if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
- return ntohl(ipv6_hdr(skb)->saddr.s6_addr32[3]);
+ data = skb_header_pointer(skb,
+ nhoff + offsetof(struct ipv6hdr,
+ saddr.s6_addr32[3]),
+ 4, &hdata);
break;
}
+ if (data)
+ return ntohl(*data);
return addr_fold(skb->sk);
}
-static u32 flow_get_dst(struct sk_buff *skb)
+static u32 flow_get_dst(const struct sk_buff *skb, int nhoff)
{
+ __be32 *data = NULL, hdata;
+
switch (skb->protocol) {
case htons(ETH_P_IP):
- if (pskb_network_may_pull(skb, sizeof(struct iphdr)))
- return ntohl(ip_hdr(skb)->daddr);
+ data = skb_header_pointer(skb,
+ nhoff + offsetof(struct iphdr,
+ daddr),
+ 4, &hdata);
break;
case htons(ETH_P_IPV6):
- if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
- return ntohl(ipv6_hdr(skb)->daddr.s6_addr32[3]);
+ data = skb_header_pointer(skb,
+ nhoff + offsetof(struct ipv6hdr,
+ daddr.s6_addr32[3]),
+ 4, &hdata);
break;
}
+ if (data)
+ return ntohl(*data);
return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
}
-static u32 flow_get_proto(struct sk_buff *skb)
+static u32 flow_get_proto(const struct sk_buff *skb, int nhoff)
{
+ __u8 *data = NULL, hdata;
+
switch (skb->protocol) {
case htons(ETH_P_IP):
- return pskb_network_may_pull(skb, sizeof(struct iphdr)) ?
- ip_hdr(skb)->protocol : 0;
+ data = skb_header_pointer(skb,
+ nhoff + offsetof(struct iphdr,
+ protocol),
+ 1, &hdata);
+ break;
case htons(ETH_P_IPV6):
- return pskb_network_may_pull(skb, sizeof(struct ipv6hdr)) ?
- ipv6_hdr(skb)->nexthdr : 0;
- default:
- return 0;
+ data = skb_header_pointer(skb,
+ nhoff + offsetof(struct ipv6hdr,
+ nexthdr),
+ 1, &hdata);
+ break;
}
+ if (data)
+ return *data;
+ return 0;
}
-static u32 flow_get_proto_src(struct sk_buff *skb)
+/* helper function to get either src or dst port */
+static __be16 *flow_get_proto_common(const struct sk_buff *skb, int nhoff,
+ __be16 *_port, int dst)
{
+ __be16 *port = NULL;
+ int poff;
+
switch (skb->protocol) {
case htons(ETH_P_IP): {
- struct iphdr *iph;
- int poff;
+ struct iphdr *iph, _iph;
- if (!pskb_network_may_pull(skb, sizeof(*iph)))
+ iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+ if (!iph)
break;
- iph = ip_hdr(skb);
if (ip_is_fragment(iph))
break;
poff = proto_ports_offset(iph->protocol);
- if (poff >= 0 &&
- pskb_network_may_pull(skb, iph->ihl * 4 + 2 + poff)) {
- iph = ip_hdr(skb);
- return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 +
- poff));
- }
+ if (poff >= 0)
+ port = skb_header_pointer(skb,
+ nhoff + iph->ihl * 4 + poff + dst,
+ sizeof(*_port), _port);
break;
}
case htons(ETH_P_IPV6): {
- struct ipv6hdr *iph;
- int poff;
+ struct ipv6hdr *iph, _iph;
- if (!pskb_network_may_pull(skb, sizeof(*iph)))
+ iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+ if (!iph)
break;
- iph = ipv6_hdr(skb);
poff = proto_ports_offset(iph->nexthdr);
- if (poff >= 0 &&
- pskb_network_may_pull(skb, sizeof(*iph) + poff + 2)) {
- iph = ipv6_hdr(skb);
- return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) +
- poff));
- }
+ if (poff >= 0)
+ port = skb_header_pointer(skb,
+ nhoff + sizeof(*iph) + poff + dst,
+ sizeof(*_port), _port);
break;
}
}
- return addr_fold(skb->sk);
+ return port;
}
-static u32 flow_get_proto_dst(struct sk_buff *skb)
+static u32 flow_get_proto_src(const struct sk_buff *skb, int nhoff)
{
- switch (skb->protocol) {
- case htons(ETH_P_IP): {
- struct iphdr *iph;
- int poff;
+ __be16 _port, *port = flow_get_proto_common(skb, nhoff, &_port, 0);
- if (!pskb_network_may_pull(skb, sizeof(*iph)))
- break;
- iph = ip_hdr(skb);
- if (ip_is_fragment(iph))
- break;
- poff = proto_ports_offset(iph->protocol);
- if (poff >= 0 &&
- pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
- iph = ip_hdr(skb);
- return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 +
- 2 + poff));
- }
- break;
- }
- case htons(ETH_P_IPV6): {
- struct ipv6hdr *iph;
- int poff;
+ if (port)
+ return ntohs(*port);
- if (!pskb_network_may_pull(skb, sizeof(*iph)))
- break;
- iph = ipv6_hdr(skb);
- poff = proto_ports_offset(iph->nexthdr);
- if (poff >= 0 &&
- pskb_network_may_pull(skb, sizeof(*iph) + poff + 4)) {
- iph = ipv6_hdr(skb);
- return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) +
- poff + 2));
- }
- break;
- }
- }
+ return addr_fold(skb->sk);
+}
+
+static u32 flow_get_proto_dst(const struct sk_buff *skb, int nhoff)
+{
+ __be16 _port, *port = flow_get_proto_common(skb, nhoff, &_port, 2);
+
+ if (port)
+ return ntohs(*port);
return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
}
@@ -223,7 +225,7 @@ static u32 flow_get_nfct(const struct sk_buff *skb)
#define CTTUPLE(skb, member) \
({ \
enum ip_conntrack_info ctinfo; \
- struct nf_conn *ct = nf_ct_get(skb, &ctinfo); \
+ const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); \
if (ct == NULL) \
goto fallback; \
ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.member; \
@@ -236,7 +238,7 @@ static u32 flow_get_nfct(const struct sk_buff *skb)
})
#endif
-static u32 flow_get_nfct_src(struct sk_buff *skb)
+static u32 flow_get_nfct_src(const struct sk_buff *skb, int nhoff)
{
switch (skb->protocol) {
case htons(ETH_P_IP):
@@ -245,10 +247,10 @@ static u32 flow_get_nfct_src(struct sk_buff *skb)
return ntohl(CTTUPLE(skb, src.u3.ip6[3]));
}
fallback:
- return flow_get_src(skb);
+ return flow_get_src(skb, nhoff);
}
-static u32 flow_get_nfct_dst(struct sk_buff *skb)
+static u32 flow_get_nfct_dst(const struct sk_buff *skb, int nhoff)
{
switch (skb->protocol) {
case htons(ETH_P_IP):
@@ -257,21 +259,21 @@ static u32 flow_get_nfct_dst(struct sk_buff *skb)
return ntohl(CTTUPLE(skb, dst.u3.ip6[3]));
}
fallback:
- return flow_get_dst(skb);
+ return flow_get_dst(skb, nhoff);
}
-static u32 flow_get_nfct_proto_src(struct sk_buff *skb)
+static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, int nhoff)
{
return ntohs(CTTUPLE(skb, src.u.all));
fallback:
- return flow_get_proto_src(skb);
+ return flow_get_proto_src(skb, nhoff);
}
-static u32 flow_get_nfct_proto_dst(struct sk_buff *skb)
+static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, int nhoff)
{
return ntohs(CTTUPLE(skb, dst.u.all));
fallback:
- return flow_get_proto_dst(skb);
+ return flow_get_proto_dst(skb, nhoff);
}
static u32 flow_get_rtclassid(const struct sk_buff *skb)
@@ -313,17 +315,19 @@ static u32 flow_get_rxhash(struct sk_buff *skb)
static u32 flow_key_get(struct sk_buff *skb, int key)
{
+ int nhoff = skb_network_offset(skb);
+
switch (key) {
case FLOW_KEY_SRC:
- return flow_get_src(skb);
+ return flow_get_src(skb, nhoff);
case FLOW_KEY_DST:
- return flow_get_dst(skb);
+ return flow_get_dst(skb, nhoff);
case FLOW_KEY_PROTO:
- return flow_get_proto(skb);
+ return flow_get_proto(skb, nhoff);
case FLOW_KEY_PROTO_SRC:
- return flow_get_proto_src(skb);
+ return flow_get_proto_src(skb, nhoff);
case FLOW_KEY_PROTO_DST:
- return flow_get_proto_dst(skb);
+ return flow_get_proto_dst(skb, nhoff);
case FLOW_KEY_IIF:
return flow_get_iif(skb);
case FLOW_KEY_PRIORITY:
@@ -333,13 +337,13 @@ static u32 flow_key_get(struct sk_buff *skb, int key)
case FLOW_KEY_NFCT:
return flow_get_nfct(skb);
case FLOW_KEY_NFCT_SRC:
- return flow_get_nfct_src(skb);
+ return flow_get_nfct_src(skb, nhoff);
case FLOW_KEY_NFCT_DST:
- return flow_get_nfct_dst(skb);
+ return flow_get_nfct_dst(skb, nhoff);
case FLOW_KEY_NFCT_PROTO_SRC:
- return flow_get_nfct_proto_src(skb);
+ return flow_get_nfct_proto_src(skb, nhoff);
case FLOW_KEY_NFCT_PROTO_DST:
- return flow_get_nfct_proto_dst(skb);
+ return flow_get_nfct_proto_dst(skb, nhoff);
case FLOW_KEY_RTCLASSID:
return flow_get_rtclassid(skb);
case FLOW_KEY_SKUID:
^ permalink raw reply related
* Re: [PATCH net-next] Add ethtool -g support to virtio_net
From: David Miller @ 2011-10-24 3:26 UTC (permalink / raw)
To: rusty; +Cc: raj, netdev, mst, virtualization
In-Reply-To: <8762jk2j2r.fsf@rustcorp.com.au>
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 20 Oct 2011 18:25:24 +1030
> On Wed, 19 Oct 2011 11:10:59 -0700 (PDT), raj@tardy.cup.hp.com (Rick Jones) wrote:
>> From: Rick Jones <rick.jones2@hp.com>
>>
>> Add support for reporting ring sizes via ethtool -g to the virtio_net
>> driver.
>>
>> Signed-off-by: Rick Jones <rick.jones2@hp.com>
>
> Acked-by: Rusty Russell <rusty@rustcorp.com.au>
>
> MST, want me to take this, or do you?
I can also take it directly, just let me know.
^ permalink raw reply
* Re: [RFD] Network configuration data in sysfs
From: David Miller @ 2011-10-24 3:24 UTC (permalink / raw)
To: kirill
Cc: kay.sievers, netdev, kuznet, jmorris, yoshfuji, kaber, gregkh,
gladkov.alexey
In-Reply-To: <20111024013406.GA24158@shutemov.name>
From: "Kirill A. Shutemov" <kirill@shutemov.name>
Date: Mon, 24 Oct 2011 04:34:07 +0300
> On Sun, Oct 23, 2011 at 08:49:43PM -0400, David Miller wrote:
>> From: "Kirill A. Shutemov" <kirill@shutemov.name>
>> Date: Mon, 24 Oct 2011 02:35:58 +0300
>>
>> > Is there something fundamental preventing us to have sysfs interface for
>> > network configuration?
>>
>> Netlink already provides everything sysfs would potentially provide as
>> well as event tracking.
>
> Not everything. You still need special utilities to view/change the
> configuration.
You can use netlink to perform any configuration change you want, or
to view any network configuration setting.
^ permalink raw reply
* Re: [PATCH] cls_flow: Add tunnel support to the flow classifier
From: Eric Dumazet @ 2011-10-24 3:14 UTC (permalink / raw)
To: Dan Siemon; +Cc: netdev
In-Reply-To: <1319419287.20602.21.camel@ganymede>
Le dimanche 23 octobre 2011 à 21:21 -0400, Dan Siemon a écrit :
> Thanks for the review.
>
> Are you arguing this use case isn't worth addressing or that there is a
> more efficient way to implement this with less code?
>
Its worth doing it, but also needs more efficient code ;)
As long as we were reading only bytes in the first 64 bytes of the
frame, existing code was probably fine and efficient.
If we want to add features and features, this is going to ask more bytes
so can trigger expensive skb head reallocs.
> > IPv6 part is also a bit limited : It assumes TCP/UDP headers are the
> > first ones. Maybe its time to use ipv6_skip_exthdr() ?
>
> I noticed this too but the existing src-proto and dst-proto don't handle
> this case either. Maybe I can look into fixing those as well.
>
Yes.
> > Note also that if we pull (with pskb_network_may_pull()) too many bytes,
> > we kill routing performance on paged frags devices, wich are now
> > becoming very common.
>
> I don't know what paged frag devices means but I trust you are correct :)
>
> The existing keys also use pskb_network_may_pull(). Should they be changed as well?
>
A frame delivered by such device has for example 64 bytes present in skb
head, but remaining of data sits in attached fragment(s).
For example :
drivers/net/ethernet/emulex/benet/be.h
/* Number of bytes of an RX frame that are copied to skb->data */
#define BE_HDR_LEN ((u16) 64)
This works well if this fragment stay as is until being delivered to
userland, or forwarded.
Using pskb_network_may_pull() on data present on fragment might force to
reallocate skb head because it was too small, including a copy of struct
skb_shared_info, and all headroom (usually 64 bytes were reserved by
dev_alloc_skb()).
Adding tunnelling code definitely can increase the max offset of
inspected data from the frame beyond 64.
skb_header_pointer() can access to frag data without reallocations.
You can find many use examples in net/sched/cls_u32.c & net/netfilter
If you prefer, I can do the preliminary work
Here is a patch to give a hint :
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 6994214..cda6bf1 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -65,19 +65,27 @@ static inline u32 addr_fold(void *addr)
return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0);
}
-static u32 flow_get_src(struct sk_buff *skb)
+static u32 flow_get_src(const struct sk_buff *skb, int nhoff)
{
+ __be32 *data = NULL, hdata;
+
switch (skb->protocol) {
case htons(ETH_P_IP):
- if (pskb_network_may_pull(skb, sizeof(struct iphdr)))
- return ntohl(ip_hdr(skb)->saddr);
+ data = skb_header_pointer(skb,
+ nhoff + offsetof(struct iphdr,
+ saddr),
+ 4, &hdata);
break;
case htons(ETH_P_IPV6):
- if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
- return ntohl(ipv6_hdr(skb)->saddr.s6_addr32[3]);
+ data = skb_header_pointer(skb,
+ nhoff + offsetof(struct ipv6hdr,
+ saddr.s6_addr32[3]),
+ 4, &hdata);
break;
}
+ if (data)
+ return ntohl(*data);
return addr_fold(skb->sk);
}
@@ -236,7 +244,7 @@ static u32 flow_get_nfct(const struct sk_buff *skb)
})
#endif
-static u32 flow_get_nfct_src(struct sk_buff *skb)
+static u32 flow_get_nfct_src(const struct sk_buff *skb, int nhoff)
{
switch (skb->protocol) {
case htons(ETH_P_IP):
@@ -245,7 +253,7 @@ static u32 flow_get_nfct_src(struct sk_buff *skb)
return ntohl(CTTUPLE(skb, src.u3.ip6[3]));
}
fallback:
- return flow_get_src(skb);
+ return flow_get_src(skb, nhoff);
}
static u32 flow_get_nfct_dst(struct sk_buff *skb)
@@ -313,9 +321,11 @@ static u32 flow_get_rxhash(struct sk_buff *skb)
static u32 flow_key_get(struct sk_buff *skb, int key)
{
+ int nhoff = skb_network_offset(skb);
+
switch (key) {
case FLOW_KEY_SRC:
- return flow_get_src(skb);
+ return flow_get_src(skb, nhoff);
case FLOW_KEY_DST:
return flow_get_dst(skb);
case FLOW_KEY_PROTO:
@@ -333,7 +343,7 @@ static u32 flow_key_get(struct sk_buff *skb, int key)
case FLOW_KEY_NFCT:
return flow_get_nfct(skb);
case FLOW_KEY_NFCT_SRC:
- return flow_get_nfct_src(skb);
+ return flow_get_nfct_src(skb, nhoff);
case FLOW_KEY_NFCT_DST:
return flow_get_nfct_dst(skb);
case FLOW_KEY_NFCT_PROTO_SRC:
^ permalink raw reply related
* Re: [RFD] Network configuration data in sysfs
From: Kirill A. Shutemov @ 2011-10-24 1:34 UTC (permalink / raw)
To: David Miller, kay.sievers
Cc: netdev, kuznet, jmorris, yoshfuji, kaber, gregkh, gladkov.alexey
In-Reply-To: <20111023.204943.1509703901636083438.davem@davemloft.net>
On Sun, Oct 23, 2011 at 08:49:43PM -0400, David Miller wrote:
> From: "Kirill A. Shutemov" <kirill@shutemov.name>
> Date: Mon, 24 Oct 2011 02:35:58 +0300
>
> > Is there something fundamental preventing us to have sysfs interface for
> > network configuration?
>
> Netlink already provides everything sysfs would potentially provide as
> well as event tracking.
Not everything. You still need special utilities to view/change the
configuration.
> udev could just listen to a netlink socket and notice all changes to
> addresses, routes, and device states.
Yes, potentially. I'm not sure if it fits well to udev design.
Kay, any comments?
--
Kirill A. Shutemov
^ permalink raw reply
* Re: [PATCH] cls_flow: Add tunnel support to the flow classifier
From: Dan Siemon @ 2011-10-24 1:21 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1318833623.2500.45.camel@edumazet-laptop>
[-- Attachment #1: Type: text/plain, Size: 2039 bytes --]
On Mon, 2011-10-17 at 08:40 +0200, Eric Dumazet wrote:
> Le dimanche 16 octobre 2011 à 19:06 -0400, Dan Siemon a écrit :
> > When used on an interface carrying tunneled traffic the flow classifier
> > can't look into the tunnels so all of the traffic within the tunnel is
> > treated as a single flow. This does not allow any type of intelligent
> > queuing to occur. This patch adds new keys to the flow classifier which
> > look inside the tunnel. Presently IP-IP, IP-IPv6, IPv6-IPv6 and IPv6-IP
> > tunnels are supported.
> >
> > If you are interested I have posted some background and experimental
> > results at:
> > http://www.coverfire.com/archives/2011/10/16/making-the-linux-flow-classifier-tunnel-aware/
> >
> > The related iproute2 patch can be found at the above URL as well.
> >
> > Signed-off-by: Dan Siemon <dan@coverfire.com>
> >
>
> Hi Dan
>
> You're adding a lot of code (omitting the diffstat :( ) for a specific
> usage, yet GRE tunnels are not supported.
Thanks for the review.
Are you arguing this use case isn't worth addressing or that there is a
more efficient way to implement this with less code?
> IPv6 part is also a bit limited : It assumes TCP/UDP headers are the
> first ones. Maybe its time to use ipv6_skip_exthdr() ?
I noticed this too but the existing src-proto and dst-proto don't handle
this case either. Maybe I can look into fixing those as well.
> Note also that if we pull (with pskb_network_may_pull()) too many bytes,
> we kill routing performance on paged frags devices, wich are now
> becoming very common.
I don't know what paged frag devices means but I trust you are correct :)
The existing keys also use pskb_network_may_pull(). Should they be changed as well?
> Adding tunnel support and deep packet inspection might require the use
> of skb_header_pointer() wich does the copy of needed data without
> requiring expensive reallocation of skb head.
I'll look into this but it may be a while before I have an updated
patch.
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 836 bytes --]
^ permalink raw reply
* Re: [PATCH net-next] Add ethtool -g support to virtio_net
From: Rusty Russell @ 2011-10-20 7:55 UTC (permalink / raw)
To: Rick Jones, netdev, mst, virtualization
In-Reply-To: <20111019181059.C644A29003F6@tardy>
On Wed, 19 Oct 2011 11:10:59 -0700 (PDT), raj@tardy.cup.hp.com (Rick Jones) wrote:
> From: Rick Jones <rick.jones2@hp.com>
>
> Add support for reporting ring sizes via ethtool -g to the virtio_net
> driver.
>
> Signed-off-by: Rick Jones <rick.jones2@hp.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
MST, want me to take this, or do you?
Cheers,
Rusty.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox