Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH] net: 8139cp: convert to hw_features
From: Michał Mirosław @ 2011-04-09 10:58 UTC (permalink / raw)
  To: netdev

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---
 drivers/net/8139cp.c |   46 +++++++++++++++++-----------------------------
 1 files changed, 17 insertions(+), 29 deletions(-)

diff --git a/drivers/net/8139cp.c b/drivers/net/8139cp.c
index dd16e83..10c4505 100644
--- a/drivers/net/8139cp.c
+++ b/drivers/net/8139cp.c
@@ -758,8 +758,7 @@ static netdev_tx_t cp_start_xmit (struct sk_buff *skb,
 
 	entry = cp->tx_head;
 	eor = (entry == (CP_TX_RING_SIZE - 1)) ? RingEnd : 0;
-	if (dev->features & NETIF_F_TSO)
-		mss = skb_shinfo(skb)->gso_size;
+	mss = skb_shinfo(skb)->gso_size;
 
 	if (skb_shinfo(skb)->nr_frags == 0) {
 		struct cp_desc *txd = &cp->tx_ring[entry];
@@ -1416,32 +1415,23 @@ static void cp_set_msglevel(struct net_device *dev, u32 value)
 	cp->msg_enable = value;
 }
 
-static u32 cp_get_rx_csum(struct net_device *dev)
+static int cp_set_features(struct net_device *dev, u32 features)
 {
 	struct cp_private *cp = netdev_priv(dev);
-	return (cpr16(CpCmd) & RxChkSum) ? 1 : 0;
-}
+	unsigned long flags;
 
-static int cp_set_rx_csum(struct net_device *dev, u32 data)
-{
-	struct cp_private *cp = netdev_priv(dev);
-	u16 cmd = cp->cpcmd, newcmd;
+	if (!((dev->features ^ features) & NETIF_F_RXCSUM))
+		return 0;
 
-	newcmd = cmd;
+	spin_lock_irqsave(&cp->lock, flags);
 
-	if (data)
-		newcmd |= RxChkSum;
+	if (features & NETIF_F_RXCSUM)
+		cp->cpcmd |= RxChkSum;
 	else
-		newcmd &= ~RxChkSum;
+		cp->cpcmd &= ~RxChkSum;
 
-	if (newcmd != cmd) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&cp->lock, flags);
-		cp->cpcmd = newcmd;
-		cpw16_f(CpCmd, newcmd);
-		spin_unlock_irqrestore(&cp->lock, flags);
-	}
+	cpw16_f(CpCmd, cp->cpcmd);
+	spin_unlock_irqrestore(&cp->lock, flags);
 
 	return 0;
 }
@@ -1554,11 +1544,6 @@ static const struct ethtool_ops cp_ethtool_ops = {
 	.get_link		= ethtool_op_get_link,
 	.get_msglevel		= cp_get_msglevel,
 	.set_msglevel		= cp_set_msglevel,
-	.get_rx_csum		= cp_get_rx_csum,
-	.set_rx_csum		= cp_set_rx_csum,
-	.set_tx_csum		= ethtool_op_set_tx_csum, /* local! */
-	.set_sg			= ethtool_op_set_sg,
-	.set_tso		= ethtool_op_set_tso,
 	.get_regs		= cp_get_regs,
 	.get_wol		= cp_get_wol,
 	.set_wol		= cp_set_wol,
@@ -1831,6 +1816,7 @@ static const struct net_device_ops cp_netdev_ops = {
 	.ndo_do_ioctl		= cp_ioctl,
 	.ndo_start_xmit		= cp_start_xmit,
 	.ndo_tx_timeout		= cp_tx_timeout,
+	.ndo_set_features	= cp_set_features,
 #if CP_VLAN_TAG_USED
 	.ndo_vlan_rx_register	= cp_vlan_rx_register,
 #endif
@@ -1934,6 +1920,9 @@ static int cp_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 	cp->cpcmd = (pci_using_dac ? PCIDAC : 0) |
 		    PCIMulRW | RxChkSum | CpRxOn | CpTxOn;
 
+	dev->features |= NETIF_F_RXCSUM;
+	dev->hw_features |= NETIF_F_RXCSUM;
+
 	regs = ioremap(pciaddr, CP_REGS_SIZE);
 	if (!regs) {
 		rc = -EIO;
@@ -1966,9 +1955,8 @@ static int cp_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (pci_using_dac)
 		dev->features |= NETIF_F_HIGHDMA;
 
-#if 0 /* disabled by default until verified */
-	dev->features |= NETIF_F_TSO;
-#endif
+	/* disabled by default until verified */
+	dev->hw_features |= NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO;
 
 	dev->irq = pdev->irq;
 
-- 
1.7.2.5


^ permalink raw reply related

* Re: [net-next-2.6 PATCH] v3 ethtool: add ntuple flow specifier data to network flow classifier
From: Ben Hutchings @ 2011-04-09 10:38 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: davem, jeffrey.t.kirsher, netdev
In-Reply-To: <20110409040159.6064.79138.stgit@gitlad.jf.intel.com>

On Fri, 2011-04-08 at 21:01 -0700, Alexander Duyck wrote:
> This change is meant to add an ntuple data extensions to the rx network flow
> classification specifiers.  The idea is to allow ntuple to be displayed via
> the network flow classification interface.
> 
> The first patch had some left over stuff from the original flow extension
> flags I had added.  That bit is removed in this patch.
> 
> The second had some left over comments that stated we ignored bits in the
> masks when we actually match them.
> 
> This work is based on input from Ben Hutchings.
> 
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Reviewed-by: Ben Hutchings <bhutchings@solarflare.com>

> ---
> 
>  include/linux/ethtool.h |   53 ++++++++++++++++++++++++++++-------------------
>  net/socket.c            |   14 ++++++------
>  2 files changed, 39 insertions(+), 28 deletions(-)
> 
> diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
> index c04d131..c7eff13 100644
> --- a/include/linux/ethtool.h
> +++ b/include/linux/ethtool.h
> @@ -380,27 +380,42 @@ struct ethtool_usrip4_spec {
>  	__u8    proto;
>  };
>  
> +union ethtool_flow_union {
> +	struct ethtool_tcpip4_spec		tcp_ip4_spec;
> +	struct ethtool_tcpip4_spec		udp_ip4_spec;
> +	struct ethtool_tcpip4_spec		sctp_ip4_spec;
> +	struct ethtool_ah_espip4_spec		ah_ip4_spec;
> +	struct ethtool_ah_espip4_spec		esp_ip4_spec;
> +	struct ethtool_usrip4_spec		usr_ip4_spec;
> +	struct ethhdr				ether_spec;
> +	__u8					hdata[60];
> +};
> +
> +struct ethtool_flow_ext {
> +	__be16	vlan_etype;
> +	__be16	vlan_tci;
> +	__be32	data[2];
> +};
> +
>  /**
>   * struct ethtool_rx_flow_spec - specification for RX flow filter
>   * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW
>   * @h_u: Flow fields to match (dependent on @flow_type)
> - * @m_u: Masks for flow field bits to be ignored
> + * @h_ext: Additional fields to match
> + * @m_u: Masks for flow field bits to be matched
> + * @m_ext: Masks for additional field bits to be matched
> + *	Note, all additional fields must be ignored unless @flow_type
> + *	includes the %FLOW_EXT flag.
>   * @ring_cookie: RX ring/queue index to deliver to, or %RX_CLS_FLOW_DISC
>   *	if packets should be discarded
>   * @location: Index of filter in hardware table
>   */
>  struct ethtool_rx_flow_spec {
>  	__u32		flow_type;
> -	union {
> -		struct ethtool_tcpip4_spec		tcp_ip4_spec;
> -		struct ethtool_tcpip4_spec		udp_ip4_spec;
> -		struct ethtool_tcpip4_spec		sctp_ip4_spec;
> -		struct ethtool_ah_espip4_spec		ah_ip4_spec;
> -		struct ethtool_ah_espip4_spec		esp_ip4_spec;
> -		struct ethtool_usrip4_spec		usr_ip4_spec;
> -		struct ethhdr				ether_spec;
> -		__u8					hdata[72];
> -	} h_u, m_u;
> +	union ethtool_flow_union h_u;
> +	struct ethtool_flow_ext h_ext;
> +	union ethtool_flow_union m_u;
> +	struct ethtool_flow_ext m_ext;
>  	__u64		ring_cookie;
>  	__u32		location;
>  };
> @@ -458,16 +473,10 @@ struct ethtool_rxnfc {
>  
>  struct compat_ethtool_rx_flow_spec {
>  	u32		flow_type;
> -	union {
> -		struct ethtool_tcpip4_spec		tcp_ip4_spec;
> -		struct ethtool_tcpip4_spec		udp_ip4_spec;
> -		struct ethtool_tcpip4_spec		sctp_ip4_spec;
> -		struct ethtool_ah_espip4_spec		ah_ip4_spec;
> -		struct ethtool_ah_espip4_spec		esp_ip4_spec;
> -		struct ethtool_usrip4_spec		usr_ip4_spec;
> -		struct ethhdr				ether_spec;
> -		u8					hdata[72];
> -	} h_u, m_u;
> +	union ethtool_flow_union h_u;
> +	struct ethtool_flow_ext h_ext;
> +	union ethtool_flow_union m_u;
> +	struct ethtool_flow_ext m_ext;
>  	compat_u64	ring_cookie;
>  	u32		location;
>  };
> @@ -1072,6 +1081,8 @@ struct ethtool_ops {
>  #define	IPV4_FLOW	0x10	/* hash only */
>  #define	IPV6_FLOW	0x11	/* hash only */
>  #define	ETHER_FLOW	0x12	/* spec only (ether_spec) */
> +/* Flag to enable additional fields in struct ethtool_rx_flow_spec */
> +#define	FLOW_EXT	0x80000000
>  
>  /* L3-L4 network traffic flow hash options */
>  #define	RXH_L2DA	(1 << 1)
> diff --git a/net/socket.c b/net/socket.c
> index 5212447..575c84f 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -2643,13 +2643,13 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
>  		return -EFAULT;
>  
>  	if (convert_in) {
> -		/* We expect there to be holes between fs.m_u and
> +		/* We expect there to be holes between fs.m_ext and
>  		 * fs.ring_cookie and at the end of fs, but nowhere else.
>  		 */
> -		BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_u) +
> -			     sizeof(compat_rxnfc->fs.m_u) !=
> -			     offsetof(struct ethtool_rxnfc, fs.m_u) +
> -			     sizeof(rxnfc->fs.m_u));
> +		BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) +
> +			     sizeof(compat_rxnfc->fs.m_ext) !=
> +			     offsetof(struct ethtool_rxnfc, fs.m_ext) +
> +			     sizeof(rxnfc->fs.m_ext));
>  		BUILD_BUG_ON(
>  			offsetof(struct compat_ethtool_rxnfc, fs.location) -
>  			offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) !=
> @@ -2657,7 +2657,7 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
>  			offsetof(struct ethtool_rxnfc, fs.ring_cookie));
>  
>  		if (copy_in_user(rxnfc, compat_rxnfc,
> -				 (void *)(&rxnfc->fs.m_u + 1) -
> +				 (void *)(&rxnfc->fs.m_ext + 1) -
>  				 (void *)rxnfc) ||
>  		    copy_in_user(&rxnfc->fs.ring_cookie,
>  				 &compat_rxnfc->fs.ring_cookie,
> @@ -2674,7 +2674,7 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
>  
>  	if (convert_out) {
>  		if (copy_in_user(compat_rxnfc, rxnfc,
> -				 (const void *)(&rxnfc->fs.m_u + 1) -
> +				 (const void *)(&rxnfc->fs.m_ext + 1) -
>  				 (const void *)rxnfc) ||
>  		    copy_in_user(&compat_rxnfc->fs.ring_cookie,
>  				 &rxnfc->fs.ring_cookie,
> 

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* [PATCH] net: ipv4: add IPPROTO_ICMP socket kind
From: Vasiliy Kulikov @ 2011-04-09 10:15 UTC (permalink / raw)
  To: linux-kernel
  Cc: netdev, Pavel Kankovsky, Solar Designer, Kees Cook, Dan Rosenberg,
	Eugene Teo, Nelson Elhage, David S. Miller, Alexey Kuznetsov,
	Pekka Savola, James Morris, Hideaki YOSHIFUJI, Patrick McHardy

This patch adds IPPROTO_ICMP socket kind.  It makes it possible to send
ICMP_ECHO messages and receive the corresponding ICMP_ECHOREPLY messages
without any special privileges.  In other words, the patch makes it
possible to implement setuid-less and CAP_NET_RAW-less /bin/ping.  In
order not to increase the kernel's attack surface (in case of
vulnerabilities in the newly added code), the new functionality is
disabled by default, but is enabled at bootup by supporting Linux
distributions, optionally with restriction to a group or a group range
(see below).

Similar functionality is implemented in Mac OS X:
http://www.manpagez.com/man/4/icmp/

A new ping socket is created with

    socket(PF_INET, SOCK_DGRAM, PROT_ICMP)

Message identifiers (octets 4-5 of ICMP header) are interpreted as local
ports. Addresses are stored in struct sockaddr_in. No port numbers are
reserved for privileged processes, port 0 is reserved for API ("let the
kernel pick a free number"). There is no notion of remote ports, remote
port numbers provided by the user (e.g. in connect()) are ignored.

Data sent and received include ICMP headers. This is deliberate to:
1) Avoid the need to transport headers values like sequence numbers by
other means.
2) Make it easier to port existing programs using raw sockets.

ICMP headers given to send() are checked and sanitized. The type must be
ICMP_ECHO and the code must be zero (future extensions might relax this,
see below). The id is set to the number (local port) of the socket, the
checksum is always recomputed.

ICMP reply packets received from the network are demultiplexed according
to their id's, and are returned by recv() without any modifications.
IP header information and ICMP errors of those packets may be obtained
via ancillary data (IP_RECVTTL, IP_RETOPTS, and IP_RECVERR). ICMP source
quenches and redirects are reported as fake errors via the error queue
(IP_RECVERR); the next hop address for redirects is saved to ee_info (in
network order).

socket(2) is restricted to the group range specified in
"/proc/sys/net/ipv4/ping_group_range".  It is "1 0" by default, meaning
that nobody (not even root) may create ping sockets.  Setting it to "100
100" would grant permissions to the single group, "0 65535" would enable
it for the world.

The existing code might be (in the unlikely case anyone needs it)
extended rather easily to handle other similar pairs of ICMP messages
(Timestamp/Reply, Information Request/Reply, Address Mask Request/Reply
etc.).


Userspace ping util & patch for it:
http://openwall.info/wiki/segoon/ping

A revision of this patch (for RHEL5/OpenVZ kernels) is in use in
Owl-current, such as in the 2011/03/12 LiveCD ISOs:
http://mirrors.kernel.org/openwall/Owl/current/iso/

For Openwall GNU/*/Linux it is the last step on the road to the
setuid-less distro.

Initially this functionality was written by Pavel Kankovsky (CC'ed him)
for linux 2.4.32, but unfortunately it was never made public.

Reference to the previous discussion:
http://lwn.net/Articles/420801/


All ping options (-b, -p, -Q, -R, -s, -t, -T, -M, -I), are tested with
the patch.

Changes since RFCv2:
    - fixed checksumming bug.
    - CAP_NET_RAW may not create icmp sockets anymore.

Changes since RFCv1:
    - minor cleanups.
    - introduced sysctl'able group range to restrict socket(2).

Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
---
 include/net/netns/ipv4.h   |    2 +
 include/net/ping.h         |   69 ++++
 net/ipv4/Kconfig           |   21 +
 net/ipv4/Makefile          |    1 +
 net/ipv4/af_inet.c         |   36 ++
 net/ipv4/icmp.c            |   14 +-
 net/ipv4/ping.c            |  933 ++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/sysctl_net_ipv4.c |   90 +++++
 8 files changed, 1165 insertions(+), 1 deletions(-)
 create mode 100644 include/net/ping.h
 create mode 100644 net/ipv4/ping.c

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index d68c3f1..ff3bb61 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -55,6 +55,8 @@ struct netns_ipv4 {
 	int sysctl_rt_cache_rebuild_count;
 	int current_rt_cache_rebuild_count;
 
+	unsigned int sysctl_ping_group_range[2];
+
 	atomic_t rt_genid;
 
 #ifdef CONFIG_IP_MROUTE
diff --git a/include/net/ping.h b/include/net/ping.h
new file mode 100644
index 0000000..32ad20a
--- /dev/null
+++ b/include/net/ping.h
@@ -0,0 +1,69 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Definitions for the "ping" module.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+#ifndef _PING_H
+#define _PING_H
+
+#include <net/netns/hash.h>
+
+#ifdef CONFIG_IP_PING_DEBUG
+#define ping_debug(fmt, x...) printk(KERN_INFO fmt, ## x)
+#else
+#define ping_debug(fmt, x...) do {} while (0)
+#endif
+
+/* PING_HTABLE_SIZE must be power of 2 */
+#define PING_HTABLE_SIZE 	64
+#define PING_HTABLE_MASK 	(PING_HTABLE_SIZE-1)
+
+#define ping_portaddr_for_each_entry(__sk, node, list) \
+	hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node)
+
+/*
+ * gid_t is either uint or ushort.  We want to pass it to
+ * proc_dointvec_minmax(), so it must not be larger than INT_MAX
+ */
+#define GID_T_MAX (((gid_t)~0U) >> 1)
+
+struct ping_table {
+	struct hlist_nulls_head	hash[PING_HTABLE_SIZE];
+	rwlock_t		lock;
+};
+
+struct ping_iter_state {
+	struct seq_net_private  p;
+	int			bucket;
+};
+
+extern struct proto ping_prot;
+
+
+#ifdef CONFIG_IP_PING
+#define icmp_echoreply ping_rcv
+#else
+#define icmp_echoreply icmp_discard
+#endif
+
+extern void ping_rcv(struct sk_buff *);
+extern void ping_err(struct sk_buff *, u32 info);
+
+extern void inet_get_ping_group_range_net(struct net *net, unsigned int *low, unsigned int *high);
+
+#ifdef CONFIG_PROC_FS
+extern int __init ping_proc_init(void);
+extern void ping_proc_exit(void);
+#endif
+
+void __init ping_init(void);
+
+
+#endif /* _PING_H */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index a5a1050..cf64f35 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -14,6 +14,27 @@ config IP_MULTICAST
 	  <file:Documentation/networking/multicast.txt>. For most people, it's
 	  safe to say N.
 
+config IP_PING
+	bool "IP: ping socket"
+	depends on EXPERIMENTAL
+	help
+	  This option introduces a new kind of sockets - "ping sockets".
+
+	  A ping socket makes it possible to send ICMP Echo messages and receive
+	  corresponding ICMP Echo Reply messages without any special privileges.
+	  In other words, it makes is possible to implement setuid-less /bin/ping.
+
+	  A new ping socket is created with socket(PF_INET, SOCK_DGRAM, PROT_ICMP).
+
+config IP_PING_DEBUG
+	bool "IP: ping socket debug output"
+	depends on IP_PING
+	default n
+	help
+	  Enable the inclusion of debug code in the ICMP ping sockets.
+	  Be aware that doing this will impact performance.
+	  If unsure say N.
+
 config IP_ADVANCED_ROUTER
 	bool "IP: advanced router"
 	---help---
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 4978d22..3a37479 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
 obj-$(CONFIG_IP_MROUTE) += ipmr.o
+obj-$(CONFIG_IP_PING) += ping.o
 obj-$(CONFIG_NET_IPIP) += ipip.o
 obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
 obj-$(CONFIG_NET_IPGRE) += ip_gre.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 45b89d7..a707d3e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -105,6 +105,7 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <net/udplite.h>
+#include <net/ping.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/raw.h>
@@ -1008,6 +1009,16 @@ static struct inet_protosw inetsw_array[] =
 		.flags =      INET_PROTOSW_PERMANENT,
        },
 
+#ifdef CONFIG_IP_PING
+       {
+		.type =       SOCK_DGRAM,
+		.protocol =   IPPROTO_ICMP,
+		.prot =       &ping_prot,
+		.ops =        &inet_dgram_ops,
+		.no_check =   UDP_CSUM_DEFAULT,
+		.flags =      INET_PROTOSW_REUSE,
+       },
+#endif
 
        {
 	       .type =       SOCK_RAW,
@@ -1528,6 +1539,9 @@ static const struct net_protocol udp_protocol = {
 
 static const struct net_protocol icmp_protocol = {
 	.handler =	icmp_rcv,
+#ifdef CONFIG_IP_PING
+	.err_handler =	ping_err,
+#endif
 	.no_policy =	1,
 	.netns_ok =	1,
 };
@@ -1643,6 +1657,12 @@ static int __init inet_init(void)
 	if (rc)
 		goto out_unregister_udp_proto;
 
+#ifdef CONFIG_IP_PING
+	rc = proto_register(&ping_prot, 1);
+	if (rc)
+		goto out_unregister_raw_proto;
+#endif
+
 	/*
 	 *	Tell SOCKET that we are alive...
 	 */
@@ -1698,6 +1718,10 @@ static int __init inet_init(void)
 	/* Add UDP-Lite (RFC 3828) */
 	udplite4_register();
 
+#ifdef CONFIG_IP_PING
+	ping_init();
+#endif
+
 	/*
 	 *	Set the ICMP layer up
 	 */
@@ -1728,6 +1752,10 @@ static int __init inet_init(void)
 	rc = 0;
 out:
 	return rc;
+#ifdef CONFIG_IP_PING
+out_unregister_raw_proto:
+	proto_unregister(&raw_prot);
+#endif
 out_unregister_udp_proto:
 	proto_unregister(&udp_prot);
 out_unregister_tcp_proto:
@@ -1752,11 +1780,19 @@ static int __init ipv4_proc_init(void)
 		goto out_tcp;
 	if (udp4_proc_init())
 		goto out_udp;
+#ifdef CONFIG_IP_PING
+	if (ping_proc_init())
+		goto out_ping;
+#endif
 	if (ip_misc_proc_init())
 		goto out_misc;
 out:
 	return rc;
 out_misc:
+#ifdef CONFIG_IP_PING
+	ping_proc_exit();
+out_ping:
+#endif
 	udp4_proc_exit();
 out_udp:
 	tcp4_proc_exit();
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4aa1b7f..7a52374 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -83,6 +83,7 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <net/raw.h>
+#include <net/ping.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <linux/errno.h>
@@ -798,6 +799,17 @@ static void icmp_redirect(struct sk_buff *skb)
 			       iph->saddr, skb->dev);
 		break;
 	}
+
+#ifdef CONFIG_IP_PING
+	/* Ping wants to see redirects.
+         * Let's pretend they are errors of sorts... */
+	if (iph->protocol == IPPROTO_ICMP &&
+	    iph->ihl >= 5 &&
+	    pskb_may_pull(skb, (iph->ihl<<2)+8)) {
+		ping_err(skb, icmp_hdr(skb)->un.gateway);
+	}
+#endif
+
 out:
 	return;
 out_err:
@@ -1058,7 +1070,7 @@ error:
  */
 static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
 	[ICMP_ECHOREPLY] = {
-		.handler = icmp_discard,
+		.handler = icmp_echoreply,
 	},
 	[1] = {
 		.handler = icmp_discard,
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
new file mode 100644
index 0000000..16a4683
--- /dev/null
+++ b/net/ipv4/ping.c
@@ -0,0 +1,933 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		"Ping" sockets
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Based on ipv4/udp.c code.
+ *
+ * Authors:	Vasiliy Kulikov / Openwall (for Linux 2.6),
+ *		Pavel Kankovsky (for Linux 2.4.32)
+ *
+ * Pavel gave all rights to bugs to Vasiliy,
+ * none of the bugs are Pavel's now.
+ *
+ */
+
+#include <asm/system.h>
+#include <linux/uaccess.h>
+#include <asm/ioctls.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <net/sock.h>
+#include <net/ping.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/route.h>
+#include <net/inet_common.h>
+#include <net/checksum.h>
+
+
+struct ping_table ping_table __read_mostly;
+
+u16 ping_port_rover;
+
+static inline int ping_hashfn(struct net *net, unsigned num, unsigned mask)
+{
+	int res = (num + net_hash_mix(net)) & mask;
+	ping_debug("hash(%d) = %d\n", num, res);
+	return res;
+}
+
+static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table,
+					     struct net *net, unsigned num)
+{
+	return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)];
+}
+
+static int ping_v4_get_port(struct sock *sk, unsigned short ident)
+{
+	struct hlist_nulls_node *node;
+	struct hlist_nulls_head *hlist;
+	struct inet_sock *isk, *isk2;
+	struct sock *sk2 = NULL;
+
+	isk = inet_sk(sk);
+	write_lock_bh(&ping_table.lock);
+	if (ident == 0) {
+		u32 i;
+		u16 result = ping_port_rover + 1;
+
+		for (i = 0; i < (1L << 16); i++, result++) {
+			if (!result)
+				result++; /* avoid zero */
+			hlist = ping_hashslot(&ping_table, sock_net(sk),
+					    result);
+			ping_portaddr_for_each_entry(sk2, node, hlist) {
+				isk2 = inet_sk(sk2);
+
+				if (isk2->inet_num == result)
+					goto next_port;
+			}
+
+			/* found */
+			ping_port_rover = ident = result;
+			break;
+next_port:
+			;
+		}
+		if (i >= (1L << 16))
+			goto fail;
+	} else {
+		hlist = ping_hashslot(&ping_table, sock_net(sk), ident);
+		ping_portaddr_for_each_entry(sk2, node, hlist) {
+			isk2 = inet_sk(sk2);
+
+			if ((isk2->inet_num == ident) &&
+			    (sk2 != sk) &&
+			    (!sk2->sk_reuse || !sk->sk_reuse))
+				goto fail;
+		}
+	}
+
+	ping_debug("found port/ident = %d\n", ident);
+	isk->inet_num = ident;
+	if (sk_unhashed(sk)) {
+		ping_debug("was not hashed\n");
+		sock_hold(sk);
+		hlist_nulls_add_head(&sk->sk_nulls_node, hlist);
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	}
+	write_unlock_bh(&ping_table.lock);
+	return 0;
+
+fail:
+	write_unlock_bh(&ping_table.lock);
+	return 1;
+}
+
+static void ping_v4_hash(struct sock *sk)
+{
+	ping_debug("ping_v4_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);
+	BUG(); /* "Please do not press this button again." */
+}
+
+static void ping_v4_unhash(struct sock *sk)
+{
+	struct inet_sock *isk = inet_sk(sk);
+	ping_debug("ping_v4_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
+	if (sk_hashed(sk)) {
+		struct hlist_nulls_head *hslot;
+
+		hslot = ping_hashslot(&ping_table, sock_net(sk), isk->inet_num);
+		write_lock_bh(&ping_table.lock);
+		hlist_nulls_del(&sk->sk_nulls_node);
+		sock_put(sk);
+		isk->inet_num = isk->inet_sport = 0;
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+		write_unlock_bh(&ping_table.lock);
+	}
+}
+
+struct sock *ping_v4_lookup(struct net *net, u32 saddr, u32 daddr,
+	 u16 ident, int dif)
+{
+	struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident);
+	struct sock *sk = NULL;
+	struct inet_sock *isk;
+	struct hlist_nulls_node *hnode;
+
+	ping_debug("try to find: num = %d, daddr = %ld, dif = %d\n",
+			 (int)ident, (unsigned long)daddr, dif);
+	read_lock_bh(&ping_table.lock);
+
+	ping_portaddr_for_each_entry(sk, hnode, hslot) {
+		isk = inet_sk(sk);
+
+		ping_debug("found: %p: num = %d, daddr = %ld, dif = %d\n", sk,
+			 (int)isk->inet_num, (unsigned long)isk->inet_rcv_saddr,
+			 sk->sk_bound_dev_if);
+
+		ping_debug("iterate\n");
+		if (isk->inet_num != ident)
+			continue;
+		if (isk->inet_rcv_saddr && isk->inet_rcv_saddr != daddr)
+			continue;
+		if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
+			continue;
+
+		sock_hold(sk);
+		goto exit;
+	}
+
+	sk = NULL;
+exit:
+	read_unlock_bh(&ping_table.lock);
+
+	return sk;
+}
+
+static int ping_init_sock(struct sock *sk)
+{
+	struct net *net = sock_net(sk);
+	gid_t group = current_egid();
+	gid_t range[2];
+	struct group_info *group_info = get_current_groups();
+	int i, j, count = group_info->ngroups;
+
+	inet_get_ping_group_range_net(net, range, range+1);
+	if (range[0] <= group && group <= range[1])
+		return 0;
+
+	for (i = 0; i < group_info->nblocks; i++) {
+		int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
+
+		for (j = 0; j < cp_count; j++) {
+			group = group_info->blocks[i][j];
+			if (range[0] <= group && group <= range[1])
+				return 0;
+		}
+
+		count -= cp_count;
+	}
+
+	return -EACCES;
+}
+
+static void ping_close(struct sock *sk, long timeout)
+{
+	ping_debug("ping_close(sk=%p,sk->num=%u)\n",
+		inet_sk(sk), inet_sk(sk)->inet_num);
+	ping_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter);
+
+	sk_common_release(sk);
+}
+
+/*
+ * We need our own bind because there are no privileged id's == local ports.
+ * Moreover, we don't allow binding to multi- and broadcast addresses.
+ */
+
+static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+	struct inet_sock *isk = inet_sk(sk);
+	unsigned short snum;
+	int chk_addr_ret;
+	int err;
+
+	if (addr_len < sizeof(struct sockaddr_in))
+		return -EINVAL;
+
+	ping_debug("ping_v4_bind(sk=%p,sa_addr=%08x,sa_port=%d)\n",
+		sk, addr->sin_addr.s_addr, ntohs(addr->sin_port));
+
+	chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
+	if (addr->sin_addr.s_addr == INADDR_ANY)
+		chk_addr_ret = RTN_LOCAL;
+
+	if ((sysctl_ip_nonlocal_bind == 0 &&
+	    isk->freebind == 0 && isk->transparent == 0 &&
+	     chk_addr_ret != RTN_LOCAL) ||
+	    chk_addr_ret == RTN_MULTICAST ||
+	    chk_addr_ret == RTN_BROADCAST)
+		return -EADDRNOTAVAIL;
+
+	lock_sock(sk);
+
+	err = -EINVAL;
+	if (isk->inet_num != 0)
+		goto out;
+
+	err = -EADDRINUSE;
+	isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr;
+	snum = ntohs(addr->sin_port);
+	if (ping_v4_get_port(sk, snum) != 0) {
+		isk->inet_saddr = isk->inet_rcv_saddr = 0;
+		goto out;
+	}
+
+	ping_debug("after bind(): num = %d, daddr = %ld, dif = %d\n",
+		(int)isk->inet_num,
+		(unsigned long) isk->inet_rcv_saddr,
+		(int)sk->sk_bound_dev_if);
+
+	err = 0;
+	if (isk->inet_rcv_saddr)
+		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+	if (snum)
+		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+	isk->inet_sport = htons(isk->inet_num);
+	isk->inet_daddr = 0;
+	isk->inet_dport = 0;
+	sk_dst_reset(sk);
+out:
+	release_sock(sk);
+	ping_debug("ping_v4_bind -> %d\n", err);
+	return err;
+}
+
+/*
+ * Is this a supported type of ICMP message?
+ */
+
+static inline int ping_supported(int type, int code)
+{
+	if (type == ICMP_ECHO && code == 0)
+		return 1;
+	return 0;
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition.
+ */
+
+static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+
+void ping_err(struct sk_buff *skb, u32 info)
+{
+	struct iphdr *iph = (struct iphdr *)skb->data;
+	struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2));
+	struct inet_sock *inet_sock;
+	int type = icmph->type;
+	int code = icmph->code;
+	struct net *net = dev_net(skb->dev);
+	struct sock *sk;
+	int harderr;
+	int err;
+
+	/* We assume the packet has already been checked by icmp_unreach */
+
+	if (!ping_supported(icmph->type, icmph->code))
+		return;
+
+	ping_debug("ping_err(type=%04x,code=%04x,id=%04x,seq=%04x)\n", type,
+		code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
+
+	sk = ping_v4_lookup(net, iph->daddr, iph->saddr,
+			    ntohs(icmph->un.echo.id), skb->dev->ifindex);
+	if (sk == NULL) {
+		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+		ping_debug("no socket, dropping\n");
+		return;	/* No socket for error */
+	}
+	ping_debug("err on socket %p\n", sk);
+
+	err = 0;
+	harderr = 0;
+	inet_sock = inet_sk(sk);
+
+	switch (type) {
+	default:
+	case ICMP_TIME_EXCEEDED:
+		err = EHOSTUNREACH;
+		break;
+	case ICMP_SOURCE_QUENCH:
+		/* This is not a real error but ping wants to see it.
+		 * Report it with some fake errno. */
+		err = EREMOTEIO;
+		break;
+	case ICMP_PARAMETERPROB:
+		err = EPROTO;
+		harderr = 1;
+		break;
+	case ICMP_DEST_UNREACH:
+		if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+			if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
+				err = EMSGSIZE;
+				harderr = 1;
+				break;
+			}
+			goto out;
+		}
+		err = EHOSTUNREACH;
+		if (code <= NR_ICMP_UNREACH) {
+			harderr = icmp_err_convert[code].fatal;
+			err = icmp_err_convert[code].errno;
+		}
+		break;
+	case ICMP_REDIRECT:
+		/* See ICMP_SOURCE_QUENCH */
+		err = EREMOTEIO;
+		break;
+	}
+
+	/*
+	 *      RFC1122: OK.  Passes ICMP errors back to application, as per
+	 *	4.1.3.3.
+	 */
+	if (!inet_sock->recverr) {
+		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
+			goto out;
+	} else {
+		ip_icmp_error(sk, skb, err, 0 /* no remote port */,
+			 info, (u8 *)icmph);
+	}
+	sk->sk_err = err;
+	sk->sk_error_report(sk);
+out:
+	sock_put(sk);
+}
+
+/*
+ *	Copy and checksum an ICMP Echo packet from user space into a buffer.
+ */
+
+struct pingfakehdr {
+	struct icmphdr icmph;
+	struct iovec *iov;
+	u32 wcheck;
+};
+
+static int ping_getfrag(void *from, char * to,
+			int offset, int fraglen, int odd, struct sk_buff *skb)
+{
+	struct pingfakehdr *pfh = (struct pingfakehdr *)from;
+
+	if (offset == 0) {
+		if (fraglen < sizeof(struct icmphdr))
+			BUG();
+		if (csum_partial_copy_fromiovecend(to + sizeof(struct icmphdr),
+			    pfh->iov, 0, fraglen - sizeof(struct icmphdr),
+			    &pfh->wcheck))
+			return -EFAULT;
+
+		return 0;
+	}
+	if (offset < sizeof(struct icmphdr))
+		BUG();
+	if (csum_partial_copy_fromiovecend
+			(to, pfh->iov, offset - sizeof(struct icmphdr),
+			 fraglen, &pfh->wcheck))
+		return -EFAULT;
+	return 0;
+}
+
+static int ping_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh)
+{
+	struct sk_buff *skb = skb_peek(&sk->sk_write_queue);
+
+	pfh->wcheck = csum_partial((char *)&pfh->icmph,
+		sizeof(struct icmphdr), pfh->wcheck);
+	pfh->icmph.checksum = csum_fold(pfh->wcheck);
+	memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr));
+	skb->ip_summed = CHECKSUM_NONE;
+	return ip_push_pending_frames(sk);
+}
+
+int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		 size_t len)
+{
+	struct inet_sock *isk = inet_sk(sk);
+	struct ipcm_cookie ipc;
+	struct icmphdr user_icmph;
+	struct pingfakehdr pfh;
+	struct rtable *rt = NULL;
+	int free = 0;
+	u32 saddr, daddr;
+	u8  tos;
+	int err;
+
+	ping_debug("ping_sendmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num);
+
+
+	if (len > 0xFFFF)
+		return -EMSGSIZE;
+
+	/*
+	 *	Check the flags.
+	 */
+
+	/* Mirror BSD error message compatibility */
+	if (msg->msg_flags & MSG_OOB)
+		return -EOPNOTSUPP;
+
+	/*
+	 *	Fetch the ICMP header provided by the userland.
+	 *	iovec is modified!
+	 */
+
+	if (memcpy_fromiovec((u8 *)&user_icmph, msg->msg_iov,
+			     sizeof(struct icmphdr)))
+		return -EFAULT;
+	if (!ping_supported(user_icmph.type, user_icmph.code))
+		return -EINVAL;
+
+	/*
+	 *	Get and verify the address.
+	 */
+
+	if (msg->msg_name) {
+		struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
+		if (msg->msg_namelen < sizeof(*usin))
+			return -EINVAL;
+		if (usin->sin_family != AF_INET)
+			return -EINVAL;
+		daddr = usin->sin_addr.s_addr;
+		/* no remote port */
+	} else {
+		if (sk->sk_state != TCP_ESTABLISHED)
+			return -EDESTADDRREQ;
+		daddr = isk->inet_daddr;
+		/* no remote port */
+	}
+
+	ipc.addr = isk->inet_saddr;
+	ipc.opt = NULL;
+	ipc.oif = sk->sk_bound_dev_if;
+
+	if (msg->msg_controllen) {
+		err = ip_cmsg_send(sock_net(sk), msg, &ipc);
+		if (err)
+			return err;
+		if (ipc.opt)
+			free = 1;
+	}
+	if (!ipc.opt)
+		ipc.opt = isk->opt;
+
+	saddr = ipc.addr;
+	ipc.addr = daddr;
+
+	if (ipc.opt && ipc.opt->srr) {
+		if (!daddr)
+			return -EINVAL;
+		daddr = ipc.opt->faddr;
+	}
+	tos = RT_TOS(isk->tos);
+	if (sock_flag(sk, SOCK_LOCALROUTE) ||
+	    (msg->msg_flags&MSG_DONTROUTE) ||
+	    (ipc.opt && ipc.opt->is_strictroute)) {
+		tos |= RTO_ONLINK;
+	}
+
+	if (ipv4_is_multicast(daddr)) {
+		if (!ipc.oif)
+			ipc.oif = isk->mc_index;
+		if (!saddr)
+			saddr = isk->mc_addr;
+	}
+
+	{
+		struct flowi fl = { .oif = ipc.oif,
+				    .mark = sk->sk_mark,
+				    .nl_u = { .ip4_u = {
+						.daddr = daddr,
+						.saddr = saddr,
+						.tos = tos } },
+				    .proto = IPPROTO_ICMP,
+				    .flags = inet_sk_flowi_flags(sk),
+		};
+
+		struct net *net = sock_net(sk);
+
+		security_sk_classify_flow(sk, &fl);
+		err = ip_route_output_flow(net, &rt, &fl, sk, 1);
+		if (err) {
+			if (err == -ENETUNREACH)
+				IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+			goto out;
+		}
+
+		err = -EACCES;
+		if ((rt->rt_flags & RTCF_BROADCAST) &&
+		    !sock_flag(sk, SOCK_BROADCAST))
+			goto out;
+	}
+
+	if (msg->msg_flags & MSG_CONFIRM)
+		goto do_confirm;
+back_from_confirm:
+
+	if (!ipc.addr)
+		ipc.addr = rt->rt_dst;
+
+	lock_sock(sk);
+
+	pfh.icmph.type = user_icmph.type; /* already checked */
+	pfh.icmph.code = user_icmph.code; /* dtto */
+	pfh.icmph.checksum = 0;
+	pfh.icmph.un.echo.id = isk->inet_sport;
+	pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence;
+	pfh.iov = msg->msg_iov;
+	pfh.wcheck = 0;
+
+	err = ip_append_data(sk, ping_getfrag, &pfh, len,
+			0, &ipc, &rt,
+			msg->msg_flags);
+	if (err)
+		ip_flush_pending_frames(sk);
+	else
+		err = ping_push_pending_frames(sk, &pfh);
+	release_sock(sk);
+
+out:
+	ip_rt_put(rt);
+	if (free)
+		kfree(ipc.opt);
+	if (!err) {
+		icmp_out_count(sock_net(sk), user_icmph.type);
+		return len;
+	}
+	return err;
+
+do_confirm:
+	dst_confirm(&rt->dst);
+	if (!(msg->msg_flags & MSG_PROBE) || len)
+		goto back_from_confirm;
+	err = 0;
+	goto out;
+}
+
+/*
+ *	IOCTL requests applicable to the UDP^H^H^HICMP protocol
+ */
+
+int ping_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+	ping_debug("ping_ioctl(sk=%p,sk->num=%u,cmd=%d,arg=%lu)\n",
+		inet_sk(sk), inet_sk(sk)->inet_num, cmd, arg);
+	switch (cmd) {
+	case SIOCOUTQ:
+	case SIOCINQ:
+		return udp_ioctl(sk, cmd, arg);
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+
+int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		 size_t len, int noblock, int flags, int *addr_len)
+{
+	struct inet_sock *isk = inet_sk(sk);
+	struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+	struct sk_buff *skb;
+	int copied, err;
+
+	ping_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num);
+
+	if (flags & MSG_OOB)
+		goto out;
+
+	if (addr_len)
+		*addr_len = sizeof(*sin);
+
+	if (flags & MSG_ERRQUEUE)
+		return ip_recv_error(sk, msg, len);
+
+	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	if (!skb)
+		goto out;
+
+	copied = skb->len;
+	if (copied > len) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+
+	/* Don't bother checking the checksum */
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	if (err)
+		goto done;
+
+	sock_recv_timestamp(msg, sk, skb);
+
+	/* Copy the address. */
+	if (sin) {
+		sin->sin_family = AF_INET;
+		sin->sin_port = 0 /* skb->h.uh->source */;
+		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+	}
+	if (isk->cmsg_flags)
+		ip_cmsg_recv(msg, skb);
+	err = copied;
+
+done:
+	skb_free_datagram(sk, skb);
+out:
+	ping_debug("ping_recvmsg -> %d\n", err);
+	return err;
+}
+
+static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	ping_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n",
+		inet_sk(sk), inet_sk(sk)->inet_num, skb);
+	if (sock_queue_rcv_skb(sk, skb) < 0) {
+		ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_INERRORS);
+		kfree_skb(skb);
+		ping_debug("ping_queue_rcv_skb -> failed\n");
+		return -1;
+	}
+	return 0;
+}
+
+
+/*
+ *	All we need to do is get the socket.
+ */
+
+void ping_rcv(struct sk_buff *skb)
+{
+	struct sock *sk;
+	struct net *net = dev_net(skb->dev);
+	struct iphdr *iph = ip_hdr(skb);
+	struct icmphdr *icmph = icmp_hdr(skb);
+	u32 saddr = iph->saddr;
+	u32 daddr = iph->daddr;
+
+	/* We assume the packet has already been checked by icmp_rcv */
+
+	ping_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n",
+		skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
+
+	/* Push ICMP header back */
+	skb_push(skb, skb->data - (u8 *)icmph);
+
+	sk = ping_v4_lookup(net, saddr, daddr, ntohs(icmph->un.echo.id),
+			    skb->dev->ifindex);
+	if (sk != NULL) {
+		ping_debug("rcv on socket %p\n", sk);
+		ping_queue_rcv_skb(sk, skb_get(skb));
+		sock_put(sk);
+		return;
+	}
+	ping_debug("no socket, dropping\n");
+
+	/* We're called from icmp_rcv(). kfree_skb() is done there. */
+}
+
+struct proto ping_prot = {
+	.name =		"PING",
+	.owner =	THIS_MODULE,
+	.init =		ping_init_sock,
+	.close =	ping_close,
+	.connect =	ip4_datagram_connect,
+	.disconnect =	udp_disconnect,
+	.ioctl =	ping_ioctl,
+	.setsockopt =	ip_setsockopt,
+	.getsockopt =	ip_getsockopt,
+	.sendmsg =	ping_sendmsg,
+	.recvmsg =	ping_recvmsg,
+	.bind =		ping_bind,
+	.backlog_rcv =	ping_queue_rcv_skb,
+	.hash =		ping_v4_hash,
+	.unhash =	ping_v4_unhash,
+	.get_port =	ping_v4_get_port,
+	.obj_size =	sizeof(struct inet_sock),
+};
+EXPORT_SYMBOL(ping_prot);
+
+#ifdef CONFIG_PROC_FS
+
+static struct sock *ping_get_first(struct seq_file *seq, int start)
+{
+	struct sock *sk;
+	struct ping_iter_state *state = seq->private;
+	struct net *net = seq_file_net(seq);
+
+	for (state->bucket = start; state->bucket < PING_HTABLE_SIZE;
+	     ++state->bucket) {
+		struct hlist_nulls_node *node;
+		struct hlist_nulls_head *hslot = &ping_table.hash[state->bucket];
+
+		if (hlist_nulls_empty(hslot))
+			continue;
+
+		sk_nulls_for_each(sk, node, hslot) {
+			if (net_eq(sock_net(sk), net))
+				goto found;
+		}
+	}
+	sk = NULL;
+found:
+	return sk;
+}
+
+static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk)
+{
+	struct ping_iter_state *state = seq->private;
+	struct net *net = seq_file_net(seq);
+
+	do {
+		sk = sk_nulls_next(sk);
+	} while (sk && (!net_eq(sock_net(sk), net)));
+
+	if (!sk)
+		return ping_get_first(seq, state->bucket + 1);
+	return sk;
+}
+
+static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct sock *sk = ping_get_first(seq, 0);
+
+	if (sk)
+		while (pos && (sk = ping_get_next(seq, sk)) != NULL)
+			--pos;
+	return pos ? NULL : sk;
+}
+
+static void *ping_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct ping_iter_state *state = seq->private;
+	state->bucket = 0;
+
+	read_lock_bh(&ping_table.lock);
+
+	return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
+}
+
+static void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct sock *sk;
+
+	if (v == SEQ_START_TOKEN)
+		sk = ping_get_idx(seq, 0);
+	else
+		sk = ping_get_next(seq, v);
+
+	++*pos;
+	return sk;
+}
+
+static void ping_seq_stop(struct seq_file *seq, void *v)
+{
+	read_unlock_bh(&ping_table.lock);
+}
+
+static void ping_format_sock(struct sock *sp, struct seq_file *f,
+		int bucket, int *len)
+{
+	struct inet_sock *inet = inet_sk(sp);
+	__be32 dest = inet->inet_daddr;
+	__be32 src = inet->inet_rcv_saddr;
+	__u16 destp = ntohs(inet->inet_dport);
+	__u16 srcp = ntohs(inet->inet_sport);
+
+	seq_printf(f, "%5d: %08X:%04X %08X:%04X"
+		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n",
+		bucket, src, srcp, dest, destp, sp->sk_state,
+		sk_wmem_alloc_get(sp),
+		sk_rmem_alloc_get(sp),
+		0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+		atomic_read(&sp->sk_refcnt), sp,
+		atomic_read(&sp->sk_drops), len);
+}
+
+static int ping_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq, "%-127s\n",
+			   "  sl  local_address rem_address   st tx_queue "
+			   "rx_queue tr tm->when retrnsmt   uid  timeout "
+			   "inode ref pointer drops");
+	else {
+		struct ping_iter_state *state = seq->private;
+		int len;
+
+		ping_format_sock(v, seq, state->bucket, &len);
+		seq_printf(seq, "%*s\n", 127 - len, "");
+	}
+	return 0;
+}
+
+static const struct seq_operations ping_seq_ops = {
+	.show		= ping_seq_show,
+	.start		= ping_seq_start,
+	.next		= ping_seq_next,
+	.stop		= ping_seq_stop,
+};
+
+static int ping_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &ping_seq_ops,
+			   sizeof(struct ping_iter_state));
+}
+
+static const struct file_operations ping_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ping_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_net,
+};
+
+static const char ping_proc_name[] = "icmp";
+
+static int ping_proc_register(struct net *net)
+{
+	struct proc_dir_entry *p;
+	int rc = 0;
+
+	p = proc_create_data(ping_proc_name, S_IRUGO, net->proc_net,
+			     &ping_seq_fops, NULL);
+	if (!p)
+		rc = -ENOMEM;
+	return rc;
+}
+
+static void ping_proc_unregister(struct net *net)
+{
+	proc_net_remove(net, ping_proc_name);
+}
+
+
+static int __net_init ping_proc_init_net(struct net *net)
+{
+	return ping_proc_register(net);
+}
+
+static void __net_exit ping_proc_exit_net(struct net *net)
+{
+	ping_proc_unregister(net);
+}
+
+static struct pernet_operations ping_net_ops = {
+	.init = ping_proc_init_net,
+	.exit = ping_proc_exit_net,
+};
+
+int __init ping_proc_init(void)
+{
+	return register_pernet_subsys(&ping_net_ops);
+}
+
+void ping_proc_exit(void)
+{
+	unregister_pernet_subsys(&ping_net_ops);
+}
+
+#endif
+
+void __init ping_init(void)
+{
+	int i;
+
+	for (i = 0; i < PING_HTABLE_SIZE; i++)
+		INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i);
+	rwlock_init(&ping_table.lock);
+}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 1a45665..9b406d7 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -13,6 +13,7 @@
 #include <linux/seqlock.h>
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/nsproxy.h>
 #include <net/snmp.h>
 #include <net/icmp.h>
 #include <net/ip.h>
@@ -21,6 +22,7 @@
 #include <net/udp.h>
 #include <net/cipso_ipv4.h>
 #include <net/inet_frag.h>
+#include <net/ping.h>
 
 static int zero;
 static int tcp_retr1_max = 255;
@@ -30,6 +32,10 @@ static int tcp_adv_win_scale_min = -31;
 static int tcp_adv_win_scale_max = 31;
 static int ip_ttl_min = 1;
 static int ip_ttl_max = 255;
+#ifdef CONFIG_IP_PING
+static int ip_ping_group_range_min[] = { 0, 0 };
+static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
+#endif
 
 /* Update system visible IP port range */
 static void set_local_port_range(int range[2])
@@ -68,6 +74,67 @@ static int ipv4_local_port_range(ctl_table *table, int write,
 	return ret;
 }
 
+#ifdef CONFIG_IP_PING
+
+void inet_get_ping_group_range_net(struct net *net, gid_t *low, gid_t *high)
+{
+	gid_t *data = net->ipv4.sysctl_ping_group_range;
+	unsigned seq;
+	do {
+		seq = read_seqbegin(&sysctl_local_ports.lock);
+
+		*low = data[0];
+		*high = data[1];
+	} while (read_seqretry(&sysctl_local_ports.lock, seq));
+}
+
+void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high)
+{
+	gid_t *data = table->data;
+	unsigned seq;
+	do {
+		seq = read_seqbegin(&sysctl_local_ports.lock);
+
+		*low = data[0];
+		*high = data[1];
+	} while (read_seqretry(&sysctl_local_ports.lock, seq));
+}
+
+/* Update system visible IP port range */
+static void set_ping_group_range(struct ctl_table *table, int range[2])
+{
+	gid_t *data = table->data;
+	write_seqlock(&sysctl_local_ports.lock);
+	data[0] = range[0];
+	data[1] = range[1];
+	write_sequnlock(&sysctl_local_ports.lock);
+}
+
+/* Validate changes from /proc interface. */
+static int ipv4_ping_group_range(ctl_table *table, int write,
+				 void __user *buffer,
+				 size_t *lenp, loff_t *ppos)
+{
+	int ret;
+	gid_t range[2];
+	ctl_table tmp = {
+		.data = &range,
+		.maxlen = sizeof(range),
+		.mode = table->mode,
+		.extra1 = &ip_ping_group_range_min,
+		.extra2 = &ip_ping_group_range_max,
+	};
+
+	inet_get_ping_group_range_table(table, range, range + 1);
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+	if (write && ret == 0)
+		set_ping_group_range(table, range);
+
+	return ret;
+}
+#endif
+
 static int proc_tcp_congestion_control(ctl_table *ctl, int write,
 				       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -680,6 +747,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+#ifdef CONFIG_IP_PING
+	{
+		.procname	= "ping_group_range",
+		.data		= &init_net.ipv4.sysctl_ping_group_range,
+		.maxlen		= sizeof(init_net.ipv4.sysctl_ping_group_range),
+		.mode		= 0644,
+		.proc_handler	= ipv4_ping_group_range,
+	},
+#endif
 	{ }
 };
 
@@ -714,8 +790,22 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 			&net->ipv4.sysctl_icmp_ratemask;
 		table[6].data =
 			&net->ipv4.sysctl_rt_cache_rebuild_count;
+#ifdef CONFIG_IP_PING
+		table[7].data =
+			&net->ipv4.sysctl_ping_group_range;
+#endif
+
 	}
 
+#ifdef CONFIG_IP_PING
+	/*
+	 * Sane defaults - nobody may create ping sockets.
+	 * Boot scripts should set this to disto-specific group.
+	 */
+	net->ipv4.sysctl_ping_group_range[0] = 1;
+	net->ipv4.sysctl_ping_group_range[1] = 0;
+#endif
+
 	net->ipv4.sysctl_rt_cache_rebuild_count = 4;
 
 	net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
-- 
1.7.0.4

^ permalink raw reply related

* Re: Kernel panic when using bridge
From: Hiroaki SHIMODA @ 2011-04-09  7:19 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Scot Doyle, netdev
In-Reply-To: <4D9FE5BE.6060600@scotdoyle.com>

On Fri, 08 Apr 2011 23:51:10 -0500
Scot Doyle <lkml@scotdoyle.com> wrote:

> On 04/08/2011 02:17 PM, Stephen Hemminger wrote:
> > Please reproduce with exactly 2.6.39-rc2 there were some bug fixes
> > to make sure that header was initialized.
> 
> Hi Stephen, here's another panic with 2.6.39-rc2 (git commit 
> bb3c90f0de7b34995b5e35cf5dc97a3d428b3761) using default kernel config 
> options.
> 
> # sysctl -a | grep bridge
> net.bridge.bridge-nf-call-arptables = 1
> net.bridge.bridge-nf-call-iptables = 1
> net.bridge.bridge-nf-call-ip6tables = 1
> net.bridge.bridge-nf-filter-vlan-tagged = 0
> net.bridge.bridge-nf-filter-pppoe-tagged = 0
> 
> # /etc/network/interfaces
> auto lo
> iface lo inet loopback
> auto br0
> iface br0 inet static
>      address x.y.z.237
>      netmask 255.255.255.224
>      gateway x.y.z.225
>      bridge_ports    eth3
>      bridge_stp    off
>      bridge_maxwait    0
>      bridge_fd    0
> auto br0:1
> iface br0:1 inet static
>      address 10.0.0.1
>      netmask 255.255.255.0
> auto br0:2
> iface br0:2 inet static
>      address 10.0.1.1
>      netmask 255.255.255.0
> 
> ------
> 
> [ 1691.681069] BUG: unable to handle kernel NULL pointer dereference at 
> 00000000000000cc
> [ 1691.688879] IP: [<ffffffff8129fb8d>] ip_options_compile+0x1c1/0x435
> [ 1691.695126] PGD 0
> [ 1691.697131] Oops: 0000 [#1] SMP
> [ 1691.700357] last sysfs file: /sys/devices/virtual/misc/kvm/uevent
> [ 1691.706418] CPU 0
> [ 1691.708241] Modules linked in: kvm_intel kvm bridge stp loop snd_pcm 
> snd_timer snd soundcore snd_page_alloc tpm_tis i7core_edac psmouse ghes 
> tpm evdev edac_core pcspkr serio_raw processor tpm_bios button dcdbas 
> thermal_sys hed power_meter ext2 mbcache dm_mod raid1 md_mod sd_mod 
> crc_t10dif usb_storage uas uhci_hcd mpt2sas scsi_transport_sas 
> raid_class ehci_hcd igb scsi_mod usbcore dca bnx2 [last unloaded: 
> scsi_wait_scan]
> [ 1691.745849]
> [ 1691.747330] Pid: 0, comm: swapper Not tainted 2.6.39-rc2+ #3 Dell 
> Inc. PowerEdge R510/0DPRKF
> [ 1691.755752] RIP: 0010:[<ffffffff8129fb8d>]  [<ffffffff8129fb8d>] 
> ip_options_compile+0x1c1/0x435
> [ 1691.764418] RSP: 0018:ffff88042f203af0  EFLAGS: 00010286
> [ 1691.769702] RAX: 0000000000000024 RBX: ffff88041c9fa900 RCX: 
> ffff880403466865
> [ 1691.776800] RDX: 0000000000000027 RSI: 0000000000000000 RDI: 
> ffffffff817e6100
> [ 1691.783899] RBP: ffff880403466863 R08: ffffffffa01ade89 R09: 
> ffff88042f203c58
> [ 1691.790997] R10: ffffe1c4ff103b40 R11: 0000000000000004 R12: 
> ffff88041c9fa928
> [ 1691.798095] R13: 0000000000000027 R14: ffff88040346684e R15: 
> 0000000000000027
> [ 1691.805194] FS:  0000000000000000(0000) GS:ffff88042f200000(0000) 
> knlGS:0000000000000000
> [ 1691.813245] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [ 1691.818960] CR2: 00000000000000cc CR3: 0000000001603000 CR4: 
> 00000000000006f0
> [ 1691.826058] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
> 0000000000000000
> [ 1691.833156] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 
> 0000000000000400
> [ 1691.840254] Process swapper (pid: 0, threadinfo ffffffff81600000, 
> task ffffffff8160b020)
> [ 1691.848303] Stack:
> [ 1691.850300]  ffff88042ec02900 ffff8804051ac740 0000000000000000 
> ffffffff817e6100
> [ 1691.857693]  0000000000000282 ffffffff810ec848 0000000000000282 
> ffff88041c9fa928
> [ 1691.865085]  ffff88041c9fa900 ffff8804038e8000 ffff88040346684e 
> ffff8804038e8000
> [ 1691.872480] Call Trace:
> [ 1691.874910] <IRQ>
> [ 1691.877005]  [<ffffffff810ec848>] ? __slab_free+0x80/0x14a
> [ 1691.882465]  [<ffffffffa01b1e3a>] ? br_parse_ip_options+0x133/0x1a0 
> [bridge]
> [ 1691.889480]  [<ffffffffa01b2bd8>] ? br_nf_pre_routing+0x348/0x3cb 
> [bridge]
> [ 1691.896324]  [<ffffffff8119d88f>] ? cpumask_next_and+0x2b/0x3a
> [ 1691.902127]  [<ffffffff81298517>] ? nf_iterate+0x41/0x7e
> [ 1691.907413]  [<ffffffffa01ade89>] ? NF_HOOK.clone.4+0x56/0x56 [bridge]
> [ 1691.913908]  [<ffffffffa01ade89>] ? NF_HOOK.clone.4+0x56/0x56 [bridge]
> [ 1691.920402]  [<ffffffff812985c7>] ? nf_hook_slow+0x73/0x114
> [ 1691.925947]  [<ffffffffa01ade89>] ? NF_HOOK.clone.4+0x56/0x56 [bridge]
> [ 1691.932442]  [<ffffffffa01ade89>] ? NF_HOOK.clone.4+0x56/0x56 [bridge]
> [ 1691.938937]  [<ffffffffa01ade6f>] ? NF_HOOK.clone.4+0x3c/0x56 [bridge]
> [ 1691.945432]  [<ffffffff810ee373>] ? 
> __kmalloc_node_track_caller+0xd4/0x10d
> [ 1691.952274]  [<ffffffffa01ae1e5>] ? br_handle_frame+0x195/0x1ac [bridge]
> [ 1691.958942]  [<ffffffffa01ae050>] ? 
> br_handle_frame_finish+0x1c7/0x1c7 [bridge]
> [ 1691.966217]  [<ffffffff812764df>] ? __netif_receive_skb+0x2a7/0x450
> [ 1691.972452]  [<ffffffff81276918>] ? netif_receive_skb+0x52/0x58
> [ 1691.978340]  [<ffffffff81276e1a>] ? napi_gro_receive+0x1f/0x2f
> [ 1691.984143]  [<ffffffff812769ef>] ? napi_skb_finish+0x1c/0x31
> [ 1691.989862]  [<ffffffffa0226fcd>] ? igb_poll+0x6d9/0x9ee [igb]
> [ 1691.995666]  [<ffffffff8103eb92>] ? try_to_wake_up+0x16a/0x17c
> [ 1692.001470]  [<ffffffff8109034f>] ? handle_irq_event+0x40/0x55
> [ 1692.007275]  [<ffffffff8106fc3c>] ? arch_local_irq_save+0x14/0x1d
> [ 1692.013338]  [<ffffffff81276f45>] ? net_rx_action+0xa4/0x1b1
> [ 1692.018971]  [<ffffffff8104ad26>] ? __do_softirq+0xb8/0x176
> [ 1692.024516]  [<ffffffff81333b5c>] ? call_softirq+0x1c/0x30
> [ 1692.029973]  [<ffffffff8100aa57>] ? do_softirq+0x3f/0x84
> [ 1692.035257]  [<ffffffff8104af91>] ? irq_exit+0x3f/0x8f
> [ 1692.040368]  [<ffffffff8100a793>] ? do_IRQ+0x85/0x9e
> [ 1692.045308]  [<ffffffff8132cad3>] ? common_interrupt+0x13/0x13
> [ 1692.051110] <EOI>
> [ 1692.053204]  [<ffffffff81061348>] ? enqueue_hrtimer+0x3f/0x53
> [ 1692.058922]  [<ffffffffa032c417>] ? arch_local_irq_enable+0x7/0x8 
> [processor]
> [ 1692.066021]  [<ffffffffa032cfdf>] ? acpi_idle_enter_bm+0x218/0x250 
> [processor]
> [ 1692.073208]  [<ffffffff8125df49>] ? menu_select+0x169/0x296
> [ 1692.078752]  [<ffffffff8125d059>] ? cpuidle_idle_call+0xf4/0x17e
> [ 1692.084727]  [<ffffffff81008298>] ? cpu_idle+0xa2/0xc4
> [ 1692.089838]  [<ffffffff8169db60>] ? start_kernel+0x3b9/0x3c4
> [ 1692.095469]  [<ffffffff8169d3c6>] ? x86_64_start_kernel+0x102/0x10f
> [ 1692.101703] Code: 4d 02 3c 03 0f 86 59 02 00 00 0f b6 d0 44 39 ea 7f 
> 32 83 c2 03 44 39 ea 0f 8f 45 02 00 00 48 85 db 74 18 48 8b 74 24 10 0f 
> b6 c0 <8b> 96 cc 00 00 00 89 54 05 ff 41 80 4c 24 08 04 80 01 04 41 80
> [ 1692.121051] RIP  [<ffffffff8129fb8d>] ip_options_compile+0x1c1/0x435
> [ 1692.127382]  RSP <ffff88042f203af0>
> [ 1692.130850] CR2: 00000000000000cc
> [ 1692.134470] ---[ end trace 0afda543b32ed72b ]---

It seems that the bug trap is occurred in ip_options_compile() due to
rt is NULL.

	8b 96 cc 00 00 00       mov    0xcc(%rsi),%edx
rsi is rt, and 0xcc means rt->rt_spec_dst. So I think below code hit
the bug trap.

332	if (skb) {
333		memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); <- here
334		opt->is_changed = 1;
335	}

And call trace seems as follows. 
  __netif_receive_skb()
    -> br_handle_frame()
         -> NF_HOOK()
              -> br_nf_pre_routing()
                   -> br_parse_ip_options()
                        -> ip_options_compile()

br_parse_ip_options() was introduced at 462fb2a (bridge : Sanitize
skb before it enters the IP stack) but ip_options_compile() or
ip_options_rcv_srr() seems to be called with no rt info.

Thanks.

^ permalink raw reply

* RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
From: Eric Dumazet @ 2011-04-09  6:36 UTC (permalink / raw)
  To: Wei Gu; +Cc: Alexander Duyck, netdev, Kirsher, Jeffrey T
In-Reply-To: <D12839161ADD3A4B8DA63D1A134D084026E48BA6AE@ESGSCCMS0001.eapac.ericsson.se>

Le samedi 09 avril 2011 à 11:27 +0800, Wei Gu a écrit :
> HI Eric,
> If I try to bind the 8 tx&rx queue to different NUMA Node to (core 3,7,11,15,19,23,27,31), looks doesn't help on the rx_missing_error anymore.
> 
> I still think the best performance would be binding NIC to one sock of CPU with it's local memory node.
> I did a lot of combination on 2.6.32 kernel, by bind the eth10 to NODE2/3 could gain 20% more performance compare to NODE0/1.
> So I guess the CPU Socket 2&3 was locally with the eth10.
> 

Ideally, you would need to split memory loads on several nodes, because
you have a workload on a single NIC, located on a given node Nx.


1) Let the buffers where NIC performs DMA be on Nx,
so that DMA is fast.

2) And everything else on other nodes, so that cpus can steal some
memory bandwidth from other nodes, and free Nx memory bandwidth for NIC
use. (Processors only need to fetch first cache line of packets to
perform routing decision)

alloc_skb() would need to use memory from node Ny for "struct sk_buff",
and memory from node Nx for "skb->data" and skb frags
[ netdev_alloc_page() in ixgbe case]

In your case, you have 4 nodes, so Ny would be in a set of 3 nodes.

So commit 564824b0c52c34692d804b would need a litle tweak in your
case [ where your cpus need to bring only one cache line from the packet payload ]

Please try following patch :



 include/linux/skbuff.h |   14 +-------------
 net/core/skbuff.c      |   19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d0ae90a..b43626d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1567,19 +1567,7 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
 	return skb;
 }
 
-/**
- *	__netdev_alloc_page - allocate a page for ps-rx on a specific device
- *	@dev: network device to receive on
- *	@gfp_mask: alloc_pages_node mask
- *
- * 	Allocate a new page. dev currently unused.
- *
- * 	%NULL is returned if there is no free memory.
- */
-static inline struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
-{
-	return alloc_pages_node(NUMA_NO_NODE, gfp_mask, 0);
-}
+extern struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask);
 
 /**
  *	netdev_alloc_page - allocate a page for ps-rx on a specific device
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7ebeed0..877797e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -259,6 +259,25 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
 }
 EXPORT_SYMBOL(__netdev_alloc_skb);
 
+/**
+ *	__netdev_alloc_page - allocate a page for ps-rx on a specific device
+ *	@dev: network device to receive on
+ *	@gfp_mask: alloc_pages_node mask
+ *
+ * 	Allocate a new page. dev currently unused.
+ *
+ * 	%NULL is returned if there is no free memory.
+ */
+struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
+{
+	int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : NUMA_NO_NODE;
+	struct page *page;
+
+	page = alloc_pages_node(node, gfp_mask, 0);
+	return page;
+}
+EXPORT_SYMBOL(__netdev_alloc_page);
+
 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
 		int size)
 {



^ permalink raw reply related

* RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
From: Wei Gu @ 2011-04-09  6:12 UTC (permalink / raw)
  To: Alexander H Duyck; +Cc: Eric Dumazet, netdev, Kirsher, Jeffrey T
In-Reply-To: <1302324057.15899.20.camel@ahduyck-mobl2.amr.corp.intel.com>

Hi Alexander,
The total thruput with 400byte UDP  receiving(terminate on prerouting hook) on 2.6.32 is over >1.5Mpps without packet lost.
I even tried forward this receved packets back on same NIC, I get >1.5Mpps Rx with same amount of Tx, no rx_missing_error at all. And even with some 68byte packets I was reach 5Mpps+/NIC on 2.6.32 kernel.

I was expect to gain even higher performance with this new linux kernel with same HW configuration.

Yes, the DMAR is off, I can get +1Mpps,but as I said not stable at all.(high rx_missing_error rate).

I'm sure the slot for eth10 was x8 Gen2:
[ixgbe: eth10: ixgbe_probe: (PCI Express:5.0Gb/s:Width x8) 00:1b:21:6b:45:cc]

For the memory configuration, I was using the same server as I was testing with 2.6.32. I have total 64G * 4 merory which is 100% memory bandwidth with 4 sock CPUs, recommended by HP expert( 8 DIMM's per processor in slot Cartridge).

Does anything from Linux kernel will affact these memory configuration thing?

numactl  --hardware
available: 4 nodes (0-3)
node 0 cpus: 0 1 2 3 4 5 6 7 32 33 34 35 36 37 38 39
node 0 size: 65525 MB
node 0 free: 63226 MB
node 1 cpus: 8 9 10 11 12 13 14 15 40 41 42 43 44 45 46 47
node 1 size: 65536 MB
node 1 free: 63292 MB
node 2 cpus: 16 17 18 19 20 21 22 23 48 49 50 51 52 53 54 55
node 2 size: 65536 MB
node 2 free: 63366 MB
node 3 cpus: 24 25 26 27 28 29 30 31 56 57 58 59 60 61 62 63
node 3 size: 65535 MB
node 3 free: 63345 MB
node distances:
node   0   1   2   3
  0:  10  20  20  20
  1:  20  10  20  20
  2:  20  20  10  20
  3:  20  20  20  10

Lspci -vvv
8d:00.0 Ethernet controller: Intel Corporation 82599EB 10-Gigabit Network Connection (rev 01)
        Subsystem: Intel Corporation Ethernet Server Adapter X520-2
        Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr+ Stepping- SERR- FastB2B- DisINTx+
        Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx+
        Latency: 0, Cache Line Size: 64 bytes
        Interrupt: pin A routed to IRQ 50
        Region 0: Memory at f0200000 (64-bit, non-prefetchable) [size=512K]
        Region 2: I/O ports at 8000 [size=32]
        Region 4: Memory at f0284000 (64-bit, non-prefetchable) [size=16K]
        [virtual] Expansion ROM at f0600000 [disabled] [size=512K]
        Capabilities: [40] Power Management version 3
                Flags: PMEClk- DSI+ D1- D2- AuxCurrent=0mA PME(D0+,D1-,D2-,D3hot+,D3cold-)
                Status: D0 NoSoftRst- PME-Enable- DSel=0 DScale=1 PME-
        Capabilities: [50] MSI: Enable- Count=1/1 Maskable+ 64bit+
                Address: 0000000000000000  Data: 0000
                Masking: 00000000  Pending: 00000000
        Capabilities: [70] MSI-X: Enable+ Count=64 Masked-
                Vector table: BAR=4 offset=00000000
                PBA: BAR=4 offset=00002000
        Capabilities: [a0] Express (v2) Endpoint, MSI 00
                DevCap: MaxPayload 512 bytes, PhantFunc 0, Latency L0s <512ns, L1 <64us
                        ExtTag- AttnBtn- AttnInd- PwrInd- RBE+ FLReset+
                DevCtl: Report errors: Correctable+ Non-Fatal+ Fatal+ Unsupported+
                        RlxdOrd+ ExtTag- PhantFunc- AuxPwr- NoSnoop+ FLReset-
                        MaxPayload 256 bytes, MaxReadReq 4096 bytes
                DevSta: CorrErr- UncorrErr- FatalErr+ UnsuppReq+ AuxPwr- TransPend+
                LnkCap: Port #2, Speed 5GT/s, Width x8, ASPM L0s, Latency L0 unlimited, L1 <32us
                        ClockPM- Surprise- LLActRep- BwNot-
                LnkCtl: ASPM Disabled; RCB 64 bytes Disabled- Retrain- CommClk-
                        ExtSynch- ClockPM- AutWidDis- BWInt- AutBWInt-
                LnkSta: Speed 5GT/s, Width x8, TrErr- Train- SlotClk+ DLActive- BWMgmt- ABWMgmt-
                DevCap2: Completion Timeout: Range ABCD, TimeoutDis+
                DevCtl2: Completion Timeout: 50us to 50ms, TimeoutDis-
                LnkCtl2: Target Link Speed: 5GT/s, EnterCompliance- SpeedDis-, Selectable De-emphasis: -6dB
                         Transmit Margin: Normal Operating Range, EnterModifiedCompliance- ComplianceSOS-
                         Compliance De-emphasis: -6dB
                LnkSta2: Current De-emphasis Level: -6dB
        Capabilities: [100] Advanced Error Reporting
                UESta:  DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- RxOF- MalfTLP- ECRC- UnsupReq+ ACSViol-
                UEMsk:  DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- RxOF- MalfTLP- ECRC- UnsupReq+ ACSViol-
                UESvrt: DLP+ SDES- TLP+ FCP+ CmpltTO+ CmpltAbrt+ UnxCmplt+ RxOF+ MalfTLP+ ECRC- UnsupReq+ ACSViol-
                CESta:  RxErr- BadTLP- BadDLLP- Rollover- Timeout- NonFatalErr-
                CEMsk:  RxErr+ BadTLP+ BadDLLP+ Rollover+ Timeout+ NonFatalErr+
                AERCap: First Error Pointer: 00, GenCap+ CGenEn- ChkCap+ ChkEn-
        Capabilities: [140] Device Serial Number 00-1b-21-ff-ff-6b-45-18
        Capabilities: [150] Alternative Routing-ID Interpretation (ARI)
                ARICap: MFVC- ACS-, Next Function: 1
                ARICtl: MFVC- ACS-, Function Group: 0
        Capabilities: [160] Single Root I/O Virtualization (SR-IOV)
                IOVCap: Migration-, Interrupt Message Number: 000
                IOVCtl: Enable- Migration- Interrupt- MSE- ARIHierarchy+
                IOVSta: Migration-
                Initial VFs: 64, Total VFs: 64, Number of VFs: 64, Function Dependency Link: 00
                VF offset: 128, stride: 2, Device ID: 10ed
                Supported Page Size: 00000553, System Page Size: 00000001
                Region 0: Memory at 0000000000000000 (64-bit, non-prefetchable)
                Region 3: Memory at 0000000000000000 (64-bit, non-prefetchable)
                VF Migration: offset: 00000000, BIR: 0
        Kernel driver in use: ixgbe
        Kernel modules: ixgbe

Thanks
WeiGu

-----Original Message-----
From: Alexander H Duyck [mailto:alexander.h.duyck@intel.com]
Sent: Saturday, April 09, 2011 12:41 PM
To: Wei Gu
Cc: Eric Dumazet; netdev; Kirsher, Jeffrey T
Subject: RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel

On Fri, 2011-04-08 at 20:36 -0700, Wei Gu wrote:
> Hi Alexander, I do agree with you that if only the rx_missing_error
> (rx_no_buffer_count: 0) indicates the memory bandwidth issue. But the
> strange thing is I using the same test configuration on Linux 2.6.32,
> which looks no this problem at all. SO it not a HW setup problem at
> all, only difference in on the Kernel version, that's why I go back to
> you guys for this new Linux 2.6.38, if it will affact this memory
> bandwidth Or BIOS etc things?

What were the numbers you were getting with 2.6.32?  I would be interested in seeing those number just to get an idea of how they compare against the 2.6.38 kernel.

> The follow dump is done, while I was try to receive 290Kpps 400Byte
> pakets from IXIA, and drop them in the prerouting hook. I bind the
> eth10 8 RX queue to CPU sock ID 3 ( core 24-31) on NUMA NODE3

Just to confirm this is with DMAR off?  I saw an earlier email that said you were getting a variable amount that was over 1Mpps and just want to confirm this is with the same config.

> ethtool -i eth10
> driver: ixgbe
> version: 3.2.10-NAPI
> firmware-version: 0.9-3
> bus-info: 0000:8d:00.0
>
> ethtool -S eth10
> NIC statistics:
>      rx_packets: 14222510
>      tx_packets: 109
>      rx_bytes: 5575223920
>      tx_bytes: 17790
>      rx_missed_errors: 15150244
>      rx_no_buffer_count: 0

I trimmed down your stats here pretty significantly.  This isn't an issue with the driver not keeping up.  The problem here is memory and/or bus bandwidth.  Based on the info you provided I am assuming you have a quad socket system.  I'm curious how the memory is laid out.  What is the total memory size, memory per node, and do you have all of the memory channels on each node populated?  One common thing I've seen cause these type of issues is an incorrect memory configuration.

Also if you could send me an lspci -vvv for 8d:00.0 specifically I would appreciate it as I would like to look over the PCIe config just to make sure the slot is a x8 PCIe gen 2.

Thanks,

Alex


^ permalink raw reply

* Re: Kernel panic when using bridge
From: Scot Doyle @ 2011-04-09  4:51 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20110408121700.0aad53fe@nehalam>

On 04/08/2011 02:17 PM, Stephen Hemminger wrote:
> Please reproduce with exactly 2.6.39-rc2 there were some bug fixes
> to make sure that header was initialized.

Hi Stephen, here's another panic with 2.6.39-rc2 (git commit 
bb3c90f0de7b34995b5e35cf5dc97a3d428b3761) using default kernel config 
options.

# sysctl -a | grep bridge
net.bridge.bridge-nf-call-arptables = 1
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.bridge.bridge-nf-filter-vlan-tagged = 0
net.bridge.bridge-nf-filter-pppoe-tagged = 0

# /etc/network/interfaces
auto lo
iface lo inet loopback
auto br0
iface br0 inet static
     address x.y.z.237
     netmask 255.255.255.224
     gateway x.y.z.225
     bridge_ports    eth3
     bridge_stp    off
     bridge_maxwait    0
     bridge_fd    0
auto br0:1
iface br0:1 inet static
     address 10.0.0.1
     netmask 255.255.255.0
auto br0:2
iface br0:2 inet static
     address 10.0.1.1
     netmask 255.255.255.0

------

[ 1691.681069] BUG: unable to handle kernel NULL pointer dereference at 
00000000000000cc
[ 1691.688879] IP: [<ffffffff8129fb8d>] ip_options_compile+0x1c1/0x435
[ 1691.695126] PGD 0
[ 1691.697131] Oops: 0000 [#1] SMP
[ 1691.700357] last sysfs file: /sys/devices/virtual/misc/kvm/uevent
[ 1691.706418] CPU 0
[ 1691.708241] Modules linked in: kvm_intel kvm bridge stp loop snd_pcm 
snd_timer snd soundcore snd_page_alloc tpm_tis i7core_edac psmouse ghes 
tpm evdev edac_core pcspkr serio_raw processor tpm_bios button dcdbas 
thermal_sys hed power_meter ext2 mbcache dm_mod raid1 md_mod sd_mod 
crc_t10dif usb_storage uas uhci_hcd mpt2sas scsi_transport_sas 
raid_class ehci_hcd igb scsi_mod usbcore dca bnx2 [last unloaded: 
scsi_wait_scan]
[ 1691.745849]
[ 1691.747330] Pid: 0, comm: swapper Not tainted 2.6.39-rc2+ #3 Dell 
Inc. PowerEdge R510/0DPRKF
[ 1691.755752] RIP: 0010:[<ffffffff8129fb8d>]  [<ffffffff8129fb8d>] 
ip_options_compile+0x1c1/0x435
[ 1691.764418] RSP: 0018:ffff88042f203af0  EFLAGS: 00010286
[ 1691.769702] RAX: 0000000000000024 RBX: ffff88041c9fa900 RCX: 
ffff880403466865
[ 1691.776800] RDX: 0000000000000027 RSI: 0000000000000000 RDI: 
ffffffff817e6100
[ 1691.783899] RBP: ffff880403466863 R08: ffffffffa01ade89 R09: 
ffff88042f203c58
[ 1691.790997] R10: ffffe1c4ff103b40 R11: 0000000000000004 R12: 
ffff88041c9fa928
[ 1691.798095] R13: 0000000000000027 R14: ffff88040346684e R15: 
0000000000000027
[ 1691.805194] FS:  0000000000000000(0000) GS:ffff88042f200000(0000) 
knlGS:0000000000000000
[ 1691.813245] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 1691.818960] CR2: 00000000000000cc CR3: 0000000001603000 CR4: 
00000000000006f0
[ 1691.826058] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
0000000000000000
[ 1691.833156] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 
0000000000000400
[ 1691.840254] Process swapper (pid: 0, threadinfo ffffffff81600000, 
task ffffffff8160b020)
[ 1691.848303] Stack:
[ 1691.850300]  ffff88042ec02900 ffff8804051ac740 0000000000000000 
ffffffff817e6100
[ 1691.857693]  0000000000000282 ffffffff810ec848 0000000000000282 
ffff88041c9fa928
[ 1691.865085]  ffff88041c9fa900 ffff8804038e8000 ffff88040346684e 
ffff8804038e8000
[ 1691.872480] Call Trace:
[ 1691.874910] <IRQ>
[ 1691.877005]  [<ffffffff810ec848>] ? __slab_free+0x80/0x14a
[ 1691.882465]  [<ffffffffa01b1e3a>] ? br_parse_ip_options+0x133/0x1a0 
[bridge]
[ 1691.889480]  [<ffffffffa01b2bd8>] ? br_nf_pre_routing+0x348/0x3cb 
[bridge]
[ 1691.896324]  [<ffffffff8119d88f>] ? cpumask_next_and+0x2b/0x3a
[ 1691.902127]  [<ffffffff81298517>] ? nf_iterate+0x41/0x7e
[ 1691.907413]  [<ffffffffa01ade89>] ? NF_HOOK.clone.4+0x56/0x56 [bridge]
[ 1691.913908]  [<ffffffffa01ade89>] ? NF_HOOK.clone.4+0x56/0x56 [bridge]
[ 1691.920402]  [<ffffffff812985c7>] ? nf_hook_slow+0x73/0x114
[ 1691.925947]  [<ffffffffa01ade89>] ? NF_HOOK.clone.4+0x56/0x56 [bridge]
[ 1691.932442]  [<ffffffffa01ade89>] ? NF_HOOK.clone.4+0x56/0x56 [bridge]
[ 1691.938937]  [<ffffffffa01ade6f>] ? NF_HOOK.clone.4+0x3c/0x56 [bridge]
[ 1691.945432]  [<ffffffff810ee373>] ? 
__kmalloc_node_track_caller+0xd4/0x10d
[ 1691.952274]  [<ffffffffa01ae1e5>] ? br_handle_frame+0x195/0x1ac [bridge]
[ 1691.958942]  [<ffffffffa01ae050>] ? 
br_handle_frame_finish+0x1c7/0x1c7 [bridge]
[ 1691.966217]  [<ffffffff812764df>] ? __netif_receive_skb+0x2a7/0x450
[ 1691.972452]  [<ffffffff81276918>] ? netif_receive_skb+0x52/0x58
[ 1691.978340]  [<ffffffff81276e1a>] ? napi_gro_receive+0x1f/0x2f
[ 1691.984143]  [<ffffffff812769ef>] ? napi_skb_finish+0x1c/0x31
[ 1691.989862]  [<ffffffffa0226fcd>] ? igb_poll+0x6d9/0x9ee [igb]
[ 1691.995666]  [<ffffffff8103eb92>] ? try_to_wake_up+0x16a/0x17c
[ 1692.001470]  [<ffffffff8109034f>] ? handle_irq_event+0x40/0x55
[ 1692.007275]  [<ffffffff8106fc3c>] ? arch_local_irq_save+0x14/0x1d
[ 1692.013338]  [<ffffffff81276f45>] ? net_rx_action+0xa4/0x1b1
[ 1692.018971]  [<ffffffff8104ad26>] ? __do_softirq+0xb8/0x176
[ 1692.024516]  [<ffffffff81333b5c>] ? call_softirq+0x1c/0x30
[ 1692.029973]  [<ffffffff8100aa57>] ? do_softirq+0x3f/0x84
[ 1692.035257]  [<ffffffff8104af91>] ? irq_exit+0x3f/0x8f
[ 1692.040368]  [<ffffffff8100a793>] ? do_IRQ+0x85/0x9e
[ 1692.045308]  [<ffffffff8132cad3>] ? common_interrupt+0x13/0x13
[ 1692.051110] <EOI>
[ 1692.053204]  [<ffffffff81061348>] ? enqueue_hrtimer+0x3f/0x53
[ 1692.058922]  [<ffffffffa032c417>] ? arch_local_irq_enable+0x7/0x8 
[processor]
[ 1692.066021]  [<ffffffffa032cfdf>] ? acpi_idle_enter_bm+0x218/0x250 
[processor]
[ 1692.073208]  [<ffffffff8125df49>] ? menu_select+0x169/0x296
[ 1692.078752]  [<ffffffff8125d059>] ? cpuidle_idle_call+0xf4/0x17e
[ 1692.084727]  [<ffffffff81008298>] ? cpu_idle+0xa2/0xc4
[ 1692.089838]  [<ffffffff8169db60>] ? start_kernel+0x3b9/0x3c4
[ 1692.095469]  [<ffffffff8169d3c6>] ? x86_64_start_kernel+0x102/0x10f
[ 1692.101703] Code: 4d 02 3c 03 0f 86 59 02 00 00 0f b6 d0 44 39 ea 7f 
32 83 c2 03 44 39 ea 0f 8f 45 02 00 00 48 85 db 74 18 48 8b 74 24 10 0f 
b6 c0 <8b> 96 cc 00 00 00 89 54 05 ff 41 80 4c 24 08 04 80 01 04 41 80
[ 1692.121051] RIP  [<ffffffff8129fb8d>] ip_options_compile+0x1c1/0x435
[ 1692.127382]  RSP <ffff88042f203af0>
[ 1692.130850] CR2: 00000000000000cc
[ 1692.134470] ---[ end trace 0afda543b32ed72b ]---
[ 1692.139064] BUG: scheduling while atomic: swapper/0/0x10000100
[ 1692.144866] Modules linked in: kvm_intel kvm bridge stp loop snd_pcm 
snd_timer snd soundcore snd_page_alloc tpm_tis i7core_edac psmouse ghes 
tpm evdev edac_core pcspkr serio_raw processor tpm_bios button dcdbas 
thermal_sys hed power_meter ext2 mbcache dm_mod raid1 md_mod sd_mod 
crc_t10dif usb_storage uas uhci_hcd mpt2sas scsi_transport_sas 
raid_class ehci_hcd igb scsi_mod usbcore dca bnx2 [last unloaded: 
scsi_wait_scan]
[ 1692.182294] CPU 0
[ 1692.184119] Modules linked in: kvm_intel kvm bridge stp loop snd_pcm 
snd_timer snd soundcore snd_page_alloc tpm_tis i7core_edac psmouse ghes 
tpm evdev edac_core pcspkr serio_raw processor tpm_bios button dcdbas 
thermal_sys hed power_meter ext2 mbcache dm_mod raid1 md_mod sd_mod 
crc_t10dif usb_storage uas uhci_hcd mpt2sas scsi_transport_sas 
raid_class ehci_hcd igb scsi_mod usbcore dca bnx2 [last unloaded: 
scsi_wait_scan]
[ 1692.221718]
[ 1692.223199] Pid: 0, comm: swapper Tainted: G      D     2.6.39-rc2+ 
#3 Dell Inc. PowerEdge R510/0DPRKF
[ 1692.232487] RIP: 0010:[<ffffffffa032c417>]  [<ffffffffa032c417>] 
arch_local_irq_enable+0x7/0x8 [processor]
[ 1692.242105] RSP: 0018:ffffffff81601eb0  EFLAGS: 00000292
[ 1692.247389] RAX: 000000000003fce5 RBX: ffffffff81061348 RCX: 
00000000000003e8
[ 1692.254489] RDX: 00000000000000c5 RSI: 0000000225c17d03 RDI: 
000000000f93df4d
[ 1692.261588] RBP: ffff880405349800 R08: 00000000fffffffd R09: 
0000000000000000
[ 1692.268689] R10: ffff88042f210ac0 R11: 0000000000000040 R12: 
ffffffff8132cace
[ 1692.275790] R13: ffff88042f20feb0 R14: ffffffff811a2ed2 R15: 
ffff88042f20fdc8
[ 1692.282892] FS:  0000000000000000(0000) GS:ffff88042f200000(0000) 
knlGS:0000000000000000
[ 1692.290943] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 1692.296660] CR2: 00000000000000cc CR3: 0000000001603000 CR4: 
00000000000006f0
[ 1692.303759] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
0000000000000000
[ 1692.310860] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 
0000000000000400
[ 1692.317959] Process swapper (pid: 0, threadinfo ffffffff81600000, 
task ffffffff8160b020)
[ 1692.326010] Stack:
[ 1692.328008]  ffffffffa032cfdf 0000000000011140 ffffffff8125df49 
000000010005525f
[ 1692.335406]  ffff880405349820 ffff8804053498f0 0000000000000002 
ffffffffffffffff
[ 1692.342803]  ffffffff8125d059 0000000000000000 ffffffff81600000 
ffffffff816812d0
[ 1692.350197] Call Trace:
[ 1692.352630]  [<ffffffffa032cfdf>] ? acpi_idle_enter_bm+0x218/0x250 
[processor]
[ 1692.359818]  [<ffffffff8125df49>] ? menu_select+0x169/0x296
[ 1692.365362]  [<ffffffff8125d059>] ? cpuidle_idle_call+0xf4/0x17e
[ 1692.371339]  [<ffffffff81008298>] ? cpu_idle+0xa2/0xc4
[ 1692.376452]  [<ffffffff8169db60>] ? start_kernel+0x3b9/0x3c4
[ 1692.382084]  [<ffffffff8169d3c6>] ? x86_64_start_kernel+0x102/0x10f
[ 1692.388319] Code: 63 1c fb 48 83 c4 38 89 e8 5b 5d 41 5c 41 5d 41 5e 
41 5f c3 0f 09 0f 1f 44 00 00 c3 fa 66 0f 1f 44 00 00 c3 fb 66 0f 1f 44 
00 00 <c3> 48 8b 15 81 20 3f e1 48 8d 42 fd 48 83 f8 01 0f 96 c0 48 ff
[ 1692.407673] Call Trace:
[ 1692.410105]  [<ffffffffa032cfdf>] ? acpi_idle_enter_bm+0x218/0x250 
[processor]
[ 1692.417294]  [<ffffffff8125df49>] ? menu_select+0x169/0x296
[ 1692.422838]  [<ffffffff8125d059>] ? cpuidle_idle_call+0xf4/0x17e
[ 1692.428815]  [<ffffffff81008298>] ? cpu_idle+0xa2/0xc4
[ 1692.433928]  [<ffffffff8169db60>] ? start_kernel+0x3b9/0x3c4
[ 1692.439560]  [<ffffffff8169d3c6>] ? x86_64_start_kernel+0x102/0x10f
^M
^GMessage from[ 1692.446160] BUG: unable to handle kernel NULL pointer 
dereference at 00000000000000e0
[ 1692.455313] IP: [<ffffffffa0226ca8>] igb_poll+0x3b4/0x9ee [igb]
[ 1692.461214] PGD 404c2e067 PUD 402966067 PMD 0
[ 1692.465660] Oops: 0000 [#2] SMP
[ 1692.468887] last sysfs file: /sys/devices/virtual/misc/kvm/uevent
[ 1692.474947] CPU 0
[ 1692.476772] Modules linked in: kvm_intel kvm bridge stp loop snd_pcm 
snd_timer snd soundcore snd_page_alloc tpm_tis i7core_edac psmouse ghes 
tpm evdev edac_core pcspkr serio_raw processor tpm_bios button dcdbas 
thermal_sys hed power_meter ext2 mbcache dm_mod raid1 md_mod sd_mod 
crc_t10dif usb_storage uas uhci_hcd mpt2sas scsi_transport_sas 
raid_class ehci_hcd igb scsi_mod usbcore dca bnx2 [last unloaded: 
scsi_wait_scan]
[ 1692.514362]
[ 1692.515843] Pid: 1740, comm: rsyslogd Tainted: G      D     
2.6.39-rc2+ #3 Dell Inc. PowerEdge R510/0DPRKF
[ 1692.525475] RIP: 0010:[<ffffffffa0226ca8>]  [<ffffffffa0226ca8>] 
igb_poll+0x3b4/0x9ee [igb]
[ 1692.533796] RSP: 0018:ffff880403081b90  EFLAGS: 00010203
[ 1692.539078] RAX: ffff880404f24e58 RBX: ffff880404f25a40 RCX: 
0000000000000000
[ 1692.546176] RDX: 0000000000000040 RSI: 0000000000001043 RDI: 
ffff880404f24e58
[ 1692.553273] RBP: 0000000000000000 R08: 00000000f352249a R09: 
ffff88042f20f470
[ 1692.560373] R10: 0000000000000000 R11: 0000000000000293 R12: 
000000000000000c
[ 1692.567470] R13: ffffc9001267b320 R14: 0000000000000000 R15: 
ffff880403ff3140
[ 1692.574571] FS:  00007f4ce614f700(0000) GS:ffff88042f200000(0000) 
knlGS:0000000000000000
[ 1692.582619] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1692.588334] CR2: 00000000000000e0 CR3: 0000000403c96000 CR4: 
00000000000006f0
[ 1692.595434] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
0000000000000000
[ 1692.602532] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 
0000000000000400
[ 1692.609630] Process rsyslogd (pid: 1740, threadinfo ffff880403080000, 
task ffff880413b8a170)
[ 1692.618025] Stack:
[ 1692.620024]  0000000000000000 ffffffff8105b633 0000000000000082 
ffffffff8103eb92
[ 1692.627419]  ffffffff8132b45a 0000000000000082 0000000000000000 
000010431d887e00
[ 1692.634813]  0000000100000015 ffff880404f24e40 ffff880400000000 
ffff8804038e8740
[ 1692.642208] Call Trace:
[ 1692.644640]  [<ffffffff8105b633>] ? wq_worker_waking_up+0x8/0x20
[ 1692.650615]  [<ffffffff8103eb92>] ? try_to_wake_up+0x16a/0x17c
[ 1692.656418]  [<ffffffff8132b45a>] ? schedule+0x56e/0x585
[ 1692.661703]  [<ffffffff8132c775>] ? _raw_spin_lock_irq+0xd/0x1a
[ 1692.667592]  [<ffffffff81276f45>] ? net_rx_action+0xa4/0x1b1
[ 1692.673221]  [<ffffffff8104ad26>] ? __do_softirq+0xb8/0x176
[ 1692.678765]  [<ffffffff81333b5c>] ? call_softirq+0x1c/0x30
[ 1692.684224]  [<ffffffff8100aa57>] ? do_softirq+0x3f/0x84
[ 1692.689507]  [<ffffffff8104af91>] ? irq_exit+0x3f/0x8f
[ 1692.694618]  [<ffffffff8100a793>] ? do_IRQ+0x85/0x9e
[ 1692.699556]  [<ffffffff8132cad3>] ? common_interrupt+0x13/0x13
[ 1692.705360]  [<ffffffff811439a0>] ? kmsg_poll+0x3a/0x3a
[ 1692.710559]  [<ffffffff81045ba5>] ? spin_unlock_irq.clone.1+0xe/0x10
[ 1692.716882]  [<ffffffff81045e01>] ? do_syslog+0x1e2/0x430
[ 1692.722253]  [<ffffffff811439e6>] ? kmsg_read+0x46/0x50
[ 1692.727451]  [<ffffffff8113aefa>] ? proc_reg_read+0x6f/0x88
[ 1692.732995]  [<ffffffff810f6868>] ? vfs_read+0x9f/0xf2
[ 1692.738106]  [<ffffffff810f6900>] ? sys_read+0x45/0x6b
[ 1692.743218]  [<ffffffff81332952>] ? system_call_fastpath+0x16/0x1b
[ 1692.749364] Code: 89 74 24 3c e9 78 03 00 00 8b 54 24 70 39 54 24 44 
0f 8d 75 03 00 00 ff 44 24 44 0f ae e8 49 8b 6d 00 ff 44 24 40 b9 00 00 
00 00
[ 1692.762725]  8b 85 e0 00 00 00 49 c7 45 00 00 00 00 00 41 8b 77 0c 0f 18
[ 1692.769844] RIP  [<ffffffffa0226ca8>] igb_poll+0x3b4/0x9ee [igb]
[ 1692.775829]  RSP <ffff880403081b90>
[ 1692.779295] CR2: 00000000000000e0
  syslogd@r510-1 [ 1692.782628] ---[ end trace 0afda543b32ed72c ]---
[ 1692.788575] BUG: scheduling while atomic: rsyslogd/1740/0x10000100
at Apr  8 20:34:[ 1692.794754] Modules linked in: kvm_intel04 ...^M
  kernel kvm bridge:[ 1691.697131]  stp loopOops: 0000 [#1]  snd_pcm 
snd_timerSMP
^M^M
^GMessa snd soundcorege from syslogd@ snd_page_alloc tpm_tisr510-1 at 
Apr  8 i7core_edac psmouse 20:34:04 ...^M
  ghes tpm kernel:[ 1691.7 evdev edac_core00357] last sysf pcspkr 
serio_raws file: /sys/dev processor tpm_biosices/virtual/mis button 
dcdbasc/kvm/uevent
^M^M thermal_sys hed
^GMessage from  power_meter ext2syslogd@r510-1 a mbcache dm_modt Apr  8 
20:34:0 raid1 md_mod4 ...^M
  kernel: sd_mod crc_t10dif[ 1691.848303] S usb_storage uastack:
^M^M
^GMess uhci_hcd mpt2sasage from syslogd scsi_transport_sas 
raid_class@r510-1 at Apr   ehci_hcd igb8 20:34:04 ...^M^M scsi_mod usbcore
  kernel:[ 1691. dca bnx2872480] Call Tra [last unloaded: scsi_wait_scan]
ce:
^M^M
^GMessag[ 1692.865388] CPU 0
[ 1692.868564] Modules linked in:e from syslogd@r kvm_intel kvm510-1 at 
Apr  8  bridge stp20:34:04 ...^M
   loop snd_pcmkernel:[ 1691.87 snd_timer snd4910] <IRQ>
^M soundcore snd_page_alloc tpm_tis i7core_edac psmouse ghes tpm evdev 
edac_core pcspkr serio_raw processor tpm_bios button dcdbas thermal_sys 
hed power_meter ext2 mbcache dm_mod raid1 md_mod sd_mod crc_t10dif 
usb_storage uas uhci_hcd mpt2sas scsi_transport_sas raid_class ehci_hcd 
igb scsi_mod usbcore dca bnx2 [last unloaded: scsi_wait_scan]
[ 1692.913092]
[ 1692.914574] Pid: 1740, comm: rsyslogd Tainted: G      D     
2.6.39-rc2+ #3 Dell Inc. PowerEdge R510/0DPRKF
[ 1692.924208] RIP: 0010:[<ffffffff81045ba5>]  [<ffffffff81045ba5>] 
spin_unlock_irq.clone.1+0xe/0x10
[ 1692.933051] RSP: 0018:ffff880403081e30  EFLAGS: 00000286
[ 1692.938336] RAX: 000000000001455c RBX: ffff880403afecc0 RCX: 
ffffffff817518a0
[ 1692.945435] RDX: 0000000000014520 RSI: 0000000000000003 RDI: 
ffffffff81751888
[ 1692.952536] RBP: 0000000000000fff R08: ffffffff811439a0 R09: 
0000000000000000
[ 1692.959636] R10: 0000000000000000 R11: 0000000000000293 R12: 
ffffffff8132cad3
[ 1692.966737] R13: 0000000000000e33 R14: 00007f4ce7983693 R15: 
ffff880403081da8
[ 1692.973837] FS:  00007f4ce614f700(0000) GS:ffff88042f200000(0000) 
knlGS:0000000000000000
[ 1692.981890] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1692.987607] CR2: 00000000000000e0 CR3: 0000000403c96000 CR4: 
00000000000006f0
[ 1692.994706] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
0000000000000000
[ 1693.001805] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 
0000000000000400
[ 1693.008907] Process rsyslogd (pid: 1740, threadinfo ffff880403080000, 
task ffff880413b8a170)
[ 1693.017303] Stack:
[ 1693.019301]  ffffffff81045e01 0000000003afed20 ffffffff00000020 
0000000000000000
[ 1693.026696]  0000000000000001 ffff880403081fd8 0000000000000000 
0002000003081e60
[ 1693.034093]  ffff880404f9b090 0000000100000000 00007f4ce7982860 
0000000000000fff
[ 1693.041490] Call Trace:
[ 1693.043922]  [<ffffffff81045e01>] ? do_syslog+0x1e2/0x430
[ 1693.049295]  [<ffffffff811439e6>] ? kmsg_read+0x46/0x50
[ 1693.054493]  [<ffffffff8113aefa>] ? proc_reg_read+0x6f/0x88
[ 1693.060039]  [<ffffffff810f6868>] ? vfs_read+0x9f/0xf2
[ 1693.065153]  [<ffffffff810f6900>] ? sys_read+0x45/0x6b
[ 1693.070266]  [<ffffffff81332952>] ? system_call_fastpath+0x16/0x1b
[ 1693.076415] Code: 00 00 75 14 c7 05 48 be 72 00 01 00 00 00 c7 05 42 
be 72 00 01 00 00 00 48 83 c4 08 c3 66 ff 05 ea bc 70 00 fb 66 0f 1f 44 
00 00 <c3> c3 48 83 ec 08 48 c7 c2 a8 38 61 81 48 c7 c6 3e 2b 4c 81 48
[ 1693.095772] Call Trace:
[ 1693.098205]  [<ffffffff81045e01>] ? do_syslog+0x1e2/0x430
[ 1693.103579]  [<ffffffff811439e6>] ? kmsg_read+0x46/0x50
[ 1693.108778]  [<ffffffff8113aefa>] ? proc_reg_read+0x6f/0x88
[ 1693.114323]  [<ffffffff810f6868>] ? vfs_read+0x9f/0xf2
[ 1693.119435]  [<ffffffff810f6900>] ? sys_read+0x45/0x6b
[ 1693.124547]  [<ffffffff81332952>] ? system_call_fastpath+0x16/0x1b
[ 1693.130704] Kernel panic - not syncing: Fatal exception in interrupt
[ 1693.137028] Pid: 1740, comm: rsyslogd Tainted: G      D     
2.6.39-rc2+ #3
[ 1693.143870] Call Trace:
[ 1693.146304]  [<ffffffff8132ac78>] ? panic+0x92/0x1a1
[ 1693.151244]  [<ffffffff8132d836>] ? oops_end+0xa9/0xb6
[ 1693.156359]  [<ffffffff8102ca16>] ? no_context+0x1ed/0x1fa
[ 1693.161819]  [<ffffffff8132f623>] ? do_page_fault+0x16b/0x308
[ 1693.167537]  [<ffffffff8132cd95>] ? page_fault+0x25/0x30
[ 1693.172824]  [<ffffffffa0226ca8>] ? igb_poll+0x3b4/0x9ee [igb]
[ 1693.178629]  [<ffffffff8105b633>] ? wq_worker_waking_up+0x8/0x20
[ 1693.184606]  [<ffffffff8103eb92>] ? try_to_wake_up+0x16a/0x17c
[ 1693.190410]  [<ffffffff8132b45a>] ? schedule+0x56e/0x585
[ 1693.195697]  [<ffffffff8132c775>] ? _raw_spin_lock_irq+0xd/0x1a
[ 1693.201588]  [<ffffffff81276f45>] ? net_rx_action+0xa4/0x1b1
[ 1693.207219]  [<ffffffff8104ad26>] ? __do_softirq+0xb8/0x176
[ 1693.212764]  [<ffffffff81333b5c>] ? call_softirq+0x1c/0x30
[ 1693.218222]  [<ffffffff8100aa57>] ? do_softirq+0x3f/0x84
[ 1693.223507]  [<ffffffff8104af91>] ? irq_exit+0x3f/0x8f
[ 1693.228619]  [<ffffffff8100a793>] ? do_IRQ+0x85/0x9e
[ 1693.233560]  [<ffffffff8132cad3>] ? common_interrupt+0x13/0x13
[ 1693.239365]  [<ffffffff811439a0>] ? kmsg_poll+0x3a/0x3a
[ 1693.244564]  [<ffffffff81045ba5>] ? spin_unlock_irq.clone.1+0xe/0x10
[ 1693.250888]  [<ffffffff81045e01>] ? do_syslog+0x1e2/0x430
[ 1693.256259]  [<ffffffff811439e6>] ? kmsg_read+0x46/0x50
[ 1693.261459]  [<ffffffff8113aefa>] ? proc_reg_read+0x6f/0x88
[ 1693.267005]  [<ffffffff810f6868>] ? vfs_read+0x9f/0xf2
[ 1693.272117]  [<ffffffff810f6900>] ? sys_read+0x45/0x6b
[ 1693.277231]  [<ffffffff81332952>] ? system_call_fastpath+0x16/0x1b


^ permalink raw reply

* Re: [net-next-2.6 PATCH] v2 ethtool: add ntuple flow specifier data to network flow classifier
From: Alexander H Duyck @ 2011-04-09  4:44 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: davem@davemloft.net, Kirsher, Jeffrey T, netdev@vger.kernel.org,
	Santwona Behera
In-Reply-To: <1302307986.2871.65.camel@bwh-desktop>

On Fri, 2011-04-08 at 17:13 -0700, Ben Hutchings wrote:
> On Fri, 2011-04-08 at 15:34 -0700, Alexander Duyck wrote:
> > This change is meant to add an ntuple data extensions to the rx network flow
> > classification specifiers.  The idea is to allow ntuple to be displayed via
> > the network flow classification interface.
> [...]
> >  /**
> >   * struct ethtool_rx_flow_spec - specification for RX flow filter
> >   * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW
> >   * @h_u: Flow fields to match (dependent on @flow_type)
> > + * @h_ext: Additional fields to match
> >   * @m_u: Masks for flow field bits to be ignored
> > + * @m_ext: Masks for additional field bits to be ignored.
> > + *	Note, all additional fields must be ignored unless @flow_type
> > + *	includes the %FLOW_EXT flag.
> [...]
> 
> Sorry I didn't bring this up against v1:
> 
> I think you worked out that these masks are interpreted as bits to be
> matched, rather than bits to be ignored, in the existing implementation
> in niu.  Looking again at niu, I think that you were right about that.
> So please fix the comment while you're updating it.
> 
> Ben.
> 
I completely forgot all about that.  Thanks for catching it.

v3 is updated and submitted.

Alex


^ permalink raw reply

* RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
From: Alexander H Duyck @ 2011-04-09  4:40 UTC (permalink / raw)
  To: Wei Gu; +Cc: Eric Dumazet, netdev, Kirsher, Jeffrey T
In-Reply-To: <D12839161ADD3A4B8DA63D1A134D084026E48BA6AF@ESGSCCMS0001.eapac.ericsson.se>

On Fri, 2011-04-08 at 20:36 -0700, Wei Gu wrote:
> Hi Alexander, I do agree with you that if only the rx_missing_error
> (rx_no_buffer_count: 0) indicates the memory bandwidth issue. But the
> strange thing is I using the same test configuration on Linux 2.6.32,
> which looks no this problem at all. SO it not a HW setup problem at
> all, only difference in on the Kernel version, that's why I go back to
> you guys for this new Linux 2.6.38, if it will affact this memory
> bandwidth Or BIOS etc things?

What were the numbers you were getting with 2.6.32?  I would be
interested in seeing those number just to get an idea of how they
compare against the 2.6.38 kernel.

> The follow dump is done, while I was try to receive 290Kpps 400Byte
> pakets from IXIA, and drop them in the prerouting hook. I bind the
> eth10 8 RX queue to CPU sock ID 3 ( core 24-31) on NUMA NODE3

Just to confirm this is with DMAR off?  I saw an earlier email that said
you were getting a variable amount that was over 1Mpps and just want to
confirm this is with the same config.

> ethtool -i eth10
> driver: ixgbe
> version: 3.2.10-NAPI
> firmware-version: 0.9-3
> bus-info: 0000:8d:00.0
> 
> ethtool -S eth10
> NIC statistics:
>      rx_packets: 14222510
>      tx_packets: 109
>      rx_bytes: 5575223920
>      tx_bytes: 17790
>      rx_missed_errors: 15150244
>      rx_no_buffer_count: 0

I trimmed down your stats here pretty significantly.  This isn't an
issue with the driver not keeping up.  The problem here is memory and/or
bus bandwidth.  Based on the info you provided I am assuming you have a
quad socket system.  I'm curious how the memory is laid out.  What is
the total memory size, memory per node, and do you have all of the
memory channels on each node populated?  One common thing I've seen
cause these type of issues is an incorrect memory configuration.

Also if you could send me an lspci -vvv for 8d:00.0 specifically I would
appreciate it as I would like to look over the PCIe config just to make
sure the slot is a x8 PCIe gen 2.

Thanks,

Alex

^ permalink raw reply

* [net-next-2.6 PATCH] v3 ethtool: add ntuple flow specifier data to network flow classifier
From: Alexander Duyck @ 2011-04-09  4:01 UTC (permalink / raw)
  To: davem, jeffrey.t.kirsher, bhutchings; +Cc: netdev

This change is meant to add an ntuple data extensions to the rx network flow
classification specifiers.  The idea is to allow ntuple to be displayed via
the network flow classification interface.

The first patch had some left over stuff from the original flow extension
flags I had added.  That bit is removed in this patch.

The second had some left over comments that stated we ignored bits in the
masks when we actually match them.

This work is based on input from Ben Hutchings.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---

 include/linux/ethtool.h |   53 ++++++++++++++++++++++++++++-------------------
 net/socket.c            |   14 ++++++------
 2 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index c04d131..c7eff13 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -380,27 +380,42 @@ struct ethtool_usrip4_spec {
 	__u8    proto;
 };
 
+union ethtool_flow_union {
+	struct ethtool_tcpip4_spec		tcp_ip4_spec;
+	struct ethtool_tcpip4_spec		udp_ip4_spec;
+	struct ethtool_tcpip4_spec		sctp_ip4_spec;
+	struct ethtool_ah_espip4_spec		ah_ip4_spec;
+	struct ethtool_ah_espip4_spec		esp_ip4_spec;
+	struct ethtool_usrip4_spec		usr_ip4_spec;
+	struct ethhdr				ether_spec;
+	__u8					hdata[60];
+};
+
+struct ethtool_flow_ext {
+	__be16	vlan_etype;
+	__be16	vlan_tci;
+	__be32	data[2];
+};
+
 /**
  * struct ethtool_rx_flow_spec - specification for RX flow filter
  * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW
  * @h_u: Flow fields to match (dependent on @flow_type)
- * @m_u: Masks for flow field bits to be ignored
+ * @h_ext: Additional fields to match
+ * @m_u: Masks for flow field bits to be matched
+ * @m_ext: Masks for additional field bits to be matched
+ *	Note, all additional fields must be ignored unless @flow_type
+ *	includes the %FLOW_EXT flag.
  * @ring_cookie: RX ring/queue index to deliver to, or %RX_CLS_FLOW_DISC
  *	if packets should be discarded
  * @location: Index of filter in hardware table
  */
 struct ethtool_rx_flow_spec {
 	__u32		flow_type;
-	union {
-		struct ethtool_tcpip4_spec		tcp_ip4_spec;
-		struct ethtool_tcpip4_spec		udp_ip4_spec;
-		struct ethtool_tcpip4_spec		sctp_ip4_spec;
-		struct ethtool_ah_espip4_spec		ah_ip4_spec;
-		struct ethtool_ah_espip4_spec		esp_ip4_spec;
-		struct ethtool_usrip4_spec		usr_ip4_spec;
-		struct ethhdr				ether_spec;
-		__u8					hdata[72];
-	} h_u, m_u;
+	union ethtool_flow_union h_u;
+	struct ethtool_flow_ext h_ext;
+	union ethtool_flow_union m_u;
+	struct ethtool_flow_ext m_ext;
 	__u64		ring_cookie;
 	__u32		location;
 };
@@ -458,16 +473,10 @@ struct ethtool_rxnfc {
 
 struct compat_ethtool_rx_flow_spec {
 	u32		flow_type;
-	union {
-		struct ethtool_tcpip4_spec		tcp_ip4_spec;
-		struct ethtool_tcpip4_spec		udp_ip4_spec;
-		struct ethtool_tcpip4_spec		sctp_ip4_spec;
-		struct ethtool_ah_espip4_spec		ah_ip4_spec;
-		struct ethtool_ah_espip4_spec		esp_ip4_spec;
-		struct ethtool_usrip4_spec		usr_ip4_spec;
-		struct ethhdr				ether_spec;
-		u8					hdata[72];
-	} h_u, m_u;
+	union ethtool_flow_union h_u;
+	struct ethtool_flow_ext h_ext;
+	union ethtool_flow_union m_u;
+	struct ethtool_flow_ext m_ext;
 	compat_u64	ring_cookie;
 	u32		location;
 };
@@ -1072,6 +1081,8 @@ struct ethtool_ops {
 #define	IPV4_FLOW	0x10	/* hash only */
 #define	IPV6_FLOW	0x11	/* hash only */
 #define	ETHER_FLOW	0x12	/* spec only (ether_spec) */
+/* Flag to enable additional fields in struct ethtool_rx_flow_spec */
+#define	FLOW_EXT	0x80000000
 
 /* L3-L4 network traffic flow hash options */
 #define	RXH_L2DA	(1 << 1)
diff --git a/net/socket.c b/net/socket.c
index 5212447..575c84f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2643,13 +2643,13 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
 		return -EFAULT;
 
 	if (convert_in) {
-		/* We expect there to be holes between fs.m_u and
+		/* We expect there to be holes between fs.m_ext and
 		 * fs.ring_cookie and at the end of fs, but nowhere else.
 		 */
-		BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_u) +
-			     sizeof(compat_rxnfc->fs.m_u) !=
-			     offsetof(struct ethtool_rxnfc, fs.m_u) +
-			     sizeof(rxnfc->fs.m_u));
+		BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) +
+			     sizeof(compat_rxnfc->fs.m_ext) !=
+			     offsetof(struct ethtool_rxnfc, fs.m_ext) +
+			     sizeof(rxnfc->fs.m_ext));
 		BUILD_BUG_ON(
 			offsetof(struct compat_ethtool_rxnfc, fs.location) -
 			offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) !=
@@ -2657,7 +2657,7 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
 			offsetof(struct ethtool_rxnfc, fs.ring_cookie));
 
 		if (copy_in_user(rxnfc, compat_rxnfc,
-				 (void *)(&rxnfc->fs.m_u + 1) -
+				 (void *)(&rxnfc->fs.m_ext + 1) -
 				 (void *)rxnfc) ||
 		    copy_in_user(&rxnfc->fs.ring_cookie,
 				 &compat_rxnfc->fs.ring_cookie,
@@ -2674,7 +2674,7 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
 
 	if (convert_out) {
 		if (copy_in_user(compat_rxnfc, rxnfc,
-				 (const void *)(&rxnfc->fs.m_u + 1) -
+				 (const void *)(&rxnfc->fs.m_ext + 1) -
 				 (const void *)rxnfc) ||
 		    copy_in_user(&compat_rxnfc->fs.ring_cookie,
 				 &rxnfc->fs.ring_cookie,


^ permalink raw reply related

* RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
From: Wei Gu @ 2011-04-09  3:51 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, Alexander Duyck, netdev, Kirsher, Jeffrey T
In-Reply-To: <20110408074902.2bd10e6b@nehalam>

Hi Stephen,
Thanks for you reply,
I do awared that local Bus, local CPU socket would gain +20% performance, that why I put the eth10 on core 24-31 (sock 3) and NUMA Node 2/3.

Thanks
WeiGu

-----Original Message-----
From: Stephen Hemminger [mailto:shemminger@vyatta.com]
Sent: Friday, April 08, 2011 10:49 PM
To: Wei Gu
Cc: Eric Dumazet; Alexander Duyck; netdev; Kirsher, Jeffrey T
Subject: Re: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel

On Fri, 8 Apr 2011 22:10:50 +0800
Wei Gu <wei.gu@ericsson.com> wrote:

> Hi,
> Got you mean.
> But as I decribed before, I start the eth10 with 8 rx queues and 8 tx
> queues, and then I binding these 8 tx&rx queue each to CPU core 24-32
> (NUMA3), which I think could gain the best performance in my case
> (It's true on Linux 2.6.32) single queue ->single CPU Then I can
> descibe a little bit with packet generator, I config the IXIA to
> continues increase the dest ip address towards the test server, so the
> packet was evenly distributed to each receving queues of the eth10.
> And according the IXIA tools the transmit sharp was really good, no
> too much peaks
>
> What I observed on Linux 2.6.38 during the test, there is no softqd
> was stressed (< 03% on SI for each core(24-31)) while the packet lost
> happens, so we are not really stress the CPU:), It looks like we are
> limited  on some memory bandwidth (DMA) on this release
>
> And with same test case on 2.6.32, no such problem at all. It running
> pretty stable > 2Mpps without rx_missing_error. There is no HW
> limitation on this DL580
>
>
> BTW what is these "swapper"
> +      0.80%          swapper  [ixgbe]                    [k] ixgbe_poll
> +      0.79%             perf  [ixgbe]                    [k] ixgbe_poll
> Why the ixgbe_poll was on swapper/perf?
>
> Thanks
> WeiGu
>
> -----Original Message-----
> From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
> Sent: Friday, April 08, 2011 8:57 PM
> To: Wei Gu
> Cc: Alexander Duyck; netdev; Kirsher, Jeffrey T
> Subject: RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
>
> Le vendredi 08 avril 2011 à 20:19 +0800, Wei Gu a écrit :
> > Hi again,
> > I tried more testing with by disable this CONFIG_DMAR with shipped
> > 2.6.38 ixgbe and Intel released 3.2.10/3.1.15.
> > All these test looks we can get >1Mpps 400bype packtes but not
> > stable at all, there will huge number missing errors with 100% CPU IDLE:
> > ethtool -S eth10 |grep rx_missed_errors
> >
> >         rx_missed_errors: 76832040
> >
> > SUM: 1102212 ETH8: 0  ETH10: 1102212 ETH6: 0 ETH4: 0
> > SUM: 521841 ETH8: 0  ETH10: 521841 ETH6: 0 ETH4: 0
> > SUM: 426776 ETH8: 0  ETH10: 426776 ETH6: 0 ETH4: 0
> > SUM: 927520 ETH8: 0  ETH10: 927520 ETH6: 0 ETH4: 0
> > SUM: 1171995 ETH8: 0  ETH10: 1171995 ETH6: 0 ETH4: 0
> > SUM: 855980 ETH8: 0  ETH10: 855980 ETH6: 0 ETH4: 0
> >
> >
> > Do you know if there is other options in the kernel will cause high
> > rate rx_missed_errors with low CPU usage. (No problem on 2.6.32 with
> > same test case)
> >
> > perf  record:
> > +     69.74%          swapper  [kernel.kallsyms]          [k] poll_idle
> > +     11.62%          swapper  [kernel.kallsyms]          [k] intel_idle
> > +      0.80%          swapper  [ixgbe]                    [k] ixgbe_poll
> > +      0.79%             perf  [ixgbe]                    [k] ixgbe_poll
> > +      0.77%             perf  [kernel.kallsyms]          [k] skb_copy_bits
> > +      0.64%          swapper  [kernel.kallsyms]          [k] skb_copy_bits
> > +      0.48%             perf  [kernel.kallsyms]          [k] __kmalloc_node_track_caller
> > +      0.44%          swapper  [kernel.kallsyms]          [k] __kmalloc_node_track_caller
> > +      0.36%          swapper  [kernel.kallsyms]          [k] kmem_cache_alloc_node
> > +      0.35%          swapper  [kernel.kallsyms]          [k] kfree
> > +      0.35%             perf  [kernel.kallsyms]          [k] kmem_cache_alloc_node
> >
>
>
> Make sure enough cpus serves interrupts, _before_ even starting your stress test.
>
> Then, make sure trafic is distributed to many different queues.
> If a single flow is used, it probably uses a single queue ->single CPU.
>
> Say you have irq affinities set to fffffffffffff  (all cpus able to
> serve IRQ X,Y,Z,T,...)
>
> Then you have a network burst (because you start your packet generator at full rate), spreaded on many queues.
>
> CPU0 takes hard interrupt for queue 0, eth8, and queues NAPI mode.
> CPU0 takes hard interrupt for queue 0, eth10, and queues NAPI mode.
> CPU0 takes hard interrupt for queue 1, eth8, and queues NAPI mode.
> CPU0 takes hard interrupt for queue 1, eth10, and queues NAPI mode.
> CPU0 takes hard interrupt for queue 2, eth8, and queues NAPI mode.
> CPU0 takes hard interrupt for queue 2, eth10, and queues NAPI mode.
> ...
> CPU0 takes hard interrupt for queue X, eth8, and queues NAPI mode.
> ...
>
> Then softirq can start, and only CPU0 is able to handle NAPI for all the queued devices. You are stuck, with CPU0 never leaving ksoftirqd.
>
> NAPI handling is always performed on the CPU that received the hardware interrupt, until we exit NAPI (and rearm interrupt delivery).
> It cannot migrate to an "idle cpu"

For performance, you need to assign each network interrupt to a single CPU. There is no load balancing effect in the IRQ controller.

If you have a multi-socket system, then it is a good idea to make the IRQ's for the NIC's be on the same socket as the bus interface. Multi socket systems are really NUMA and putting IRQ on non-local CPU has measurable impact.



--

^ permalink raw reply

* RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
From: Wei Gu @ 2011-04-09  3:36 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: Eric Dumazet, netdev, Kirsher, Jeffrey T
In-Reply-To: <4D9F3629.9020506@intel.com>

Hi Alexander,
I do agree with you that if only the rx_missing_error (rx_no_buffer_count: 0) indicates the memory bandwidth issue. But the strange thing is I using the same test configuration on Linux 2.6.32, which looks no this problem at all. SO it not a HW setup problem at all, only difference in on the Kernel version, that's why I go back to you guys for this new Linux 2.6.38, if it will affact this memory bandwidth Or BIOS etc things?

The follow dump is done, while I was try to receive 290Kpps 400Byte pakets from IXIA, and drop them in the prerouting hook.
I bind the eth10 8 RX queue to CPU sock ID 3 ( core 24-31) on NUMA NODE3

ethtool -i eth10
driver: ixgbe
version: 3.2.10-NAPI
firmware-version: 0.9-3
bus-info: 0000:8d:00.0

ethtool -S eth10
NIC statistics:
     rx_packets: 14222510
     tx_packets: 109
     rx_bytes: 5575223920
     tx_bytes: 17790
     rx_errors: 0
     tx_errors: 0
     rx_dropped: 0
     tx_dropped: 0
     multicast: 0
     collisions: 0
     rx_over_errors: 0
     rx_crc_errors: 0
     rx_frame_errors: 0
     rx_fifo_errors: 0
     rx_missed_errors: 15150244
     tx_aborted_errors: 0
     tx_carrier_errors: 0
     tx_fifo_errors: 0
     tx_heartbeat_errors: 0
     rx_pkts_nic: 14226186
     tx_pkts_nic: 109
     rx_bytes_nic: 11750580400
     tx_bytes_nic: 18642
     lsc_int: 2
     tx_busy: 0
     non_eop_descs: 0
     broadcast: 0
     rx_no_buffer_count: 0
     tx_timeout_count: 0
     tx_restart_queue: 0
     rx_long_length_errors: 0
     rx_short_length_errors: 0
     tx_flow_control_xon: 0
     rx_flow_control_xon: 0
     tx_flow_control_xoff: 0
     rx_flow_control_xoff: 0
     rx_csum_offload_errors: 0
     low_latency_interrupt: 0
     alloc_rx_page_failed: 0
     alloc_rx_buff_failed: 0
     lro_aggregated: 0
     lro_flushed: 0
     lro_recycled: 0
     rx_no_dma_resources: 0
     hw_rsc_aggregated: 0
     hw_rsc_flushed: 0
     rx_flm: 0
     fdir_match: 0
     fdir_miss: 0
     fdir_overflow: 0
     fcoe_bad_fccrc: 0
     fcoe_last_errors: 0
     rx_fcoe_dropped: 0
     rx_fcoe_packets: 0
     rx_fcoe_dwords: 0
     tx_fcoe_packets: 0
     tx_fcoe_dwords: 0
     tx_queue_0_packets: 0
     tx_queue_0_bytes: 0
     tx_queue_1_packets: 10
     tx_queue_1_bytes: 540
     tx_queue_2_packets: 0
     tx_queue_2_bytes: 0
     tx_queue_3_packets: 0
     tx_queue_3_bytes: 0
     tx_queue_4_packets: 30
     tx_queue_4_bytes: 2340
     tx_queue_5_packets: 4
     tx_queue_5_bytes: 1368
     tx_queue_6_packets: 65
     tx_queue_6_bytes: 13542
     tx_queue_7_packets: 0
     tx_queue_7_bytes: 0
     rx_queue_0_packets: 1777898
     rx_queue_0_bytes: 696936016
     rx_queue_1_packets: 1777207
     rx_queue_1_bytes: 696665144
     rx_queue_2_packets: 1778379
     rx_queue_2_bytes: 697124568
     rx_queue_3_packets: 1777891
     rx_queue_3_bytes: 696933272
     rx_queue_4_packets: 1777050
     rx_queue_4_bytes: 696603600
     rx_queue_5_packets: 1777915
     rx_queue_5_bytes: 696942680
     rx_queue_6_packets: 1778737
     rx_queue_6_bytes: 697264904
     rx_queue_7_packets: 1778391
     rx_queue_7_bytes: 697129272

Lspci dump:

00:00.0 Host bridge: Intel Corporation 5520/5500/X58 I/O Hub to ESI Port (rev 22)
00:01.0 PCI bridge: Intel Corporation 5520/5500/X58 I/O Hub PCI Express Root Port 1 (rev 22)
00:02.0 PCI bridge: Intel Corporation 5520/5500/X58 I/O Hub PCI Express Root Port 2 (rev 22)
00:03.0 PCI bridge: Intel Corporation 5520/5500/X58 I/O Hub PCI Express Root Port 3 (rev 22)
00:04.0 PCI bridge: Intel Corporation 5520/X58 I/O Hub PCI Express Root Port 4 (rev 22)
00:05.0 PCI bridge: Intel Corporation 5520/X58 I/O Hub PCI Express Root Port 5 (rev 22)
00:06.0 PCI bridge: Intel Corporation 5520/X58 I/O Hub PCI Express Root Port 6 (rev 22)
00:07.0 PCI bridge: Intel Corporation 5520/5500/X58 I/O Hub PCI Express Root Port 7 (rev 22)
00:08.0 PCI bridge: Intel Corporation 5520/5500/X58 I/O Hub PCI Express Root Port 8 (rev 22)
00:09.0 PCI bridge: Intel Corporation 5520/5500/X58 I/O Hub PCI Express Root Port 9 (rev 22)
00:0a.0 PCI bridge: Intel Corporation 5520/5500/X58 I/O Hub PCI Express Root Port 10 (rev 22)
00:14.0 PIC: Intel Corporation 5520/5500/X58 I/O Hub System Management Registers (rev 22)
00:14.1 PIC: Intel Corporation 5520/5500/X58 I/O Hub GPIO and Scratch Pad Registers (rev 22)
00:14.2 PIC: Intel Corporation 5520/5500/X58 I/O Hub Control Status and RAS Registers (rev 22)
00:1c.0 PCI bridge: Intel Corporation 82801JI (ICH10 Family) PCI Express Root Port 1
00:1c.4 PCI bridge: Intel Corporation 82801JI (ICH10 Family) PCI Express Root Port 5
00:1d.0 USB Controller: Intel Corporation 82801JI (ICH10 Family) USB UHCI Controller #1
00:1d.1 USB Controller: Intel Corporation 82801JI (ICH10 Family) USB UHCI Controller #2
00:1d.2 USB Controller: Intel Corporation 82801JI (ICH10 Family) USB UHCI Controller #3
00:1d.3 USB Controller: Intel Corporation 82801JI (ICH10 Family) USB UHCI Controller #6
00:1d.7 USB Controller: Intel Corporation 82801JI (ICH10 Family) USB2 EHCI Controller #1
00:1e.0 PCI bridge: Intel Corporation 82801 PCI Bridge (rev 90)
00:1f.0 ISA bridge: Intel Corporation 82801JIB (ICH10) LPC Interface Controller
00:1f.2 IDE interface: Intel Corporation 82801JI (ICH10 Family) 4 port SATA IDE Controller #1
01:03.0 VGA compatible controller: ATI Technologies Inc ES1000 (rev 02)
02:00.0 System peripheral: Hewlett-Packard Company iLO3 Slave instrumentation & System support (rev 04)
02:00.2 System peripheral: Hewlett-Packard Company iLO3 Management Processor Support and Messaging (rev 04)
02:00.4 USB Controller: Hewlett-Packard Company Proliant iLO2/iLO3 virtual USB controller (rev 01)
03:00.0 RAID bus controller: Hewlett-Packard Company Smart Array G6 controllers (rev 01)
04:00.0 Ethernet controller: NetXen Incorporated NX3031 Multifunction 1/10-Gigabit Server Adapter (rev 42)
04:00.1 Ethernet controller: NetXen Incorporated NX3031 Multifunction 1/10-Gigabit Server Adapter (rev 42)
04:00.2 Ethernet controller: NetXen Incorporated NX3031 Multifunction 1/10-Gigabit Server Adapter (rev 42)
04:00.3 Ethernet controller: NetXen Incorporated NX3031 Multifunction 1/10-Gigabit Server Adapter (rev 42)
0b:00.0 Ethernet controller: Intel Corporation 82599EB 10-Gigabit Network Connection (rev 01)
0b:00.1 Ethernet controller: Intel Corporation 82599EB 10-Gigabit Network Connection (rev 01)
11:00.0 Ethernet controller: Intel Corporation 82599EB 10-Gigabit Network Connection (rev 01)
11:00.1 Ethernet controller: Intel Corporation 82599EB 10-Gigabit Network Connection (rev 01)
80:00.0 PCI bridge: Intel Corporation 5500 Non-Legacy I/O Hub PCI Express Root Port 0 (rev 22)
80:01.0 PCI bridge: Intel Corporation 5520/5500/X58 I/O Hub PCI Express Root Port 1 (rev 22)
80:02.0 PCI bridge: Intel Corporation 5520/5500/X58 I/O Hub PCI Express Root Port 2 (rev 22)
80:03.0 PCI bridge: Intel Corporation 5520/5500/X58 I/O Hub PCI Express Root Port 3 (rev 22)
80:04.0 PCI bridge: Intel Corporation 5520/X58 I/O Hub PCI Express Root Port 4 (rev 22)
80:05.0 PCI bridge: Intel Corporation 5520/X58 I/O Hub PCI Express Root Port 5 (rev 22)
80:06.0 PCI bridge: Intel Corporation 5520/X58 I/O Hub PCI Express Root Port 6 (rev 22)
80:07.0 PCI bridge: Intel Corporation 5520/5500/X58 I/O Hub PCI Express Root Port 7 (rev 22)
80:08.0 PCI bridge: Intel Corporation 5520/5500/X58 I/O Hub PCI Express Root Port 8 (rev 22)
80:09.0 PCI bridge: Intel Corporation 5520/5500/X58 I/O Hub PCI Express Root Port 9 (rev 22)
80:0a.0 PCI bridge: Intel Corporation 5520/5500/X58 I/O Hub PCI Express Root Port 10 (rev 22)
80:14.0 PIC: Intel Corporation 5520/5500/X58 I/O Hub System Management Registers (rev 22)
80:14.1 PIC: Intel Corporation 5520/5500/X58 I/O Hub GPIO and Scratch Pad Registers (rev 22)
80:14.2 PIC: Intel Corporation 5520/5500/X58 I/O Hub Control Status and RAS Registers (rev 22)
8d:00.0 Ethernet controller: Intel Corporation 82599EB 10-Gigabit Network Connection (rev 01)
8d:00.1 Ethernet controller: Intel Corporation 82599EB 10-Gigabit Network Connection (rev 01)
90:00.0 Ethernet controller: Intel Corporation 82599EB 10-Gigabit Network Connection (rev 01)
90:00.1 Ethernet controller: Intel Corporation 82599EB 10-Gigabit Network Connection (rev 01)

-----Original Message-----
From: Alexander Duyck [mailto:alexander.h.duyck@intel.com]
Sent: Saturday, April 09, 2011 12:22 AM
To: Wei Gu
Cc: Eric Dumazet; netdev; Kirsher, Jeffrey T
Subject: Re: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel

On 4/8/2011 5:19 AM, Wei Gu wrote:
> Hi again,
> I tried more testing with by disable this CONFIG_DMAR with shipped 2.6.38 ixgbe and Intel released 3.2.10/3.1.15.
> All these test looks we can get>1Mpps 400bype packtes but not stable at all, there will huge number missing errors with 100% CPU IDLE:
> ethtool -S eth10 |grep rx_missed_errors
>
>          rx_missed_errors: 76832040
>
> SUM: 1102212 ETH8: 0  ETH10: 1102212 ETH6: 0 ETH4: 0
> SUM: 521841 ETH8: 0  ETH10: 521841 ETH6: 0 ETH4: 0
> SUM: 426776 ETH8: 0  ETH10: 426776 ETH6: 0 ETH4: 0
> SUM: 927520 ETH8: 0  ETH10: 927520 ETH6: 0 ETH4: 0
> SUM: 1171995 ETH8: 0  ETH10: 1171995 ETH6: 0 ETH4: 0
> SUM: 855980 ETH8: 0  ETH10: 855980 ETH6: 0 ETH4: 0
>
>
> Do you know if there is other options in the kernel will cause high
> rate rx_missed_errors with low CPU usage. (No problem on 2.6.32 with
> same test case)
>
> perf  record:
> +     69.74%          swapper  [kernel.kallsyms]          [k] poll_idle
> +     11.62%          swapper  [kernel.kallsyms]          [k] intel_idle
> +      0.80%          swapper  [ixgbe]                    [k] ixgbe_poll
> +      0.79%             perf  [ixgbe]                    [k] ixgbe_poll
> +      0.77%             perf  [kernel.kallsyms]          [k] skb_copy_bits
> +      0.64%          swapper  [kernel.kallsyms]          [k] skb_copy_bits
> +      0.48%             perf  [kernel.kallsyms]          [k] __kmalloc_node_track_caller
> +      0.44%          swapper  [kernel.kallsyms]          [k] __kmalloc_node_track_caller
> +      0.36%          swapper  [kernel.kallsyms]          [k] kmem_cache_alloc_node
> +      0.35%          swapper  [kernel.kallsyms]          [k] kfree
> +      0.35%             perf  [kernel.kallsyms]          [k] kmem_cache_alloc_node

I was wondering if you could dump all of your ethtool stats instead of just the rx_missed_errors as this will provide us with much more info to work with.

I'm mainly interested in seeing if the rx_no_buffer_count is incrementing as well.  If it is not then what you may be seeing is a bus bandwidth issue depending on what slot you are in.

Also if you could provide an lspci dump for the part that would also give us some additional information on your PCIe bus configuration.

Thanks,

Alex

^ permalink raw reply

* RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
From: Wei Gu @ 2011-04-09  3:27 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Alexander Duyck, netdev, Kirsher, Jeffrey T
In-Reply-To: <1302275223.4409.36.camel@edumazet-laptop>

HI Eric,
If I try to bind the 8 tx&rx queue to different NUMA Node to (core 3,7,11,15,19,23,27,31), looks doesn't help on the rx_missing_error anymore.

I still think the best performance would be binding NIC to one sock of CPU with it's local memory node.
I did a lot of combination on 2.6.32 kernel, by bind the eth10 to NODE2/3 could gain 20% more performance compare to NODE0/1.
So I guess the CPU Socket 2&3 was locally with the eth10.

Thanks
WeiGu

-----Original Message-----
From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
Sent: Friday, April 08, 2011 11:07 PM
To: Wei Gu
Cc: Alexander Duyck; netdev; Kirsher, Jeffrey T
Subject: RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel

Le vendredi 08 avril 2011 à 22:10 +0800, Wei Gu a écrit :
> Hi,
> Got you mean.
> But as I decribed before, I start the eth10 with 8 rx queues and 8 tx
> queues, and then I binding these 8 tx&rx queue each to CPU core 24-32
> (NUMA3), which I think could gain the best performance in my case
> (It's true on Linux 2.6.32) single queue ->single CPU

Try with other cpus ? Maybe a mix.

Maybe your thinking is not good, and you chose the cpus that were not the best candidates. This was OK in 2.6.32 because you were lucky.

Using cpus from an unique NUMA node is not very good, since only one NUMA node is going to be used, and other NUMA nodes are idle.

NUMA binding is tricky. Linux try to use local node, hoping that all cpus are running and use local memory. In the end, global throughput is better.

But if your workload use cpus from one single node, then it means you lose part of the memory bandwidth.

> Then I can descibe a little bit with packet generator, I config the
> IXIA to continues increase the dest ip address towards the test
> server, so the packet was evenly distributed to each receving queues
> of the eth10. And according the IXIA tools the transmit sharp was
> really good, no too much peaks
>
> What I observed on Linux 2.6.38 during the test, there is no softqd
> was stressed (< 03% on SI for each core(24-31)) while the packet lost
> happens, so we are not really stress the CPU:), It looks like we are
> limited  on some memory bandwidth (DMA) on this release

That would mean you chose the wrong cpus to handle this load.

>
> And with same test case on 2.6.32, no such problem at all. It running
> pretty stable > 2Mpps without rx_missing_error. There is no HW
> limitation on this DL580
>
>
> BTW what is these "swapper"
> +      0.80%          swapper  [ixgbe]                    [k]
> ixgbe_poll
> +      0.79%             perf  [ixgbe]                    [k]
> ixgbe_poll
> Why the ixgbe_poll was on swapper/perf?
>

softirq are run behalf the current interrupted thread, unless you enter ksoftirqd if load is high.

It can be "idle task" or the "perf" task, or another ones...

^ permalink raw reply

* Re: [net-next-2.6 PATCH] v2 ethtool: add ntuple flow specifier data to network flow classifier
From: Ben Hutchings @ 2011-04-09  0:13 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: davem, jeffrey.t.kirsher, netdev, Santwona Behera
In-Reply-To: <20110408223451.27132.31805.stgit@gitlad.jf.intel.com>

On Fri, 2011-04-08 at 15:34 -0700, Alexander Duyck wrote:
> This change is meant to add an ntuple data extensions to the rx network flow
> classification specifiers.  The idea is to allow ntuple to be displayed via
> the network flow classification interface.
[...]
>  /**
>   * struct ethtool_rx_flow_spec - specification for RX flow filter
>   * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW
>   * @h_u: Flow fields to match (dependent on @flow_type)
> + * @h_ext: Additional fields to match
>   * @m_u: Masks for flow field bits to be ignored
> + * @m_ext: Masks for additional field bits to be ignored.
> + *	Note, all additional fields must be ignored unless @flow_type
> + *	includes the %FLOW_EXT flag.
[...]

Sorry I didn't bring this up against v1:

I think you worked out that these masks are interpreted as bits to be
matched, rather than bits to be ignored, in the existing implementation
in niu.  Looking again at niu, I think that you were right about that.
So please fix the comment while you're updating it.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* [RFC] ethtool: remove phys_id from ethtool_ops
From: Stephen Hemminger @ 2011-04-09  0:08 UTC (permalink / raw)
  To: Ben Hutchings, David Miller; +Cc: netdev

Hold this patch until after the Intel and Qlogic driver
changes are merged into net-next. Patches have been submitted
but still waiting for vendor.

After that all the upstream kernel drivers now use phys_id, 
and the old ethtool_ops interface (phys_id) can be removed.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/include/linux/ethtool.h	2011-04-08 16:59:55.691452941 -0700
+++ b/include/linux/ethtool.h	2011-04-08 17:00:26.135748450 -0700
@@ -767,12 +767,6 @@ bool ethtool_invalid_flags(struct net_de
  *	the indicator accordingly.  Finally, it is called with the argument
  *	%ETHTOOL_ID_INACTIVE and must deactivate the indicator.  Returns a
  *	negative error code or zero.
- * @phys_id: Deprecated in favour of @set_phys_id.
- *	Identify the physical device, e.g. by flashing an LED
- *	attached to it until interrupted by a signal or the given time
- *	(in seconds) elapses.  If the given time is zero, use a default
- *	time limit.  Returns a negative error code or zero.  Being
- *	interrupted by a signal is not an error.
  * @get_ethtool_stats: Return extended statistics about the device.
  *	This is only useful if the device maintains statistics not
  *	included in &struct rtnl_link_stats64.
@@ -858,7 +852,6 @@ struct ethtool_ops {
 	void	(*self_test)(struct net_device *, struct ethtool_test *, u64 *);
 	void	(*get_strings)(struct net_device *, u32 stringset, u8 *);
 	int	(*set_phys_id)(struct net_device *, enum ethtool_phys_id_state);
-	int	(*phys_id)(struct net_device *, u32);
 	void	(*get_ethtool_stats)(struct net_device *,
 				     struct ethtool_stats *, u64 *);
 	int	(*begin)(struct net_device *);
--- a/net/core/ethtool.c	2011-04-08 17:00:33.259817585 -0700
+++ b/net/core/ethtool.c	2011-04-08 17:01:01.620092913 -0700
@@ -1623,7 +1623,7 @@ static int ethtool_phys_id(struct net_de
 	static bool busy;
 	int rc;
 
-	if (!dev->ethtool_ops->set_phys_id && !dev->ethtool_ops->phys_id)
+	if (!dev->ethtool_ops->set_phys_id)
 		return -EOPNOTSUPP;
 
 	if (busy)
@@ -1632,10 +1632,6 @@ static int ethtool_phys_id(struct net_de
 	if (copy_from_user(&id, useraddr, sizeof(id)))
 		return -EFAULT;
 
-	if (!dev->ethtool_ops->set_phys_id)
-		/* Do it the old way */
-		return dev->ethtool_ops->phys_id(dev, id.data);
-
 	rc = dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE);
 	if (rc && rc != -EINVAL)
 		return rc;


^ permalink raw reply

* [PATCH net-next-2.6 2/2] niu: Recognise original ethtool class code for AH/ESP flow hashing
From: Ben Hutchings @ 2011-04-08 23:49 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Alexander Duyck, Santwona Behera, Jeff Kirsher

When the RX network flow classification interface was originally
defined for reporting and controlling of flow hashing, AH and ESP were
not given distinct flow class codes (apparently because the Sun
Neptune hardware treats them very similarly).

For flow steering, they must be distinguished, so new and separate
flow class codes were added for AH and ESP.  But for backward-
compatibility, flow hash operations should continue to support the
original class codes.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/niu.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/net/niu.c b/drivers/net/niu.c
index 681a42c..dedca01 100644
--- a/drivers/net/niu.c
+++ b/drivers/net/niu.c
@@ -7022,6 +7022,7 @@ static int niu_ethflow_to_class(int flow_type, u64 *class)
 	case UDP_V4_FLOW:
 		*class = CLASS_CODE_UDP_IPV4;
 		break;
+	case AH_ESP_V4_FLOW:
 	case AH_V4_FLOW:
 	case ESP_V4_FLOW:
 		*class = CLASS_CODE_AH_ESP_IPV4;
@@ -7035,6 +7036,7 @@ static int niu_ethflow_to_class(int flow_type, u64 *class)
 	case UDP_V6_FLOW:
 		*class = CLASS_CODE_UDP_IPV6;
 		break;
+	case AH_ESP_V6_FLOW:
 	case AH_V6_FLOW:
 	case ESP_V6_FLOW:
 		*class = CLASS_CODE_AH_ESP_IPV6;
-- 
1.7.4


-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related

* [PATCH net-next-2.6 1/2] gianfar: Clean up implementation of RX network flow classification
From: Ben Hutchings @ 2011-04-08 23:45 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, ppc-dev

This code was cribbed from niu, so gfar_set_hash_opts() begins by
converting the ethtool flow class code into a class code for Sun
Neptune hardware, then does the same thing again for the hardware it's
really dealing with.  It may also return -1 (-EPERM) for some
unhandled ethtool flow class codes.

Remove the useless code and definitions, and fix the error code.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
This isn't even compile-tested, since it can only be built for some
PowerPC SoCs.  Could someone on ppc-dev check that this won't break the
driver?

Ben.

 drivers/net/gianfar.h         |   17 -------------
 drivers/net/gianfar_ethtool.c |   52 +----------------------------------------
 2 files changed, 1 insertions(+), 68 deletions(-)

diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h
index ec5d595..57ee3b0 100644
--- a/drivers/net/gianfar.h
+++ b/drivers/net/gianfar.h
@@ -382,23 +382,6 @@ extern const char gfar_driver_version[];
 #define BD_LFLAG(flags) ((flags) << 16)
 #define BD_LENGTH_MASK		0x0000ffff
 
-#define CLASS_CODE_UNRECOG		0x00
-#define CLASS_CODE_DUMMY1		0x01
-#define CLASS_CODE_ETHERTYPE1		0x02
-#define CLASS_CODE_ETHERTYPE2		0x03
-#define CLASS_CODE_USER_PROG1		0x04
-#define CLASS_CODE_USER_PROG2		0x05
-#define CLASS_CODE_USER_PROG3		0x06
-#define CLASS_CODE_USER_PROG4		0x07
-#define CLASS_CODE_TCP_IPV4		0x08
-#define CLASS_CODE_UDP_IPV4		0x09
-#define CLASS_CODE_AH_ESP_IPV4		0x0a
-#define CLASS_CODE_SCTP_IPV4		0x0b
-#define CLASS_CODE_TCP_IPV6		0x0c
-#define CLASS_CODE_UDP_IPV6		0x0d
-#define CLASS_CODE_AH_ESP_IPV6		0x0e
-#define CLASS_CODE_SCTP_IPV6		0x0f
-
 #define FPR_FILER_MASK	0xFFFFFFFF
 #define MAX_FILER_IDX	0xFF
 
diff --git a/drivers/net/gianfar_ethtool.c b/drivers/net/gianfar_ethtool.c
index 3bc8e27..0840590 100644
--- a/drivers/net/gianfar_ethtool.c
+++ b/drivers/net/gianfar_ethtool.c
@@ -645,42 +645,6 @@ static int gfar_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
 }
 #endif
 
-static int gfar_ethflow_to_class(int flow_type, u64 *class)
-{
-	switch (flow_type) {
-	case TCP_V4_FLOW:
-		*class = CLASS_CODE_TCP_IPV4;
-		break;
-	case UDP_V4_FLOW:
-		*class = CLASS_CODE_UDP_IPV4;
-		break;
-	case AH_V4_FLOW:
-	case ESP_V4_FLOW:
-		*class = CLASS_CODE_AH_ESP_IPV4;
-		break;
-	case SCTP_V4_FLOW:
-		*class = CLASS_CODE_SCTP_IPV4;
-		break;
-	case TCP_V6_FLOW:
-		*class = CLASS_CODE_TCP_IPV6;
-		break;
-	case UDP_V6_FLOW:
-		*class = CLASS_CODE_UDP_IPV6;
-		break;
-	case AH_V6_FLOW:
-	case ESP_V6_FLOW:
-		*class = CLASS_CODE_AH_ESP_IPV6;
-		break;
-	case SCTP_V6_FLOW:
-		*class = CLASS_CODE_SCTP_IPV6;
-		break;
-	default:
-		return 0;
-	}
-
-	return 1;
-}
-
 static void ethflow_to_filer_rules (struct gfar_private *priv, u64 ethflow)
 {
 	u32 fcr = 0x0, fpr = FPR_FILER_MASK;
@@ -778,11 +742,6 @@ static int gfar_ethflow_to_filer_table(struct gfar_private *priv, u64 ethflow, u
 	case UDP_V6_FLOW:
 		cmp_rqfpr = RQFPR_IPV6 |RQFPR_UDP;
 		break;
-	case IPV4_FLOW:
-		cmp_rqfpr = RQFPR_IPV4;
-	case IPV6_FLOW:
-		cmp_rqfpr = RQFPR_IPV6;
-		break;
 	default:
 		printk(KERN_ERR "Right now this class is not supported\n");
 		return 0;
@@ -848,18 +807,9 @@ static int gfar_ethflow_to_filer_table(struct gfar_private *priv, u64 ethflow, u
 
 static int gfar_set_hash_opts(struct gfar_private *priv, struct ethtool_rxnfc *cmd)
 {
-	u64 class;
-
-	if (!gfar_ethflow_to_class(cmd->flow_type, &class))
-		return -EINVAL;
-
-	if (class < CLASS_CODE_USER_PROG1 ||
-			class > CLASS_CODE_SCTP_IPV6)
-		return -EINVAL;
-
 	/* write the filer rules here */
 	if (!gfar_ethflow_to_filer_table(priv, cmd->data, cmd->flow_type))
-		return -1;
+		return -EINVAL;
 
 	return 0;
 }
-- 
1.7.4



-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related

* Re: [ethtool PATCH 2/2] Add support for ESP as a separate protocol from AH
From: Ben Hutchings @ 2011-04-08 23:10 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: jeffrey.t.kirsher, netdev
In-Reply-To: <20110408221239.26893.36898.stgit@gitlad.jf.intel.com>

On Fri, 2011-04-08 at 15:12 -0700, Alexander Duyck wrote:
> This change is meant to split out ESP from AH.  Currently they are present
> as both a combined value, and two separate values.  In order to try and
> support eventually splitting the two out into separate values this change
> makes it so that ESP can be called out separately in ethtool.
[...]

The split between AH and ESP flows is one of several backward-
incompatible changes that Santwona made to RXNFC when adding flow
steering (originally it just dealt with hashing).

1. cxgb4, sfc and earlier versions of niu recognise AH_ESP_V{4,6}_FLOW
class codes for reporting of flow hashing.  niu also allowed it to be
controlled.

2. Later versions of niu recognise the separate AH and ESP flow class
codes.  gianfar also recognises them, but silently ignores any attempt
to control them!

We should make niu accept AH_ESP_V{4,6}_FLOW again for reporting and
control of flow hashing.  ethtool should use class codes
{AH,ESP}_V{4,6}_FLOW for flow steering, but not for flow hashing.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* [PATCH] IPV4:Removed the unnecessary loops and made /proc/net/snmp ouput more readable
From: Sasikanth V @ 2011-04-08 22:51 UTC (permalink / raw)
  To: David S. Miller, Alexey Kuznetsov, James Morris, Patrick McHardy
  Cc: netdev, Sasikanth V


Signed-off-by: Sasikanth V <sasikanth.v19@gmail.com>
---
 net/ipv4/proc.c |   75 ++++++++++++++++++++----------------------------------
 1 files changed, 28 insertions(+), 47 deletions(-)

diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index b14ec7d..b4eda92 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -265,12 +265,9 @@ static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals,
 	if (count) {
 		seq_printf(seq, "\nIcmpMsg:");
 		for (j = 0; j < count; ++j)
-			seq_printf(seq, " %sType%u",
+			seq_printf(seq, " %sType%u\t%lu",
 				type[j] & 0x100 ? "Out" : "In",
-				type[j] & 0xff);
-		seq_printf(seq, "\nIcmpMsg:");
-		for (j = 0; j < count; ++j)
-			seq_printf(seq, " %lu", vals[j]);
+				type[j] & 0xff, vals[j]);
 	}
 }
 
@@ -305,26 +302,26 @@ static void icmp_put(struct seq_file *seq)
 	int i;
 	struct net *net = seq->private;
 
-	seq_puts(seq, "\nIcmp: InMsgs InErrors");
-	for (i=0; icmpmibmap[i].name != NULL; i++)
-		seq_printf(seq, " In%s", icmpmibmap[i].name);
-	seq_printf(seq, " OutMsgs OutErrors");
-	for (i=0; icmpmibmap[i].name != NULL; i++)
-		seq_printf(seq, " Out%s", icmpmibmap[i].name);
-	seq_printf(seq, "\nIcmp: %lu %lu",
+	seq_puts(seq, "Icmp:\n");
+	seq_printf(seq, "%-32s\t%lu\n%-32s\t%lu\n","InMsgs",
 		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS),
+		"InErrors",
 		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS));
+
 	for (i=0; icmpmibmap[i].name != NULL; i++)
-		seq_printf(seq, " %lu",
-			snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
-				icmpmibmap[i].index));
-	seq_printf(seq, " %lu %lu",
+		seq_printf(seq, "%s%-32s\t%lu\n", "In",icmpmibmap[i].name,
+			   snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
+			 	           icmpmibmap[i].index));
+
+	seq_printf(seq, "%-32s\t%lu\n%-32s\t%lu\n","OutMsgs",
 		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
+		"OutErrors",
 		snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
+
 	for (i=0; icmpmibmap[i].name != NULL; i++)
-		seq_printf(seq, " %lu",
-			snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
-				icmpmibmap[i].index | 0x100));
+		seq_printf(seq, "%s%-32s\t%lu\n", "Out", icmpmibmap[i].name,
+			   snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
+				          icmpmibmap[i].index | 0x100));
 }
 
 /*
@@ -335,18 +332,15 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
 	int i;
 	struct net *net = seq->private;
 
-	seq_puts(seq, "Ip: Forwarding DefaultTTL");
-
-	for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
-		seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
-
-	seq_printf(seq, "\nIp: %d %d",
-		   IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
+	seq_puts(seq, "Ip:\n");
+	seq_printf(seq, "%-32s\t%d\n%-32s\t%d\n", "Forwarding",
+		   IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,"DefaultTTL",
 		   sysctl_ip_default_ttl);
 
 	BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
+
 	for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
-		seq_printf(seq, " %llu",
+		seq_printf(seq, "%-32s\t%llu\n", snmp4_ipstats_list[i].name,
 			   snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
 					     snmp4_ipstats_list[i].entry,
 					     offsetof(struct ipstats_mib, syncp)));
@@ -354,45 +348,32 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
 	icmp_put(seq);	/* RFC 2011 compatibility */
 	icmpmsg_put(seq);
 
-	seq_puts(seq, "\nTcp:");
-	for (i = 0; snmp4_tcp_list[i].name != NULL; i++)
-		seq_printf(seq, " %s", snmp4_tcp_list[i].name);
-
-	seq_puts(seq, "\nTcp:");
+	seq_puts(seq, "Tcp:\n");
 	for (i = 0; snmp4_tcp_list[i].name != NULL; i++) {
 		/* MaxConn field is signed, RFC 2012 */
 		if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
-			seq_printf(seq, " %ld",
+			seq_printf(seq, "%-32s\t%lu\n", snmp4_tcp_list[i].name,
 				   snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
 						   snmp4_tcp_list[i].entry));
 		else
-			seq_printf(seq, " %lu",
+			seq_printf(seq, "%-32s\t%lu\n", snmp4_tcp_list[i].name,
 				   snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
 						   snmp4_tcp_list[i].entry));
 	}
 
-	seq_puts(seq, "\nUdp:");
-	for (i = 0; snmp4_udp_list[i].name != NULL; i++)
-		seq_printf(seq, " %s", snmp4_udp_list[i].name);
-
-	seq_puts(seq, "\nUdp:");
+	seq_puts(seq, "Udp:\n");
 	for (i = 0; snmp4_udp_list[i].name != NULL; i++)
-		seq_printf(seq, " %lu",
+		seq_printf(seq, "%-32s\t%lu\n", snmp4_udp_list[i].name,
 			   snmp_fold_field((void __percpu **)net->mib.udp_statistics,
 					   snmp4_udp_list[i].entry));
 
 	/* the UDP and UDP-Lite MIBs are the same */
-	seq_puts(seq, "\nUdpLite:");
-	for (i = 0; snmp4_udp_list[i].name != NULL; i++)
-		seq_printf(seq, " %s", snmp4_udp_list[i].name);
-
-	seq_puts(seq, "\nUdpLite:");
+	seq_puts(seq, "UdpLite:\n");
 	for (i = 0; snmp4_udp_list[i].name != NULL; i++)
-		seq_printf(seq, " %lu",
+		seq_printf(seq, "%-32s\t%lu\n", snmp4_udp_list[i].name,
 			   snmp_fold_field((void __percpu **)net->mib.udplite_statistics,
 					   snmp4_udp_list[i].entry));
 
-	seq_putc(seq, '\n');
 	return 0;
 }
 
-- 
1.7.3.4


^ permalink raw reply related

* Re: shutdown(2) does not fully shut down socket any more
From: Kees Cook @ 2011-04-08 22:46 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20110408.153130.226769035.davem@davemloft.net>

On Fri, Apr 08, 2011 at 03:31:30PM -0700, David Miller wrote:
> From: Kees Cook <kees.cook@canonical.com>
> Date: Fri, 8 Apr 2011 15:30:04 -0700
> 
> > Ah! Thanks, I didn't noticed that; I saw no activity on the bugzilla entry,
> > so I thought it hadn't been seen yet.
> 
> We bascially do not use kernel.org bugzilla for bug tracking, all
> work and discussions occur only here on the netdev list.
> 
> So what Stephen Hemminger, Andrew Morton, and others do is simply
> forward the bug reports here to the list so we can work on them.

Makes sense, yeah. It'd be cool if the forwarder added a note to the bug
saying "this has been forwarded [here]". Then the reporter would know where
to go look for status updates.

But, regardless, it's being looked at. Thanks!

-Kees

-- 
Kees Cook
Ubuntu Security Team

^ permalink raw reply

* Re: [ethtool PATCH 1/2] ethtool: fix manpage so that it will display tables again
From: Ben Hutchings @ 2011-04-08 22:45 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: jeffrey.t.kirsher, netdev
In-Reply-To: <20110408221234.26893.58812.stgit@gitlad.jf.intel.com>

On Fri, 2011-04-08 at 15:12 -0700, Alexander Duyck wrote:
> The current ethtool manpage is not displaying tables.  After trying to pass
> the manpage through tbl I repeatedly saw the error:
> tbl:ethtool.8.in:707: unrecognised format `x'
> tbl:ethtool.8.in:707: giving up on this table
> 
> By dropping the 'x' the errors went away and when I built the manpage the
> tables reappeared so I am assuming this is the correct approach.
[...]

I think this must depend somewhat on the version of tbl.  I must admit I
haven't yet tested with a range of versions.

In groff 1.20.1, the 'x' column flag requested an 'expanded' column
which will be as wide as possible (after allowing for the other column
contents) with its contents word-wrapped if necessary.  Without this
flag, tbl wraps some columns poorly and complains about some tables
being too wide (though I don't think they are).  Clearly that is less
bad than having it discard the tables altogether, so I will apply this.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: [net-next-2.6 PATCH 2/2] ethtool: add ntuple flow specifier data to network flow classifier
From: Alexander Duyck @ 2011-04-08 22:35 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: davem@davemloft.net, Kirsher, Jeffrey T, netdev@vger.kernel.org
In-Reply-To: <1302301284.2871.25.camel@bwh-desktop>

On 4/8/2011 3:21 PM, Ben Hutchings wrote:
> On Fri, 2011-04-08 at 15:07 -0700, Alexander Duyck wrote:
>> This change is meant to add an ntuple data extensions to the rx network flow
>> classification specifiers.  The idea is to allow ntuple to be displayed via
>> the network flow classification interface.
>
> Thanks for carrying on with this.
>
> [...]
>>   /**
>>    * struct ethtool_rx_flow_spec - specification for RX flow filter
>>    * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW
>>    * @h_u: Flow fields to match (dependent on @flow_type)
>> + * @h_ext: Additional fields to match
>>    * @m_u: Masks for flow field bits to be ignored
>> + * @m_ext: Masks for additional field bits to be ignored.
>> + *	Note, all additional fields must be ignored unless @flow_type
>> + *	includes the %FLOW_EXT flag.
>>    * @ring_cookie: RX ring/queue index to deliver to, or %RX_CLS_FLOW_DISC
>>    *	if packets should be discarded
>>    * @location: Index of filter in hardware table
>>    */
>>   struct ethtool_rx_flow_spec {
>>   	__u32		flow_type;
>> -	union {
>> -		struct ethtool_tcpip4_spec		tcp_ip4_spec;
>> -		struct ethtool_tcpip4_spec		udp_ip4_spec;
>> -		struct ethtool_tcpip4_spec		sctp_ip4_spec;
>> -		struct ethtool_ah_espip4_spec		ah_ip4_spec;
>> -		struct ethtool_ah_espip4_spec		esp_ip4_spec;
>> -		struct ethtool_usrip4_spec		usr_ip4_spec;
>> -		struct ethhdr				ether_spec;
>> -		__u8					hdata[72];
>> -	} h_u, m_u;
>> +	union ethtool_flow_union h_u;
>> +	struct ethtool_flow_ext h_ext;
>> +	union ethtool_flow_union m_u;
>> +	struct ethtool_flow_ext m_ext;
>> +	__u32		flow_type_ext;
> [...]
>
> You can't add flow_type_ext here.  I assume this is an oversight, since
> it isn't mentioned anywhere else.
>
> Ben.
>

Yeah, I forgot and left it in there.  It will be removed and resubmitted.

Thanks,

Alex

^ permalink raw reply

* [net-next-2.6 PATCH] v2 ethtool: add ntuple flow specifier data to network flow classifier
From: Alexander Duyck @ 2011-04-08 22:34 UTC (permalink / raw)
  To: davem, jeffrey.t.kirsher, bhutchings; +Cc: netdev

This change is meant to add an ntuple data extensions to the rx network flow
classification specifiers.  The idea is to allow ntuple to be displayed via
the network flow classification interface.

The first patch had some left over stuff from the original flow extension
flags I had added.  That bit is removed in this patch.

This work is based on input from Ben Hutchings.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---

 include/linux/ethtool.h |   51 +++++++++++++++++++++++++++++------------------
 net/socket.c            |   14 ++++++-------
 2 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index c04d131..68a7310 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -380,27 +380,42 @@ struct ethtool_usrip4_spec {
 	__u8    proto;
 };
 
+union ethtool_flow_union {
+	struct ethtool_tcpip4_spec		tcp_ip4_spec;
+	struct ethtool_tcpip4_spec		udp_ip4_spec;
+	struct ethtool_tcpip4_spec		sctp_ip4_spec;
+	struct ethtool_ah_espip4_spec		ah_ip4_spec;
+	struct ethtool_ah_espip4_spec		esp_ip4_spec;
+	struct ethtool_usrip4_spec		usr_ip4_spec;
+	struct ethhdr				ether_spec;
+	__u8					hdata[60];
+};
+
+struct ethtool_flow_ext {
+	__be16	vlan_etype;
+	__be16	vlan_tci;
+	__be32	data[2];
+};
+
 /**
  * struct ethtool_rx_flow_spec - specification for RX flow filter
  * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW
  * @h_u: Flow fields to match (dependent on @flow_type)
+ * @h_ext: Additional fields to match
  * @m_u: Masks for flow field bits to be ignored
+ * @m_ext: Masks for additional field bits to be ignored.
+ *	Note, all additional fields must be ignored unless @flow_type
+ *	includes the %FLOW_EXT flag.
  * @ring_cookie: RX ring/queue index to deliver to, or %RX_CLS_FLOW_DISC
  *	if packets should be discarded
  * @location: Index of filter in hardware table
  */
 struct ethtool_rx_flow_spec {
 	__u32		flow_type;
-	union {
-		struct ethtool_tcpip4_spec		tcp_ip4_spec;
-		struct ethtool_tcpip4_spec		udp_ip4_spec;
-		struct ethtool_tcpip4_spec		sctp_ip4_spec;
-		struct ethtool_ah_espip4_spec		ah_ip4_spec;
-		struct ethtool_ah_espip4_spec		esp_ip4_spec;
-		struct ethtool_usrip4_spec		usr_ip4_spec;
-		struct ethhdr				ether_spec;
-		__u8					hdata[72];
-	} h_u, m_u;
+	union ethtool_flow_union h_u;
+	struct ethtool_flow_ext h_ext;
+	union ethtool_flow_union m_u;
+	struct ethtool_flow_ext m_ext;
 	__u64		ring_cookie;
 	__u32		location;
 };
@@ -458,16 +473,10 @@ struct ethtool_rxnfc {
 
 struct compat_ethtool_rx_flow_spec {
 	u32		flow_type;
-	union {
-		struct ethtool_tcpip4_spec		tcp_ip4_spec;
-		struct ethtool_tcpip4_spec		udp_ip4_spec;
-		struct ethtool_tcpip4_spec		sctp_ip4_spec;
-		struct ethtool_ah_espip4_spec		ah_ip4_spec;
-		struct ethtool_ah_espip4_spec		esp_ip4_spec;
-		struct ethtool_usrip4_spec		usr_ip4_spec;
-		struct ethhdr				ether_spec;
-		u8					hdata[72];
-	} h_u, m_u;
+	union ethtool_flow_union h_u;
+	struct ethtool_flow_ext h_ext;
+	union ethtool_flow_union m_u;
+	struct ethtool_flow_ext m_ext;
 	compat_u64	ring_cookie;
 	u32		location;
 };
@@ -1072,6 +1081,8 @@ struct ethtool_ops {
 #define	IPV4_FLOW	0x10	/* hash only */
 #define	IPV6_FLOW	0x11	/* hash only */
 #define	ETHER_FLOW	0x12	/* spec only (ether_spec) */
+/* Flag to enable additional fields in struct ethtool_rx_flow_spec */
+#define	FLOW_EXT	0x80000000
 
 /* L3-L4 network traffic flow hash options */
 #define	RXH_L2DA	(1 << 1)
diff --git a/net/socket.c b/net/socket.c
index 5212447..575c84f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2643,13 +2643,13 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
 		return -EFAULT;
 
 	if (convert_in) {
-		/* We expect there to be holes between fs.m_u and
+		/* We expect there to be holes between fs.m_ext and
 		 * fs.ring_cookie and at the end of fs, but nowhere else.
 		 */
-		BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_u) +
-			     sizeof(compat_rxnfc->fs.m_u) !=
-			     offsetof(struct ethtool_rxnfc, fs.m_u) +
-			     sizeof(rxnfc->fs.m_u));
+		BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) +
+			     sizeof(compat_rxnfc->fs.m_ext) !=
+			     offsetof(struct ethtool_rxnfc, fs.m_ext) +
+			     sizeof(rxnfc->fs.m_ext));
 		BUILD_BUG_ON(
 			offsetof(struct compat_ethtool_rxnfc, fs.location) -
 			offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) !=
@@ -2657,7 +2657,7 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
 			offsetof(struct ethtool_rxnfc, fs.ring_cookie));
 
 		if (copy_in_user(rxnfc, compat_rxnfc,
-				 (void *)(&rxnfc->fs.m_u + 1) -
+				 (void *)(&rxnfc->fs.m_ext + 1) -
 				 (void *)rxnfc) ||
 		    copy_in_user(&rxnfc->fs.ring_cookie,
 				 &compat_rxnfc->fs.ring_cookie,
@@ -2674,7 +2674,7 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
 
 	if (convert_out) {
 		if (copy_in_user(compat_rxnfc, rxnfc,
-				 (const void *)(&rxnfc->fs.m_u + 1) -
+				 (const void *)(&rxnfc->fs.m_ext + 1) -
 				 (const void *)rxnfc) ||
 		    copy_in_user(&compat_rxnfc->fs.ring_cookie,
 				 &rxnfc->fs.ring_cookie,


^ permalink raw reply related

* Re: shutdown(2) does not fully shut down socket any more
From: David Miller @ 2011-04-08 22:31 UTC (permalink / raw)
  To: kees.cook; +Cc: netdev
In-Reply-To: <20110408223004.GY4050@outflux.net>

From: Kees Cook <kees.cook@canonical.com>
Date: Fri, 8 Apr 2011 15:30:04 -0700

> Ah! Thanks, I didn't noticed that; I saw no activity on the bugzilla entry,
> so I thought it hadn't been seen yet.

We bascially do not use kernel.org bugzilla for bug tracking, all
work and discussions occur only here on the netdev list.

So what Stephen Hemminger, Andrew Morton, and others do is simply
forward the bug reports here to the list so we can work on them.

^ permalink raw reply

* Re: shutdown(2) does not fully shut down socket any more
From: Kees Cook @ 2011-04-08 22:30 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20110408.152427.193714809.davem@davemloft.net>

Hi David,

On Fri, Apr 08, 2011 at 03:24:27PM -0700, David Miller wrote:
> Eric Dumazet will be working on fixing this, see:
> 
> http://marc.info/?l=linux-netdev&m=130176733401613&w=2
> 
> And Stephem Hemminger already forward that bugzilla entry here
> at few days ago.

Ah! Thanks, I didn't noticed that; I saw no activity on the bugzilla entry,
so I thought it hadn't been seen yet.

-Kees

-- 
Kees Cook
Ubuntu Security Team

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox