Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 04/10] netfilter: regard users as refcount for l4proto's per-net data
From: Gao feng @ 2012-06-14 10:07 UTC (permalink / raw)
  To: pablo; +Cc: netdev, netfilter-devel, Gao feng
In-Reply-To: <1339668445-23848-1-git-send-email-gaofeng@cn.fujitsu.com>

Now, nf_proto_net's users is confusing.
we should regard it as the refcount for l4proto's per-net data,
because maybe there are two l4protos use the same per-net data.

so increment pn->users when nf_conntrack_l4proto_register
success, and decrement it for nf_conntrack_l4_unregister case.

because nf_conntrack_l3proto_ipv[4|6] don't use the same per-net
data,so we don't need to add a refcnt for their per-net data.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
 net/netfilter/nf_conntrack_proto.c |   70 ++++++++++++++++++++++-------------
 1 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index c9df1b4..63f9430 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -39,16 +39,13 @@ static int
 nf_ct_register_sysctl(struct net *net,
 		      struct ctl_table_header **header,
 		      const char *path,
-		      struct ctl_table *table,
-		      unsigned int *users)
+		      struct ctl_table *table)
 {
 	if (*header == NULL) {
 		*header = register_net_sysctl(net, path, table);
 		if (*header == NULL)
 			return -ENOMEM;
 	}
-	if (users != NULL)
-		(*users)++;
 
 	return 0;
 }
@@ -58,7 +55,7 @@ nf_ct_unregister_sysctl(struct ctl_table_header **header,
 			struct ctl_table **table,
 			unsigned int *users)
 {
-	if (users != NULL && --*users > 0)
+	if (users != NULL && *users > 0)
 		return;
 
 	unregister_net_sysctl_table(*header);
@@ -191,8 +188,7 @@ static int nf_ct_l3proto_register_sysctl(struct net *net,
 		err = nf_ct_register_sysctl(net,
 					    &in->ctl_table_header,
 					    l3proto->ctl_table_path,
-					    in->ctl_table,
-					    NULL);
+					    in->ctl_table);
 
 		if (err < 0) {
 			kfree(in->ctl_table);
@@ -330,20 +326,17 @@ static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
 
 static
 int nf_ct_l4proto_register_sysctl(struct net *net,
+				  struct nf_proto_net *pn,
 				  struct nf_conntrack_l4proto *l4proto)
 {
 	int err = 0;
-	struct nf_proto_net *pn = nf_ct_l4proto_net(net, l4proto);
-	if (pn == NULL)
-		return 0;
 
 #ifdef CONFIG_SYSCTL
 	if (pn->ctl_table != NULL) {
 		err = nf_ct_register_sysctl(net,
 					    &pn->ctl_table_header,
 					    "net/netfilter",
-					    pn->ctl_table,
-					    &pn->users);
+					    pn->ctl_table);
 		if (err < 0) {
 			if (!pn->users) {
 				kfree(pn->ctl_table);
@@ -357,8 +350,7 @@ int nf_ct_l4proto_register_sysctl(struct net *net,
 		err = nf_ct_register_sysctl(net,
 					    &pn->ctl_compat_header,
 					    "net/ipv4/netfilter",
-					    pn->ctl_compat_table,
-					    NULL);
+					    pn->ctl_compat_table);
 		if (err == 0)
 			goto out;
 		nf_ct_kfree_compat_sysctl_table(pn);
@@ -374,11 +366,9 @@ out:
 
 static
 void nf_ct_l4proto_unregister_sysctl(struct net *net,
+				     struct nf_proto_net *pn,
 				     struct nf_conntrack_l4proto *l4proto)
 {
-	struct nf_proto_net *pn = nf_ct_l4proto_net(net, l4proto);
-	if (pn == NULL)
-		return;
 #ifdef CONFIG_SYSCTL
 	if (pn->ctl_table_header != NULL)
 		nf_ct_unregister_sysctl(&pn->ctl_table_header,
@@ -391,8 +381,6 @@ void nf_ct_l4proto_unregister_sysctl(struct net *net,
 					&pn->ctl_compat_table,
 					NULL);
 #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
-#else
-	pn->users--;
 #endif /* CONFIG_SYSCTL */
 }
 
@@ -458,22 +446,33 @@ int nf_conntrack_l4proto_register(struct net *net,
 				  struct nf_conntrack_l4proto *l4proto)
 {
 	int ret = 0;
+
+	struct nf_proto_net *pn = NULL;
+
 	if (l4proto->init_net) {
 		ret = l4proto->init_net(net, l4proto->l3proto);
 		if (ret < 0)
-			return ret;
+			goto out;
 	}
 
-	ret = nf_ct_l4proto_register_sysctl(net, l4proto);
+	pn = nf_ct_l4proto_net(net, l4proto);
+	if (pn == NULL)
+		goto out;
+
+	ret = nf_ct_l4proto_register_sysctl(net, pn, l4proto);
 	if (ret < 0)
-		return ret;
+		goto out;
 
 	if (net == &init_net) {
 		ret = nf_conntrack_l4proto_register_net(l4proto);
-		if (ret < 0)
-			nf_ct_l4proto_unregister_sysctl(net, l4proto);
+		if (ret < 0) {
+			nf_ct_l4proto_unregister_sysctl(net, pn, l4proto);
+			goto out;
+		}
 	}
-
+	/* increase the nf_proto_net's refcnt */
+	pn->users++;
+out:
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_register);
@@ -498,10 +497,18 @@ nf_conntrack_l4proto_unregister_net(struct nf_conntrack_l4proto *l4proto)
 void nf_conntrack_l4proto_unregister(struct net *net,
 				     struct nf_conntrack_l4proto *l4proto)
 {
+	struct nf_proto_net *pn = NULL;
 	if (net == &init_net)
 		nf_conntrack_l4proto_unregister_net(l4proto);
 
-	nf_ct_l4proto_unregister_sysctl(net, l4proto);
+	pn = nf_ct_l4proto_net(net, l4proto);
+	if (pn == NULL)
+		return;
+
+	/* decrease the nf_proto_net's refcnt */
+	pn->users--;
+	nf_ct_l4proto_unregister_sysctl(net, pn, l4proto);
+
 	/* Remove all contrack entries for this protocol */
 	rtnl_lock();
 	nf_ct_iterate_cleanup(net, kill_l4proto, l4proto);
@@ -513,11 +520,14 @@ int nf_conntrack_proto_init(struct net *net)
 {
 	unsigned int i;
 	int err;
+	struct nf_proto_net *pn = nf_ct_l4proto_net(net,
+						    &nf_conntrack_l4proto_generic);
 	err = nf_conntrack_l4proto_generic.init_net(net,
 						    nf_conntrack_l4proto_generic.l3proto);
 	if (err < 0)
 		return err;
 	err = nf_ct_l4proto_register_sysctl(net,
+					    pn,
 					    &nf_conntrack_l4proto_generic);
 	if (err < 0)
 		return err;
@@ -527,13 +537,21 @@ int nf_conntrack_proto_init(struct net *net)
 			rcu_assign_pointer(nf_ct_l3protos[i],
 					   &nf_conntrack_l3proto_generic);
 	}
+	/* increase generic proto's nf_proto_net refcnt */
+	pn->users++;
+
 	return 0;
 }
 
 void nf_conntrack_proto_fini(struct net *net)
 {
 	unsigned int i;
+	struct nf_proto_net *pn = nf_ct_l4proto_net(net,
+						    &nf_conntrack_l4proto_generic);
+	/* decrease generic proto's nf_proto_net refcnt */
+	pn->users--;
 	nf_ct_l4proto_unregister_sysctl(net,
+					pn,
 					&nf_conntrack_l4proto_generic);
 	if (net == &init_net) {
 		/* free l3proto protocol tables */
-- 
1.7.7.6

^ permalink raw reply related

* [PATCH 05/10] netfilter: merge tcpv[4,6]_net_init into tcp_net_init
From: Gao feng @ 2012-06-14 10:07 UTC (permalink / raw)
  To: pablo; +Cc: netdev, netfilter-devel, Gao feng
In-Reply-To: <1339668445-23848-1-git-send-email-gaofeng@cn.fujitsu.com>

merge tcpv4_net_init and tcpv6_net_init into tcp_net_init to
reduce the redundancy codes.

and use nf_proto_net.users to identify if it's the first time
we use the nf_proto_net. when it's the first time,we will
initialized it.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
 net/netfilter/nf_conntrack_proto_tcp.c |   57 ++++++++------------------------
 1 files changed, 14 insertions(+), 43 deletions(-)

diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 6db9d3c..e3d5427 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1593,18 +1593,14 @@ static int tcp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn)
 	return 0;
 }
 
-static int tcpv4_init_net(struct net *net, u_int16_t proto)
+static int tcp_init_net(struct net *net, u_int16_t proto)
 {
-	int i;
 	int ret = 0;
 	struct nf_tcp_net *tn = tcp_pernet(net);
 	struct nf_proto_net *pn = (struct nf_proto_net *)tn;
 
-#ifdef CONFIG_SYSCTL
-	if (!pn->ctl_table) {
-#else
-	if (!pn->users++) {
-#endif
+	if (!pn->users) {
+		int i = 0;
 		for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++)
 			tn->timeouts[i] = tcp_timeouts[i];
 
@@ -1613,45 +1609,20 @@ static int tcpv4_init_net(struct net *net, u_int16_t proto)
 		tn->tcp_max_retrans = nf_ct_tcp_max_retrans;
 	}
 
-	ret = tcp_kmemdup_compat_sysctl_table(pn);
+	if (proto == AF_INET) {
+		ret = tcp_kmemdup_compat_sysctl_table(pn);
+		if (ret < 0)
+			return ret;
 
-	if (ret < 0)
-		return ret;
+		ret = tcp_kmemdup_sysctl_table(pn);
+		if (ret < 0)
+			nf_ct_kfree_compat_sysctl_table(pn);
+	} else
+		ret = tcp_kmemdup_sysctl_table(pn);
 
-	ret = tcp_kmemdup_sysctl_table(pn);
-
-#ifdef CONFIG_SYSCTL
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-	if (ret < 0) {
-		kfree(pn->ctl_compat_table);
-		pn->ctl_compat_table = NULL;
-	}
-#endif
-#endif
 	return ret;
 }
 
-static int tcpv6_init_net(struct net *net, u_int16_t proto)
-{
-	int i;
-	struct nf_tcp_net *tn = tcp_pernet(net);
-	struct nf_proto_net *pn = (struct nf_proto_net *)tn;
-
-#ifdef CONFIG_SYSCTL
-	if (!pn->ctl_table) {
-#else
-	if (!pn->users++) {
-#endif
-		for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++)
-			tn->timeouts[i] = tcp_timeouts[i];
-		tn->tcp_loose = nf_ct_tcp_loose;
-		tn->tcp_be_liberal = nf_ct_tcp_be_liberal;
-		tn->tcp_max_retrans = nf_ct_tcp_max_retrans;
-	}
-
-	return tcp_kmemdup_sysctl_table(pn);
-}
-
 struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
 {
 	.l3proto		= PF_INET,
@@ -1684,7 +1655,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
 		.nla_policy	= tcp_timeout_nla_policy,
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-	.init_net		= tcpv4_init_net,
+	.init_net		= tcp_init_net,
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4);
 
@@ -1720,6 +1691,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly =
 		.nla_policy	= tcp_timeout_nla_policy,
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-	.init_net		= tcpv6_init_net,
+	.init_net		= tcp_init_net,
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6);
-- 
1.7.7.6

^ permalink raw reply related

* Re: PPPoE performance regression
From: David Woodhouse @ 2012-06-14 10:35 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Nathan Williams, Karl Hiramoto, David S. Miller, netdev,
	John Crispin, Benjamin LaHaise
In-Reply-To: <20120614061809.GA10453@drongo>

[-- Attachment #1: Type: text/plain, Size: 1065 bytes --]

On Thu, 2012-06-14 at 16:18 +1000, Paul Mackerras wrote:
> Umm, how does ppp_output_wakeup() actually get called?

In fact I'm thinking of eliminating ppp_output_wakeup() in the general
case.

The idea (and it is *just* an idea so far) is to introduce
ppp_sent_queue(), ppp_completed_queue() and ppp_reset_queue()¹ functions
which take a ppp_chan and map onto the corresponding netdev_* functions
for BQL.

Having done that, we should be able to trigger the wakeup automatically
from the ppp_completed_queue() function, and there's no need for channel
drivers to call ppp_output_wakeup() directly. Not only do we get proper
holistic queue length management, we also move the flow control into PPP
and get rid of the horrid dependency on internal PPP locking that's
documented in commit 9d02daf75², and which we'd have to address on the
PPPoX side too.

And the overhead that Ben is concerned about should be fairly minimal.

-- 
dwmw2

¹ For ppp_reset_queue in the mlppp case it gets moderately non-trivial.
² Look for 'downl'. Ick.

[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 6171 bytes --]

^ permalink raw reply

* Re: Regression on TX throughput when using bonding
From: David Miller @ 2012-06-14 10:31 UTC (permalink / raw)
  To: eric.dumazet; +Cc: jhautbois, netdev
In-Reply-To: <1339668471.22704.714.camel@edumazet-glaptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 14 Jun 2012 12:07:51 +0200

> We should have a way to properly park packets in Qdiscs, and only do the
> orphaning once skb given to real device for 'immediate or so'
> transmission.

Ok.

^ permalink raw reply

* [net-next 6/6] ixgbe: Check PTP Rx timestamps via BPF filter
From: Jeff Kirsher @ 2012-06-14 10:18 UTC (permalink / raw)
  To: davem; +Cc: Jacob Keller, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1339669089-27955-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

This patch fixes a potential Rx timestamp deadlock that causes the Rx
timestamping to stall indefinitely. The issue could occur when a PTP packet is
timestamped by hardware but never reaches the Rx queue. In order to prevent a
permanent loss of timestamping, the RXSTMP(L/H) registers have to be read to
unlock them. (This used to only occur when a packet that was timestamped
reached the software.) However the registers can't be read early otherwise
there is no way to correlate them to the packet.

This patch introduces a filter function which can be used to determine if a
packet should have been timestamped. Supplied with the filter setup by the
hwtstamp ioctl, check to make sure the PTP protocol and message type match the
expected values. If so, then read the timestamp registers (to free them.) At
this point check the descriptor bit, if the bit is set then we know this
packet correlates to the timestamp stored in the RXTSTAMP registers.
Otherwise, assume that packet was dropped by the hardware, and ignore this
timestamp value. However, we have at least unlocked the rxtstamp registers for
future timestamping.

Due to the way the driver handles skb data, it cannot be directly accessed. In
order to work around this, a copy of the skb data into a linear buffer is
made. From this buffer it becomes possible to read the data correctly

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Richard Cochran <richardcochran@gmail.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h      |    2 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |    3 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c  |  113 ++++++++++++++++++++++---
 3 files changed, 104 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 3ef3c52..41f9f6e 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -561,6 +561,7 @@ struct ixgbe_adapter {
 	spinlock_t tmreg_lock;
 	struct cyclecounter cc;
 	struct timecounter tc;
+	int rx_hwtstamp_filter;
 	u32 base_incval;
 	u32 cycle_speed;
 #endif /* CONFIG_IXGBE_PTP */
@@ -718,6 +719,7 @@ extern void ixgbe_ptp_overflow_check(struct ixgbe_adapter *adapter);
 extern void ixgbe_ptp_tx_hwtstamp(struct ixgbe_q_vector *q_vector,
 				  struct sk_buff *skb);
 extern void ixgbe_ptp_rx_hwtstamp(struct ixgbe_q_vector *q_vector,
+				  union ixgbe_adv_rx_desc *rx_desc,
 				  struct sk_buff *skb);
 extern int ixgbe_ptp_hwtstamp_ioctl(struct ixgbe_adapter *adapter,
 				    struct ifreq *ifr, int cmd);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 1675b66..b0ddfd4 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1397,8 +1397,7 @@ static void ixgbe_process_skb_fields(struct ixgbe_ring *rx_ring,
 	ixgbe_rx_checksum(rx_ring, rx_desc, skb);
 
 #ifdef CONFIG_IXGBE_PTP
-	if (ixgbe_test_staterr(rx_desc, IXGBE_RXDADV_STAT_TS))
-		ixgbe_ptp_rx_hwtstamp(rx_ring->q_vector, skb);
+	ixgbe_ptp_rx_hwtstamp(rx_ring->q_vector, rx_desc, skb);
 #endif
 
 	if ((dev->features & NETIF_F_HW_VLAN_RX) &&
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
index 5ed8cff..cb7d1b2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
@@ -26,6 +26,7 @@
 *******************************************************************************/
 #include "ixgbe.h"
 #include <linux/export.h>
+#include <linux/ptp_classify.h>
 
 /*
  * The 82599 and the X540 do not have true 64bit nanosecond scale
@@ -100,6 +101,10 @@
 #define NSECS_PER_SEC 1000000000ULL
 #endif
 
+static struct sock_filter ptp_filter[] = {
+	PTP_FILTER
+};
+
 /**
  * ixgbe_ptp_read - read raw cycle counter (to be used by time counter)
  * @cc - the cyclecounter structure
@@ -426,6 +431,68 @@ void ixgbe_ptp_overflow_check(struct ixgbe_adapter *adapter)
 }
 
 /**
+ * ixgbe_ptp_match - determine if this skb matches a ptp packet
+ * @skb: pointer to the skb
+ * @hwtstamp: pointer to the hwtstamp_config to check
+ *
+ * Determine whether the skb should have been timestamped, assuming the
+ * hwtstamp was set via the hwtstamp ioctl. Returns non-zero when the packet
+ * should have a timestamp waiting in the registers, and 0 otherwise.
+ *
+ * V1 packets have to check the version type to determine whether they are
+ * correct. However, we can't directly access the data because it might be
+ * fragmented in the SKB, in paged memory. In order to work around this, we
+ * use skb_copy_bits which will properly copy the data whether it is in the
+ * paged memory fragments or not. We have to copy the IP header as well as the
+ * message type.
+ */
+static int ixgbe_ptp_match(struct sk_buff *skb, int rx_filter)
+{
+	struct iphdr iph;
+	u8 msgtype;
+	unsigned int type, offset;
+
+	if (rx_filter == HWTSTAMP_FILTER_NONE)
+		return 0;
+
+	type = sk_run_filter(skb, ptp_filter);
+
+	if (likely(rx_filter == HWTSTAMP_FILTER_PTP_V2_EVENT))
+		return type & PTP_CLASS_V2;
+
+	/* For the remaining cases actually check message type */
+	switch (type) {
+	case PTP_CLASS_V1_IPV4:
+		skb_copy_bits(skb, OFF_IHL, &iph, sizeof(iph));
+		offset = ETH_HLEN + (iph.ihl << 2) + UDP_HLEN + OFF_PTP_CONTROL;
+		break;
+	case PTP_CLASS_V1_IPV6:
+		offset = OFF_PTP6 + OFF_PTP_CONTROL;
+		break;
+	default:
+		/* other cases invalid or handled above */
+		return 0;
+	}
+
+	/* Make sure our buffer is long enough */
+	if (skb->len < offset)
+		return 0;
+
+	skb_copy_bits(skb, offset, &msgtype, sizeof(msgtype));
+
+	switch (rx_filter) {
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+		return (msgtype == IXGBE_RXMTRL_V1_SYNC_MSG);
+		break;
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+		return (msgtype == IXGBE_RXMTRL_V1_DELAY_REQ_MSG);
+		break;
+	default:
+		return 0;
+	}
+}
+
+/**
  * ixgbe_ptp_tx_hwtstamp - utility function which checks for TX time stamp
  * @q_vector: structure containing interrupt and ring information
  * @skb: particular skb to send timestamp with
@@ -474,6 +541,7 @@ void ixgbe_ptp_tx_hwtstamp(struct ixgbe_q_vector *q_vector,
 /**
  * ixgbe_ptp_rx_hwtstamp - utility function which checks for RX time stamp
  * @q_vector: structure containing interrupt and ring information
+ * @rx_desc: the rx descriptor
  * @skb: particular skb to send timestamp with
  *
  * if the timestamp is valid, we convert it into the timecounter ns
@@ -481,6 +549,7 @@ void ixgbe_ptp_tx_hwtstamp(struct ixgbe_q_vector *q_vector,
  * is passed up the network stack
  */
 void ixgbe_ptp_rx_hwtstamp(struct ixgbe_q_vector *q_vector,
+			   union ixgbe_adv_rx_desc *rx_desc,
 			   struct sk_buff *skb)
 {
 	struct ixgbe_adapter *adapter;
@@ -498,21 +567,33 @@ void ixgbe_ptp_rx_hwtstamp(struct ixgbe_q_vector *q_vector,
 	hw = &adapter->hw;
 
 	tsyncrxctl = IXGBE_READ_REG(hw, IXGBE_TSYNCRXCTL);
+
+	/* Check if we have a valid timestamp and make sure the skb should
+	 * have been timestamped */
+	if (likely(!(tsyncrxctl & IXGBE_TSYNCRXCTL_VALID) ||
+		   !ixgbe_ptp_match(skb, adapter->rx_hwtstamp_filter)))
+		return;
+
+	/*
+	 * Always read the registers, in order to clear a possible fault
+	 * because of stagnant RX timestamp values for a packet that never
+	 * reached the queue.
+	 */
 	regval |= (u64)IXGBE_READ_REG(hw, IXGBE_RXSTMPL);
 	regval |= (u64)IXGBE_READ_REG(hw, IXGBE_RXSTMPH) << 32;
 
 	/*
-	 * If this bit is set, then the RX registers contain the time stamp. No
-	 * other packet will be time stamped until we read these registers, so
-	 * read the registers to make them available again. Because only one
-	 * packet can be time stamped at a time, we know that the register
-	 * values must belong to this one here and therefore we don't need to
-	 * compare any of the additional attributes stored for it.
+	 * If the timestamp bit is set in the packet's descriptor, we know the
+	 * timestamp belongs to this packet. No other packet can be
+	 * timestamped until the registers for timestamping have been read.
+	 * Therefor only one packet with this bit can be in the queue at a
+	 * time, and the rx timestamp values that were in the registers belong
+	 * to this packet.
 	 *
 	 * If nothing went wrong, then it should have a skb_shared_tx that we
 	 * can turn into a skb_shared_hwtstamps.
 	 */
-	if (!(tsyncrxctl & IXGBE_TSYNCRXCTL_VALID))
+	if (unlikely(!ixgbe_test_staterr(rx_desc, IXGBE_RXDADV_STAT_TS)))
 		return;
 
 	spin_lock_irqsave(&adapter->tmreg_lock, flags);
@@ -598,19 +679,20 @@ int ixgbe_ptp_hwtstamp_ioctl(struct ixgbe_adapter *adapter,
 	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
 	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
 		tsync_rx_ctl |= IXGBE_TSYNCRXCTL_TYPE_EVENT_V2;
-		config.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
 		is_l2 = true;
 		is_l4 = true;
+		config.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
 		break;
 	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
 	case HWTSTAMP_FILTER_ALL:
 	default:
 		/*
-		 * register RXMTRL must be set, therefore it is not
-		 * possible to time stamp both V1 Sync and Delay_Req messages
-		 * and hardware does not support timestamping all packets
-		 * => return error
+		 * register RXMTRL must be set in order to do V1 packets,
+		 * therefore it is not possible to time stamp both V1 Sync and
+		 * Delay_Req messages and hardware does not support
+		 * timestamping all packets => return error
 		 */
+		config.rx_filter = HWTSTAMP_FILTER_NONE;
 		return -ERANGE;
 	}
 
@@ -620,6 +702,9 @@ int ixgbe_ptp_hwtstamp_ioctl(struct ixgbe_adapter *adapter,
 		return 0;
 	}
 
+	/* Store filter value for later use */
+	adapter->rx_hwtstamp_filter = config.rx_filter;
+
 	/* define ethertype filter for timestamped packets */
 	if (is_l2)
 		IXGBE_WRITE_REG(hw, IXGBE_ETQF(3),
@@ -855,6 +940,10 @@ void ixgbe_ptp_init(struct ixgbe_adapter *adapter)
 		return;
 	}
 
+	/* initialize the ptp filter */
+	if (ptp_filter_init(ptp_filter, ARRAY_SIZE(ptp_filter)))
+		e_dev_warn("ptp_filter_init failed\n");
+
 	spin_lock_init(&adapter->tmreg_lock);
 
 	ixgbe_ptp_start_cyclecounter(adapter);
-- 
1.7.10.2

^ permalink raw reply related

* [net-next 5/6] ixgbe: PTP Fix hwtstamp mode settings
From: Jeff Kirsher @ 2012-06-14 10:18 UTC (permalink / raw)
  To: davem; +Cc: Jacob Keller, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1339669089-27955-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

When enabling the hwtstamp mode for Rx timestamping the V2 ptp event type
specific modes (Delay Request and Sync) have been rolled into the V2 all event
packet modes, in order to more accurately represent what hardware is doing.
Hardware always timestamps the Path delay packets when a V2 mode is selected,
regardless of what type was selected (in order to always support Path delay
mode). However this means the user selected modes of timestamping only Sync or
Delay Request is not truly supported. This patch correctly sets the mode for
the hwtstamp config and returns to the user that all V2 event packets will be
timestamped.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c |   23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
index 174f41f..5ed8cff 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
@@ -540,6 +540,11 @@ void ixgbe_ptp_rx_hwtstamp(struct ixgbe_q_vector *q_vector,
  * type has to be specified. Matching the kind of event packet is
  * not supported, with the exception of "all V2 events regardless of
  * level 2 or 4".
+ *
+ * Since hardware always timestamps Path delay packets when timestamping V2
+ * packets, regardless of the type specified in the register, only use V2
+ * Event mode. This more accurately tells the user what the hardware is going
+ * to do anyways.
  */
 int ixgbe_ptp_hwtstamp_ioctl(struct ixgbe_adapter *adapter,
 			     struct ifreq *ifr, int cmd)
@@ -583,27 +588,15 @@ int ixgbe_ptp_hwtstamp_ioctl(struct ixgbe_adapter *adapter,
 		tsync_rx_mtrl = IXGBE_RXMTRL_V1_DELAY_REQ_MSG;
 		is_l4 = true;
 		break;
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
 	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
 	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
-		tsync_rx_ctl |= IXGBE_TSYNCRXCTL_TYPE_L2_L4_V2;
-		tsync_rx_mtrl = IXGBE_RXMTRL_V2_SYNC_MSG;
-		is_l2 = true;
-		is_l4 = true;
-		config.rx_filter = HWTSTAMP_FILTER_SOME;
-		break;
 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
 	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
 	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
-		tsync_rx_ctl |= IXGBE_TSYNCRXCTL_TYPE_L2_L4_V2;
-		tsync_rx_mtrl = IXGBE_RXMTRL_V2_DELAY_REQ_MSG;
-		is_l2 = true;
-		is_l4 = true;
-		config.rx_filter = HWTSTAMP_FILTER_SOME;
-		break;
-	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
-	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
-	case HWTSTAMP_FILTER_PTP_V2_EVENT:
 		tsync_rx_ctl |= IXGBE_TSYNCRXCTL_TYPE_EVENT_V2;
 		config.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
 		is_l2 = true;
-- 
1.7.10.2

^ permalink raw reply related

* [net-next 4/6] ixgbe: ptp code cleanup
From: Jeff Kirsher @ 2012-06-14 10:18 UTC (permalink / raw)
  To: davem; +Cc: Jacob Keller, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1339669089-27955-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

This patch fixes two minor nits from Richard Cochran. The first is a case of
ambitious line wrapping that wasn't necessary. The second is to re-order the
flag checks for PPS support. Previously, the hardware test was done first, and
the interrupt flag test was done second. Now, test the interrupt flag and use
the unlikely macro.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |    8 +++-----
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c  |   13 +++++++------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 17ad6a3..1675b66 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -790,12 +790,10 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 		total_packets += tx_buffer->gso_segs;
 
 #ifdef CONFIG_IXGBE_PTP
-		if (unlikely(tx_buffer->tx_flags &
-			     IXGBE_TX_FLAGS_TSTAMP))
-			ixgbe_ptp_tx_hwtstamp(q_vector,
-					      tx_buffer->skb);
-
+		if (unlikely(tx_buffer->tx_flags & IXGBE_TX_FLAGS_TSTAMP))
+			ixgbe_ptp_tx_hwtstamp(q_vector, tx_buffer->skb);
 #endif
+
 		/* free the skb */
 		dev_kfree_skb_any(tx_buffer->skb);
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
index ddc6a4d..174f41f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
@@ -307,13 +307,14 @@ void ixgbe_ptp_check_pps_event(struct ixgbe_adapter *adapter, u32 eicr)
 	    !(adapter->flags2 & IXGBE_FLAG2_PTP_PPS_ENABLED))
 		return;
 
-	switch (hw->mac.type) {
-	case ixgbe_mac_X540:
-		if (eicr & IXGBE_EICR_TIMESYNC)
+	if (unlikely(eicr & IXGBE_EICR_TIMESYNC)) {
+		switch (hw->mac.type) {
+		case ixgbe_mac_X540:
 			ptp_clock_event(adapter->ptp_clock, &event);
-		break;
-	default:
-		break;
+			break;
+		default:
+			break;
+		}
 	}
 }
 
-- 
1.7.10.2

^ permalink raw reply related

* [net-next 3/6] ixgbe: do not compile ixgbe_sysfs.c when CONFIG_IXGBE_HWMON is not set
From: Jeff Kirsher @ 2012-06-14 10:18 UTC (permalink / raw)
  To: davem; +Cc: Emil Tantilov, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1339669089-27955-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Emil Tantilov <emil.s.tantilov@intel.com>

ixgbe_sysfs.c is only needed when CONFIG_IXGBE_HWMON is configured in the
kernel.

Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
Acked-by: Don Skidmore <Donald.c.skidmore@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/Makefile      |    4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_sysfs.c |    2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/Makefile b/drivers/net/ethernet/intel/ixgbe/Makefile
index 0bdf06b..5fd5d04 100644
--- a/drivers/net/ethernet/intel/ixgbe/Makefile
+++ b/drivers/net/ethernet/intel/ixgbe/Makefile
@@ -34,11 +34,11 @@ obj-$(CONFIG_IXGBE) += ixgbe.o
 
 ixgbe-objs := ixgbe_main.o ixgbe_common.o ixgbe_ethtool.o \
               ixgbe_82599.o ixgbe_82598.o ixgbe_phy.o ixgbe_sriov.o \
-              ixgbe_mbx.o ixgbe_x540.o ixgbe_sysfs.o ixgbe_lib.o
+              ixgbe_mbx.o ixgbe_x540.o ixgbe_lib.o
 
 ixgbe-$(CONFIG_IXGBE_DCB) +=  ixgbe_dcb.o ixgbe_dcb_82598.o \
                               ixgbe_dcb_82599.o ixgbe_dcb_nl.o
 
 ixgbe-$(CONFIG_IXGBE_PTP) += ixgbe_ptp.o
-
+ixgbe-$(CONFIG_IXGBE_HWMON) += ixgbe_sysfs.o
 ixgbe-$(CONFIG_FCOE:m=y) += ixgbe_fcoe.o
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sysfs.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sysfs.c
index 1d80b1c..2334fce 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sysfs.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sysfs.c
@@ -37,7 +37,6 @@
 #include <linux/netdevice.h>
 #include <linux/hwmon.h>
 
-#ifdef CONFIG_IXGBE_HWMON
 /* hwmon callback functions */
 static ssize_t ixgbe_hwmon_show_location(struct device *dev,
 					 struct device_attribute *attr,
@@ -241,5 +240,4 @@ err:
 exit:
 	return rc;
 }
-#endif /* CONFIG_IXGBE_HWMON */
 
-- 
1.7.10.2

^ permalink raw reply related

* [net-next 2/6] ixgbe: align flow control DV macros with datasheet
From: Jeff Kirsher @ 2012-06-14 10:18 UTC (permalink / raw)
  To: davem; +Cc: John Fastabend, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1339669089-27955-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: John Fastabend <john.r.fastabend@intel.com>

The flow control DV macros are used to calculate the flow control
high and low thresholds. This patch annotates these macros slightly
better and fixes the issues below.

The macro variables are renamed LINK to _max_frame_link and TC to
_max_frame_tc. This was to avoid confusion and make them more
readable. It was found that people auditing the code read TC to be
'traffic class' in the 802.1Q definition instead of the max frame
size of the tc. Hopefully it is clear now.

This audit also found the following real deviations from the
theoretical values. Fixed in this patch.

  * I multiplied the DV calculations by (36/25) which always
    evaluates to 1. This does not match the intended theoretical
    value of 1.44.

  * IXGBE_BT2KB added 1023 to account for rounding however this
    really should be 8 * 1023 - 1 to account for division by 8k.

  * x2 multiplication of max frame in DV calculations to account
    for updated hardware recommendations.

With this patch the DV values are inline with the recommendations
in the 82599 and 82598 data sheets. Its worth noting I did not
see any dropped frames with flow control on in my experiments without
this patch. However aligning with the hardware specs and
recommendations seems like a good idea here to account for worst
case scenarios.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Tested-by: Ross Brattain <ross.b.brattain@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h |   37 +++++++++++++++----------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
index 204848d..1085c07 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
@@ -2419,7 +2419,7 @@ typedef u32 ixgbe_physical_layer;
  */
 
 /* BitTimes (BT) conversion */
-#define IXGBE_BT2KB(BT) ((BT + 1023) / (8 * 1024))
+#define IXGBE_BT2KB(BT) ((BT + (8 * 1024 - 1)) / (8 * 1024))
 #define IXGBE_B2BT(BT) (BT * 8)
 
 /* Calculate Delay to respond to PFC */
@@ -2450,24 +2450,31 @@ typedef u32 ixgbe_physical_layer;
 #define IXGBE_PCI_DELAY	10000
 
 /* Calculate X540 delay value in bit times */
-#define IXGBE_FILL_RATE (36 / 25)
-
-#define IXGBE_DV_X540(LINK, TC) (IXGBE_FILL_RATE * \
-				 (IXGBE_B2BT(LINK) + IXGBE_PFC_D + \
-				 (2 * IXGBE_CABLE_DC) + \
-				 (2 * IXGBE_ID_X540) + \
-				 IXGBE_HD + IXGBE_B2BT(TC)))
+#define IXGBE_DV_X540(_max_frame_link, _max_frame_tc) \
+			((36 * \
+			  (IXGBE_B2BT(_max_frame_link) + \
+			   IXGBE_PFC_D + \
+			   (2 * IXGBE_CABLE_DC) + \
+			   (2 * IXGBE_ID_X540) + \
+			   IXGBE_HD) / 25 + 1) + \
+			 2 * IXGBE_B2BT(_max_frame_tc))
 
 /* Calculate 82599, 82598 delay value in bit times */
-#define IXGBE_DV(LINK, TC) (IXGBE_FILL_RATE * \
-			    (IXGBE_B2BT(LINK) + IXGBE_PFC_D + \
-			    (2 * IXGBE_CABLE_DC) + (2 * IXGBE_ID) + \
-			    IXGBE_HD + IXGBE_B2BT(TC)))
+#define IXGBE_DV(_max_frame_link, _max_frame_tc) \
+			((36 * \
+			  (IXGBE_B2BT(_max_frame_link) + \
+			   IXGBE_PFC_D + \
+			   (2 * IXGBE_CABLE_DC) + \
+			   (2 * IXGBE_ID) + \
+			   IXGBE_HD) / 25 + 1) + \
+			 2 * IXGBE_B2BT(_max_frame_tc))
 
 /* Calculate low threshold delay values */
-#define IXGBE_LOW_DV_X540(TC) (2 * IXGBE_B2BT(TC) + \
-			       (IXGBE_FILL_RATE * IXGBE_PCI_DELAY))
-#define IXGBE_LOW_DV(TC)      (2 * IXGBE_LOW_DV_X540(TC))
+#define IXGBE_LOW_DV_X540(_max_frame_tc) \
+			(2 * IXGBE_B2BT(_max_frame_tc) + \
+			(36 * IXGBE_PCI_DELAY / 25) + 1)
+#define IXGBE_LOW_DV(_max_frame_tc) \
+			(2 * IXGBE_LOW_DV_X540(_max_frame_tc))
 
 /* Software ATR hash keys */
 #define IXGBE_ATR_BUCKET_HASH_KEY    0x3DAD14E2
-- 
1.7.10.2

^ permalink raw reply related

* [net-next 1/6] e1000e: use more informative logging macros when netdev not yet registered
From: Jeff Kirsher @ 2012-06-14 10:18 UTC (permalink / raw)
  To: davem; +Cc: Bruce Allan, netdev, gospo, sassmann, Jeff Kirsher
In-Reply-To: <1339669089-27955-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Bruce Allan <bruce.w.allan@intel.com>

Based on a report from Ethan Zhao, before calling register_netdev() the
driver should be using logging macros that do not display the potentially
confusing "(unregistered net_device)" yet still display the useful driver
name and PCI bus/device/function.

Reported-by: Ethan Zhao <ethan.kernel@gmail.com>
Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/netdev.c |   11 ++++---
 drivers/net/ethernet/intel/e1000e/param.c  |   43 +++++++++++++++++-----------
 2 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 31d37a2..ba86b3f 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -6238,7 +6238,8 @@ static int __devinit e1000_probe(struct pci_dev *pdev,
 	}
 
 	if (hw->phy.ops.check_reset_block && hw->phy.ops.check_reset_block(hw))
-		e_info("PHY reset is blocked due to SOL/IDER session.\n");
+		dev_info(&pdev->dev,
+			 "PHY reset is blocked due to SOL/IDER session.\n");
 
 	/* Set initial default active device features */
 	netdev->features = (NETIF_F_SG |
@@ -6288,7 +6289,7 @@ static int __devinit e1000_probe(struct pci_dev *pdev,
 		if (e1000_validate_nvm_checksum(&adapter->hw) >= 0)
 			break;
 		if (i == 2) {
-			e_err("The NVM Checksum Is Not Valid\n");
+			dev_err(&pdev->dev, "The NVM Checksum Is Not Valid\n");
 			err = -EIO;
 			goto err_eeprom;
 		}
@@ -6298,13 +6299,15 @@ static int __devinit e1000_probe(struct pci_dev *pdev,
 
 	/* copy the MAC address */
 	if (e1000e_read_mac_addr(&adapter->hw))
-		e_err("NVM Read Error while reading MAC address\n");
+		dev_err(&pdev->dev,
+			"NVM Read Error while reading MAC address\n");
 
 	memcpy(netdev->dev_addr, adapter->hw.mac.addr, netdev->addr_len);
 	memcpy(netdev->perm_addr, adapter->hw.mac.addr, netdev->addr_len);
 
 	if (!is_valid_ether_addr(netdev->perm_addr)) {
-		e_err("Invalid MAC Address: %pM\n", netdev->perm_addr);
+		dev_err(&pdev->dev, "Invalid MAC Address: %pM\n",
+			netdev->perm_addr);
 		err = -EIO;
 		goto err_eeprom;
 	}
diff --git a/drivers/net/ethernet/intel/e1000e/param.c b/drivers/net/ethernet/intel/e1000e/param.c
index 55cc156..dfbfa7f 100644
--- a/drivers/net/ethernet/intel/e1000e/param.c
+++ b/drivers/net/ethernet/intel/e1000e/param.c
@@ -199,16 +199,19 @@ static int __devinit e1000_validate_option(unsigned int *value,
 	case enable_option:
 		switch (*value) {
 		case OPTION_ENABLED:
-			e_info("%s Enabled\n", opt->name);
+			dev_info(&adapter->pdev->dev, "%s Enabled\n",
+				 opt->name);
 			return 0;
 		case OPTION_DISABLED:
-			e_info("%s Disabled\n", opt->name);
+			dev_info(&adapter->pdev->dev, "%s Disabled\n",
+				 opt->name);
 			return 0;
 		}
 		break;
 	case range_option:
 		if (*value >= opt->arg.r.min && *value <= opt->arg.r.max) {
-			e_info("%s set to %i\n", opt->name, *value);
+			dev_info(&adapter->pdev->dev, "%s set to %i\n",
+				 opt->name, *value);
 			return 0;
 		}
 		break;
@@ -220,7 +223,8 @@ static int __devinit e1000_validate_option(unsigned int *value,
 			ent = &opt->arg.l.p[i];
 			if (*value == ent->i) {
 				if (ent->str[0] != '\0')
-					e_info("%s\n", ent->str);
+					dev_info(&adapter->pdev->dev, "%s\n",
+						 ent->str);
 				return 0;
 			}
 		}
@@ -230,8 +234,8 @@ static int __devinit e1000_validate_option(unsigned int *value,
 		BUG();
 	}
 
-	e_info("Invalid %s value specified (%i) %s\n", opt->name, *value,
-	       opt->err);
+	dev_info(&adapter->pdev->dev, "Invalid %s value specified (%i) %s\n",
+		 opt->name, *value, opt->err);
 	*value = opt->def;
 	return -1;
 }
@@ -251,8 +255,10 @@ void __devinit e1000e_check_options(struct e1000_adapter *adapter)
 	int bd = adapter->bd_number;
 
 	if (bd >= E1000_MAX_NIC) {
-		e_notice("Warning: no configuration for board #%i\n", bd);
-		e_notice("Using defaults for all values\n");
+		dev_notice(&adapter->pdev->dev,
+			   "Warning: no configuration for board #%i\n", bd);
+		dev_notice(&adapter->pdev->dev,
+			   "Using defaults for all values\n");
 	}
 
 	{ /* Transmit Interrupt Delay */
@@ -366,27 +372,32 @@ void __devinit e1000e_check_options(struct e1000_adapter *adapter)
 			 * default values
 			 */
 			if (adapter->itr > 4)
-				e_info("%s set to default %d\n", opt.name,
-				       adapter->itr);
+				dev_info(&adapter->pdev->dev,
+					 "%s set to default %d\n", opt.name,
+					 adapter->itr);
 		}
 
 		adapter->itr_setting = adapter->itr;
 		switch (adapter->itr) {
 		case 0:
-			e_info("%s turned off\n", opt.name);
+			dev_info(&adapter->pdev->dev, "%s turned off\n",
+				 opt.name);
 			break;
 		case 1:
-			e_info("%s set to dynamic mode\n", opt.name);
+			dev_info(&adapter->pdev->dev,
+				 "%s set to dynamic mode\n", opt.name);
 			adapter->itr = 20000;
 			break;
 		case 3:
-			e_info("%s set to dynamic conservative mode\n",
-			       opt.name);
+			dev_info(&adapter->pdev->dev,
+				 "%s set to dynamic conservative mode\n",
+				 opt.name);
 			adapter->itr = 20000;
 			break;
 		case 4:
-			e_info("%s set to simplified (2000-8000 ints) mode\n",
-			       opt.name);
+			dev_info(&adapter->pdev->dev,
+				 "%s set to simplified (2000-8000 ints) mode\n",
+				 opt.name);
 			break;
 		default:
 			/*
-- 
1.7.10.2

^ permalink raw reply related

* [net-next 0/6][pull request] Intel Wired LAN Driver Updates
From: Jeff Kirsher @ 2012-06-14 10:18 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, gospo, sassmann

This series contains updates to e1000 and ixgbe.

The following are changes since commit 0450243096de90ff51c3a6c605410c5e28d79f8d:
  bonding: drop_monitor aware
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-next master

Bruce Allan (1):
  e1000e: use more informative logging macros when netdev not yet
    registered

Emil Tantilov (1):
  ixgbe: do not compile ixgbe_sysfs.c when CONFIG_IXGBE_HWMON is not
    set

Jacob Keller (3):
  ixgbe: ptp code cleanup
  ixgbe: PTP Fix hwtstamp mode settings
  ixgbe: Check PTP Rx timestamps via BPF filter

John Fastabend (1):
  ixgbe: align flow control DV macros with datasheet

 drivers/net/ethernet/intel/e1000e/netdev.c     |   11 +-
 drivers/net/ethernet/intel/e1000e/param.c      |   43 ++++---
 drivers/net/ethernet/intel/ixgbe/Makefile      |    4 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe.h       |    2 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  |   11 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c   |  149 ++++++++++++++++++------
 drivers/net/ethernet/intel/ixgbe/ixgbe_sysfs.c |    2 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h  |   37 +++---
 8 files changed, 180 insertions(+), 79 deletions(-)

-- 
1.7.10.2

^ permalink raw reply

* Re: Regression on TX throughput when using bonding
From: Jean-Michel Hautbois @ 2012-06-14 10:15 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1339667417.22704.707.camel@edumazet-glaptop>

2012/6/14 Eric Dumazet <eric.dumazet@gmail.com>:
> On Thu, 2012-06-14 at 11:22 +0200, Eric Dumazet wrote:
>
>> So you are saying that if you make skb_orphan_try() doing nothing, it
>> solves your problem ?
>
> It probably does, if your application does an UDP flood, trying to send
> more than the link bandwidth. I guess only benchmarks workloads ever try
> to do that.
>
> bonding has no way to give congestion back, it has no Qdisc by default.
>
> We probably can defer the skb_orphan_try() for bonding master, a bit
> like the IFF_XMIT_DST_RELEASE
>
>  drivers/net/bonding/bond_main.c |    2 +-
>  include/linux/if.h              |    3 +++
>  net/core/dev.c                  |    5 +++--
>  3 files changed, 7 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index 2ee8cf9..1b1e9c8 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -4343,7 +4343,7 @@ static void bond_setup(struct net_device *bond_dev)
>        bond_dev->tx_queue_len = 0;
>        bond_dev->flags |= IFF_MASTER|IFF_MULTICAST;
>        bond_dev->priv_flags |= IFF_BONDING;
> -       bond_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
> +       bond_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING | IFF_XMIT_ORPHAN);
>
>        /* At first, we block adding VLANs. That's the only way to
>         * prevent problems that occur when adding VLANs over an
> diff --git a/include/linux/if.h b/include/linux/if.h
> index f995c66..a788e7b 100644
> --- a/include/linux/if.h
> +++ b/include/linux/if.h
> @@ -81,6 +81,9 @@
>  #define IFF_UNICAST_FLT        0x20000         /* Supports unicast filtering   */
>  #define IFF_TEAM_PORT  0x40000         /* device used as team port */
>  #define IFF_SUPP_NOFCS 0x80000         /* device supports sending custom FCS */
> +#define IFF_XMIT_ORPHAN        0x100000        /* dev_hard_start_xmit() is allowed to
> +                                        * orphan skb
> +                                        */
>
>
>  #define IF_GET_IFACE   0x0001          /* for querying only */
> diff --git a/net/core/dev.c b/net/core/dev.c
> index cd09819..3435463 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2193,7 +2193,8 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
>                if (!list_empty(&ptype_all))
>                        dev_queue_xmit_nit(skb, dev);
>
> -               skb_orphan_try(skb);
> +               if (dev->priv_flags & IFF_XMIT_ORPHAN)
> +                       skb_orphan_try(skb);
>
>                features = netif_skb_features(skb);
>
> @@ -5929,7 +5930,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
>        INIT_LIST_HEAD(&dev->napi_list);
>        INIT_LIST_HEAD(&dev->unreg_list);
>        INIT_LIST_HEAD(&dev->link_watch_list);
> -       dev->priv_flags = IFF_XMIT_DST_RELEASE;
> +       dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_ORPHAN;
>        setup(dev);
>
>        dev->num_tx_queues = txqs;
>
>

It works

^ permalink raw reply

* Re: Regression on TX throughput when using bonding
From: Eric Dumazet @ 2012-06-14 10:07 UTC (permalink / raw)
  To: David Miller; +Cc: jhautbois, netdev
In-Reply-To: <20120614.030021.2291563831943273331.davem@davemloft.net>

On Thu, 2012-06-14 at 03:00 -0700, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Thu, 14 Jun 2012 11:50:17 +0200
> 
> > On Thu, 2012-06-14 at 11:22 +0200, Eric Dumazet wrote:
> > 
> >> So you are saying that if you make skb_orphan_try() doing nothing, it
> >> solves your problem ?
> > 
> > It probably does, if your application does an UDP flood, trying to send
> > more than the link bandwidth. I guess only benchmarks workloads ever try
> > to do that.
> 
> Eric, I just want to point out that back when this early orphaning
> idea were being proposed I warned about this, and specifically I
> mentioned that, for datagram sockets, the socket send buffer limits
> are what provide proper rate control and fairness.

If I remember well, the argument was that if workload was using thousand
of sockets, the per socket limitation of in-flight packet would not save
you anyway. We would drop packets.


> It also, therefore, protects the system from one datagram spammer
> being able to essentially take over the network interface and blocking
> out all other users.
> 
> Early orphaning breaks this completely.
> 
> I guess we decided that moving an atomic operation earlier is worth
> all of this?

It was, but with BQL, we should have far less packets in TX rings, so it
might be different today (on BQL enabled NICS only)

> 
> Now we are so addicted to the increased performance from early
> orphaning that I fear we'll never be allowed back into that sane
> state of affairs ever again.


bonding (or other virtual devices) is special in the sense the
dev_hard_start_xmit() is called twice.

We should have a way to properly park packets in Qdiscs, and only do the
orphaning once skb given to real device for 'immediate or so'
transmission.

The pppoe thread is only another manifestation of the same problem.

^ permalink raw reply

* [PATCH 10/10] netfilter: nf_conntrack_l4proto_dccp[4,6] cleanup
From: Gao feng @ 2012-06-14 10:07 UTC (permalink / raw)
  To: pablo; +Cc: netdev, netfilter-devel, Gao feng
In-Reply-To: <1339668445-23848-1-git-send-email-gaofeng@cn.fujitsu.com>

some cleanup of nf_conntrack_l4proto_dccp[4,6],
make codes more clearer and ready for moving the
sysctl code to nf_conntrack_proto_*_sysctl.c to
reduce the ifdef pollution.

and use nf_proto_net.users to identify if it's the first time
we use the nf_proto_net. when it's the first time,we will
initialized it.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
 net/netfilter/nf_conntrack_proto_dccp.c |   51 +++++++++++++++++-------------
 1 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 52da8f0..ef044ef 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -815,16 +815,38 @@ static struct ctl_table dccp_sysctl_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
 
+static int dccp_kmemdup_sysctl_table(struct nf_proto_net *pn)
+{
+#ifdef CONFIG_SYSCTL
+	struct dccp_net *dn = (struct dccp_net *)pn;
+
+	if (pn->ctl_table)
+		return 0;
+
+	pn->ctl_table = kmemdup(dccp_sysctl_table,
+				sizeof(dccp_sysctl_table),
+				GFP_KERNEL);
+	if (!pn->ctl_table)
+		return -ENOMEM;
+
+	pn->ctl_table[0].data = &dn->dccp_timeout[CT_DCCP_REQUEST];
+	pn->ctl_table[1].data = &dn->dccp_timeout[CT_DCCP_RESPOND];
+	pn->ctl_table[2].data = &dn->dccp_timeout[CT_DCCP_PARTOPEN];
+	pn->ctl_table[3].data = &dn->dccp_timeout[CT_DCCP_OPEN];
+	pn->ctl_table[4].data = &dn->dccp_timeout[CT_DCCP_CLOSEREQ];
+	pn->ctl_table[5].data = &dn->dccp_timeout[CT_DCCP_CLOSING];
+	pn->ctl_table[6].data = &dn->dccp_timeout[CT_DCCP_TIMEWAIT];
+	pn->ctl_table[7].data = &dn->dccp_loose;
+#endif
+	return 0;
+}
+
 static int dccp_init_net(struct net *net, u_int16_t proto)
 {
 	struct dccp_net *dn = dccp_pernet(net);
 	struct nf_proto_net *pn = (struct nf_proto_net *)dn;
 
-#ifdef CONFIG_SYSCTL
-	if (!pn->ctl_table) {
-#else
-	if (!pn->users++) {
-#endif
+	if (!pn->users) {
 		/* default values */
 		dn->dccp_loose = 1;
 		dn->dccp_timeout[CT_DCCP_REQUEST]	= 2 * DCCP_MSL;
@@ -834,24 +856,9 @@ static int dccp_init_net(struct net *net, u_int16_t proto)
 		dn->dccp_timeout[CT_DCCP_CLOSEREQ]	= 64 * HZ;
 		dn->dccp_timeout[CT_DCCP_CLOSING]	= 64 * HZ;
 		dn->dccp_timeout[CT_DCCP_TIMEWAIT]	= 2 * DCCP_MSL;
-#ifdef CONFIG_SYSCTL
-		pn->ctl_table = kmemdup(dccp_sysctl_table,
-					sizeof(dccp_sysctl_table),
-					GFP_KERNEL);
-		if (!pn->ctl_table)
-			return -ENOMEM;
-
-		pn->ctl_table[0].data = &dn->dccp_timeout[CT_DCCP_REQUEST];
-		pn->ctl_table[1].data = &dn->dccp_timeout[CT_DCCP_RESPOND];
-		pn->ctl_table[2].data = &dn->dccp_timeout[CT_DCCP_PARTOPEN];
-		pn->ctl_table[3].data = &dn->dccp_timeout[CT_DCCP_OPEN];
-		pn->ctl_table[4].data = &dn->dccp_timeout[CT_DCCP_CLOSEREQ];
-		pn->ctl_table[5].data = &dn->dccp_timeout[CT_DCCP_CLOSING];
-		pn->ctl_table[6].data = &dn->dccp_timeout[CT_DCCP_TIMEWAIT];
-		pn->ctl_table[7].data = &dn->dccp_loose;
-#endif
 	}
-	return 0;
+
+	return dccp_kmemdup_sysctl_table(pn);
 }
 
 static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = {
-- 
1.7.7.6


^ permalink raw reply related

* [PATCH 09/10] netfilter: nf_conntrack_l4proto_generic cleanup
From: Gao feng @ 2012-06-14 10:07 UTC (permalink / raw)
  To: pablo; +Cc: netdev, netfilter-devel, Gao feng
In-Reply-To: <1339668445-23848-1-git-send-email-gaofeng@cn.fujitsu.com>

some cleanup of nf_conntrack_l4proto_generic,
split the code to make it more clearer.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
 net/netfilter/nf_conntrack_proto_generic.c |   41 ++++++++++++++++++++++-----
 1 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index d1ed7b4..5e3c0ea 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -135,34 +135,59 @@ static struct ctl_table generic_compat_sysctl_table[] = {
 #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
 #endif /* CONFIG_SYSCTL */
 
-static int generic_init_net(struct net *net, u_int16_t proto)
+static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn)
 {
-	struct nf_generic_net *gn = generic_pernet(net);
-	struct nf_proto_net *pn = (struct nf_proto_net *)gn;
-	gn->timeout = nf_ct_generic_timeout;
 #ifdef CONFIG_SYSCTL
+	struct nf_generic_net *gn = (struct nf_generic_net *)pn;
+
 	pn->ctl_table = kmemdup(generic_sysctl_table,
 				sizeof(generic_sysctl_table),
 				GFP_KERNEL);
 	if (!pn->ctl_table)
 		return -ENOMEM;
+
 	pn->ctl_table[0].data = &gn->timeout;
+#endif
+	return 0;
+}
 
+static int generic_kmemdup_compat_sysctl_table(struct nf_proto_net *pn)
+{
+#ifdef CONFIG_SYSCTL
 #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+	struct nf_generic_net *gn = (struct nf_generic_net *)pn;
+
 	pn->ctl_compat_table = kmemdup(generic_compat_sysctl_table,
 				       sizeof(generic_compat_sysctl_table),
 				       GFP_KERNEL);
-	if (!pn->ctl_compat_table) {
-		kfree(pn->ctl_table);
-		pn->ctl_table = NULL;
+	if (!pn->ctl_compat_table)
 		return -ENOMEM;
-	}
+
 	pn->ctl_compat_table[0].data = &gn->timeout;
 #endif
 #endif
 	return 0;
 }
 
+static int generic_init_net(struct net *net, u_int16_t proto)
+{
+	int ret = 0;
+	struct nf_generic_net *gn = generic_pernet(net);
+	struct nf_proto_net *pn = (struct nf_proto_net *)gn;
+
+	gn->timeout = nf_ct_generic_timeout;
+
+	ret = generic_kmemdup_compat_sysctl_table(pn);
+	if (ret < 0)
+		return ret;
+
+	ret = generic_kmemdup_sysctl_table(pn);
+	if (ret < 0)
+		nf_ct_kfree_compat_sysctl_table(pn);
+
+	return ret;
+}
+
 struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly =
 {
 	.l3proto		= PF_UNSPEC,
-- 
1.7.7.6


^ permalink raw reply related

* [PATCH 08/10] netfilter: merge sctpv[4,6]_net_init into sctp_net_init
From: Gao feng @ 2012-06-14 10:07 UTC (permalink / raw)
  To: pablo; +Cc: netdev, netfilter-devel, Gao feng
In-Reply-To: <1339668445-23848-1-git-send-email-gaofeng@cn.fujitsu.com>

merge sctpv4_net_init and sctpv6_net_init into sctp_net_init to
reduce the redundancy codes.

and use nf_proto_net.users to identify if it's the first time
we use the nf_proto_net. when it's the first time,we will
initialized it.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
 net/netfilter/nf_conntrack_proto_sctp.c |   60 ++++++++++--------------------
 1 files changed, 20 insertions(+), 40 deletions(-)

diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 1e7836c..49ea3c0 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -707,23 +707,11 @@ static struct ctl_table sctp_compat_sysctl_table[] = {
 #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
 #endif
 
-static void sctp_init_net_data(struct sctp_net *sn)
-{
-	int i;
-#ifdef CONFIG_SYSCTL
-	if (!sn->pn.ctl_table) {
-#else
-	if (!sn->pn.users++) {
-#endif
-		for (i = 0; i < SCTP_CONNTRACK_MAX; i++)
-			sn->timeouts[i] = sctp_timeouts[i];
-	}
-}
-
 static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn)
 {
 #ifdef CONFIG_SYSCTL
 	struct sctp_net *sn = (struct sctp_net *)pn;
+
 	if (pn->ctl_table)
 		return 0;
 
@@ -749,6 +737,7 @@ static int sctp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn)
 #ifdef CONFIG_SYSCTL
 #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
 	struct sctp_net *sn = (struct sctp_net *)pn;
+
 	pn->ctl_compat_table = kmemdup(sctp_compat_sysctl_table,
 				       sizeof(sctp_compat_sysctl_table),
 				       GFP_KERNEL);
@@ -767,41 +756,32 @@ static int sctp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn)
 	return 0;
 }
 
-static int sctpv4_init_net(struct net *net, u_int16_t proto)
+static int sctp_init_net(struct net *net, u_int16_t proto)
 {
-	int ret;
+	int ret = 0;
 	struct sctp_net *sn = sctp_pernet(net);
 	struct nf_proto_net *pn = (struct nf_proto_net *)sn;
 
-	sctp_init_net_data(sn);
-
-	ret = sctp_kmemdup_compat_sysctl_table(pn);
-	if (ret < 0)
-		return ret;
+	if (!sn->pn.users) {
+		int i = 0;
+		for (i = 0; i < SCTP_CONNTRACK_MAX; i++)
+			sn->timeouts[i] = sctp_timeouts[i];
+	}
 
-	ret = sctp_kmemdup_sysctl_table(pn);
+	if (proto == AF_INET) {
+		ret = sctp_kmemdup_compat_sysctl_table(pn);
+		if (ret < 0)
+			return ret;
 
-#ifdef CONFIG_SYSCTL
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-	if (ret < 0) {
+		ret = sctp_kmemdup_sysctl_table(pn);
+		if (ret < 0)
+			nf_ct_kfree_compat_sysctl_table(pn);
+	} else
+		ret = sctp_kmemdup_sysctl_table(pn);
 
-		kfree(pn->ctl_compat_table);
-		pn->ctl_compat_table = NULL;
-	}
-#endif
-#endif
 	return ret;
 }
 
-static int sctpv6_init_net(struct net *net, u_int16_t proto)
-{
-	struct sctp_net *sn = sctp_pernet(net);
-	struct nf_proto_net *pn = (struct nf_proto_net *)sn;
-
-	sctp_init_net_data(sn);
-	return sctp_kmemdup_sysctl_table(pn);
-}
-
 static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
 	.l3proto		= PF_INET,
 	.l4proto 		= IPPROTO_SCTP,
@@ -833,7 +813,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 	.net_id			= &sctp_net_id,
-	.init_net		= sctpv4_init_net,
+	.init_net		= sctp_init_net,
 };
 
 static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
@@ -867,7 +847,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 #endif
 	.net_id			= &sctp_net_id,
-	.init_net		= sctpv6_init_net,
+	.init_net		= sctp_init_net,
 };
 
 static int sctp_net_init(struct net *net)
-- 
1.7.7.6


^ permalink raw reply related

* [PATCH 07/10] netfilter: nf_conntrack_l4proto_udplite[4,6] cleanup
From: Gao feng @ 2012-06-14 10:07 UTC (permalink / raw)
  To: pablo; +Cc: netdev, netfilter-devel, Gao feng
In-Reply-To: <1339668445-23848-1-git-send-email-gaofeng@cn.fujitsu.com>

some cleanup for nf_conntrack_l4proto_udplite[4,6],
make codes more clearer and ready for moving the
sysctl code to nf_conntrack_proto_*_sysctl.c to
reduce the ifdef pollution.

and use nf_proto_net.users to identify if it's the first time
we use the nf_proto_net. when it's the first time,we will
initialized it.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
 net/netfilter/nf_conntrack_proto_udplite.c |   42 +++++++++++++++++----------
 1 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index d33e511..f6a789a 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -234,29 +234,39 @@ static struct ctl_table udplite_sysctl_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
 
+static int udplite_kmemdup_sysctl_table(struct nf_proto_net *pn)
+{
+#ifdef CONFIG_SYSCTL
+	struct udplite_net *un = (struct udplite_net *)pn;
+
+	if (pn->ctl_table)
+		return 0;
+
+	pn->ctl_table = kmemdup(udplite_sysctl_table,
+				sizeof(udplite_sysctl_table),
+				GFP_KERNEL);
+	if (!pn->ctl_table)
+		return -ENOMEM;
+
+	pn->ctl_table[0].data = &un->timeouts[UDPLITE_CT_UNREPLIED];
+	pn->ctl_table[1].data = &un->timeouts[UDPLITE_CT_REPLIED];
+#endif
+	return 0;
+}
+
+
 static int udplite_init_net(struct net *net, u_int16_t proto)
 {
-	int i;
 	struct udplite_net *un = udplite_pernet(net);
 	struct nf_proto_net *pn = (struct nf_proto_net *)un;
-#ifdef CONFIG_SYSCTL
-	if (!pn->ctl_table) {
-#else
-	if (!pn->users++) {
-#endif
+
+	if (!pn->users) {
+		int i;
 		for (i = 0 ; i < UDPLITE_CT_MAX; i++)
 			un->timeouts[i] = udplite_timeouts[i];
-#ifdef CONFIG_SYSCTL
-		pn->ctl_table = kmemdup(udplite_sysctl_table,
-					sizeof(udplite_sysctl_table),
-					GFP_KERNEL);
-		if (!pn->ctl_table)
-			return -ENOMEM;
-		pn->ctl_table[0].data = &un->timeouts[UDPLITE_CT_UNREPLIED];
-		pn->ctl_table[1].data = &un->timeouts[UDPLITE_CT_REPLIED];
-#endif
 	}
-	return 0;
+
+	return udplite_kmemdup_sysctl_table(pn);
 }
 
 static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
-- 
1.7.7.6


^ permalink raw reply related

* [PATCH 06/10] merge udpv[4,6]_net_init into udp_net_init
From: Gao feng @ 2012-06-14 10:07 UTC (permalink / raw)
  To: pablo; +Cc: netdev, netfilter-devel, Gao feng
In-Reply-To: <1339668445-23848-1-git-send-email-gaofeng@cn.fujitsu.com>

merge udpv4_net_init and udpv6_net_init into udp_net_init to
reduce the redundancy codes.

and use nf_proto_net.users to identify if it's the first time
we use the nf_proto_net. when it's the first time,we will
initialized it.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
 net/netfilter/nf_conntrack_proto_udp.c |   60 ++++++++++++--------------------
 1 files changed, 22 insertions(+), 38 deletions(-)

diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 2b978e6..480126e 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -239,13 +239,16 @@ static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn)
 {
 #ifdef CONFIG_SYSCTL
 	struct nf_udp_net *un = (struct nf_udp_net *)pn;
+
 	if (pn->ctl_table)
 		return 0;
+
 	pn->ctl_table = kmemdup(udp_sysctl_table,
 				sizeof(udp_sysctl_table),
 				GFP_KERNEL);
 	if (!pn->ctl_table)
 		return -ENOMEM;
+
 	pn->ctl_table[0].data = &un->timeouts[UDP_CT_UNREPLIED];
 	pn->ctl_table[1].data = &un->timeouts[UDP_CT_REPLIED];
 #endif
@@ -257,6 +260,7 @@ static int udp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn)
 #ifdef CONFIG_SYSCTL
 #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
 	struct nf_udp_net *un = (struct nf_udp_net *)pn;
+
 	pn->ctl_compat_table = kmemdup(udp_compat_sysctl_table,
 				       sizeof(udp_compat_sysctl_table),
 				       GFP_KERNEL);
@@ -270,52 +274,32 @@ static int udp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn)
 	return 0;
 }
 
-static void udp_init_net_data(struct nf_udp_net *un)
+static int udp_init_net(struct net *net, u_int16_t proto)
 {
-	int i;
-#ifdef CONFIG_SYSCTL
-	if (!un->pn.ctl_table) {
-#else
-	if (!un->pn.users++) {
-#endif
+	int ret = 0;
+	struct nf_udp_net *un = udp_pernet(net);
+	struct nf_proto_net *pn = (struct nf_proto_net *)un;
+
+	if (pn->users) {
+		int i = 0;
 		for (i = 0; i < UDP_CT_MAX; i++)
 			un->timeouts[i] = udp_timeouts[i];
 	}
-}
 
-static int udpv4_init_net(struct net *net, u_int16_t proto)
-{
-	int ret;
-	struct nf_udp_net *un = udp_pernet(net);
-	struct nf_proto_net *pn = (struct nf_proto_net *)un;
+	if (proto == AF_INET) {
+		ret = udp_kmemdup_compat_sysctl_table(pn);
+		if (ret < 0)
+			return ret;
 
-	udp_init_net_data(un);
+		ret = udp_kmemdup_sysctl_table(pn);
+		if (ret < 0)
+			nf_ct_kfree_compat_sysctl_table(pn);
+	} else
+		ret = udp_kmemdup_sysctl_table(pn);
 
-	ret = udp_kmemdup_compat_sysctl_table(pn);
-	if (ret < 0)
-		return ret;
-
-	ret = udp_kmemdup_sysctl_table(pn);
-#ifdef CONFIG_SYSCTL
-#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
-	if (ret < 0) {
-		kfree(pn->ctl_compat_table);
-		pn->ctl_compat_table = NULL;
-	}
-#endif
-#endif
 	return ret;
 }
 
-static int udpv6_init_net(struct net *net, u_int16_t proto)
-{
-	struct nf_udp_net *un = udp_pernet(net);
-	struct nf_proto_net *pn = (struct nf_proto_net *)un;
-
-	udp_init_net_data(un);
-	return udp_kmemdup_sysctl_table(pn);
-}
-
 struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
 {
 	.l3proto		= PF_INET,
@@ -343,7 +327,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
 		.nla_policy	= udp_timeout_nla_policy,
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-	.init_net		= udpv4_init_net,
+	.init_net		= udp_init_net,
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4);
 
@@ -374,6 +358,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
 		.nla_policy	= udp_timeout_nla_policy,
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-	.init_net		= udpv6_init_net,
+	.init_net		= udp_init_net,
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6);
-- 
1.7.7.6


^ permalink raw reply related

* [PATCH 03/10] netfilter: add nf_ct_kfree_compat_sysctl_table to make codes clear
From: Gao feng @ 2012-06-14 10:07 UTC (permalink / raw)
  To: pablo; +Cc: netdev, netfilter-devel, Gao feng
In-Reply-To: <1339668445-23848-1-git-send-email-gaofeng@cn.fujitsu.com>

add function nf_ct_kfree_compat_sysctl_table to kfree l4proto's
compat sysctl table and set the sysctl table point to NULL.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
 include/net/netfilter/nf_conntrack_l4proto.h |    8 ++++++++
 net/netfilter/nf_conntrack_proto.c           |    4 +---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 5dd60f2..889b717 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -132,6 +132,14 @@ extern int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
 extern int nf_ct_port_nlattr_tuple_size(void);
 extern const struct nla_policy nf_ct_port_nla_policy[];
 
+static inline void nf_ct_kfree_compat_sysctl_table(struct nf_proto_net *pn)
+{
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
+	kfree(pn->ctl_compat_table);
+	pn->ctl_compat_table = NULL;
+#endif
+}
+
 #ifdef CONFIG_SYSCTL
 #ifdef DEBUG_INVALID_PACKETS
 #define LOG_INVALID(net, proto)				\
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 8fc0332..c9df1b4 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -361,9 +361,7 @@ int nf_ct_l4proto_register_sysctl(struct net *net,
 					    NULL);
 		if (err == 0)
 			goto out;
-
-		kfree(pn->ctl_compat_table);
-		pn->ctl_compat_table = NULL;
+		nf_ct_kfree_compat_sysctl_table(pn);
 		nf_ct_unregister_sysctl(&pn->ctl_table_header,
 					&pn->ctl_table,
 					&pn->users);
-- 
1.7.7.6


^ permalink raw reply related

* [PATCH 02/10] netfilter: add parameter proto for l4proto.init_net
From: Gao feng @ 2012-06-14 10:07 UTC (permalink / raw)
  To: pablo; +Cc: netdev, netfilter-devel, Gao feng
In-Reply-To: <1339668445-23848-1-git-send-email-gaofeng@cn.fujitsu.com>

there are redundancy codes in l4proto's init_net functions.
we can use one init_net function and l3proto to impletment
the same thing.

So we should add l3proto as a parameter for init_net function.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
 include/net/netfilter/nf_conntrack_l4proto.h   |    2 +-
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |    2 +-
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |    2 +-
 net/netfilter/nf_conntrack_proto.c             |    6 ++++--
 net/netfilter/nf_conntrack_proto_dccp.c        |    2 +-
 net/netfilter/nf_conntrack_proto_generic.c     |    2 +-
 net/netfilter/nf_conntrack_proto_gre.c         |    2 +-
 net/netfilter/nf_conntrack_proto_sctp.c        |    4 ++--
 net/netfilter/nf_conntrack_proto_tcp.c         |    4 ++--
 net/netfilter/nf_conntrack_proto_udp.c         |    4 ++--
 net/netfilter/nf_conntrack_proto_udplite.c     |    2 +-
 11 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 81c52b5..5dd60f2 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -97,7 +97,7 @@ struct nf_conntrack_l4proto {
 #endif
 	int	*net_id;
 	/* Init l4proto pernet data */
-	int (*init_net)(struct net *net);
+	int (*init_net)(struct net *net, u_int16_t proto);
 
 	/* Protocol name */
 	const char *name;
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 041923c..76f7a2f 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -337,7 +337,7 @@ static struct ctl_table icmp_compat_sysctl_table[] = {
 #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
 #endif /* CONFIG_SYSCTL */
 
-static int icmp_init_net(struct net *net)
+static int icmp_init_net(struct net *net, u_int16_t proto)
 {
 	struct nf_icmp_net *in = icmp_pernet(net);
 	struct nf_proto_net *pn = (struct nf_proto_net *)in;
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 63ed012..807ae09 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -333,7 +333,7 @@ static struct ctl_table icmpv6_sysctl_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
 
-static int icmpv6_init_net(struct net *net)
+static int icmpv6_init_net(struct net *net, u_int16_t proto)
 {
 	struct nf_icmp_net *in = icmpv6_pernet(net);
 	struct nf_proto_net *pn = (struct nf_proto_net *)in;
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index a434dd7..8fc0332 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -193,6 +193,7 @@ static int nf_ct_l3proto_register_sysctl(struct net *net,
 					    l3proto->ctl_table_path,
 					    in->ctl_table,
 					    NULL);
+
 		if (err < 0) {
 			kfree(in->ctl_table);
 			in->ctl_table = NULL;
@@ -460,7 +461,7 @@ int nf_conntrack_l4proto_register(struct net *net,
 {
 	int ret = 0;
 	if (l4proto->init_net) {
-		ret = l4proto->init_net(net);
+		ret = l4proto->init_net(net, l4proto->l3proto);
 		if (ret < 0)
 			return ret;
 	}
@@ -514,7 +515,8 @@ int nf_conntrack_proto_init(struct net *net)
 {
 	unsigned int i;
 	int err;
-	err = nf_conntrack_l4proto_generic.init_net(net);
+	err = nf_conntrack_l4proto_generic.init_net(net,
+						    nf_conntrack_l4proto_generic.l3proto);
 	if (err < 0)
 		return err;
 	err = nf_ct_l4proto_register_sysctl(net,
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index c33f76a..52da8f0 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -815,7 +815,7 @@ static struct ctl_table dccp_sysctl_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
 
-static int dccp_init_net(struct net *net)
+static int dccp_init_net(struct net *net, u_int16_t proto)
 {
 	struct dccp_net *dn = dccp_pernet(net);
 	struct nf_proto_net *pn = (struct nf_proto_net *)dn;
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index bb0e74f..d1ed7b4 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -135,7 +135,7 @@ static struct ctl_table generic_compat_sysctl_table[] = {
 #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
 #endif /* CONFIG_SYSCTL */
 
-static int generic_init_net(struct net *net)
+static int generic_init_net(struct net *net, u_int16_t proto)
 {
 	struct nf_generic_net *gn = generic_pernet(net);
 	struct nf_proto_net *pn = (struct nf_proto_net *)gn;
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 25ba5a2..851b93b 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -348,7 +348,7 @@ gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = {
 };
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
 
-static int gre_init_net(struct net *net)
+static int gre_init_net(struct net *net, u_int16_t proto)
 {
 	struct netns_proto_gre *net_gre = gre_pernet(net);
 	int i;
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 8fb0582..1e7836c 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -767,7 +767,7 @@ static int sctp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn)
 	return 0;
 }
 
-static int sctpv4_init_net(struct net *net)
+static int sctpv4_init_net(struct net *net, u_int16_t proto)
 {
 	int ret;
 	struct sctp_net *sn = sctp_pernet(net);
@@ -793,7 +793,7 @@ static int sctpv4_init_net(struct net *net)
 	return ret;
 }
 
-static int sctpv6_init_net(struct net *net)
+static int sctpv6_init_net(struct net *net, u_int16_t proto)
 {
 	struct sctp_net *sn = sctp_pernet(net);
 	struct nf_proto_net *pn = (struct nf_proto_net *)sn;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 99caa13..6db9d3c 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1593,7 +1593,7 @@ static int tcp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn)
 	return 0;
 }
 
-static int tcpv4_init_net(struct net *net)
+static int tcpv4_init_net(struct net *net, u_int16_t proto)
 {
 	int i;
 	int ret = 0;
@@ -1631,7 +1631,7 @@ static int tcpv4_init_net(struct net *net)
 	return ret;
 }
 
-static int tcpv6_init_net(struct net *net)
+static int tcpv6_init_net(struct net *net, u_int16_t proto)
 {
 	int i;
 	struct nf_tcp_net *tn = tcp_pernet(net);
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index a83cf93..2b978e6 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -283,7 +283,7 @@ static void udp_init_net_data(struct nf_udp_net *un)
 	}
 }
 
-static int udpv4_init_net(struct net *net)
+static int udpv4_init_net(struct net *net, u_int16_t proto)
 {
 	int ret;
 	struct nf_udp_net *un = udp_pernet(net);
@@ -307,7 +307,7 @@ static int udpv4_init_net(struct net *net)
 	return ret;
 }
 
-static int udpv6_init_net(struct net *net)
+static int udpv6_init_net(struct net *net, u_int16_t proto)
 {
 	struct nf_udp_net *un = udp_pernet(net);
 	struct nf_proto_net *pn = (struct nf_proto_net *)un;
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index b32e700..d33e511 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -234,7 +234,7 @@ static struct ctl_table udplite_sysctl_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
 
-static int udplite_init_net(struct net *net)
+static int udplite_init_net(struct net *net, u_int16_t proto)
 {
 	int i;
 	struct udplite_net *un = udplite_pernet(net);
-- 
1.7.7.6


^ permalink raw reply related

* [PATCH 01/10] netfilter: fix problem with proto register
From: Gao feng @ 2012-06-14 10:07 UTC (permalink / raw)
  To: pablo; +Cc: netdev, netfilter-devel, Gao feng

commit 2c352f444ccfa966a1aa4fd8e9ee29381c467448
(netfilter: nf_conntrack: prepare namespace support for
l4 protocol trackers) register proto before register sysctl.

it changes the behavior that when register sysctl failed, the
proto should not be registered too.

so change to register sysctl before register protos.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
 net/netfilter/nf_conntrack_proto.c |   37 ++++++++++++++++++++++-------------
 1 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 1ea9194..a434dd7 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -253,18 +253,23 @@ int nf_conntrack_l3proto_register(struct net *net,
 {
 	int ret = 0;
 
-	if (net == &init_net)
-		ret = nf_conntrack_l3proto_register_net(proto);
+	if (proto->init_net) {
+		ret = proto->init_net(net);
+		if (ret < 0)
+			return ret;
+	}
 
+	ret = nf_ct_l3proto_register_sysctl(net, proto);
 	if (ret < 0)
 		return ret;
 
-	if (proto->init_net) {
-		ret = proto->init_net(net);
+	if (net == &init_net) {
+		ret = nf_conntrack_l3proto_register_net(proto);
 		if (ret < 0)
-			return ret;
+			nf_ct_l3proto_unregister_sysctl(net, proto);
 	}
-	return nf_ct_l3proto_register_sysctl(net, proto);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_register);
 
@@ -454,19 +459,23 @@ int nf_conntrack_l4proto_register(struct net *net,
 				  struct nf_conntrack_l4proto *l4proto)
 {
 	int ret = 0;
-	if (net == &init_net)
-		ret = nf_conntrack_l4proto_register_net(l4proto);
-
-	if (ret < 0)
-		return ret;
-
-	if (l4proto->init_net)
+	if (l4proto->init_net) {
 		ret = l4proto->init_net(net);
+		if (ret < 0)
+			return ret;
+	}
 
+	ret = nf_ct_l4proto_register_sysctl(net, l4proto);
 	if (ret < 0)
 		return ret;
 
-	return nf_ct_l4proto_register_sysctl(net, l4proto);
+	if (net == &init_net) {
+		ret = nf_conntrack_l4proto_register_net(l4proto);
+		if (ret < 0)
+			nf_ct_l4proto_unregister_sysctl(net, l4proto);
+	}
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_register);
 
-- 
1.7.7.6


^ permalink raw reply related

* Re: Regression on TX throughput when using bonding
From: David Miller @ 2012-06-14 10:00 UTC (permalink / raw)
  To: eric.dumazet; +Cc: jhautbois, netdev
In-Reply-To: <1339667417.22704.707.camel@edumazet-glaptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 14 Jun 2012 11:50:17 +0200

> On Thu, 2012-06-14 at 11:22 +0200, Eric Dumazet wrote:
> 
>> So you are saying that if you make skb_orphan_try() doing nothing, it
>> solves your problem ?
> 
> It probably does, if your application does an UDP flood, trying to send
> more than the link bandwidth. I guess only benchmarks workloads ever try
> to do that.

Eric, I just want to point out that back when this early orphaning
idea were being proposed I warned about this, and specifically I
mentioned that, for datagram sockets, the socket send buffer limits
are what provide proper rate control and fairness.

It also, therefore, protects the system from one datagram spammer
being able to essentially take over the network interface and blocking
out all other users.

Early orphaning breaks this completely.

I guess we decided that moving an atomic operation earlier is worth
all of this?

Now we are so addicted to the increased performance from early
orphaning that I fear we'll never be allowed back into that sane
state of affairs ever again.

^ permalink raw reply

* Re: Regression on TX throughput when using bonding
From: Eric Dumazet @ 2012-06-14  9:50 UTC (permalink / raw)
  To: Jean-Michel Hautbois; +Cc: netdev
In-Reply-To: <1339665719.22704.692.camel@edumazet-glaptop>

On Thu, 2012-06-14 at 11:22 +0200, Eric Dumazet wrote:

> So you are saying that if you make skb_orphan_try() doing nothing, it
> solves your problem ?

It probably does, if your application does an UDP flood, trying to send
more than the link bandwidth. I guess only benchmarks workloads ever try
to do that.

bonding has no way to give congestion back, it has no Qdisc by default.

We probably can defer the skb_orphan_try() for bonding master, a bit
like the IFF_XMIT_DST_RELEASE 

 drivers/net/bonding/bond_main.c |    2 +-
 include/linux/if.h              |    3 +++
 net/core/dev.c                  |    5 +++--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 2ee8cf9..1b1e9c8 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4343,7 +4343,7 @@ static void bond_setup(struct net_device *bond_dev)
 	bond_dev->tx_queue_len = 0;
 	bond_dev->flags |= IFF_MASTER|IFF_MULTICAST;
 	bond_dev->priv_flags |= IFF_BONDING;
-	bond_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
+	bond_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING | IFF_XMIT_ORPHAN);
 
 	/* At first, we block adding VLANs. That's the only way to
 	 * prevent problems that occur when adding VLANs over an
diff --git a/include/linux/if.h b/include/linux/if.h
index f995c66..a788e7b 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -81,6 +81,9 @@
 #define IFF_UNICAST_FLT	0x20000		/* Supports unicast filtering	*/
 #define IFF_TEAM_PORT	0x40000		/* device used as team port */
 #define IFF_SUPP_NOFCS	0x80000		/* device supports sending custom FCS */
+#define IFF_XMIT_ORPHAN	0x100000	/* dev_hard_start_xmit() is allowed to
+					 * orphan skb
+					 */
 
 
 #define IF_GET_IFACE	0x0001		/* for querying only */
diff --git a/net/core/dev.c b/net/core/dev.c
index cd09819..3435463 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2193,7 +2193,8 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 		if (!list_empty(&ptype_all))
 			dev_queue_xmit_nit(skb, dev);
 
-		skb_orphan_try(skb);
+		if (dev->priv_flags & IFF_XMIT_ORPHAN)
+			skb_orphan_try(skb);
 
 		features = netif_skb_features(skb);
 
@@ -5929,7 +5930,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	INIT_LIST_HEAD(&dev->napi_list);
 	INIT_LIST_HEAD(&dev->unreg_list);
 	INIT_LIST_HEAD(&dev->link_watch_list);
-	dev->priv_flags = IFF_XMIT_DST_RELEASE;
+	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_ORPHAN;
 	setup(dev);
 
 	dev->num_tx_queues = txqs;

^ permalink raw reply related

* Re: Regression on TX throughput when using bonding
From: Jean-Michel Hautbois @ 2012-06-14  9:40 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1339665719.22704.692.camel@edumazet-glaptop>

2012/6/14 Eric Dumazet <eric.dumazet@gmail.com>:
> On Thu, 2012-06-14 at 10:58 +0200, Jean-Michel Hautbois wrote:
>> Hi all,
>>
>> I have bisected a regression which concerns TX throughput when using bonding.
>> I tested only with 10Gbps cards, as it appears when bandwidth need is
>> over 1Gbps on my machine.
>> I send UDP multicast packets over bonding and observe the tc result.
>>
>> When KO :
>> $>tc -s -d qdisc show dev eth1
>> qdisc pfifo_fast 0: root refcnt 2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1
>> 1 1 1 1 1 1
>>  Sent 1106527591 bytes 273802 pkt (dropped 306419, overlimits 0 requeues 223)
>>  backlog 0b 0p requeues 223
>>
>> Ok course, when OK, dropped is 0.
>> $>tc -s -d qdisc show dev eth1
>> qdisc pfifo_fast 0: root refcnt 2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1
>> 1 1 1 1 1 1
>>  Sent 1648662087 bytes 408009 pkt (dropped 0, overlimits 0 requeues 0)
>>  backlog 0b 0p requeues 0
>>
>>
>> Here is the incriminated commit:
>>
>> fc6055a5ba31e2c14e36e8939f9bf2b6d586a7f5 is the first bad commit
>> commit fc6055a5ba31e2c14e36e8939f9bf2b6d586a7f5
>
>>     net: Introduce skb_orphan_try()
>
> So you are saying that if you make skb_orphan_try() doing nothing, it
> solves your problem ?
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index cd09819..6df40dd 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2096,6 +2096,7 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
>  */
>  static inline void skb_orphan_try(struct sk_buff *skb)
>  {
> +#if 0
>        struct sock *sk = skb->sk;
>
>        if (sk && !skb_shinfo(skb)->tx_flags) {
> @@ -2106,6 +2107,7 @@ static inline void skb_orphan_try(struct sk_buff *skb)
>                        skb->rxhash = sk->sk_hash;
>                skb_orphan(skb);
>        }
> +#endif
>  }
>
>  static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
>

Yes :

[09:44:46]root@debian:~# uname -r
3.2.0
[09:44:50]root@debian:~# tc -s -d qdisc show dev eth1
qdisc mq 0: root
 Sent 11786861758 bytes 2916943 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0

JM

^ permalink raw reply

* Re: [PATCH 2/5] drivers/net/ethernet/dec/tulip: Use standard __set_bit_le() function
From: Akinobu Mita @ 2012-06-14  9:36 UTC (permalink / raw)
  To: Takuya Yoshikawa
  Cc: Grant Grundler, Takuya Yoshikawa, akpm, bhutchings, grundler,
	arnd, benh, avi, mtosatti, linux-net-drivers, netdev,
	linux-kernel, linux-arch, kvm
In-Reply-To: <20120614072831.63845f6ddf024ece28e49f5e@gmail.com>

2012/6/14 Takuya Yoshikawa <takuya.yoshikawa@gmail.com>:
> On Wed, 13 Jun 2012 08:21:18 -0700
> Grant Grundler <grantgrundler@gmail.com> wrote:
>
>> >> >> Should this hash_table be converted from u16 hash_table[32] to
>> >> >> DECLARE_BITMAP(hash_table, 16 * 32) to ensure that it is aligned
>> >> >> on long-word boundary?
>> >> >
>> >> > I think hash_table is already long-word aligned because it is placed
>> >> > right after a pointer.
>> >>
>> >> I recommend converting to proper bitmap.  Because such an implicit
>> >> assumption is easily broken by someone touching this function.
>> >
>> > Do you mean something like:
>> >        DECLARE_BITMAP(__hash_table, 16 * 32);
>> >        u16 *hash_table = (u16 *)__hash_table;
>> > ?
>> >
>> > Grant, what do you think about this?
>>
>> Hi Takuya,
>> two thoughts:
>> 1) while I agree with Akinobu and thank him for pointing out a
>> _potential_ alignment problem, this is a separate issue and your
>> existing patch should go in anyway. There are probably other drivers
>> with _potential_ alignment issues. Akinobu could get credit for
>> finding them by submitting patches after reviewing calls to set_bit
>> and set_bit_le() - similar to what you are doing now.
>
> I prefer approach 1.
>
> hash_table is local in build_setup_frame_hash(), so if further
> improvement is also required, we can do that locally there later.

This potential alignment problem is introduced by this patch.  Because
the original set_bit_le() in tulip driver can handle unaligned bitmap.
This is why I recommended it should be fixed in this patch.

But please just ignore me if I'm too much paranoid.  And I'll handle
this issue if no one wants to do it.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox