Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 1/2] bnx2x: add CSUM and TSO support for encapsulation protocols
From: Dmitry Kravkov @ 2013-03-18 16:51 UTC (permalink / raw)
  To: davem, netdev; +Cc: Dmitry Kravkov, Eilon Greenstein

The patch utilizes FW offload capabilities for
encapsulation protocols.

Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h      |   29 ++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c  |  196 ++++++++++++++++++++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c |    7 +
 3 files changed, 204 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index 9e8d195..a4729c7 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -612,9 +612,10 @@ struct bnx2x_fastpath {
  * START_BD		- describes packed
  * START_BD(splitted)	- includes unpaged data segment for GSO
  * PARSING_BD		- for TSO and CSUM data
+ * PARSING_BD2		- for encapsulation data
  * Frag BDs		- decribes pages for frags
  */
-#define BDS_PER_TX_PKT		3
+#define BDS_PER_TX_PKT		4
 #define MAX_BDS_PER_TX_PKT	(MAX_SKB_FRAGS + BDS_PER_TX_PKT)
 /* max BDs per tx packet including next pages */
 #define MAX_DESC_PER_TX_PKT	(MAX_BDS_PER_TX_PKT + \
@@ -731,16 +732,22 @@ struct bnx2x_fastpath {
 
 #define pbd_tcp_flags(tcp_hdr)	(ntohl(tcp_flag_word(tcp_hdr))>>16 & 0xff)
 
-#define XMIT_PLAIN			0
-#define XMIT_CSUM_V4			0x1
-#define XMIT_CSUM_V6			0x2
-#define XMIT_CSUM_TCP			0x4
-#define XMIT_GSO_V4			0x8
-#define XMIT_GSO_V6			0x10
-
-#define XMIT_CSUM			(XMIT_CSUM_V4 | XMIT_CSUM_V6)
-#define XMIT_GSO			(XMIT_GSO_V4 | XMIT_GSO_V6)
-
+#define XMIT_PLAIN		0
+#define XMIT_CSUM_V4		(1 << 0)
+#define XMIT_CSUM_V6		(1 << 1)
+#define XMIT_CSUM_TCP		(1 << 2)
+#define XMIT_GSO_V4		(1 << 3)
+#define XMIT_GSO_V6		(1 << 4)
+#define XMIT_CSUM_ENC_V4	(1 << 5)
+#define XMIT_CSUM_ENC_V6	(1 << 6)
+#define XMIT_GSO_ENC_V4		(1 << 7)
+#define XMIT_GSO_ENC_V6		(1 << 8)
+
+#define XMIT_CSUM_ENC		(XMIT_CSUM_ENC_V4 | XMIT_CSUM_ENC_V6)
+#define XMIT_GSO_ENC		(XMIT_GSO_ENC_V4 | XMIT_GSO_ENC_V6)
+
+#define XMIT_CSUM		(XMIT_CSUM_V4 | XMIT_CSUM_V6 | XMIT_CSUM_ENC)
+#define XMIT_GSO		(XMIT_GSO_V4 | XMIT_GSO_V6 | XMIT_GSO_ENC)
 
 /* stuff added to make the code fit 80Col */
 #define CQE_TYPE(cqe_fp_flags)	 ((cqe_fp_flags) & ETH_FAST_PATH_RX_CQE_TYPE)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 9f7a379..8091de7 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -3148,27 +3148,44 @@ static __le16 bnx2x_csum_fix(unsigned char *t_header, u16 csum, s8 fix)
 static u32 bnx2x_xmit_type(struct bnx2x *bp, struct sk_buff *skb)
 {
 	u32 rc;
+	__u8 prot = 0;
+	__be16 protocol;
 
 	if (skb->ip_summed != CHECKSUM_PARTIAL)
-		rc = XMIT_PLAIN;
+		return XMIT_PLAIN;
 
-	else {
-		if (vlan_get_protocol(skb) == htons(ETH_P_IPV6)) {
-			rc = XMIT_CSUM_V6;
-			if (ipv6_hdr(skb)->nexthdr == IPPROTO_TCP)
-				rc |= XMIT_CSUM_TCP;
+	protocol = vlan_get_protocol(skb);
+	if (protocol == htons(ETH_P_IPV6)) {
+		rc = XMIT_CSUM_V6;
+		prot = ipv6_hdr(skb)->nexthdr;
+	} else {
+		rc = XMIT_CSUM_V4;
+		prot = ip_hdr(skb)->protocol;
+	}
 
+	if (!CHIP_IS_E1x(bp) && skb->encapsulation) {
+		if (inner_ip_hdr(skb)->version == 6) {
+			rc |= XMIT_CSUM_ENC_V6;
+			if (inner_ipv6_hdr(skb)->nexthdr == IPPROTO_TCP)
+				rc |= XMIT_CSUM_TCP;
 		} else {
-			rc = XMIT_CSUM_V4;
-			if (ip_hdr(skb)->protocol == IPPROTO_TCP)
+			rc |= XMIT_CSUM_ENC_V4;
+			if (inner_ip_hdr(skb)->protocol == IPPROTO_TCP)
 				rc |= XMIT_CSUM_TCP;
 		}
 	}
+	if (prot == IPPROTO_TCP)
+		rc |= XMIT_CSUM_TCP;
 
-	if (skb_is_gso_v6(skb))
-		rc |= XMIT_GSO_V6 | XMIT_CSUM_TCP | XMIT_CSUM_V6;
-	else if (skb_is_gso(skb))
-		rc |= XMIT_GSO_V4 | XMIT_CSUM_V4 | XMIT_CSUM_TCP;
+	if (skb_is_gso_v6(skb)) {
+		rc |= (XMIT_GSO_V6 | XMIT_CSUM_TCP | XMIT_CSUM_V6);
+		if (rc & XMIT_CSUM_ENC)
+			rc |= XMIT_GSO_ENC_V6;
+	} else if (skb_is_gso(skb)) {
+		rc |= (XMIT_GSO_V4 | XMIT_CSUM_V4 | XMIT_CSUM_TCP);
+		if (rc & XMIT_CSUM_ENC)
+			rc |= XMIT_GSO_ENC_V4;
+	}
 
 	return rc;
 }
@@ -3256,11 +3273,20 @@ exit_lbl:
 static void bnx2x_set_pbd_gso_e2(struct sk_buff *skb, u32 *parsing_data,
 				 u32 xmit_type)
 {
+	struct ipv6hdr *ipv6;
+
 	*parsing_data |= (skb_shinfo(skb)->gso_size <<
 			      ETH_TX_PARSE_BD_E2_LSO_MSS_SHIFT) &
 			      ETH_TX_PARSE_BD_E2_LSO_MSS;
-	if ((xmit_type & XMIT_GSO_V6) &&
-	    (ipv6_hdr(skb)->nexthdr == NEXTHDR_IPV6))
+
+	if (xmit_type & XMIT_GSO_ENC_V6)
+		ipv6 = inner_ipv6_hdr(skb);
+	else if (xmit_type & XMIT_GSO_V6)
+		ipv6 = ipv6_hdr(skb);
+	else
+		ipv6 = NULL;
+
+	if (ipv6 && ipv6->nexthdr == NEXTHDR_IPV6)
 		*parsing_data |= ETH_TX_PARSE_BD_E2_IPV6_WITH_EXT_HDR;
 }
 
@@ -3297,6 +3323,40 @@ static void bnx2x_set_pbd_gso(struct sk_buff *skb,
 }
 
 /**
+ * bnx2x_set_pbd_csum_enc - update PBD with checksum and return header length
+ *
+ * @bp:			driver handle
+ * @skb:		packet skb
+ * @parsing_data:	data to be updated
+ * @xmit_type:		xmit flags
+ *
+ * 57712/578xx related, when skb has encapsulation
+ */
+static u8 bnx2x_set_pbd_csum_enc(struct bnx2x *bp, struct sk_buff *skb,
+				 u32 *parsing_data, u32 xmit_type)
+{
+	*parsing_data |=
+		((((u8 *)skb_inner_transport_header(skb) - skb->data) >> 1) <<
+		ETH_TX_PARSE_BD_E2_L4_HDR_START_OFFSET_W_SHIFT) &
+		ETH_TX_PARSE_BD_E2_L4_HDR_START_OFFSET_W;
+
+	if (xmit_type & XMIT_CSUM_TCP) {
+		*parsing_data |= ((inner_tcp_hdrlen(skb) / 4) <<
+			ETH_TX_PARSE_BD_E2_TCP_HDR_LENGTH_DW_SHIFT) &
+			ETH_TX_PARSE_BD_E2_TCP_HDR_LENGTH_DW;
+
+		return skb_inner_transport_header(skb) +
+			inner_tcp_hdrlen(skb) - skb->data;
+	}
+
+	/* We support checksum offload for TCP and UDP only.
+	 * No need to pass the UDP header length - it's a constant.
+	 */
+	return skb_inner_transport_header(skb) +
+		sizeof(struct udphdr) - skb->data;
+}
+
+/**
  * bnx2x_set_pbd_csum_e2 - update PBD with checksum and return header length
  *
  * @bp:			driver handle
@@ -3327,13 +3387,14 @@ static u8 bnx2x_set_pbd_csum_e2(struct bnx2x *bp, struct sk_buff *skb,
 	return skb_transport_header(skb) + sizeof(struct udphdr) - skb->data;
 }
 
+/* set FW indication according to inner or outer protocols if tunneled */
 static void bnx2x_set_sbd_csum(struct bnx2x *bp, struct sk_buff *skb,
 			       struct eth_tx_start_bd *tx_start_bd,
 			       u32 xmit_type)
 {
 	tx_start_bd->bd_flags.as_bitfield |= ETH_TX_BD_FLAGS_L4_CSUM;
 
-	if (xmit_type & XMIT_CSUM_V6)
+	if (xmit_type & (XMIT_CSUM_ENC_V6 | XMIT_CSUM_V6))
 		tx_start_bd->bd_flags.as_bitfield |= ETH_TX_BD_FLAGS_IPV6;
 
 	if (!(xmit_type & XMIT_CSUM_TCP))
@@ -3396,6 +3457,72 @@ static u8 bnx2x_set_pbd_csum(struct bnx2x *bp, struct sk_buff *skb,
 	return hlen;
 }
 
+static void bnx2x_update_pbds_gso_enc(struct sk_buff *skb,
+				      struct eth_tx_parse_bd_e2 *pbd_e2,
+				      struct eth_tx_parse_2nd_bd *pbd2,
+				      u16 *global_data,
+				      u32 xmit_type)
+{
+	u16 inner_hlen_w = 0;
+	u8 outerip_off, outerip_len = 0;
+
+	/* IP len */
+	inner_hlen_w = (skb_inner_transport_header(skb) -
+			skb_inner_network_header(skb)) >> 1;
+
+	/* transport len */
+	if (xmit_type & XMIT_CSUM_TCP)
+		inner_hlen_w += inner_tcp_hdrlen(skb) >> 1;
+	else
+		inner_hlen_w += sizeof(struct udphdr) >> 1;
+
+	pbd2->fw_ip_hdr_to_payload_w = inner_hlen_w;
+
+	if (xmit_type & XMIT_CSUM_ENC_V4) {
+		struct iphdr *iph = inner_ip_hdr(skb);
+
+		pbd2->fw_ip_csum_wo_len_flags_frag =
+			bswab16(csum_fold((~iph->check) -
+					  iph->tot_len - iph->frag_off));
+	} else {
+		pbd2->fw_ip_hdr_to_payload_w =
+			inner_hlen_w - ((sizeof(struct ipv6hdr)) >> 1);
+	}
+
+	pbd2->tcp_send_seq = bswab32(inner_tcp_hdr(skb)->seq);
+
+	pbd2->tcp_flags = pbd_tcp_flags(inner_tcp_hdr(skb));
+
+	if (xmit_type & XMIT_GSO_V4) {
+		pbd2->hw_ip_id = bswab16(ip_hdr(skb)->id);
+
+		pbd_e2->data.tunnel_data.pseudo_csum =
+			bswab16(~csum_tcpudp_magic(
+					inner_ip_hdr(skb)->saddr,
+					inner_ip_hdr(skb)->daddr,
+					0, IPPROTO_TCP, 0));
+
+		outerip_len = ip_hdr(skb)->ihl << 1;
+	} else {
+		pbd_e2->data.tunnel_data.pseudo_csum =
+			bswab16(~csum_ipv6_magic(
+					&inner_ipv6_hdr(skb)->saddr,
+					&inner_ipv6_hdr(skb)->daddr,
+					0, IPPROTO_TCP, 0));
+	}
+
+	outerip_off = (skb_network_header(skb) - skb->data) >> 1;
+
+	*global_data |=
+		outerip_off |
+		(!!(xmit_type & XMIT_CSUM_V6) <<
+			ETH_TX_PARSE_2ND_BD_IP_HDR_TYPE_OUTER_SHIFT) |
+		(outerip_len <<
+			ETH_TX_PARSE_2ND_BD_IP_HDR_LEN_OUTER_W_SHIFT) |
+		((skb->protocol == cpu_to_be16(ETH_P_8021Q)) <<
+			ETH_TX_PARSE_2ND_BD_LLC_SNAP_EN_SHIFT);
+}
+
 /* called with netif_tx_lock
  * bnx2x_tx_int() runs without netif_tx_lock unless it needs to call
  * netif_wake_queue()
@@ -3411,6 +3538,7 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct eth_tx_bd *tx_data_bd, *total_pkt_bd = NULL;
 	struct eth_tx_parse_bd_e1x *pbd_e1x = NULL;
 	struct eth_tx_parse_bd_e2 *pbd_e2 = NULL;
+	struct eth_tx_parse_2nd_bd *pbd2 = NULL;
 	u32 pbd_e2_parsing_data = 0;
 	u16 pkt_prod, bd_prod;
 	int nbd, txq_index;
@@ -3567,12 +3695,46 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (!CHIP_IS_E1x(bp)) {
 		pbd_e2 = &txdata->tx_desc_ring[bd_prod].parse_bd_e2;
 		memset(pbd_e2, 0, sizeof(struct eth_tx_parse_bd_e2));
-		/* Set PBD in checksum offload case */
-		if (xmit_type & XMIT_CSUM)
+
+		if (xmit_type & XMIT_CSUM_ENC) {
+			u16 global_data = 0;
+
+			/* Set PBD in enc checksum offload case */
+			hlen = bnx2x_set_pbd_csum_enc(bp, skb,
+						      &pbd_e2_parsing_data,
+						      xmit_type);
+
+			/* turn on 2nd parsing and get a BD */
+			bd_prod = TX_BD(NEXT_TX_IDX(bd_prod));
+
+			pbd2 = &txdata->tx_desc_ring[bd_prod].parse_2nd_bd;
+
+			memset(pbd2, 0, sizeof(*pbd2));
+
+			pbd_e2->data.tunnel_data.ip_hdr_start_inner_w =
+				(skb_inner_network_header(skb) -
+				 skb->data) >> 1;
+
+			if (xmit_type & XMIT_GSO_ENC)
+				bnx2x_update_pbds_gso_enc(skb, pbd_e2, pbd2,
+							  &global_data,
+							  xmit_type);
+
+			pbd2->global_data = cpu_to_le16(global_data);
+
+			/* add addition parse BD indication to start BD */
+			SET_FLAG(tx_start_bd->general_data,
+				 ETH_TX_START_BD_PARSE_NBDS, 1);
+			/* set encapsulation flag in start BD */
+			SET_FLAG(tx_start_bd->general_data,
+				 ETH_TX_START_BD_TUNNEL_EXIST, 1);
+			nbd++;
+		} else if (xmit_type & XMIT_CSUM) {
 			/* Set PBD in checksum offload case w/o encapsulation */
 			hlen = bnx2x_set_pbd_csum_e2(bp, skb,
 						     &pbd_e2_parsing_data,
 						     xmit_type);
+		}
 
 		/* Add the macs to the parsing BD this is a vf */
 		if (IS_VF(bp)) {
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 04d123f..4902d1e 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -11965,6 +11965,13 @@ static int bnx2x_init_dev(struct bnx2x *bp, struct pci_dev *pdev,
 		NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6 |
 		NETIF_F_RXCSUM | NETIF_F_LRO | NETIF_F_GRO |
 		NETIF_F_RXHASH | NETIF_F_HW_VLAN_TX;
+	if (!CHIP_IS_E1x(bp)) {
+		dev->hw_features |= NETIF_F_GSO_GRE;
+		dev->hw_enc_features =
+			NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_SG |
+			NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6 |
+			NETIF_F_GSO_GRE;
+	}
 
 	dev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
 		NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_HIGHDMA;
-- 
1.7.7.2

^ permalink raw reply related

* [PATCH] libertas: drop maintainership
From: Dan Williams @ 2013-03-18 16:48 UTC (permalink / raw)
  To: netdev; +Cc: linux-wireless, Daniel Drake, Bing Zhao

Would be better maintained by somebody who actualy has time for it.

Signed-off-by: Dan Williams <dcbw@redhat.com>
---
Daniel?  Bing? :)

I'm quite happy to ACK somebody else picking libertas up, and I'm also
happy to investigate how to send my hardware cache to the new
maintainer.  (usb8388 dev modules, some CF8385s, some sd8686s, etc).
Also happy to forward along the libertas-dev moderator password.

diff --git a/MAINTAINERS b/MAINTAINERS
index c08411b..2146279 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5042,12 +5042,6 @@ L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/net/ethernet/marvell/sk*

-MARVELL LIBERTAS WIRELESS DRIVER
-M:	Dan Williams <dcbw@redhat.com>
-L:	libertas-dev@lists.infradead.org
-S:	Maintained
-F:	drivers/net/wireless/libertas/
-
 MARVELL MV643XX ETHERNET DRIVER
 M:	Lennert Buytenhek <buytenh@wantstofly.org>
 L:	netdev@vger.kernel.org

^ permalink raw reply related

* Re: [PATCH nf-next] netfilter: nfnetlink_queue: zero copy support
From: Pablo Neira Ayuso @ 2013-03-18 15:36 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Florian Westphal, netdev, Netfilter Developer Mailing List
In-Reply-To: <1363614679.29475.130.camel@edumazet-glaptop>

On Mon, Mar 18, 2013 at 06:51:19AM -0700, Eric Dumazet wrote:
> On Mon, 2013-03-18 at 10:24 +0100, Florian Westphal wrote:
> > Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > >
> > > -GRO/GSO packets are segmented in nf_queue()
> > > and checksummed in nfqnl_build_packet_message().
> > > Proper support for GSO/GRO packets (no segmentation,
> > > and no checksumming) needs application cooperation, if we
> > > want no regressions.
> > 
> > Since ipqueue is gone we might be able to push the segmentation
> > down to nfnetlink_queue.  Then new userspace applications
> > could indicate a 'I won't verify checksums and will handle huge
> > packets'.
> > 
> > Are you working on something like this?
> 
> I validated that it was only an API concern, by commenting out the code,
> and got 20Gbps (link speed) using the sample program (using a bigger
> buffer to receive the skbs and removing the printf() for each packet)
> 
> Pablo followed the experiments and I believe he has an idea of the
> needed API.

Will take over this. Florian, ping me if interested in helping.

Thanks a lot for the patch and ideas Eric!

^ permalink raw reply

* Re: [PATCH] sfc: make local functions static
From: Ben Hutchings @ 2013-03-18 15:33 UTC (permalink / raw)
  To: David Miller; +Cc: stephen, netdev
In-Reply-To: <20130317.142653.1414523995890573030.davem@davemloft.net>

On Sun, 2013-03-17 at 14:26 -0400, David Miller wrote:
> From: Stephen Hemminger <stephen@networkplumber.org>
> Date: Sat, 16 Mar 2013 09:57:51 -0700
> 
> > Trivial sparse detected functions that should be static.
> > 
> > Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> 
> Applied, thanks Stephen.

Thanks.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: sctp: hang in sctp_remaddr_seq_show
From: Vlad Yasevich @ 2013-03-18 15:31 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Neil Horman, Sasha Levin, sri, David S. Miller, linux-sctp,
	netdev, Dave Jones, linux-kernel@vger.kernel.org
In-Reply-To: <1363620340.29475.132.camel@edumazet-glaptop>

On 03/18/2013 11:25 AM, Eric Dumazet wrote:
> On Mon, 2013-03-18 at 07:04 -0400, Neil Horman wrote:
>
>> I'm not sure why the process would never get back to the schedule, but looking
>> at the sctp_remaddr_seq_show function, I think that we should convert this
>> sequence:
>> 	sctp_local_bh_disable();
>> 	read_lock(&head->lock);
>> 	rcu_read_lock();
>>
>> to this:
>> 	read_lock(&head->lock);
>> 	rcu_read_lock_bh();
>>
>> Neil
>
> I dont think so.
>
> BH needs to be disabled before read_lock(&head->lock);
>
> or else, write_lock() could deadlock (assuming it can be called from BH)
>
>

If anything, this should probably be done like this:

rcu_read_lock();
read_lock_bh(&head->lock)
...

read_unlock_bh(&head->lock)
rcu_read_unlock();

-vlad

^ permalink raw reply

* Re: sctp: hang in sctp_remaddr_seq_show
From: Eric Dumazet @ 2013-03-18 15:25 UTC (permalink / raw)
  To: Neil Horman
  Cc: Sasha Levin, vyasevich, sri, David S. Miller, linux-sctp, netdev,
	Dave Jones, linux-kernel@vger.kernel.org
In-Reply-To: <20130318110415.GA9478@hmsreliant.think-freely.org>

On Mon, 2013-03-18 at 07:04 -0400, Neil Horman wrote:

> I'm not sure why the process would never get back to the schedule, but looking
> at the sctp_remaddr_seq_show function, I think that we should convert this
> sequence:
> 	sctp_local_bh_disable();
> 	read_lock(&head->lock);
> 	rcu_read_lock();
> 
> to this:
> 	read_lock(&head->lock);
> 	rcu_read_lock_bh();
> 
> Neil

I dont think so.

BH needs to be disabled before read_lock(&head->lock);

or else, write_lock() could deadlock (assuming it can be called from BH)

^ permalink raw reply

* [PATCH V3 4/7] PHYLIB: queue work on any cpu
From: Viresh Kumar @ 2013-03-18 15:23 UTC (permalink / raw)
  To: pjt, paul.mckenney, tglx, tj, suresh.b.siddha, venki, mingo,
	peterz, rostedt
  Cc: linaro-kernel, robin.randhawa, Steve.Bannister, Liviu.Dudau,
	charles.garcia-tobin, Arvind.Chauhan, linux-rt-users,
	linux-kernel, Viresh Kumar, David S. Miller, netdev
In-Reply-To: <cover.1363617402.git.viresh.kumar@linaro.org>

Phylib uses workqueues for multiple purposes. There is no real dependency of
scheduling these on the cpu which scheduled them.

On a idle system, it is observed that and idle cpu wakes up many times just to
service this work. It would be better if we can schedule it on a cpu which isn't
idle to save on power.

By idle cpu (from scheduler's perspective) we mean:
- Current task is idle task
- nr_running == 0
- wake_list is empty

This patch replaces the schedule_work() and schedule_delayed_work() routines
with their queue_[delayed_]work_on_any_cpu() siblings with system_wq as
parameter.

These routines would look for the closest (via scheduling domains) non-idle cpu
(non-idle from schedulers perspective). If the current cpu is not idle or all
cpus are idle, work will be scheduled on local cpu.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: netdev@vger.kernel.org
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/net/phy/phy.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 298b4c2..a517706 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -439,7 +439,7 @@ void phy_start_machine(struct phy_device *phydev,
 {
 	phydev->adjust_state = handler;
 
-	schedule_delayed_work(&phydev->state_queue, HZ);
+	queue_delayed_work_on_any_cpu(system_wq, &phydev->state_queue, HZ);
 }
 
 /**
@@ -527,7 +527,7 @@ static irqreturn_t phy_interrupt(int irq, void *phy_dat)
 	disable_irq_nosync(irq);
 	atomic_inc(&phydev->irq_disable);
 
-	schedule_work(&phydev->phy_queue);
+	queue_work_on_any_cpu(system_wq, &phydev->phy_queue);
 
 	return IRQ_HANDLED;
 }
@@ -682,7 +682,7 @@ static void phy_change(struct work_struct *work)
 
 	/* reschedule state queue work to run as soon as possible */
 	cancel_delayed_work_sync(&phydev->state_queue);
-	schedule_delayed_work(&phydev->state_queue, 0);
+	queue_delayed_work_on_any_cpu(system_wq, &phydev->state_queue, 0);
 
 	return;
 
@@ -966,7 +966,8 @@ void phy_state_machine(struct work_struct *work)
 	if (err < 0)
 		phy_error(phydev);
 
-	schedule_delayed_work(&phydev->state_queue, PHY_STATE_TIME * HZ);
+	queue_delayed_work_on_any_cpu(system_wq, &phydev->state_queue,
+			PHY_STATE_TIME * HZ);
 }
 
 static inline void mmd_phy_indirect(struct mii_bus *bus, int prtad, int devad,
-- 
1.7.12.rc2.18.g61b472e


^ permalink raw reply related

* Re: [PATCH 2/4] xen-netfront: drop skb when skb->len > 65535
From: Wei Liu @ 2013-03-18 15:10 UTC (permalink / raw)
  To: Ian Campbell
  Cc: wei.liu2, netdev@vger.kernel.org, xen-devel@lists.xen.org,
	konrad.wilk@oracle.com, annie.li@oracle.com
In-Reply-To: <1363619244.2963.11.camel@zakaz.uk.xensource.com>

On Mon, 2013-03-18 at 15:07 +0000, Ian Campbell wrote:
> On Mon, 2013-03-18 at 15:04 +0000, Wei Liu wrote:
> > On Mon, 2013-03-18 at 14:54 +0000, Ian Campbell wrote:
> > > On Mon, 2013-03-18 at 14:40 +0000, Wei Liu wrote:
> > > > On Mon, 2013-03-18 at 11:42 +0000, Ian Campbell wrote:
> > > > > On Mon, 2013-03-18 at 10:35 +0000, Wei Liu wrote:
> > > > > > The `size' field of Xen network wire format is uint16_t, anything bigger than
> > > > > > 65535 will cause overflow.
> > > > > > 
> > > > > > Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> > > > > > ---
> > > > > >  drivers/net/xen-netfront.c |   12 ++++++++++++
> > > > > >  1 file changed, 12 insertions(+)
> > > > > > 
> > > > > > diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
> > > > > > index 5527663..8c3d065 100644
> > > > > > --- a/drivers/net/xen-netfront.c
> > > > > > +++ b/drivers/net/xen-netfront.c
> > > > > > @@ -547,6 +547,18 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
> > > > > >  	unsigned int len = skb_headlen(skb);
> > > > > >  	unsigned long flags;
> > > > > >  
> > > > > > +	/*
> > > > > > +	 * wire format of xen_netif_tx_request only supports skb->len
> > > > > > +	 * < 64K, because size field in xen_netif_tx_request is
> > > > > > +	 * uint16_t.
> > > > > 
> > > > > Is there some field we can set e.g. in struct ethernet_device which
> > > > > would stop this from happening?
> > > > > 
> > > > 
> > > > struct ethernet_device? I could not find it.
> > > > 
> > > > And for struct net_device,
> > > 
> > > I meant struct net_device.
> > > 
> > > >  there is no field for this AFAICT.
> > > 
> > > Interesting. Are hardware devices expected to cope with arbitrary sized
> > > GSO skbs then I wonder.
> > > 
> > 
> > No idea. But there is a macro called GSO_MAX_SIZE (65536) in struct
> > net_device. :-)
> 
> But aren't we seeing skb's bigger than that?
> 

Yes, skb->len = 65538.

> Maybe this is just a historical bug in some older guests?
> 

I saw this with latest upstream kernel.


Wei.

> Ian.
> 
> 

^ permalink raw reply

* Re: [PATCH 2/4] xen-netfront: drop skb when skb->len > 65535
From: Ian Campbell @ 2013-03-18 15:07 UTC (permalink / raw)
  To: Wei Liu
  Cc: netdev@vger.kernel.org, xen-devel@lists.xen.org,
	konrad.wilk@oracle.com, annie.li@oracle.com
In-Reply-To: <1363619098.29093.205.camel@zion.uk.xensource.com>

On Mon, 2013-03-18 at 15:04 +0000, Wei Liu wrote:
> On Mon, 2013-03-18 at 14:54 +0000, Ian Campbell wrote:
> > On Mon, 2013-03-18 at 14:40 +0000, Wei Liu wrote:
> > > On Mon, 2013-03-18 at 11:42 +0000, Ian Campbell wrote:
> > > > On Mon, 2013-03-18 at 10:35 +0000, Wei Liu wrote:
> > > > > The `size' field of Xen network wire format is uint16_t, anything bigger than
> > > > > 65535 will cause overflow.
> > > > > 
> > > > > Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> > > > > ---
> > > > >  drivers/net/xen-netfront.c |   12 ++++++++++++
> > > > >  1 file changed, 12 insertions(+)
> > > > > 
> > > > > diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
> > > > > index 5527663..8c3d065 100644
> > > > > --- a/drivers/net/xen-netfront.c
> > > > > +++ b/drivers/net/xen-netfront.c
> > > > > @@ -547,6 +547,18 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
> > > > >  	unsigned int len = skb_headlen(skb);
> > > > >  	unsigned long flags;
> > > > >  
> > > > > +	/*
> > > > > +	 * wire format of xen_netif_tx_request only supports skb->len
> > > > > +	 * < 64K, because size field in xen_netif_tx_request is
> > > > > +	 * uint16_t.
> > > > 
> > > > Is there some field we can set e.g. in struct ethernet_device which
> > > > would stop this from happening?
> > > > 
> > > 
> > > struct ethernet_device? I could not find it.
> > > 
> > > And for struct net_device,
> > 
> > I meant struct net_device.
> > 
> > >  there is no field for this AFAICT.
> > 
> > Interesting. Are hardware devices expected to cope with arbitrary sized
> > GSO skbs then I wonder.
> > 
> 
> No idea. But there is a macro called GSO_MAX_SIZE (65536) in struct
> net_device. :-)

But aren't we seeing skb's bigger than that?

Maybe this is just a historical bug in some older guests?

Ian.

^ permalink raw reply

* Re: [PATCH 2/4] xen-netfront: drop skb when skb->len > 65535
From: Wei Liu @ 2013-03-18 15:04 UTC (permalink / raw)
  To: Ian Campbell
  Cc: wei.liu2, netdev@vger.kernel.org, xen-devel@lists.xen.org,
	konrad.wilk@oracle.com, annie.li@oracle.com
In-Reply-To: <1363618459.2963.10.camel@zakaz.uk.xensource.com>

On Mon, 2013-03-18 at 14:54 +0000, Ian Campbell wrote:
> On Mon, 2013-03-18 at 14:40 +0000, Wei Liu wrote:
> > On Mon, 2013-03-18 at 11:42 +0000, Ian Campbell wrote:
> > > On Mon, 2013-03-18 at 10:35 +0000, Wei Liu wrote:
> > > > The `size' field of Xen network wire format is uint16_t, anything bigger than
> > > > 65535 will cause overflow.
> > > > 
> > > > Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> > > > ---
> > > >  drivers/net/xen-netfront.c |   12 ++++++++++++
> > > >  1 file changed, 12 insertions(+)
> > > > 
> > > > diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
> > > > index 5527663..8c3d065 100644
> > > > --- a/drivers/net/xen-netfront.c
> > > > +++ b/drivers/net/xen-netfront.c
> > > > @@ -547,6 +547,18 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
> > > >  	unsigned int len = skb_headlen(skb);
> > > >  	unsigned long flags;
> > > >  
> > > > +	/*
> > > > +	 * wire format of xen_netif_tx_request only supports skb->len
> > > > +	 * < 64K, because size field in xen_netif_tx_request is
> > > > +	 * uint16_t.
> > > 
> > > Is there some field we can set e.g. in struct ethernet_device which
> > > would stop this from happening?
> > > 
> > 
> > struct ethernet_device? I could not find it.
> > 
> > And for struct net_device,
> 
> I meant struct net_device.
> 
> >  there is no field for this AFAICT.
> 
> Interesting. Are hardware devices expected to cope with arbitrary sized
> GSO skbs then I wonder.
> 

No idea. But there is a macro called GSO_MAX_SIZE (65536) in struct
net_device. :-)


Wei.

> Ian.
> 

^ permalink raw reply

* Re: [PATCH 2/4] xen-netfront: drop skb when skb->len > 65535
From: Ian Campbell @ 2013-03-18 14:54 UTC (permalink / raw)
  To: Wei Liu
  Cc: netdev@vger.kernel.org, xen-devel@lists.xen.org,
	konrad.wilk@oracle.com, annie.li@oracle.com
In-Reply-To: <1363617642.29093.203.camel@zion.uk.xensource.com>

On Mon, 2013-03-18 at 14:40 +0000, Wei Liu wrote:
> On Mon, 2013-03-18 at 11:42 +0000, Ian Campbell wrote:
> > On Mon, 2013-03-18 at 10:35 +0000, Wei Liu wrote:
> > > The `size' field of Xen network wire format is uint16_t, anything bigger than
> > > 65535 will cause overflow.
> > > 
> > > Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> > > ---
> > >  drivers/net/xen-netfront.c |   12 ++++++++++++
> > >  1 file changed, 12 insertions(+)
> > > 
> > > diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
> > > index 5527663..8c3d065 100644
> > > --- a/drivers/net/xen-netfront.c
> > > +++ b/drivers/net/xen-netfront.c
> > > @@ -547,6 +547,18 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
> > >  	unsigned int len = skb_headlen(skb);
> > >  	unsigned long flags;
> > >  
> > > +	/*
> > > +	 * wire format of xen_netif_tx_request only supports skb->len
> > > +	 * < 64K, because size field in xen_netif_tx_request is
> > > +	 * uint16_t.
> > 
> > Is there some field we can set e.g. in struct ethernet_device which
> > would stop this from happening?
> > 
> 
> struct ethernet_device? I could not find it.
> 
> And for struct net_device,

I meant struct net_device.

>  there is no field for this AFAICT.

Interesting. Are hardware devices expected to cope with arbitrary sized
GSO skbs then I wonder.

Ian.

^ permalink raw reply

* Re: [PATCH 2/4] xen-netfront: drop skb when skb->len > 65535
From: Wei Liu @ 2013-03-18 14:40 UTC (permalink / raw)
  To: Ian Campbell
  Cc: wei.liu2, netdev@vger.kernel.org, xen-devel@lists.xen.org,
	konrad.wilk@oracle.com, annie.li@oracle.com
In-Reply-To: <1363606970.30193.22.camel@zakaz.uk.xensource.com>

On Mon, 2013-03-18 at 11:42 +0000, Ian Campbell wrote:
> On Mon, 2013-03-18 at 10:35 +0000, Wei Liu wrote:
> > The `size' field of Xen network wire format is uint16_t, anything bigger than
> > 65535 will cause overflow.
> > 
> > Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> > ---
> >  drivers/net/xen-netfront.c |   12 ++++++++++++
> >  1 file changed, 12 insertions(+)
> > 
> > diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
> > index 5527663..8c3d065 100644
> > --- a/drivers/net/xen-netfront.c
> > +++ b/drivers/net/xen-netfront.c
> > @@ -547,6 +547,18 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
> >  	unsigned int len = skb_headlen(skb);
> >  	unsigned long flags;
> >  
> > +	/*
> > +	 * wire format of xen_netif_tx_request only supports skb->len
> > +	 * < 64K, because size field in xen_netif_tx_request is
> > +	 * uint16_t.
> 
> Is there some field we can set e.g. in struct ethernet_device which
> would stop this from happening?
> 

struct ethernet_device? I could not find it.

And for struct net_device, there is no field for this AFAICT.


Wei.

> 
> > +	 */
> > +	if (unlikely(skb->len > (uint16_t)(~0))) {
> > +		net_alert_ratelimited(
> > +			"xennet: skb->len = %d, too big for wire format\n",
> > +			skb->len);
> > +		goto drop;
> > +	}
> > +
> >  	slots = DIV_ROUND_UP(offset + len, PAGE_SIZE) +
> >  		xennet_count_skb_frag_slots(skb);
> >  	if (unlikely(slots > MAX_SKB_FRAGS + 1)) {
> 
> 

^ permalink raw reply

* Re: [Xen-devel] [PATCH 2/4] xen-netfront: drop skb when skb->len > 65535
From: Wei Liu @ 2013-03-18 14:19 UTC (permalink / raw)
  To: David Vrabel
  Cc: wei.liu2, Ian Campbell, netdev@vger.kernel.org,
	xen-devel@lists.xen.org, annie.li@oracle.com,
	konrad.wilk@oracle.com
In-Reply-To: <51471DE0.9060506@citrix.com>

On Mon, 2013-03-18 at 14:00 +0000, David Vrabel wrote:
> On 18/03/13 13:48, Ian Campbell wrote:
> > On Mon, 2013-03-18 at 13:46 +0000, David Vrabel wrote:
> >> On 18/03/13 10:35, Wei Liu wrote:
> >>> The `size' field of Xen network wire format is uint16_t, anything bigger than
> >>> 65535 will cause overflow.
> >>
> >> The backend needs to be able to handle these bad packets without
> >> disconnecting the VIF -- we can't fix all the frontend drivers.
> > 
> > Agreed, although that doesn't imply that we shouldn't fix the frontend
> > where we can -- such as upstream as Wei does here.
> 
> Yes, frontends should be fixed where possible.
> 
> This is what I came up with for the backend.  I don't have time to look
> into it further but, Wei, feel free to use it as a starting point.
> 

Thanks for this patch.

I haven't gone through XSA-39 discussion, this is why I didn't come up
with a fix for backend -- I need to make sure dropping packet like this
won't re-exhibit the security hole.


Wei.

> David
> 
> diff --git a/drivers/net/xen-netback/netback.c
> b/drivers/net/xen-netback/netback.c
> index cd49ba9..18e2671 100644
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -899,10 +899,11 @@ static void netbk_fatal_tx_err(struct xenvif *vif)
>  static int netbk_count_requests(struct xenvif *vif,
>  				struct xen_netif_tx_request *first,
>  				struct xen_netif_tx_request *txp,
> -				int work_to_do)
> +				int work_to_do, int idx)
>  {
>  	RING_IDX cons = vif->tx.req_cons;
>  	int frags = 0;
> +	bool drop = false;
> 
>  	if (!(first->flags & XEN_NETTXF_more_data))
>  		return 0;
> @@ -922,10 +923,20 @@ static int netbk_count_requests(struct xenvif *vif,
> 
>  		memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + frags),
>  		       sizeof(*txp));
> -		if (txp->size > first->size) {
> -			netdev_err(vif->dev, "Frag is bigger than frame.\n");
> -			netbk_fatal_tx_err(vif);
> -			return -EIO;
> +
> +		/*
> +		 * If the guest submitted a frame >= 64 KiB then
> +		 * first->size overflowed and following frags will
> +		 * appear to be larger than the frame.
> +		 *
> +		 * This cannot be a fatal error as there are buggy
> +		 * frontends that do this.
> +		 *
> +		 * Consume all the frags and drop the packet.
> +		 */
> +		if (!drop && txp->size > first->size) {
> +			netdev_dbg(vif->dev, "Frag is bigger than frame.\n");
> +			drop = true;
>  		}
> 
>  		first->size -= txp->size;
> @@ -938,6 +949,12 @@ static int netbk_count_requests(struct xenvif *vif,
>  			return -EINVAL;
>  		}
>  	} while ((txp++)->flags & XEN_NETTXF_more_data);
> +
> +	if (drop) {
> +		netbk_tx_err(vif, txp, idx + frags);
> +		return -EIO;
> +	}
> +
>  	return frags;
>  }
> 
> @@ -1327,7 +1344,7 @@ static unsigned xen_netbk_tx_build_gops(struct
> xen_netbk *netbk)
>  				continue;
>  		}
> 
> -		ret = netbk_count_requests(vif, &txreq, txfrags, work_to_do);
> +		ret = netbk_count_requests(vif, &txreq, txfrags, work_to_do, idx);
>  		if (unlikely(ret < 0))
>  			continue;
> 

^ permalink raw reply

* Re: [Xen-devel] [PATCH 2/4] xen-netfront: drop skb when skb->len > 65535
From: David Vrabel @ 2013-03-18 14:00 UTC (permalink / raw)
  To: Ian Campbell
  Cc: Wei Liu, netdev@vger.kernel.org, xen-devel@lists.xen.org,
	annie.li@oracle.com, konrad.wilk@oracle.com
In-Reply-To: <1363614500.30193.47.camel@zakaz.uk.xensource.com>

On 18/03/13 13:48, Ian Campbell wrote:
> On Mon, 2013-03-18 at 13:46 +0000, David Vrabel wrote:
>> On 18/03/13 10:35, Wei Liu wrote:
>>> The `size' field of Xen network wire format is uint16_t, anything bigger than
>>> 65535 will cause overflow.
>>
>> The backend needs to be able to handle these bad packets without
>> disconnecting the VIF -- we can't fix all the frontend drivers.
> 
> Agreed, although that doesn't imply that we shouldn't fix the frontend
> where we can -- such as upstream as Wei does here.

Yes, frontends should be fixed where possible.

This is what I came up with for the backend.  I don't have time to look
into it further but, Wei, feel free to use it as a starting point.

David

diff --git a/drivers/net/xen-netback/netback.c
b/drivers/net/xen-netback/netback.c
index cd49ba9..18e2671 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -899,10 +899,11 @@ static void netbk_fatal_tx_err(struct xenvif *vif)
 static int netbk_count_requests(struct xenvif *vif,
 				struct xen_netif_tx_request *first,
 				struct xen_netif_tx_request *txp,
-				int work_to_do)
+				int work_to_do, int idx)
 {
 	RING_IDX cons = vif->tx.req_cons;
 	int frags = 0;
+	bool drop = false;

 	if (!(first->flags & XEN_NETTXF_more_data))
 		return 0;
@@ -922,10 +923,20 @@ static int netbk_count_requests(struct xenvif *vif,

 		memcpy(txp, RING_GET_REQUEST(&vif->tx, cons + frags),
 		       sizeof(*txp));
-		if (txp->size > first->size) {
-			netdev_err(vif->dev, "Frag is bigger than frame.\n");
-			netbk_fatal_tx_err(vif);
-			return -EIO;
+
+		/*
+		 * If the guest submitted a frame >= 64 KiB then
+		 * first->size overflowed and following frags will
+		 * appear to be larger than the frame.
+		 *
+		 * This cannot be a fatal error as there are buggy
+		 * frontends that do this.
+		 *
+		 * Consume all the frags and drop the packet.
+		 */
+		if (!drop && txp->size > first->size) {
+			netdev_dbg(vif->dev, "Frag is bigger than frame.\n");
+			drop = true;
 		}

 		first->size -= txp->size;
@@ -938,6 +949,12 @@ static int netbk_count_requests(struct xenvif *vif,
 			return -EINVAL;
 		}
 	} while ((txp++)->flags & XEN_NETTXF_more_data);
+
+	if (drop) {
+		netbk_tx_err(vif, txp, idx + frags);
+		return -EIO;
+	}
+
 	return frags;
 }

@@ -1327,7 +1344,7 @@ static unsigned xen_netbk_tx_build_gops(struct
xen_netbk *netbk)
 				continue;
 		}

-		ret = netbk_count_requests(vif, &txreq, txfrags, work_to_do);
+		ret = netbk_count_requests(vif, &txreq, txfrags, work_to_do, idx);
 		if (unlikely(ret < 0))
 			continue;

^ permalink raw reply related

* Re: [PATCH regression/bisected] Revert "brcmsmac: support 4313iPA"
From: David Herrmann @ 2013-03-18 13:58 UTC (permalink / raw)
  To: Piotr Haber
  Cc: linux-wireless, linux-kernel, Arend van Spriel, John W. Linville,
	brcm80211-dev-list, netdev, Pieter-Paul Giesberts
In-Reply-To: <51471B85.5060404@broadcom.com>

Hi Piotr

On Mon, Mar 18, 2013 at 2:49 PM, Piotr Haber <phaber@broadcom.com> wrote:
> On 03/18/13 11:45, David Herrmann wrote:
>> This reverts commit b6fc28a158076ca2764edc9a6d1e1402f56e1c0c. It breaks
>> wireless AP reconnection on: (14e4:4727)
>>   Broadcom Corporation BCM4313 802.11b/g/n Wireless LAN Controller
>>
>> Any attempt to reconnect to an AP results in timeouts no matter how near to the
>> AP I am:
>>  00:10:40 $nb kernel: wlan0: authenticate with 00:18:39:0a:8e:23
>>  00:10:40 $nb kernel: wlan0: direct probe to 00:18:39:0a:8e:23 (try 1/3)
>>  00:10:40 $nb kernel: wlan0: direct probe to 00:18:39:0a:8e:23 (try 2/3)
>>  00:10:41 $nb kernel: wlan0: direct probe to 00:18:39:0a:8e:23 (try 3/3)
>>  00:10:41 $nb kernel: wlan0: authentication with 00:18:39:0a:8e:23 timed out
>> ---
>> Hi
>>
>> I tried coming up with a fix instead of reverting this commit, but the commit is
>> way to big for me to understand what's going on. Sorry.
>>
>> With linux-3.8 connecting to an AP broke on my machine. I could connect to an AP
>> one time, but any further attempt resulted in:
>>  00:10:40 $nb kernel: wlan0: authenticate with 00:18:39:0a:8e:23
>>  00:10:40 $nb kernel: wlan0: direct probe to 00:18:39:0a:8e:23 (try 1/3)
>>  00:10:40 $nb kernel: wlan0: direct probe to 00:18:39:0a:8e:23 (try 2/3)
>>  00:10:41 $nb kernel: wlan0: direct probe to 00:18:39:0a:8e:23 (try 3/3)
>>  00:10:41 $nb kernel: wlan0: authentication with 00:18:39:0a:8e:23 timed out
>>
>> Even sitting right next to the AP didn't help so I started bisecting and it
>> turned out to be:
>>   "brcmsmac: support 4313iPA" b6fc28a158076ca2764edc9a6d1e1402f56e1c0c
>> Please revert it.
>>
>> Thanks
>> David
>>
> Hi,
> unfortunately this is not a first report of this patch breaking 4313 for some users.
> I'm pretty confident that it is hardware revision related as we have 4313ePA and iPA boards running
> successfully in our test setup.
> Could you aid us in effort of finding the problem by supplying the contents of this debugfs file:
> <debugfs_mount>/brcmsmac/bcma0:0/hardware

Hi

$ cat /sys/kernel/debug/brcmsmac/bcma0\:0/hardware
board vendor: 185f
board type: 51a
board revision: 1408
board flags: 8402a01
board flags2: 880
firmware revision: 262032b

I can also try partial reverts of that commit, but I really don't know
which parts might be important.

Thanks
David

^ permalink raw reply

* Re: [PATCH nf-next] netfilter: nfnetlink_queue: zero copy support
From: Eric Dumazet @ 2013-03-18 13:51 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Pablo Neira Ayuso, netdev, Netfilter Developer Mailing List
In-Reply-To: <20130318092444.GG7938@breakpoint.cc>

On Mon, 2013-03-18 at 10:24 +0100, Florian Westphal wrote:
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> > -GRO/GSO packets are segmented in nf_queue()
> > and checksummed in nfqnl_build_packet_message().
> > Proper support for GSO/GRO packets (no segmentation,
> > and no checksumming) needs application cooperation, if we
> > want no regressions.
> 
> Since ipqueue is gone we might be able to push the segmentation
> down to nfnetlink_queue.  Then new userspace applications
> could indicate a 'I won't verify checksums and will handle huge
> packets'.
> 
> Are you working on something like this?

I validated that it was only an API concern, by commenting out the code,
and got 20Gbps (link speed) using the sample program (using a bigger
buffer to receive the skbs and removing the printf() for each packet)

Pablo followed the experiments and I believe he has an idea of the
needed API. Anyway, after one week in NFWS, I wont have the time to do
it, I have a huge backlog...

Note that its not a zero copy :

Before the patch we had 2 copies. (kernel->kernel done in softirq
context, and kernel->user in process context)

After the patch we have the copy from kernel to user land, done
in process context.

^ permalink raw reply

* Re: [PATCH regression/bisected] Revert "brcmsmac: support 4313iPA"
From: Piotr Haber @ 2013-03-18 13:49 UTC (permalink / raw)
  To: David Herrmann
  Cc: linux-wireless, linux-kernel, Arend van Spriel, John W. Linville,
	brcm80211-dev-list, netdev, Pieter-Paul Giesberts
In-Reply-To: <1363603503-2378-1-git-send-email-dh.herrmann@gmail.com>

On 03/18/13 11:45, David Herrmann wrote:
> This reverts commit b6fc28a158076ca2764edc9a6d1e1402f56e1c0c. It breaks
> wireless AP reconnection on: (14e4:4727)
>   Broadcom Corporation BCM4313 802.11b/g/n Wireless LAN Controller
> 
> Any attempt to reconnect to an AP results in timeouts no matter how near to the
> AP I am:
>  00:10:40 $nb kernel: wlan0: authenticate with 00:18:39:0a:8e:23
>  00:10:40 $nb kernel: wlan0: direct probe to 00:18:39:0a:8e:23 (try 1/3)
>  00:10:40 $nb kernel: wlan0: direct probe to 00:18:39:0a:8e:23 (try 2/3)
>  00:10:41 $nb kernel: wlan0: direct probe to 00:18:39:0a:8e:23 (try 3/3)
>  00:10:41 $nb kernel: wlan0: authentication with 00:18:39:0a:8e:23 timed out
> ---
> Hi
> 
> I tried coming up with a fix instead of reverting this commit, but the commit is
> way to big for me to understand what's going on. Sorry.
> 
> With linux-3.8 connecting to an AP broke on my machine. I could connect to an AP
> one time, but any further attempt resulted in:
>  00:10:40 $nb kernel: wlan0: authenticate with 00:18:39:0a:8e:23
>  00:10:40 $nb kernel: wlan0: direct probe to 00:18:39:0a:8e:23 (try 1/3)
>  00:10:40 $nb kernel: wlan0: direct probe to 00:18:39:0a:8e:23 (try 2/3)
>  00:10:41 $nb kernel: wlan0: direct probe to 00:18:39:0a:8e:23 (try 3/3)
>  00:10:41 $nb kernel: wlan0: authentication with 00:18:39:0a:8e:23 timed out
> 
> Even sitting right next to the AP didn't help so I started bisecting and it
> turned out to be:
>   "brcmsmac: support 4313iPA" b6fc28a158076ca2764edc9a6d1e1402f56e1c0c
> Please revert it.
> 
> Thanks
> David
> 
Hi,
unfortunately this is not a first report of this patch breaking 4313 for some users.
I'm pretty confident that it is hardware revision related as we have 4313ePA and iPA boards running
successfully in our test setup.
Could you aid us in effort of finding the problem by supplying the contents of this debugfs file:
<debugfs_mount>/brcmsmac/bcma0:0/hardware

Piotr

^ permalink raw reply

* Re: [Xen-devel] [PATCH 2/4] xen-netfront: drop skb when skb->len > 65535
From: Ian Campbell @ 2013-03-18 13:48 UTC (permalink / raw)
  To: David Vrabel
  Cc: Wei Liu, netdev@vger.kernel.org, xen-devel@lists.xen.org,
	annie.li@oracle.com, konrad.wilk@oracle.com
In-Reply-To: <51471AAC.2050509@citrix.com>

On Mon, 2013-03-18 at 13:46 +0000, David Vrabel wrote:
> On 18/03/13 10:35, Wei Liu wrote:
> > The `size' field of Xen network wire format is uint16_t, anything bigger than
> > 65535 will cause overflow.
> 
> The backend needs to be able to handle these bad packets without
> disconnecting the VIF -- we can't fix all the frontend drivers.

Agreed, although that doesn't imply that we shouldn't fix the frontend
where we can -- such as upstream as Wei does here.

Ian.

> 
> David
> 
> > Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> > ---
> >  drivers/net/xen-netfront.c |   12 ++++++++++++
> >  1 file changed, 12 insertions(+)
> > 
> > diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
> > index 5527663..8c3d065 100644
> > --- a/drivers/net/xen-netfront.c
> > +++ b/drivers/net/xen-netfront.c
> > @@ -547,6 +547,18 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
> >  	unsigned int len = skb_headlen(skb);
> >  	unsigned long flags;
> >  
> > +	/*
> > +	 * wire format of xen_netif_tx_request only supports skb->len
> > +	 * < 64K, because size field in xen_netif_tx_request is
> > +	 * uint16_t.
> > +	 */
> > +	if (unlikely(skb->len > (uint16_t)(~0))) {
> > +		net_alert_ratelimited(
> > +			"xennet: skb->len = %d, too big for wire format\n",
> > +			skb->len);
> > +		goto drop;
> > +	}
> > +
> >  	slots = DIV_ROUND_UP(offset + len, PAGE_SIZE) +
> >  		xennet_count_skb_frag_slots(skb);
> >  	if (unlikely(slots > MAX_SKB_FRAGS + 1)) {
> 

^ permalink raw reply

* Re: [Xen-devel] [PATCH 2/4] xen-netfront: drop skb when skb->len > 65535
From: David Vrabel @ 2013-03-18 13:46 UTC (permalink / raw)
  To: Wei Liu; +Cc: netdev, xen-devel, annie.li, ian.campbell, konrad.wilk
In-Reply-To: <1363602955-24790-3-git-send-email-wei.liu2@citrix.com>

On 18/03/13 10:35, Wei Liu wrote:
> The `size' field of Xen network wire format is uint16_t, anything bigger than
> 65535 will cause overflow.

The backend needs to be able to handle these bad packets without
disconnecting the VIF -- we can't fix all the frontend drivers.

David

> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> ---
>  drivers/net/xen-netfront.c |   12 ++++++++++++
>  1 file changed, 12 insertions(+)
> 
> diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
> index 5527663..8c3d065 100644
> --- a/drivers/net/xen-netfront.c
> +++ b/drivers/net/xen-netfront.c
> @@ -547,6 +547,18 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
>  	unsigned int len = skb_headlen(skb);
>  	unsigned long flags;
>  
> +	/*
> +	 * wire format of xen_netif_tx_request only supports skb->len
> +	 * < 64K, because size field in xen_netif_tx_request is
> +	 * uint16_t.
> +	 */
> +	if (unlikely(skb->len > (uint16_t)(~0))) {
> +		net_alert_ratelimited(
> +			"xennet: skb->len = %d, too big for wire format\n",
> +			skb->len);
> +		goto drop;
> +	}
> +
>  	slots = DIV_ROUND_UP(offset + len, PAGE_SIZE) +
>  		xennet_count_skb_frag_slots(skb);
>  	if (unlikely(slots > MAX_SKB_FRAGS + 1)) {

^ permalink raw reply

* Re: [PATCH 2/4] xen-netfront: drop skb when skb->len > 65535
From: Konrad Rzeszutek Wilk @ 2013-03-18 13:44 UTC (permalink / raw)
  To: Wei Liu; +Cc: netdev, xen-devel, ian.campbell, annie.li
In-Reply-To: <1363602955-24790-3-git-send-email-wei.liu2@citrix.com>

On Mon, Mar 18, 2013 at 10:35:53AM +0000, Wei Liu wrote:
> The `size' field of Xen network wire format is uint16_t, anything bigger than
> 65535 will cause overflow.

Should this also copy stable@vger.kernel.org?
> 
> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> ---
>  drivers/net/xen-netfront.c |   12 ++++++++++++
>  1 file changed, 12 insertions(+)
> 
> diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
> index 5527663..8c3d065 100644
> --- a/drivers/net/xen-netfront.c
> +++ b/drivers/net/xen-netfront.c
> @@ -547,6 +547,18 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
>  	unsigned int len = skb_headlen(skb);
>  	unsigned long flags;
>  
> +	/*
> +	 * wire format of xen_netif_tx_request only supports skb->len
> +	 * < 64K, because size field in xen_netif_tx_request is
> +	 * uint16_t.
> +	 */
> +	if (unlikely(skb->len > (uint16_t)(~0))) {
> +		net_alert_ratelimited(
> +			"xennet: skb->len = %d, too big for wire format\n",
> +			skb->len);
> +		goto drop;
> +	}
> +
>  	slots = DIV_ROUND_UP(offset + len, PAGE_SIZE) +
>  		xennet_count_skb_frag_slots(skb);
>  	if (unlikely(slots > MAX_SKB_FRAGS + 1)) {
> -- 
> 1.7.10.4
> 

^ permalink raw reply

* Re: [PATCH 4/4] xen-netback: coalesce slots before copying
From: James Harper @ 2013-03-18 13:27 UTC (permalink / raw)
  To: James Harper, Wei Liu, netdev@vger.kernel.org,
	xen-devel@lists.xen.org
  Cc: annie.li@oracle.com, ian.campbell@citrix.com,
	konrad.wilk@oracle.com
In-Reply-To: <6035A0D088A63A46850C3988ED045A4B387EC758@BITCOM1.int.sbss.com.au>

> >
> > This patch tries to coalesce tx requests when constructing grant copy
> > structures. It enables netback to deal with situation when frontend's
> > MAX_SKB_FRAGS is larger than backend's MAX_SKB_FRAGS.
> >
> > It defines max_skb_slots, which is a estimation of the maximum number of
> > slots
> > a guest can send, anything bigger than that is considered malicious. Now it
> is
> > set to 20, which should be enough to accommodate Linux (16 to 19) and
> > possibly
> > Windows (19?).
> >
> > +/*
> > + * This is an estimation of the maximum possible frags a SKB might
> > + * have, anything larger than this is considered malicious. Typically
> > + * Linux has 16 to 19, Windows has 19(?).
> > + */
> 
> Could you remove the "Windows has 19(?)" comment? I don't think it's
> helpful, even with the "(?)"... I just checked and windows 2008R2 gives
> GPLPV a maximum of 20 buffers in all the testing I've done, and that's after
> the header is coalesced so it's probably more than that. I'm pretty sure I
> tested windows 2003 quite a while back and I could coax it into giving
> ridiculous numbers of buffers when using iperf with tiny buffers.
> 
> Maybe "Windows has >19" if you need to put a number on it?
> 

Actually it turns out GPLPV just stops counting at 20. If I keep counting I can sometimes see over 1000 buffers per GSO packet under Windows using "iperf -l50", so windows will quite happily send 1000's of buffers and I don't have any evidence that it wouldn't cope with a similar number on receive. fwiw.

(of course coalescing vs using 1000 ring slots is an obvious choice...)

James

^ permalink raw reply

* Re: [PATCH] drivers/isdn: beautify code, delete 'break' after 'return'
From: Jiri Kosina @ 2013-03-18 13:25 UTC (permalink / raw)
  To: Chen Gang; +Cc: isdn, Jiri Slaby, Greg KH, alan, netdev
In-Reply-To: <514281B9.7000105@asianux.com>

On Fri, 15 Mar 2013, Chen Gang wrote:

> Hello Maintainers:
> 
>   is it patch ok ?

This one is now applied.

-- 
Jiri Kosina
SUSE Labs

^ permalink raw reply

* [PATCH 5/5] ipvs: fix some sparse warnings
From: Simon Horman @ 2013-03-18 13:15 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Simon Horman
In-Reply-To: <1363612543-9787-1-git-send-email-horms@verge.net.au>

From: Julian Anastasov <ja@ssi.bg>

	Add missing __percpu annotations and make
ip_vs_net_id static.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             |    2 +-
 net/netfilter/ipvs/ip_vs_core.c |    8 +-------
 net/netfilter/ipvs/ip_vs_est.c  |    2 +-
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index fce8e6b..bee87ba 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -459,7 +459,7 @@ struct ip_vs_estimator {
 struct ip_vs_stats {
 	struct ip_vs_stats_user	ustats;		/* statistics */
 	struct ip_vs_estimator	est;		/* estimator */
-	struct ip_vs_cpu_stats	*cpustats;	/* per cpu counters */
+	struct ip_vs_cpu_stats __percpu	*cpustats;	/* per cpu counters */
 	spinlock_t		lock;		/* spin lock */
 	struct ip_vs_stats_user	ustats0;	/* reset values */
 };
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 61f49d2..2aef23e 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -69,10 +69,7 @@ EXPORT_SYMBOL(ip_vs_conn_put);
 EXPORT_SYMBOL(ip_vs_get_debug_level);
 #endif
 
-int ip_vs_net_id __read_mostly;
-#ifdef IP_VS_GENERIC_NETNS
-EXPORT_SYMBOL(ip_vs_net_id);
-#endif
+static int ip_vs_net_id __read_mostly;
 /* netns cnt used for uniqueness */
 static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
 
@@ -1181,9 +1178,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 						iph.len)))) {
 #ifdef CONFIG_IP_VS_IPV6
 				if (af == AF_INET6) {
-					struct net *net =
-						dev_net(skb_dst(skb)->dev);
-
 					if (!skb->dev)
 						skb->dev = net->loopback_dev;
 					icmpv6_send(skb,
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 0fac601..6bee6d0 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -56,7 +56,7 @@
  * Make a summary from each cpu
  */
 static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum,
-				 struct ip_vs_cpu_stats *stats)
+				 struct ip_vs_cpu_stats __percpu *stats)
 {
 	int i;
 
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 1/5] ipvs: add backup_only flag to avoid loops
From: Simon Horman @ 2013-03-18 13:15 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Simon Horman
In-Reply-To: <1363612543-9787-1-git-send-email-horms@verge.net.au>

From: Julian Anastasov <ja@ssi.bg>

	Dmitry Akindinov is reporting for a problem where
SYNs are looping between the master and backup server
when the backup server is used as real server in DR mode
and has IPVS rules to function as director.

	Even when the backup function is enabled we
continue to forward traffic and schedule new connections
when the current master is using the backup server as
real server. While this is not a problem for NAT, for
DR and TUN method the backup server can not determine
if a request comes from client or from director.

	To avoid such loops add new sysctl flag
backup_only. It can be needed for DR/TUN setups that
do not need backup and director function at the
same time. When the backup function is enabled we
stop any forwarding and pass the traffic to the local
stack (real server mode). The flag disables the
director function when the backup function is enabled.

	For setups that enable backup function for
some virtual services and director function for
other virtual services there should be another more
complex solution to support DR/TUN mode, may be to
assign per-virtual service syncid value, so that
we can differentiate the requests.

Reported-by: Dmitry Akindinov <dimak@stalker.com>
Tested-by: German Myzovsky <lawyer@sipnet.ru>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 Documentation/networking/ipvs-sysctl.txt |    7 +++++++
 include/net/ip_vs.h                      |   12 ++++++++++++
 net/netfilter/ipvs/ip_vs_core.c          |   12 ++++++++----
 net/netfilter/ipvs/ip_vs_ctl.c           |    7 +++++++
 4 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/Documentation/networking/ipvs-sysctl.txt b/Documentation/networking/ipvs-sysctl.txt
index f2a2488..9573d0c 100644
--- a/Documentation/networking/ipvs-sysctl.txt
+++ b/Documentation/networking/ipvs-sysctl.txt
@@ -15,6 +15,13 @@ amemthresh - INTEGER
         enabled and the variable is automatically set to 2, otherwise
         the strategy is disabled and the variable is  set  to 1.
 
+backup_only - BOOLEAN
+	0 - disabled (default)
+	not 0 - enabled
+
+	If set, disable the director function while the server is
+	in backup mode to avoid packet loops for DR/TUN methods.
+
 conntrack - BOOLEAN
 	0 - disabled (default)
 	not 0 - enabled
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 68c69d5..fce8e6b 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -976,6 +976,7 @@ struct netns_ipvs {
 	int			sysctl_sync_retries;
 	int			sysctl_nat_icmp_send;
 	int			sysctl_pmtu_disc;
+	int			sysctl_backup_only;
 
 	/* ip_vs_lblc */
 	int			sysctl_lblc_expiration;
@@ -1067,6 +1068,12 @@ static inline int sysctl_pmtu_disc(struct netns_ipvs *ipvs)
 	return ipvs->sysctl_pmtu_disc;
 }
 
+static inline int sysctl_backup_only(struct netns_ipvs *ipvs)
+{
+	return ipvs->sync_state & IP_VS_STATE_BACKUP &&
+	       ipvs->sysctl_backup_only;
+}
+
 #else
 
 static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
@@ -1114,6 +1121,11 @@ static inline int sysctl_pmtu_disc(struct netns_ipvs *ipvs)
 	return 1;
 }
 
+static inline int sysctl_backup_only(struct netns_ipvs *ipvs)
+{
+	return 0;
+}
+
 #endif
 
 /*
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 47edf5a..18b4bc5 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1577,7 +1577,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	}
 	/* ipvs enabled in this netns ? */
 	net = skb_net(skb);
-	if (!net_ipvs(net)->enable)
+	ipvs = net_ipvs(net);
+	if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
 		return NF_ACCEPT;
 
 	ip_vs_fill_iph_skb(af, skb, &iph);
@@ -1654,7 +1655,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	}
 
 	IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
-	ipvs = net_ipvs(net);
 	/* Check the server status */
 	if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 		/* the destination server is not available */
@@ -1815,13 +1815,15 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
 {
 	int r;
 	struct net *net;
+	struct netns_ipvs *ipvs;
 
 	if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
 		return NF_ACCEPT;
 
 	/* ipvs enabled in this netns ? */
 	net = skb_net(skb);
-	if (!net_ipvs(net)->enable)
+	ipvs = net_ipvs(net);
+	if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
 		return NF_ACCEPT;
 
 	return ip_vs_in_icmp(skb, &r, hooknum);
@@ -1835,6 +1837,7 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
 {
 	int r;
 	struct net *net;
+	struct netns_ipvs *ipvs;
 	struct ip_vs_iphdr iphdr;
 
 	ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr);
@@ -1843,7 +1846,8 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
 
 	/* ipvs enabled in this netns ? */
 	net = skb_net(skb);
-	if (!net_ipvs(net)->enable)
+	ipvs = net_ipvs(net);
+	if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
 		return NF_ACCEPT;
 
 	return ip_vs_in_icmp_v6(skb, &r, hooknum, &iphdr);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index c68198b..9e2d1cc 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1808,6 +1808,12 @@ static struct ctl_table vs_vars[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "backup_only",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 #ifdef CONFIG_IP_VS_DEBUG
 	{
 		.procname	= "debug_level",
@@ -3741,6 +3747,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
 	tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
 	ipvs->sysctl_pmtu_disc = 1;
 	tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
+	tbl[idx++].data = &ipvs->sysctl_backup_only;
 
 
 	ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 2/5] ipvs: remove extra rcu lock
From: Simon Horman @ 2013-03-18 13:15 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Simon Horman
In-Reply-To: <1363612543-9787-1-git-send-email-horms@verge.net.au>

From: Julian Anastasov <ja@ssi.bg>

	In 3.7 we added code that uses ipv4_update_pmtu
but after commit c5ae7d4192 (ipv4: must use rcu protection
while calling fib_lookup) the RCU lock is not needed.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_core.c |    2 --
 1 file changed, 2 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 18b4bc5..61f49d2 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1394,10 +1394,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 			skb_reset_network_header(skb);
 			IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
 				&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
-			rcu_read_lock();
 			ipv4_update_pmtu(skb, dev_net(skb->dev),
 					 mtu, 0, 0, 0, 0);
-			rcu_read_unlock();
 			/* Client uses PMTUD? */
 			if (!(cih->frag_off & htons(IP_DF)))
 				goto ignore_ipip;
-- 
1.7.10.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox