Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v2 1/2] net: core: introduce netif_skb_dev_features
From: Florian Westphal @ 2014-02-13 22:09 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal

Will be used by upcoming ipv4 forward path change that needs to
determine feature mask using skb->dst->dev instead of skb->dev.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
Changes since v1:
 no more 'illegal_highdma' discards 'const' qualifier from pointer target type'

 include/linux/netdevice.h |  7 ++++++-
 net/core/dev.c            | 22 ++++++++++++----------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 440a02e..21d4e6b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3068,7 +3068,12 @@ void netdev_change_features(struct net_device *dev);
 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 					struct net_device *dev);
 
-netdev_features_t netif_skb_features(struct sk_buff *skb);
+netdev_features_t netif_skb_dev_features(struct sk_buff *skb,
+					 const struct net_device *dev);
+static inline netdev_features_t netif_skb_features(struct sk_buff *skb)
+{
+	return netif_skb_dev_features(skb, skb->dev);
+}
 
 static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 3721db7..afc4e47 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2420,7 +2420,7 @@ EXPORT_SYMBOL(netdev_rx_csum_fault);
  * 2. No high memory really exists on this machine.
  */
 
-static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
+static int illegal_highdma(const struct net_device *dev, struct sk_buff *skb)
 {
 #ifdef CONFIG_HIGHMEM
 	int i;
@@ -2495,34 +2495,36 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
 }
 
 static netdev_features_t harmonize_features(struct sk_buff *skb,
-	netdev_features_t features)
+					    const struct net_device *dev,
+					    netdev_features_t features)
 {
 	if (skb->ip_summed != CHECKSUM_NONE &&
 	    !can_checksum_protocol(features, skb_network_protocol(skb))) {
 		features &= ~NETIF_F_ALL_CSUM;
-	} else if (illegal_highdma(skb->dev, skb)) {
+	} else if (illegal_highdma(dev, skb)) {
 		features &= ~NETIF_F_SG;
 	}
 
 	return features;
 }
 
-netdev_features_t netif_skb_features(struct sk_buff *skb)
+netdev_features_t netif_skb_dev_features(struct sk_buff *skb,
+					 const struct net_device *dev)
 {
 	__be16 protocol = skb->protocol;
-	netdev_features_t features = skb->dev->features;
+	netdev_features_t features = dev->features;
 
-	if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
+	if (skb_shinfo(skb)->gso_segs > dev->gso_max_segs)
 		features &= ~NETIF_F_GSO_MASK;
 
 	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
 		protocol = veh->h_vlan_encapsulated_proto;
 	} else if (!vlan_tx_tag_present(skb)) {
-		return harmonize_features(skb, features);
+		return harmonize_features(skb, dev, features);
 	}
 
-	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
+	features &= (dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
 					       NETIF_F_HW_VLAN_STAG_TX);
 
 	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
@@ -2530,9 +2532,9 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
 				NETIF_F_HW_VLAN_STAG_TX;
 
-	return harmonize_features(skb, features);
+	return harmonize_features(skb, dev, features);
 }
-EXPORT_SYMBOL(netif_skb_features);
+EXPORT_SYMBOL(netif_skb_dev_features);
 
 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			struct netdev_queue *txq)
-- 
1.8.1.5

^ permalink raw reply related

* [PATCH v6 2/2] net: ip, ipv6: handle gso skbs in forwarding path
From: Florian Westphal @ 2014-02-13 22:09 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal
In-Reply-To: <1392329352-31606-1-git-send-email-fw@strlen.de>

Marcelo Ricardo Leitner reported problems when the forwarding link path
has a lower mtu than the incoming one if the inbound interface supports GRO.

Given:
Host <mtu1500> R1 <mtu1200> R2

Host sends tcp stream which is routed via R1 and R2.  R1 performs GRO.

In this case, the kernel will fail to send ICMP fragmentation needed
messages (or pkt too big for ipv6), as GSO packets currently bypass dstmtu
checks in forward path. Instead, Linux tries to send out packets exceeding
the mtu.

When locking route MTU on Host (i.e., no ipv4 DF bit set), R1 does
not fragment the packets when forwarding, and again tries to send out
packets exceeding R1-R2 link mtu.

This alters the forwarding dstmtu checks to take the individual gso
segment lengths into account.

For ipv6, we send out pkt too big error for gso if the individual
segments are too big.

For ipv4, we either send icmp fragmentation needed, or, if the DF bit
is not set, perform software segmentation and let the output path
create fragments when the packet is leaving the machine.
It is not 100% correct as the error message will contain the headers of
the GRO skb instead of the original/segmented one, but it seems to
work fine in my (limited) tests.

Eric Dumazet suggested to simply shrink mss via ->gso_size to avoid
sofware segmentation.

However it turns out that skb_segment() assumes skb nr_frags is related
to mss size so we would BUG there.  I don't want to mess with it considering
Herbert and Eric disagree on what the correct behavior should be.

Hannes Frederic Sowa notes that when we would shrink gso_size
skb_segment would then also need to deal with the case where
SKB_MAX_FRAGS would be exceeded.

This uses sofware segmentation in the forward path when we hit ipv4
non-DF packets and the outgoing link mtu is too small.  Its not perfect,
but given the lack of bug reports wrt. GRO fwd being broken this is a
rare case anyway.  Also its not like this could not be improved later
once the dust settles.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Reported-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
Changes since v5; propagated Herberts Ack from v5 to changelog.
[ resend is due to error in earlier patch in series ]

Changes since V4:
 - use new netif_skb_dev_features instead of netif_skb_features
   as we need dst->dev and not skb->dev (spotted by
   Hannes Frederic Sowa)

Changes since V3:
 - use ip_dst_mtu_maybe_forward instead of dst_mtu
 - add comment wrt. DF bit not being set

Changes since V2:
 - make this thing apply to current -net tree
 - kill unused variables in ip_forward/ip6_output

Changes since V1:
 suggestions from Eric Dumazet:
  - skip more expensive computation for small packets in fwd path
  - use netif_skb_features() feature mask and remove GSO flags
    instead of using 0 feature set.

 include/linux/skbuff.h | 17 ++++++++++++
 net/ipv4/ip_forward.c  | 71 ++++++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv6/ip6_output.c  | 17 ++++++++++--
 3 files changed, 101 insertions(+), 4 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f589c9a..3ebbbe7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2916,5 +2916,22 @@ static inline bool skb_head_is_locked(const struct sk_buff *skb)
 {
 	return !skb->head_frag || skb_cloned(skb);
 }
+
+/**
+ * skb_gso_network_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_network_seglen is used to determine the real size of the
+ * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
+ *
+ * The MAC/L2 header is not accounted for.
+ */
+static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
+{
+	unsigned int hdr_len = skb_transport_header(skb) -
+			       skb_network_header(skb);
+	return hdr_len + skb_gso_transport_seglen(skb);
+}
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index e9f1217..f3869c1 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -39,6 +39,71 @@
 #include <net/route.h>
 #include <net/xfrm.h>
 
+static bool ip_may_fragment(const struct sk_buff *skb)
+{
+	return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) ||
+	       !skb->local_df;
+}
+
+static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+	if (skb->len <= mtu || skb->local_df)
+		return false;
+
+	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
+		return false;
+
+	return true;
+}
+
+static bool ip_gso_exceeds_dst_mtu(const struct sk_buff *skb)
+{
+	unsigned int mtu;
+
+	if (skb->local_df || !skb_is_gso(skb))
+		return false;
+
+	mtu = ip_dst_mtu_maybe_forward(skb_dst(skb), true);
+
+	/* if seglen > mtu, do software segmentation for IP fragmentation on
+	 * output.  DF bit cannot be set since ip_forward would have sent
+	 * icmp error.
+	 */
+	return skb_gso_network_seglen(skb) > mtu;
+}
+
+/* called if GSO skb needs to be fragmented on forward */
+static int ip_forward_finish_gso(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	netdev_features_t features;
+	struct sk_buff *segs;
+	int ret = 0;
+
+	features = netif_skb_dev_features(skb, dst->dev);
+	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+	if (IS_ERR(segs)) {
+		kfree_skb(skb);
+		return -ENOMEM;
+	}
+
+	consume_skb(skb);
+
+	do {
+		struct sk_buff *nskb = segs->next;
+		int err;
+
+		segs->next = NULL;
+		err = dst_output(segs);
+
+		if (err && ret == 0)
+			ret = err;
+		segs = nskb;
+	} while (segs);
+
+	return ret;
+}
+
 static int ip_forward_finish(struct sk_buff *skb)
 {
 	struct ip_options *opt	= &(IPCB(skb)->opt);
@@ -49,6 +114,9 @@ static int ip_forward_finish(struct sk_buff *skb)
 	if (unlikely(opt->optlen))
 		ip_forward_options(skb);
 
+	if (ip_gso_exceeds_dst_mtu(skb))
+		return ip_forward_finish_gso(skb);
+
 	return dst_output(skb);
 }
 
@@ -91,8 +159,7 @@ int ip_forward(struct sk_buff *skb)
 
 	IPCB(skb)->flags |= IPSKB_FORWARDED;
 	mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
-	if (unlikely(skb->len > mtu && !skb_is_gso(skb) &&
-		     (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
+	if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) {
 		IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 			  htonl(mtu));
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ef02b26..070a2fa 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -342,6 +342,20 @@ static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 	return mtu;
 }
 
+static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
+{
+	if (skb->len <= mtu || skb->local_df)
+		return false;
+
+	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
+		return true;
+
+	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
+		return false;
+
+	return true;
+}
+
 int ip6_forward(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
@@ -466,8 +480,7 @@ int ip6_forward(struct sk_buff *skb)
 	if (mtu < IPV6_MIN_MTU)
 		mtu = IPV6_MIN_MTU;
 
-	if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
-	    (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
+	if (ip6_pkt_too_big(skb, mtu)) {
 		/* Again, force OUTPUT device used as source address */
 		skb->dev = dst->dev;
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-- 
1.8.1.5

^ permalink raw reply related

* Re: [PATCH] macvlan: unregister net device when netdev_upper_dev_link() fails
From: David Miller @ 2014-02-13 22:13 UTC (permalink / raw)
  To: xiyou.wangcong; +Cc: netdev, kaber, cwang
In-Reply-To: <1392162690-6647-2-git-send-email-xiyou.wangcong@gmail.com>

From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 11 Feb 2014 15:51:29 -0800

> From: Cong Wang <cwang@twopensource.com>
> 
> rtnl_newlink() doesn't unregister it for us on failure.
> 
> Cc: Patrick McHardy <kaber@trash.net>
> Cc: David S. Miller <davem@davemloft.net>
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
> Signed-off-by: Cong Wang <cwang@twopensource.com>

Applied.

^ permalink raw reply

* Re: [PATCH] net: correct error path in rtnl_newlink()
From: David Miller @ 2014-02-13 22:13 UTC (permalink / raw)
  To: xiyou.wangcong; +Cc: netdev, eric.dumazet, cwang
In-Reply-To: <1392162690-6647-3-git-send-email-xiyou.wangcong@gmail.com>

From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 11 Feb 2014 15:51:30 -0800

> From: Cong Wang <cwang@twopensource.com>
> 
> I saw the following BUG when ->newlink() fails in rtnl_newlink():
> 
> [   40.240058] kernel BUG at net/core/dev.c:6438!
> 
> this is due to free_netdev() is not supposed to be called before
> netdev is completely unregistered, therefore it is not correct
> to call free_netdev() here, at least for ops->newlink!=NULL case,
> many drivers call it in ->destructor so that rtnl_unlock() will
> take care of it, we probably don't need to do anything here.
> 
> Cc: David S. Miller <davem@davemloft.net>
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
> Signed-off-by: Cong Wang <cwang@twopensource.com>

Applied.

^ permalink raw reply

* Re: [PATCH v3 0/2] sctp: fix a problem with net_namespace
From: David Miller @ 2014-02-13 22:13 UTC (permalink / raw)
  To: wangweidong1; +Cc: nhorman, vyasevich, dborkman, sergei.shtylyov, netdev
In-Reply-To: <1392169484-8256-1-git-send-email-wangweidong1@huawei.com>

From: Wang Weidong <wangweidong1@huawei.com>
Date: Wed, 12 Feb 2014 09:44:42 +0800

> fix a problem with net_namespace, and optimize
> the sctp_sysctl_net_register.
> 
> v2 -> v3:
>   -patch2: add empty line after declaration as potined out by Sergei.
> 
> v1 -> v2:
>   -patch1: add Neil's ACK.
> 
> Wang Weidong (2):
>   sctp: fix a missed .data initialization
>   sctp: optimize the sctp_sysctl_net_register

Series applied, thanks.

^ permalink raw reply

* Re: [PATCH net] bonding: Fix deadlock in bonding driver when using netpoll
From: David Miller @ 2014-02-13 22:13 UTC (permalink / raw)
  To: dingtianhong; +Cc: fubar, vfalico, andy, netdev
In-Reply-To: <52FAF350.80005@huawei.com>

From: Ding Tianhong <dingtianhong@huawei.com>
Date: Wed, 12 Feb 2014 12:06:40 +0800

> The bonding driver take write locks and spin locks that are shared
> by the tx path in enslave processing and notification processing,
> If the netconsole is in use, the bonding can call printk which puts
> us in the netpoll tx path, if the netconsole is attached to the bonding
> driver, result in deadlock.
> 
> So add protection for these place, by checking the netpoll_block_tx
> state, we can defer the sending of the netconsole frames until a later
> time using the retransmit feature of netpoll_send_skb that is triggered
> on the return code NETDEV_TX_BUSY.
> 
> Cc: Jay Vosburgh <fubar@us.ibm.com>
> Cc: Veaceslav Falico <vfalico@redhat.com>
> Cc: Andy Gospodarek <andy@greyhouse.net>
> Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>

Applied.

^ permalink raw reply

* Re: [PATCH v2 1/2] net: core: introduce netif_skb_dev_features
From: David Miller @ 2014-02-13 22:17 UTC (permalink / raw)
  To: fw; +Cc: netdev
In-Reply-To: <1392329352-31606-1-git-send-email-fw@strlen.de>

From: Florian Westphal <fw@strlen.de>
Date: Thu, 13 Feb 2014 23:09:11 +0100

> Will be used by upcoming ipv4 forward path change that needs to
> determine feature mask using skb->dst->dev instead of skb->dev.
> 
> Signed-off-by: Florian Westphal <fw@strlen.de>

Applied and queued up for -stable.

^ permalink raw reply

* Re: [PATCH v6 2/2] net: ip, ipv6: handle gso skbs in forwarding path
From: David Miller @ 2014-02-13 22:17 UTC (permalink / raw)
  To: fw; +Cc: netdev
In-Reply-To: <1392329352-31606-2-git-send-email-fw@strlen.de>

From: Florian Westphal <fw@strlen.de>
Date: Thu, 13 Feb 2014 23:09:12 +0100

> Marcelo Ricardo Leitner reported problems when the forwarding link path
> has a lower mtu than the incoming one if the inbound interface supports GRO.
 ...
> However it turns out that skb_segment() assumes skb nr_frags is related
> to mss size so we would BUG there.  I don't want to mess with it considering
> Herbert and Eric disagree on what the correct behavior should be.
> 
> Hannes Frederic Sowa notes that when we would shrink gso_size
> skb_segment would then also need to deal with the case where
> SKB_MAX_FRAGS would be exceeded.
> 
> This uses sofware segmentation in the forward path when we hit ipv4
> non-DF packets and the outgoing link mtu is too small.  Its not perfect,
> but given the lack of bug reports wrt. GRO fwd being broken this is a
> rare case anyway.  Also its not like this could not be improved later
> once the dust settles.
> 
> Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
> Reported-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
> Signed-off-by: Florian Westphal <fw@strlen.de>

Also applied and queued up for -stable, thanks.

^ permalink raw reply

* Re: [RFC PATCH v2 tip 0/7] 64-bit BPF insn set and tracing filters
From: Daniel Borkmann @ 2014-02-13 22:22 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Ingo Molnar, David S. Miller, Steven Rostedt, Peter Zijlstra,
	H. Peter Anvin, Thomas Gleixner, Masami Hiramatsu, Tom Zanussi,
	Jovi Zhangwei, Eric Dumazet, Linus Torvalds, Andrew Morton,
	Frederic Weisbecker, Arnaldo Carvalho de Melo, Pekka Enberg,
	Arjan van de Ven, Christoph Hellwig, linux-kernel, netdev
In-Reply-To: <52FD2908.8000009@redhat.com>

On 02/13/2014 09:20 PM, Daniel Borkmann wrote:
> On 02/07/2014 02:20 AM, Alexei Starovoitov wrote:
> ...
>> Hi Daniel,
>
> Thanks for your answer and sorry for the late reply.
>
>> Thank you for taking a look. Good questions. I had the same concerns.
>> Old BPF was carefully extended in specific places.
>> End result may look big at first glance, but every extension has specific
>> reason behind it. I tried to explain the reasoning in Documentation/bpf_jit.txt
>>
>> I'm planning to write an on-the-fly converter from old BPF to BPF64
>> when BPF64 manages to demonstrate that it is equally safe.
>> It is straight forward to convert. Encoding is very similar.
>> Core concepts are the same.
>> Try diff include/uapi/linux/filter.h include/linux/bpf.h
>> to see how much is reused.
>>
>> I believe that old BPF outlived itself and BPF64 should
>> replace it in all current use cases plus a lot more.
>> It just cannot happen at once.
>> BPF64 can come in. bpf32->bpf64 converter functioning.
>> JIT from bpf64->aarch64 and may be sparc64 needs to be in place.
>> Then old bpf can fade away.
>
> Do you see a possibility to integrate your work step by step? That is,
> to first integrate the interpreter part only; meaning, to detect "old"
> BPF programs e.g. coming from SO_ATTACH_FILTER et al and run them in
> compatibility mode while extended BPF is fully integrated and replaces
> the old engine in net/core/filter.c. Maybe, "old" programs can be
> transformed transparently to the new representation and then would be
> good to execute in eBPF. If possible, in such a way that in the first
> step JIT compilers won't need any upgrades. Once that is resolved,
> JIT compilers could successively migrate, arch by arch, to compile the
> new code? And last but not least the existing tools as well for handling
> eBPF. I think, if possible, that would be great. Also, I unfortunately
> haven't looked into your code too deeply yet due to time constraints,
> but I'm wondering e.g. for accessing some skb fields we currently use
> the "hack" to "overload" load instructions with negative arguments. Do
> we have a sort of "meta" instruction that is extendible in eBPF to avoid
> such things in future?
>
>>> First of all, I think it's very interesting work ! I'm just a bit concerned
>>> that this _huge_ patchset with 64 bit BPF, or however we call it, will line
>>
>> Huge?
>> kernel is only 2k
>> the rest is 6k of userspace LLVM backend where most of it is llvm's
>> boilerplate code. GCC backend for BPF is 3k.
>> The goal is to have both GCC and LLVM backends to be upstreamed
>> when kernel pieces are agreed upon.
>> For comparison existing tools/net/bpf* is 2.5k
>> but here with 6k we get optimizing compiler from C and assembler.
>>
>>> up in one row next to the BPF code we currently have and next to new
>>> nftables
>>> engine and we will end up with three such engines which do quite similar
>>> things and are all exposed to user space thus they need to be maintained
>>> _forever_, adding up legacy even more. What would be the long-term future
>>> use
>>> cases where the 64 bit engine comes into place compared to the current BPF
>>> engine? What are the concrete killer features? I didn't went through your
>>
>> killer features vs old bpf are:
>> - zero-cost function calls
>> - 32-bit vs 64-bit
>> - optimizing compiler that can compile C into BPF64
>>
>> Why call kernel function from BPF?
>> So that BPF instruction set has to be extended only once and JITs are
>> written only once.
>> Over the years many extensions crept into old BPF as 'negative offsets'.
>> but JITs don't support all of them and assume bpf input as 'skb' only.
>> seccomp is using old bpf, but, because of these limitations, cannot use JIT.
>> BPF64 allows seccomp to be JITed, since bpf input is generalized
>> as 'struct bpf_context'.
>> New 'negative offset' extension for old bpf would mean implementing it in
>> JITs of all architectures? Painful, but doable. We can do better.

I'm very curious, do you also have any performance numbers, e.g. for
networking by taking JIT'ed/non-JIT'ed BPF filters and compare them against
JIT'ed/non-JIT'ed eBPF filters to see how many pps we gain or loose e.g.
for a scenario with a middle box running cls_bpf .. or some other macro/
micro benchmark just to get a picture where both stand in terms of
performance? Who knows, maybe it would outperform nftables engine as
well? ;-) How would that look on a 32bit arch with eBPF that is 64bit?

>> Fixed instruction set that allows zero-overhead calls into kernel functions
>> is much more flexible and extendable in a clean way.
>> Take a look at kernel/trace/bpf_trace_callbacks.c
>> It is a customization of generic BPF64 core for 'tracing filters'.
>> The set of functions for networking and definition of 'bpf_context'
>> will be different.
>> So BPF64 for tracing need X extensions, BPF64 for networking needs Y
>> extensions, but core framework stays the same and JIT stays the same.
>>
>> How to do zero-overhead call?
>> Map BPF registers to native registers one to one
>> and have compatible calling convention between BPF and native.
>> Then BPF asm code:
>> mov R1, 1
>> mov R2, 2
>> call foo
>> will be JITed into x86-64:
>> mov rdi, 1
>> mov rsi, 2
>> call foo
>> That makes BPF64 calls into kernel as fast as possible.
>> Especially for networking we don't want overhead of FFI mechanisms.
>>
>> That's why A and X regs and lack of callee-saved regs make old BPF
>> impractical to support generic function calls.
>>
>> BPF64 defines R1-R5 as function arguments and R6-R9 as
>> callee-saved, so kernel can natively call into JIT-ed BPF and back
>> with no extra argument shuffling.
>> gcc/llvm backends know that R6-R9 will be preserved while BPF is
>> calling into kernel functions and can make proper optimizations.
>> R6-R9 map to rbx-r15 on x86-64. On aarch64 we have
>> even more freedom of mapping.
>>
>>> code
>>> in detail, but although we might/could have _some_ performance benefits but
>>> at
>>> the _huge_ cost of adding complexity. The current BPF I find okay to debug
>>> and
>>> to follow, but how would be debug'ability of 64 bit programs end up, as you
>>> mention, it becomes "unbearably complex"?
>>
>> "unbearably complex" was the reference to x86 static analyzer :)
>> It's difficult to reconstruct and verify control and data flow of x86 asm code.
>> Binary compilers do that (like transmeta and others), but that's not suitable
>> for kernel.
>>
>> Both old bpf asm and bpf64 asm code I find equivalent in readability.
>>
>> clang dropmon.c ...|llc -filetype=asm
>> will produce the following bpf64 asm code:
>>          mov     r6, r1
>>          ldd     r1, 8(r6)
>>          std     -8(r10), r1
>>          mov     r7, 0
>>          mov     r3, r10
>>          addi    r3, -8
>>          mov     r1, r6
>>          mov     r2, r7
>>          call    bpf_table_lookup
>>          jeqi    r0, 0 goto .LBB0_2
>>
>> which corresponds to C:
>> void dropmon(struct bpf_context *ctx)
>> {       void *loc;
>>          uint64_t *drop_cnt;
>>          loc = (void *)ctx->arg2;
>>          drop_cnt = bpf_table_lookup(ctx, 0, &loc);
>>          if (drop_cnt) ...
>>
>> I think restricted C is easier to program and debug.
>> Which is another killer feature of bpf64.
>>
>> Interesting use case would be if some kernel subsystem
>> decides to generate BPF64 insns on the fly and JIT them.
>> Sort of self-modifieable kernel code.
>> It's certainly easier to generate BPF64 binary with macroses
>> from linux/bpf.h instead of x86 binary...
>> I may be dreaming here :)
>>
>>> Did you instead consider to
>>> replace
>>> the current BPF engine instead, and add a sort of built-in compatibility
>>> mode for current BPF programs? I think that this would be the way better
>>> option to go with instead of adding a new engine next to the other. For
>>> maintainability, trying to replace the old one might be harder to do on the
>>> short term but better to maintain on the long run for everyone, no?
>>
>> Exactly. I think on-the-fly converter from bpf32->bpf64 is this built-in
>> compatibility layer. I completely agree that replacing bpf32 is hard
>> short term, since it will raise too many concerns about
>> stability/safety, but long term it's a way to go.
>
> Yes, I agree.
>
>> I'm open to all suggestions on how to make it more generic, useful,
>> faster.
>>
>> Thank you for feedback.
>
> Thank you, must have been really fun to implement this. :)
>
>> Regards,
>> Alexei
>>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: RTL8153 fails to get link after applying c7de7dec2 to 3.8 kernel
From: Grant Grundler @ 2014-02-13 22:22 UTC (permalink / raw)
  To: hayeswang; +Cc: Inki Yoo, netdev, David Miller
In-Reply-To: <FD21003FF7E540DCA7F8F39A2BC046B4@realtek.com.tw>

On Wed, Feb 12, 2014 at 6:55 PM, hayeswang <hayeswang@realtek.com> wrote:
...
> According to the information from our FAE about the dangles for samsung,
> the default values of some device power are disabled or off. I don't know
> the history about this. However, it would cause the dangle no link, because
> the ecm driver wouldn't enable them. That is, the dangle couldn't use the
> ECM mode.

That makes sense. How can I check that?

> I think you should create a udev rule to change the configuration
> to 1 when the dangle is plugged. Then, it could load the vendor mode driver.

Which "configuration" are you referring to?
Something like this? /sys/devices/12110000.usb/usb3/3-2/bConfigurationValue

I'm happy to try any command line you give me.

If that command works, I can write the udev rule for future testing
with these devices.

> I don't think so. The reason is as above. Change the configuration of the
> device to 1 (vendor mode) is what you have to do.

Ok...just need one more bit of guidance from you and this will be
resolved I think.

cheers,
grant

^ permalink raw reply

* [PATCH net-next] tcp: add pacing_rate information into tcp_info
From: Eric Dumazet @ 2014-02-13 22:27 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

From: Eric Dumazet <edumazet@google.com>

Add two new fields to struct tcp_info, to report sk_pacing_rate
and sk_max_pacing_rate to monitoring applications, as ss from iproute2.

User exported fields are 64bit, even if kernel is currently using 32bit
fields.

lpaa5:~# ss -i 
..
	 skmem:(r0,rb357120,t0,tb2097152,f1584,w1980880,o0,bl0) ts sack cubic
wscale:6,6 rto:400 rtt:0.875/0.75 mss:1448 cwnd:1 ssthresh:12 send
13.2Mbps pacing_rate 3336.2Mbps unacked:15 retrans:1/5448 lost:15
rcv_space:29200

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/uapi/linux/tcp.h |    3 +++
 net/ipv4/tcp.c           |    5 +++++
 2 files changed, 8 insertions(+)

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 377f1e59411d..3b9718328d8b 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -186,6 +186,9 @@ struct tcp_info {
 	__u32	tcpi_rcv_space;
 
 	__u32	tcpi_total_retrans;
+
+	__u64	tcpi_pacing_rate;
+	__u64	tcpi_max_pacing_rate;
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9f3a2db9109e..bed379c7abcd 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2794,6 +2794,11 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
 	info->tcpi_rcv_space = tp->rcvq_space.space;
 
 	info->tcpi_total_retrans = tp->total_retrans;
+
+	info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ?
+					sk->sk_pacing_rate : ~0ULL;
+	info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ?
+					sk->sk_max_pacing_rate : ~0ULL;
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 

^ permalink raw reply related

* [PATCH net-next v2 00/14] tipc: clean up media and bearer layer
From: Jon Maloy @ 2014-02-13 22:27 UTC (permalink / raw)
  To: davem
  Cc: netdev, Paul Gortmaker, erik.hugne, ying.xue, maloy,
	tipc-discussion, Jon Maloy

This commit series aims at facilitating future changes to the
locking policy around nodes, links and bearers.

Currently, we have a big read/write lock (net_lock) that is used for
serializing all changes to the node, link and bearer lists, as well
as to their mutual pointers and references.

But, in order to allow for concurrent access to the contents of these
structures, net_lock is only used in read mode by the data path code,
and hence a finer granular locking policy must be applied inside the
scope of net_lock: a spinlock (node_lock) for each node structure,
and another one (bearer_lock) for protection of bearer structures.

This locking policy has proved hard to maintain. We have several 
times encountered contention problems between node_lock and 
bearer_lock, and with the advent of the RCU locking mechanism we
feel it is anyway obsolete and ripe for improvements.

We now plan to replace net_lock with an RCU lock, as well as
getting rid of bearer_lock altogether. This will both reduce data
path overhead and make the code more manageable, while reducing the
risk of future lock contention problems.

Prior to these changes, we need to do some necessary cleanup and
code consolidation. This is what we do with this commit series,
before we finally remove bearer_lock. In a later series we will
replace net_lock with an RCU lock.

v2:
 - Re-inserted a removed kerneldoc entry in commit#5, based on 
   feedback from D. Miller.

Jon Maloy (9):
  tipc: stricter behavior of message reassembly function
  tipc: change reception of tunnelled duplicate packets
  tipc: change reception of tunnelled failover packets
  tipc: change signature of tunnelling reception function
  tipc: more cleanup of tunnelling reception function
  tipc: rename stack variables in function tipc_link_tunnel_rcv
  tipc: changes to general packet reception algorithm
  tipc: delay delete of link when failover is needed
  tipc: add node_lock protection to link lookup function

Ying Xue (5):
  tipc: move code for resetting links from bearer.c to link.c
  tipc: move code for deleting links from bearer.c to link.c
  tipc: redefine 'started' flag in struct link to bitmap
  tipc: remove 'links' list from tipc_bearer struct
  tipc: remove bearer_lock from tipc_bearer struct

 net/tipc/bcast.c  |    7 +-
 net/tipc/bearer.c |   42 ++----
 net/tipc/bearer.h |    7 +-
 net/tipc/core.c   |    2 +-
 net/tipc/link.c   |  432 ++++++++++++++++++++++++++++++-----------------------
 net/tipc/link.h   |   34 ++---
 net/tipc/node.c   |    8 +-
 7 files changed, 283 insertions(+), 249 deletions(-)

-- 
1.7.9.5

^ permalink raw reply

* Re: [PATCH 2/3] net: UDP gro_receive accept csum=0
From: Tom Herbert @ 2014-02-13 22:27 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: David Miller, Linux Netdev List, Joseph Gasparakis, Jerry Chu,
	Eric Dumazet
In-Reply-To: <52FC7D0C.1020601@mellanox.com>

> Hi Tom,
>
> Considering the patch just "as is" vs. the current code, its OK.
>
> However, as skbs have only one indicator for the status of the checksum
> checks done by the receiving hardware, the basic assertion I thought we
> needed here is to reject skb if either it has the udp mark set or the
> encapsulation field is false, this is according to the conventions set by
> these two commits
>
> 0afb166 vxlan: Add capability of Rx checksum offload for inner packet
> 6a674e9 net: Add support for hardware-offloaded encapsulation
>
> B/c after finalizing the GRO work and decapsulating, skb injected up into
> the TCP stack with ip_summed equals to CHECKSUM_NONE are rejected. If this
> assumption is wrong, maybe we can remove testing the ip_summed field here
> altogether?
>
If I'm interpreting the semantics correctly, when skb->encapsulation
is set the ip_summed is valid for both the inner and outer header
(e.g. CHECKSUM_COMPLETE is always assumed okay for both layers). If
skb->encapsulation is not set then ip_summed is only valid for outer
header. So then the patch is broken in the case that encap is not set,
ip_summed is CHECKSUM_UNNECESSARY, csum == 0, and we need to validate
the inner checksum.

But even worse, is there a fundamental issue where udp4_csum_init is
able to change ip_summed to be CHECKSUM_UNNECESSARY (either check == 0
or ip_summed == CHECKSUM_UNNECESSARY) regardless of
skb->encapsulation, sending the packet into encap_rcv which could
ultimately incorrectly apply ip_summed on the inner TCP/UDP packet?

Tom

> Or.
>

^ permalink raw reply

* Re: [net-next 00/15] Intel Wired LAN Driver Updates
From: David Miller @ 2014-02-13 22:27 UTC (permalink / raw)
  To: aaron.f.brown; +Cc: netdev, gospo, sassmann
In-Reply-To: <1392292133-32120-1-git-send-email-aaron.f.brown@intel.com>

From: Aaron Brown <aaron.f.brown@intel.com>
Date: Thu, 13 Feb 2014 03:48:38 -0800

> This series contains updates to i40e and i40evf, primarily reset
> handling / refactoring along with a fair amount of minor cleanup.
> 
> Jesse fixes some spelling, bumps the version and other trivial fixes.
> Akeem sets a bit that is needed before shutdown in the case of 
> tx_timeout recovery failure.  Mitch refactors reset handling along 
> with a whole bunch of clean up.

Series applied, thanks.

^ permalink raw reply

* Re: [PATCH net-next] net: remove unnecessary return's
From: Julia Lawall @ 2014-02-13 22:28 UTC (permalink / raw)
  To: Dave Jones
  Cc: Julia Lawall, Joe Perches, Stephen Hemminger, David Miller,
	netdev
In-Reply-To: <20140213220021.GA1174@redhat.com>



On Thu, 13 Feb 2014, Dave Jones wrote:

> On Thu, Feb 13, 2014 at 10:55:23PM +0100, Julia Lawall wrote:
> 
>  > The patch below converts label: return; } to label: ; }.  I have only 
>  > scanned through the patches, not patched the code and looked at the 
>  > results, so I am not sure that it is completely correct.  On the other 
>  > hand, I'm also not sure that label: ; } is better than label: return; }, 
>  > either.  If people think it is, then I can cheack the results in more 
>  > detail.
> 
> Why not delete the label, and just replace the goto with a return if
> the label is at the end of the function ?

Here is an example.  Perhaps the uniformity of the if ... goto pattern is 
valuable, though?

julia

diff -u -p a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c
--- a/drivers/mfd/ab3100-core.c
+++ b/drivers/mfd/ab3100-core.c
@@ -586,7 +586,7 @@ static void ab3100_setup_debugfs(struct
 
 	ab3100_dir = debugfs_create_dir("ab3100", NULL);
 	if (!ab3100_dir)
-		goto exit_no_debugfs;
+		return;
 
 	ab3100_reg_file = debugfs_create_file("registers",
 				S_IRUGO, ab3100_dir, ab3100,
@@ -623,7 +623,6 @@ static void ab3100_setup_debugfs(struct
 	debugfs_remove(ab3100_reg_file);
  exit_destroy_dir:
 	debugfs_remove(ab3100_dir);
- exit_no_debugfs:
 	return;
 }
 static inline void ab3100_remove_debugfs(void)

^ permalink raw reply

* [PATCH net-next v2 03/14] tipc: move code for deleting links from bearer.c to link.c
From: Jon Maloy @ 2014-02-13 22:29 UTC (permalink / raw)
  To: davem; +Cc: Jon Maloy, netdev, tipc-discussion
In-Reply-To: <1392330558-19048-1-git-send-email-jon.maloy@ericsson.com>

From: Ying Xue <ying.xue@windriver.com>

We break out the code for deleting attached links in the
function bearer_disable(), and define a new function named
tipc_link_delete_list() to do this job.

This commit incurs no functional changes, but makes the code of
function bearer_disable() cleaner. It is also a preparation
for a more important change to the bearer code, in a subsequent
commit in this series.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
---
 net/tipc/bearer.c |    6 +-----
 net/tipc/link.c   |    9 +++++++++
 net/tipc/link.h   |    1 +
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index a3bdf5c..a5be053 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -366,16 +366,12 @@ static int tipc_reset_bearer(struct tipc_bearer *b_ptr)
  */
 static void bearer_disable(struct tipc_bearer *b_ptr)
 {
-	struct tipc_link *l_ptr;
-	struct tipc_link *temp_l_ptr;
 	struct tipc_link_req *temp_req;
 
 	pr_info("Disabling bearer <%s>\n", b_ptr->name);
 	spin_lock_bh(&b_ptr->lock);
 	b_ptr->media->disable_media(b_ptr);
-	list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) {
-		tipc_link_delete(l_ptr);
-	}
+	tipc_link_delete_list(b_ptr);
 	temp_req = b_ptr->link_req;
 	b_ptr->link_req = NULL;
 	spin_unlock_bh(&b_ptr->lock);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 3ff34e8..424e9f3 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -313,6 +313,15 @@ void tipc_link_delete(struct tipc_link *l_ptr)
 	kfree(l_ptr);
 }
 
+void tipc_link_delete_list(struct tipc_bearer *b_ptr)
+{
+	struct tipc_link *l_ptr;
+	struct tipc_link *temp_l_ptr;
+
+	list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) {
+		tipc_link_delete(l_ptr);
+	}
+}
 
 /**
  * link_schedule_port - schedule port for deferred sending
diff --git a/net/tipc/link.h b/net/tipc/link.h
index 73beecb..994ebd1 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -219,6 +219,7 @@ void tipc_link_delete(struct tipc_link *l_ptr);
 void tipc_link_failover_send_queue(struct tipc_link *l_ptr);
 void tipc_link_dup_send_queue(struct tipc_link *l_ptr,
 			      struct tipc_link *dest);
+void tipc_link_delete_list(struct tipc_bearer *b_ptr);
 void tipc_link_reset_fragments(struct tipc_link *l_ptr);
 int tipc_link_is_up(struct tipc_link *l_ptr);
 int tipc_link_is_active(struct tipc_link *l_ptr);
-- 
1.7.9.5


------------------------------------------------------------------------------
Android apps run on BlackBerry 10
Introducing the new BlackBerry 10.2.1 Runtime for Android apps.
Now with support for Jelly Bean, Bluetooth, Mapview and more.
Get your Android app in front of a whole new audience.  Start now.
http://pubads.g.doubleclick.net/gampad/clk?id=124407151&iu=/4140/ostg.clktrk

^ permalink raw reply related

* [PATCH net-next v2 04/14] tipc: redefine 'started' flag in struct link to bitmap
From: Jon Maloy @ 2014-02-13 22:29 UTC (permalink / raw)
  To: davem; +Cc: Jon Maloy, netdev, tipc-discussion
In-Reply-To: <1392330558-19048-1-git-send-email-jon.maloy@ericsson.com>

From: Ying Xue <ying.xue@windriver.com>

Currently, the 'started' field in struct tipc_link represents only a
binary state, 'started' or 'not started'. We need it to represent
more link execution states in the coming commits in this series.
Hence, we rename the field to 'flags', and define the current
started/non-started state to be represented by the LSB bit of
that field.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
---
 net/tipc/link.c |    4 ++--
 net/tipc/link.h |   22 +++++++++++-----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/net/tipc/link.c b/net/tipc/link.c
index 424e9f3..2070d03 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -500,7 +500,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event)
 	struct tipc_link *other;
 	u32 cont_intv = l_ptr->continuity_interval;
 
-	if (!l_ptr->started && (event != STARTING_EVT))
+	if (!(l_ptr->flags & LINK_STARTED) && (event != STARTING_EVT))
 		return;		/* Not yet. */
 
 	/* Check whether changeover is going on */
@@ -626,7 +626,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event)
 			link_set_timer(l_ptr, cont_intv);
 			break;
 		case STARTING_EVT:
-			l_ptr->started = 1;
+			l_ptr->flags |= LINK_STARTED;
 			/* fall through */
 		case TIMEOUT_EVT:
 			tipc_link_send_proto_msg(l_ptr, RESET_MSG, 0, 0, 0, 0, 0);
diff --git a/net/tipc/link.h b/net/tipc/link.h
index 994ebd1..a900e74 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -1,7 +1,7 @@
 /*
  * net/tipc/link.h: Include file for TIPC link code
  *
- * Copyright (c) 1995-2006, Ericsson AB
+ * Copyright (c) 1995-2006, 2013, Ericsson AB
  * Copyright (c) 2004-2005, 2010-2011, Wind River Systems
  * All rights reserved.
  *
@@ -40,27 +40,27 @@
 #include "msg.h"
 #include "node.h"
 
-/*
- * Link reassembly status codes
+/* Link reassembly status codes
  */
 #define LINK_REASM_ERROR	-1
 #define LINK_REASM_COMPLETE	1
 
-/*
- * Out-of-range value for link sequence numbers
+/* Out-of-range value for link sequence numbers
  */
 #define INVALID_LINK_SEQ 0x10000
 
-/*
- * Link states
+/* Link working states
  */
 #define WORKING_WORKING 560810u
 #define WORKING_UNKNOWN 560811u
 #define RESET_UNKNOWN   560812u
 #define RESET_RESET     560813u
 
-/*
- * Starting value for maximum packet size negotiation on unicast links
+/* Link endpoint execution states
+ */
+#define LINK_STARTED    0x0001
+
+/* Starting value for maximum packet size negotiation on unicast links
  * (unless bearer MTU is less)
  */
 #define MAX_PKT_DEFAULT 1500
@@ -103,7 +103,7 @@ struct tipc_stats {
  * @timer: link timer
  * @owner: pointer to peer node
  * @link_list: adjacent links in bearer's list of links
- * @started: indicates if link has been started
+ * @flags: execution state flags for link endpoint instance
  * @checkpoint: reference point for triggering link continuity checking
  * @peer_session: link session # being used by peer end of link
  * @peer_bearer_id: bearer id used by link's peer endpoint
@@ -152,7 +152,7 @@ struct tipc_link {
 	struct list_head link_list;
 
 	/* Management and link supervision data */
-	int started;
+	unsigned int flags;
 	u32 checkpoint;
 	u32 peer_session;
 	u32 peer_bearer_id;
-- 
1.7.9.5


------------------------------------------------------------------------------
Android apps run on BlackBerry 10
Introducing the new BlackBerry 10.2.1 Runtime for Android apps.
Now with support for Jelly Bean, Bluetooth, Mapview and more.
Get your Android app in front of a whole new audience.  Start now.
http://pubads.g.doubleclick.net/gampad/clk?id=124407151&iu=/4140/ostg.clktrk

^ permalink raw reply related

* [PATCH net-next v2 06/14] tipc: change reception of tunnelled duplicate packets
From: Jon Maloy @ 2014-02-13 22:29 UTC (permalink / raw)
  To: davem; +Cc: Jon Maloy, netdev, tipc-discussion
In-Reply-To: <1392330558-19048-1-git-send-email-jon.maloy@ericsson.com>

When a second link to a destination comes up, some sender sockets will
steer their subsequent traffic through the new link. In order to
guarantee preserved packet order and cardinality for those sockets, we
tunnel a duplicate of the old link's send queue through the new link
before we open it for regular traffic. The last arriving packet copy,
on whichever link, will be dropped at the receiving end based on the
original sequence number, to ensure that only one copy is delivered to
the end receiver.

In this commit, we change the algorithm for receiving DUPLICATE_MSG
packets, at the same time delegating it to a new subfunction,
tipc_link_dup_rcv(). Instead of returning an extracted inner packet to
the packet reception loop in tipc_rcv(), we just add it to the receiving
(new) link's deferred packet queue. The packet will then be processed by
that link when it receives its first non-tunneled packet, i.e., at
latest when the changeover procedure is finished.

Because tipc_link_tunnel_rcv()/tipc_link_dup_rcv() now is consuming all
packets of type DUPLICATE_MSG, the calling tipc_rcv() function can omit
testing for this. This in turn means that the current conditional jump
to the label 'protocol_check' becomes redundant, and we can remove that
label.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Ying Xue <ying.xue@windriver.com>
---
 net/tipc/link.c |   53 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/net/tipc/link.c b/net/tipc/link.c
index e7e44ab..f227a38 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1437,7 +1437,6 @@ void tipc_rcv(struct sk_buff *head, struct tipc_bearer *b_ptr)
 		u32 seq_no;
 		u32 ackd;
 		u32 released = 0;
-		int type;
 
 		head = head->next;
 		buf->next = NULL;
@@ -1525,7 +1524,6 @@ void tipc_rcv(struct sk_buff *head, struct tipc_bearer *b_ptr)
 		}
 
 		/* Now (finally!) process the incoming message */
-protocol_check:
 		if (unlikely(!link_working_working(l_ptr))) {
 			if (msg_user(msg) == LINK_PROTOCOL) {
 				link_recv_proto_msg(l_ptr, buf);
@@ -1599,15 +1597,11 @@ deliver:
 			tipc_node_unlock(n_ptr);
 			continue;
 		case CHANGEOVER_PROTOCOL:
-			type = msg_type(msg);
-			if (tipc_link_tunnel_rcv(&l_ptr, &buf)) {
-				msg = buf_msg(buf);
-				seq_no = msg_seqno(msg);
-				if (type == ORIGINAL_MSG)
-					goto deliver;
-				goto protocol_check;
-			}
-			break;
+			if (!tipc_link_tunnel_rcv(&l_ptr, &buf))
+				break;
+			msg = buf_msg(buf);
+			seq_no = msg_seqno(msg);
+			goto deliver;
 		default:
 			kfree_skb(buf);
 			buf = NULL;
@@ -2107,7 +2101,30 @@ static struct sk_buff *buf_extract(struct sk_buff *skb, u32 from_pos)
 	return eb;
 }
 
-/*  tipc_link_tunnel_rcv(): Receive a tunneled packet, sent
+
+
+/* tipc_link_dup_rcv(): Receive a tunnelled DUPLICATE_MSG packet.
+ * Owner node is locked.
+ */
+static void tipc_link_dup_rcv(struct tipc_link *l_ptr,
+			      struct sk_buff *t_buf)
+{
+	struct sk_buff *buf;
+
+	if (!tipc_link_is_up(l_ptr))
+		return;
+
+	buf = buf_extract(t_buf, INT_H_SIZE);
+	if (buf == NULL) {
+		pr_warn("%sfailed to extract inner dup pkt\n", link_co_err);
+		return;
+	}
+
+	/* Add buffer to deferred queue, if applicable: */
+	link_handle_out_of_seq_msg(l_ptr, buf);
+}
+
+/*  tipc_link_tunnel_rcv(): Receive a tunnelled packet, sent
  *  via other link as result of a failover (ORIGINAL_MSG) or
  *  a new active link (DUPLICATE_MSG). Failover packets are
  *  returned to the active link for delivery upwards.
@@ -2126,6 +2143,7 @@ static int tipc_link_tunnel_rcv(struct tipc_link **l_ptr,
 
 	if (bearer_id >= MAX_BEARERS)
 		goto exit;
+
 	dest_link = (*l_ptr)->owner->links[bearer_id];
 	if (!dest_link)
 		goto exit;
@@ -2138,15 +2156,8 @@ static int tipc_link_tunnel_rcv(struct tipc_link **l_ptr,
 	msg = msg_get_wrapped(tunnel_msg);
 
 	if (msg_typ == DUPLICATE_MSG) {
-		if (less(msg_seqno(msg), mod(dest_link->next_in_no)))
-			goto exit;
-		*buf = buf_extract(tunnel_buf, INT_H_SIZE);
-		if (*buf == NULL) {
-			pr_warn("%sduplicate msg dropped\n", link_co_err);
-			goto exit;
-		}
-		kfree_skb(tunnel_buf);
-		return 1;
+		tipc_link_dup_rcv(dest_link, tunnel_buf);
+		goto exit;
 	}
 
 	/* First original message ?: */
-- 
1.7.9.5


------------------------------------------------------------------------------
Android apps run on BlackBerry 10
Introducing the new BlackBerry 10.2.1 Runtime for Android apps.
Now with support for Jelly Bean, Bluetooth, Mapview and more.
Get your Android app in front of a whole new audience.  Start now.
http://pubads.g.doubleclick.net/gampad/clk?id=124407151&iu=/4140/ostg.clktrk

^ permalink raw reply related

* [PATCH net-next v2 08/14] tipc: change signature of tunnelling reception function
From: Jon Maloy @ 2014-02-13 22:29 UTC (permalink / raw)
  To: davem; +Cc: Jon Maloy, netdev, tipc-discussion
In-Reply-To: <1392330558-19048-1-git-send-email-jon.maloy@ericsson.com>

After the earlier commits in this series related to the function
tipc_link_tunnel_rcv(), we can now go further and simplify its
signature.

The function now consumes all DUPLICATE packets, and only returns such
ORIGINAL packets that are ready for immediate delivery, i.e., no
more link level protocol processing needs to be done by the caller.
As a consequence, the the caller, tipc_rcv(), does not access the link
pointer after call return, and it becomes unnecessary to pass a link
pointer reference in the call. Instead, we now only pass it the tunnel
link's owner node, which is sufficient to find the destination link for
the tunnelled packet.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Ying Xue <ying.xue@windriver.com>
---
 net/tipc/link.c |   14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/net/tipc/link.c b/net/tipc/link.c
index 26a54f4..f9f9068 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -78,7 +78,7 @@ static const char *link_unk_evt = "Unknown link event ";
 static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr,
 				       struct sk_buff *buf);
 static void link_recv_proto_msg(struct tipc_link *l_ptr, struct sk_buff *buf);
-static int  tipc_link_tunnel_rcv(struct tipc_link **l_ptr,
+static int  tipc_link_tunnel_rcv(struct tipc_node *n_ptr,
 				 struct sk_buff **buf);
 static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tolerance);
 static int  link_send_sections_long(struct tipc_port *sender,
@@ -1597,7 +1597,7 @@ deliver:
 			tipc_node_unlock(n_ptr);
 			continue;
 		case CHANGEOVER_PROTOCOL:
-			if (!tipc_link_tunnel_rcv(&l_ptr, &buf))
+			if (!tipc_link_tunnel_rcv(n_ptr, &buf))
 				break;
 			msg = buf_msg(buf);
 			seq_no = msg_seqno(msg);
@@ -2174,7 +2174,7 @@ exit:
  *  returned to the active link for delivery upwards.
  *  Owner node is locked.
  */
-static int tipc_link_tunnel_rcv(struct tipc_link **l_ptr,
+static int tipc_link_tunnel_rcv(struct tipc_node *n_ptr,
 				struct sk_buff **buf)
 {
 	struct sk_buff *tunnel_buf = *buf;
@@ -2186,15 +2186,9 @@ static int tipc_link_tunnel_rcv(struct tipc_link **l_ptr,
 	if (bearer_id >= MAX_BEARERS)
 		goto exit;
 
-	dest_link = (*l_ptr)->owner->links[bearer_id];
+	dest_link = n_ptr->links[bearer_id];
 	if (!dest_link)
 		goto exit;
-	if (dest_link == *l_ptr) {
-		pr_err("Unexpected changeover message on link <%s>\n",
-		       (*l_ptr)->name);
-		goto exit;
-	}
-	*l_ptr = dest_link;
 
 	if (msg_typ == DUPLICATE_MSG) {
 		tipc_link_dup_rcv(dest_link, tunnel_buf);
-- 
1.7.9.5


------------------------------------------------------------------------------
Android apps run on BlackBerry 10
Introducing the new BlackBerry 10.2.1 Runtime for Android apps.
Now with support for Jelly Bean, Bluetooth, Mapview and more.
Get your Android app in front of a whole new audience.  Start now.
http://pubads.g.doubleclick.net/gampad/clk?id=124407151&iu=/4140/ostg.clktrk

^ permalink raw reply related

* [PATCH net-next v2 13/14] tipc: remove bearer_lock from tipc_bearer struct
From: Jon Maloy @ 2014-02-13 22:29 UTC (permalink / raw)
  To: davem; +Cc: Jon Maloy, netdev, tipc-discussion
In-Reply-To: <1392330558-19048-1-git-send-email-jon.maloy@ericsson.com>

From: Ying Xue <ying.xue@windriver.com>

After the earlier commits ("tipc: remove 'links' list from
tipc_bearer struct") and ("tipc: introduce new spinlock to protect
struct link_req"), there is no longer any need to protect struct
link_req or or any link list by use of bearer_lock. Furthermore,
we have eliminated the need for using bearer_lock during downcalls
(send) from the link to the bearer, since we have ensured that
bearers always have a longer life cycle that their associated links,
and always contain valid data.

So, the only need now for a lock protecting bearers is for guaranteeing
consistency of the bearer list itself. For this, it is sufficient, at
least for the time being, to continue applying 'net_lock´ in write mode.

By removing bearer_lock we also pre-empt introduction of issue b) descibed
in the previous commit "tipc: remove 'links' list from tipc_bearer struct":

"b) When the outer protection from net_lock is gone, taking
    bearer_lock and node_lock in opposite order of method 1) and 2)
    will become an obvious deadlock hazard".

Therefore, we now eliminate the bearer_lock spinlock.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
---
 net/tipc/bcast.c  |    1 -
 net/tipc/bearer.c |   16 +++-------------
 net/tipc/bearer.h |    5 +----
 3 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index af35f76..06a639c 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -785,7 +785,6 @@ void tipc_bclink_init(void)
 	bcl->owner = &bclink->node;
 	bcl->max_pkt = MAX_PKT_DEFAULT_MCAST;
 	tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT);
-	spin_lock_init(&bcbearer->bearer.lock);
 	bcl->b_ptr = &bcbearer->bearer;
 	bcl->state = WORKING_WORKING;
 	strlcpy(bcl->name, tipc_bclink_name, TIPC_MAX_LINK_NAME);
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 60caa45..242cddd 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -327,7 +327,6 @@ restart:
 	b_ptr->net_plane = bearer_id + 'A';
 	b_ptr->active = 1;
 	b_ptr->priority = priority;
-	spin_lock_init(&b_ptr->lock);
 
 	res = tipc_disc_create(b_ptr, &b_ptr->bcast_addr, disc_domain);
 	if (res) {
@@ -351,9 +350,7 @@ static int tipc_reset_bearer(struct tipc_bearer *b_ptr)
 {
 	read_lock_bh(&tipc_net_lock);
 	pr_info("Resetting bearer <%s>\n", b_ptr->name);
-	spin_lock_bh(&b_ptr->lock);
 	tipc_link_reset_list(b_ptr->identity);
-	spin_unlock_bh(&b_ptr->lock);
 	read_unlock_bh(&tipc_net_lock);
 	return 0;
 }
@@ -365,19 +362,12 @@ static int tipc_reset_bearer(struct tipc_bearer *b_ptr)
  */
 static void bearer_disable(struct tipc_bearer *b_ptr, bool shutting_down)
 {
-	struct tipc_link_req *temp_req;
-
 	pr_info("Disabling bearer <%s>\n", b_ptr->name);
-	spin_lock_bh(&b_ptr->lock);
 	b_ptr->media->disable_media(b_ptr);
-	tipc_link_delete_list(b_ptr->identity, shutting_down);
-	temp_req = b_ptr->link_req;
-	b_ptr->link_req = NULL;
-	spin_unlock_bh(&b_ptr->lock);
-
-	if (temp_req)
-		tipc_disc_delete(temp_req);
 
+	tipc_link_delete_list(b_ptr->identity, shutting_down);
+	if (b_ptr->link_req)
+		tipc_disc_delete(b_ptr->link_req);
 	memset(b_ptr, 0, sizeof(struct tipc_bearer));
 }
 
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 647cb1d..425dd81 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -107,10 +107,8 @@ struct tipc_media {
 
 /**
  * struct tipc_bearer - Generic TIPC bearer structure
- * @dev: ptr to associated network device
- * @usr_handle: pointer to additional media-specific information about bearer
+ * @media_ptr: pointer to additional media-specific information about bearer
  * @mtu: max packet size bearer can support
- * @lock: spinlock for controlling access to bearer
  * @addr: media-specific address associated with bearer
  * @name: bearer name (format = media:interface)
  * @media: ptr to media structure associated with bearer
@@ -133,7 +131,6 @@ struct tipc_bearer {
 	u32 mtu;				/* initalized by media */
 	struct tipc_media_addr addr;		/* initalized by media */
 	char name[TIPC_MAX_BEARER_NAME];
-	spinlock_t lock;
 	struct tipc_media *media;
 	struct tipc_media_addr bcast_addr;
 	u32 priority;
-- 
1.7.9.5


------------------------------------------------------------------------------
Android apps run on BlackBerry 10
Introducing the new BlackBerry 10.2.1 Runtime for Android apps.
Now with support for Jelly Bean, Bluetooth, Mapview and more.
Get your Android app in front of a whole new audience.  Start now.
http://pubads.g.doubleclick.net/gampad/clk?id=124407151&iu=/4140/ostg.clktrk
_______________________________________________
tipc-discussion mailing list
tipc-discussion@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/tipc-discussion

^ permalink raw reply related

* [PATCH net-next v2 00/14] tipc: clean up media and bearer layer
From: Jon Maloy @ 2014-02-13 22:29 UTC (permalink / raw)
  To: davem
  Cc: netdev, Paul Gortmaker, erik.hugne, ying.xue, maloy,
	tipc-discussion, Jon Maloy

This commit series aims at facilitating future changes to the
locking policy around nodes, links and bearers.

Currently, we have a big read/write lock (net_lock) that is used for
serializing all changes to the node, link and bearer lists, as well
as to their mutual pointers and references.

But, in order to allow for concurrent access to the contents of these
structures, net_lock is only used in read mode by the data path code,
and hence a finer granular locking policy must be applied inside the
scope of net_lock: a spinlock (node_lock) for each node structure,
and another one (bearer_lock) for protection of bearer structures.

This locking policy has proved hard to maintain. We have several 
times encountered contention problems between node_lock and 
bearer_lock, and with the advent of the RCU locking mechanism we
feel it is anyway obsolete and ripe for improvements.

We now plan to replace net_lock with an RCU lock, as well as
getting rid of bearer_lock altogether. This will both reduce data
path overhead and make the code more manageable, while reducing the
risk of future lock contention problems.

Prior to these changes, we need to do some necessary cleanup and
code consolidation. This is what we do with this commit series,
before we finally remove bearer_lock. In a later series we will
replace net_lock with an RCU lock.

v2:
 - Re-inserted a removed kerneldoc entry in commit#5, based on 
   feedback from D. Miller.

Jon Maloy (9):
  tipc: stricter behavior of message reassembly function
  tipc: change reception of tunnelled duplicate packets
  tipc: change reception of tunnelled failover packets
  tipc: change signature of tunnelling reception function
  tipc: more cleanup of tunnelling reception function
  tipc: rename stack variables in function tipc_link_tunnel_rcv
  tipc: changes to general packet reception algorithm
  tipc: delay delete of link when failover is needed
  tipc: add node_lock protection to link lookup function

Ying Xue (5):
  tipc: move code for resetting links from bearer.c to link.c
  tipc: move code for deleting links from bearer.c to link.c
  tipc: redefine 'started' flag in struct link to bitmap
  tipc: remove 'links' list from tipc_bearer struct
  tipc: remove bearer_lock from tipc_bearer struct

 net/tipc/bcast.c  |    7 +-
 net/tipc/bearer.c |   42 ++----
 net/tipc/bearer.h |    7 +-
 net/tipc/core.c   |    2 +-
 net/tipc/link.c   |  432 ++++++++++++++++++++++++++++++-----------------------
 net/tipc/link.h   |   34 ++---
 net/tipc/node.c   |    8 +-
 7 files changed, 283 insertions(+), 249 deletions(-)

-- 
1.7.9.5

^ permalink raw reply

* [PATCH net-next v2 01/14] tipc: stricter behavior of message reassembly function
From: Jon Maloy @ 2014-02-13 22:29 UTC (permalink / raw)
  To: davem
  Cc: netdev, Paul Gortmaker, erik.hugne, ying.xue, maloy,
	tipc-discussion, Jon Maloy
In-Reply-To: <1392330558-19048-1-git-send-email-jon.maloy@ericsson.com>

The function tipc_link_recv_fragment(struct sk_buff **buf) currently
leaves the value of the input buffer pointer undefined when it returns,
except when the return code indicates that the reassembly is complete.
This despite the fact that it always consumes the input buffer.

Here, we enforce a stricter behavior by this function, ensuring that
the returned buffer pointer is non-NULL if and only if the reassembly
is complete. This makes it possible to test for the buffer pointer as
criteria for successful reassembly.

We also rename the function to tipc_link_frag_rcv(), which is both
shorter and more in line with common naming practice in the network
subsystem.

Apart from the new name, these changes have no impact on current
users of the function, but makes it more practical for use in some
planned future commits.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Ying Xue <ying.xue@windriver.com>
---
 net/tipc/bcast.c |    6 +++---
 net/tipc/link.c  |   16 +++++++++-------
 net/tipc/link.h  |    6 +++---
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index bf860d9..af35f76 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -481,9 +481,9 @@ receive:
 			tipc_link_recv_bundle(buf);
 		} else if (msg_user(msg) == MSG_FRAGMENTER) {
 			int ret;
-			ret = tipc_link_recv_fragment(&node->bclink.reasm_head,
-						      &node->bclink.reasm_tail,
-						      &buf);
+			ret = tipc_link_frag_rcv(&node->bclink.reasm_head,
+						 &node->bclink.reasm_tail,
+						 &buf);
 			if (ret == LINK_REASM_ERROR)
 				goto unlock;
 			spin_lock_bh(&bc_lock);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index d4b5de4..17fbd15 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1584,9 +1584,9 @@ deliver:
 			continue;
 		case MSG_FRAGMENTER:
 			l_ptr->stats.recv_fragments++;
-			ret = tipc_link_recv_fragment(&l_ptr->reasm_head,
-						      &l_ptr->reasm_tail,
-						      &buf);
+			ret = tipc_link_frag_rcv(&l_ptr->reasm_head,
+						 &l_ptr->reasm_tail,
+						 &buf);
 			if (ret == LINK_REASM_COMPLETE) {
 				l_ptr->stats.recv_fragmented++;
 				msg = buf_msg(buf);
@@ -2277,12 +2277,11 @@ static int link_send_long_buf(struct tipc_link *l_ptr, struct sk_buff *buf)
 	return dsz;
 }
 
-/*
- * tipc_link_recv_fragment(): Called with node lock on. Returns
+/* tipc_link_frag_rcv(): Called with node lock on. Returns
  * the reassembled buffer if message is complete.
  */
-int tipc_link_recv_fragment(struct sk_buff **head, struct sk_buff **tail,
-			    struct sk_buff **fbuf)
+int tipc_link_frag_rcv(struct sk_buff **head, struct sk_buff **tail,
+		       struct sk_buff **fbuf)
 {
 	struct sk_buff *frag = *fbuf;
 	struct tipc_msg *msg = buf_msg(frag);
@@ -2296,6 +2295,7 @@ int tipc_link_recv_fragment(struct sk_buff **head, struct sk_buff **tail,
 			goto out_free;
 		*head = frag;
 		skb_frag_list_init(*head);
+		*fbuf = NULL;
 		return 0;
 	} else if (*head &&
 		   skb_try_coalesce(*head, frag, &headstolen, &delta)) {
@@ -2315,10 +2315,12 @@ int tipc_link_recv_fragment(struct sk_buff **head, struct sk_buff **tail,
 		*tail = *head = NULL;
 		return LINK_REASM_COMPLETE;
 	}
+	*fbuf = NULL;
 	return 0;
 out_free:
 	pr_warn_ratelimited("Link unable to reassemble fragmented message\n");
 	kfree_skb(*fbuf);
+	*fbuf = NULL;
 	return LINK_REASM_ERROR;
 }
 
diff --git a/net/tipc/link.h b/net/tipc/link.h
index 3b6aa65..8addc5e 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -239,9 +239,9 @@ int tipc_link_send_sections_fast(struct tipc_port *sender,
 				 struct iovec const *msg_sect,
 				 unsigned int len, u32 destnode);
 void tipc_link_recv_bundle(struct sk_buff *buf);
-int  tipc_link_recv_fragment(struct sk_buff **reasm_head,
-			     struct sk_buff **reasm_tail,
-			     struct sk_buff **fbuf);
+int  tipc_link_frag_rcv(struct sk_buff **reasm_head,
+			struct sk_buff **reasm_tail,
+			struct sk_buff **fbuf);
 void tipc_link_send_proto_msg(struct tipc_link *l_ptr, u32 msg_typ, int prob,
 			      u32 gap, u32 tolerance, u32 priority,
 			      u32 acked_mtu);
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH net-next v2 02/14] tipc: move code for resetting links from bearer.c to link.c
From: Jon Maloy @ 2014-02-13 22:29 UTC (permalink / raw)
  To: davem
  Cc: netdev, Paul Gortmaker, erik.hugne, ying.xue, maloy,
	tipc-discussion, Jon Maloy
In-Reply-To: <1392330558-19048-1-git-send-email-jon.maloy@ericsson.com>

From: Ying Xue <ying.xue@windriver.com>

We break out the code for resetting attached links in the
function tipc_reset_bearer(), and define a new function named
tipc_link_reset_list() to do this job.

This commit incurs no functional changes, but makes the code
of function tipc_reset_bearer() cleaner. It is also a preparation
for a more important change to the bearer code, in a subsequent
commit in this series.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
---
 net/tipc/bearer.c |   11 +----------
 net/tipc/link.c   |   12 ++++++++++++
 net/tipc/link.h   |    1 +
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index a38c899..a3bdf5c 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -350,19 +350,10 @@ exit:
  */
 static int tipc_reset_bearer(struct tipc_bearer *b_ptr)
 {
-	struct tipc_link *l_ptr;
-	struct tipc_link *temp_l_ptr;
-
 	read_lock_bh(&tipc_net_lock);
 	pr_info("Resetting bearer <%s>\n", b_ptr->name);
 	spin_lock_bh(&b_ptr->lock);
-	list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) {
-		struct tipc_node *n_ptr = l_ptr->owner;
-
-		spin_lock_bh(&n_ptr->lock);
-		tipc_link_reset(l_ptr);
-		spin_unlock_bh(&n_ptr->lock);
-	}
+	tipc_link_reset_list(b_ptr);
 	spin_unlock_bh(&b_ptr->lock);
 	read_unlock_bh(&tipc_net_lock);
 	return 0;
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 17fbd15..3ff34e8 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -461,6 +461,18 @@ void tipc_link_reset(struct tipc_link *l_ptr)
 	link_reset_statistics(l_ptr);
 }
 
+void tipc_link_reset_list(struct tipc_bearer *b_ptr)
+{
+	struct tipc_link *l_ptr;
+
+	list_for_each_entry(l_ptr, &b_ptr->links, link_list) {
+		struct tipc_node *n_ptr = l_ptr->owner;
+
+		spin_lock_bh(&n_ptr->lock);
+		tipc_link_reset(l_ptr);
+		spin_unlock_bh(&n_ptr->lock);
+	}
+}
 
 static void link_activate(struct tipc_link *l_ptr)
 {
diff --git a/net/tipc/link.h b/net/tipc/link.h
index 8addc5e..73beecb 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -231,6 +231,7 @@ struct sk_buff *tipc_link_cmd_show_stats(const void *req_tlv_area,
 struct sk_buff *tipc_link_cmd_reset_stats(const void *req_tlv_area,
 					  int req_tlv_space);
 void tipc_link_reset(struct tipc_link *l_ptr);
+void tipc_link_reset_list(struct tipc_bearer *b_ptr);
 int tipc_link_send(struct sk_buff *buf, u32 dest, u32 selector);
 void tipc_link_send_names(struct list_head *message_list, u32 dest);
 int tipc_link_send_buf(struct tipc_link *l_ptr, struct sk_buff *buf);
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH net-next v2 05/14] tipc: remove 'links' list from tipc_bearer struct
From: Jon Maloy @ 2014-02-13 22:29 UTC (permalink / raw)
  To: davem
  Cc: netdev, Paul Gortmaker, erik.hugne, ying.xue, maloy,
	tipc-discussion, Jon Maloy
In-Reply-To: <1392330558-19048-1-git-send-email-jon.maloy@ericsson.com>

From: Ying Xue <ying.xue@windriver.com>

In our ongoing effort to simplify the TIPC locking structure,
we see a need to remove the linked list for tipc_links
in the bearer. This can be explained as follows.

Currently, we have three different ways to access a link,
via three different lists/tables:

1: Via a node hash table:
   Used by the time-critical outgoing/incoming data paths.
   (e.g. link_send_sections_fast() and tipc_recv_msg() ):

grab net_lock(read)
   find node from node hash table
   grab node_lock
       select link
       grab bearer_lock
          send_msg()
       release bearer_lock
   release node lock
release net_lock

2: Via a global linked list for nodes:
   Used by configuration commands (link_cmd_set_value())

grab net_lock(read)
   find node and link from global node list (using link name)
   grab node_lock
       update link
   release node lock
release net_lock

(Same locking order as above. No problem.)

3: Via the bearer's linked link list:
   Used by notifications from interface (e.g. tipc_disable_bearer() )

grab net_lock(write)
   grab bearer_lock
      get link ptr from bearer's link list
      get node from link
      grab node_lock
         delete link
      release node lock
   release bearer_lock
release net_lock

(Different order from above, but works because we grab the
outer net_lock in write mode first, excluding all other access.)

The first major goal in our simplification effort is to get rid
of the "big" net_lock, replacing it with rcu-locks when accessing
the node list and node hash array. This will come in a later patch
series.

But to get there we first need to rewrite access methods ##2 and 3,
since removal of net_lock would introduce three major problems:

a) In access method #2, we access the link before taking the
   protecting node_lock. This will not work once net_lock is gone,
   so we will have to change the access order. We will deal with
   this in a later commit in this series, "tipc: add node lock
   protection to link found by link_find_link()".

b) When the outer protection from net_lock is gone, taking
   bearer_lock and node_lock in opposite order of method 1) and 2)
   will become an obvious deadlock hazard. This is fixed in the
   commit ("tipc: remove bearer_lock from tipc_bearer struct")
   later in this series.

c) Similar to what is described in problem a), access method #3
   starts with using a link pointer that is unprotected by node_lock,
   in order to via that pointer find the correct node struct and
   lock it. Before we remove net_lock, this access order must be
   altered. This is what we do with this commit.

We can avoid introducing problem problem c) by even here using the
global node list to find the node, before accessing its links. When
we loop though the node list we use the own bearer identity as search
criteria, thus easily finding the links that are associated to the
resetting/disabling bearer. It should be noted that although this
method is somewhat slower than the current list traversal, it is in
no way time critical. This is only about resetting or deleting links,
something that must be considered relatively infrequent events.

As a bonus, we can get rid of the mutual pointers between links and
bearers. After this commit, pointer dependency go in one direction
only: from the link to the bearer.

This commit pre-empts introduction of problem c) as described above.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
---
 net/tipc/bearer.c |    5 ++--
 net/tipc/bearer.h |    2 --
 net/tipc/core.c   |    2 +-
 net/tipc/link.c   |   67 +++++++++++++++++++----------------------------------
 net/tipc/link.h   |    8 +++----
 5 files changed, 30 insertions(+), 54 deletions(-)

diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index a5be053..4931eea 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -327,7 +327,6 @@ restart:
 	b_ptr->net_plane = bearer_id + 'A';
 	b_ptr->active = 1;
 	b_ptr->priority = priority;
-	INIT_LIST_HEAD(&b_ptr->links);
 	spin_lock_init(&b_ptr->lock);
 
 	res = tipc_disc_create(b_ptr, &b_ptr->bcast_addr, disc_domain);
@@ -353,7 +352,7 @@ static int tipc_reset_bearer(struct tipc_bearer *b_ptr)
 	read_lock_bh(&tipc_net_lock);
 	pr_info("Resetting bearer <%s>\n", b_ptr->name);
 	spin_lock_bh(&b_ptr->lock);
-	tipc_link_reset_list(b_ptr);
+	tipc_link_reset_list(b_ptr->identity);
 	spin_unlock_bh(&b_ptr->lock);
 	read_unlock_bh(&tipc_net_lock);
 	return 0;
@@ -371,7 +370,7 @@ static void bearer_disable(struct tipc_bearer *b_ptr)
 	pr_info("Disabling bearer <%s>\n", b_ptr->name);
 	spin_lock_bh(&b_ptr->lock);
 	b_ptr->media->disable_media(b_ptr);
-	tipc_link_delete_list(b_ptr);
+	tipc_link_delete_list(b_ptr->identity);
 	temp_req = b_ptr->link_req;
 	b_ptr->link_req = NULL;
 	spin_unlock_bh(&b_ptr->lock);
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 4f5db9a..647cb1d 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -120,7 +120,6 @@ struct tipc_media {
  * @tolerance: default link tolerance for bearer
  * @identity: array index of this bearer within TIPC bearer array
  * @link_req: ptr to (optional) structure making periodic link setup requests
- * @links: list of non-congested links associated with bearer
  * @active: non-zero if bearer structure is represents a bearer
  * @net_plane: network plane ('A' through 'H') currently associated with bearer
  * @nodes: indicates which nodes in cluster can be reached through bearer
@@ -142,7 +141,6 @@ struct tipc_bearer {
 	u32 tolerance;
 	u32 identity;
 	struct tipc_link_req *link_req;
-	struct list_head links;
 	int active;
 	char net_plane;
 	struct tipc_node_map nodes;
diff --git a/net/tipc/core.c b/net/tipc/core.c
index f9e88d8..3f76b98 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -1,7 +1,7 @@
 /*
  * net/tipc/core.c: TIPC module code
  *
- * Copyright (c) 2003-2006, Ericsson AB
+ * Copyright (c) 2003-2006, 2013, Ericsson AB
  * Copyright (c) 2005-2006, 2010-2013, Wind River Systems
  * All rights reserved.
  *
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 2070d03..e7e44ab 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -147,11 +147,6 @@ int tipc_link_is_active(struct tipc_link *l_ptr)
 /**
  * link_timeout - handle expiration of link timer
  * @l_ptr: pointer to link
- *
- * This routine must not grab "tipc_net_lock" to avoid a potential deadlock conflict
- * with tipc_link_delete().  (There is no risk that the node will be deleted by
- * another thread because tipc_link_delete() always cancels the link timer before
- * tipc_node_delete() is called.)
  */
 static void link_timeout(struct tipc_link *l_ptr)
 {
@@ -213,8 +208,8 @@ static void link_set_timer(struct tipc_link *l_ptr, u32 time)
  * Returns pointer to link.
  */
 struct tipc_link *tipc_link_create(struct tipc_node *n_ptr,
-			      struct tipc_bearer *b_ptr,
-			      const struct tipc_media_addr *media_addr)
+				   struct tipc_bearer *b_ptr,
+				   const struct tipc_media_addr *media_addr)
 {
 	struct tipc_link *l_ptr;
 	struct tipc_msg *msg;
@@ -279,47 +274,32 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr,
 
 	k_init_timer(&l_ptr->timer, (Handler)link_timeout,
 		     (unsigned long)l_ptr);
-	list_add_tail(&l_ptr->link_list, &b_ptr->links);
 
 	link_state_event(l_ptr, STARTING_EVT);
 
 	return l_ptr;
 }
 
-/**
- * tipc_link_delete - delete a link
- * @l_ptr: pointer to link
- *
- * Note: 'tipc_net_lock' is write_locked, bearer is locked.
- * This routine must not grab the node lock until after link timer cancellation
- * to avoid a potential deadlock situation.
- */
-void tipc_link_delete(struct tipc_link *l_ptr)
-{
-	if (!l_ptr) {
-		pr_err("Attempt to delete non-existent link\n");
-		return;
-	}
-
-	k_cancel_timer(&l_ptr->timer);
 
-	tipc_node_lock(l_ptr->owner);
-	tipc_link_reset(l_ptr);
-	tipc_node_detach_link(l_ptr->owner, l_ptr);
-	tipc_link_purge_queues(l_ptr);
-	list_del_init(&l_ptr->link_list);
-	tipc_node_unlock(l_ptr->owner);
-	k_term_timer(&l_ptr->timer);
-	kfree(l_ptr);
-}
-
-void tipc_link_delete_list(struct tipc_bearer *b_ptr)
+void tipc_link_delete_list(unsigned int bearer_id)
 {
 	struct tipc_link *l_ptr;
-	struct tipc_link *temp_l_ptr;
+	struct tipc_node *n_ptr;
 
-	list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) {
-		tipc_link_delete(l_ptr);
+	list_for_each_entry(n_ptr, &tipc_node_list, list) {
+		spin_lock_bh(&n_ptr->lock);
+		l_ptr = n_ptr->links[bearer_id];
+		if (l_ptr) {
+			tipc_link_reset(l_ptr);
+			tipc_node_detach_link(n_ptr, l_ptr);
+			spin_unlock_bh(&n_ptr->lock);
+
+			/* Nobody else can access this link now: */
+			del_timer_sync(&l_ptr->timer);
+			kfree(l_ptr);
+			continue;
+		}
+		spin_unlock_bh(&n_ptr->lock);
 	}
 }
 
@@ -470,15 +450,16 @@ void tipc_link_reset(struct tipc_link *l_ptr)
 	link_reset_statistics(l_ptr);
 }
 
-void tipc_link_reset_list(struct tipc_bearer *b_ptr)
+void tipc_link_reset_list(unsigned int bearer_id)
 {
 	struct tipc_link *l_ptr;
+	struct tipc_node *n_ptr;
 
-	list_for_each_entry(l_ptr, &b_ptr->links, link_list) {
-		struct tipc_node *n_ptr = l_ptr->owner;
-
+	list_for_each_entry(n_ptr, &tipc_node_list, list) {
 		spin_lock_bh(&n_ptr->lock);
-		tipc_link_reset(l_ptr);
+		l_ptr = n_ptr->links[bearer_id];
+		if (l_ptr)
+			tipc_link_reset(l_ptr);
 		spin_unlock_bh(&n_ptr->lock);
 	}
 }
diff --git a/net/tipc/link.h b/net/tipc/link.h
index a900e74..3340fc1 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -59,6 +59,7 @@
 /* Link endpoint execution states
  */
 #define LINK_STARTED    0x0001
+#define LINK_STOPPED    0x0002
 
 /* Starting value for maximum packet size negotiation on unicast links
  * (unless bearer MTU is less)
@@ -102,7 +103,6 @@ struct tipc_stats {
  * @media_addr: media address to use when sending messages over link
  * @timer: link timer
  * @owner: pointer to peer node
- * @link_list: adjacent links in bearer's list of links
  * @flags: execution state flags for link endpoint instance
  * @checkpoint: reference point for triggering link continuity checking
  * @peer_session: link session # being used by peer end of link
@@ -149,7 +149,6 @@ struct tipc_link {
 	struct tipc_media_addr media_addr;
 	struct timer_list timer;
 	struct tipc_node *owner;
-	struct list_head link_list;
 
 	/* Management and link supervision data */
 	unsigned int flags;
@@ -215,11 +214,10 @@ struct tipc_port;
 struct tipc_link *tipc_link_create(struct tipc_node *n_ptr,
 			      struct tipc_bearer *b_ptr,
 			      const struct tipc_media_addr *media_addr);
-void tipc_link_delete(struct tipc_link *l_ptr);
+void tipc_link_delete_list(unsigned int bearer_id);
 void tipc_link_failover_send_queue(struct tipc_link *l_ptr);
 void tipc_link_dup_send_queue(struct tipc_link *l_ptr,
 			      struct tipc_link *dest);
-void tipc_link_delete_list(struct tipc_bearer *b_ptr);
 void tipc_link_reset_fragments(struct tipc_link *l_ptr);
 int tipc_link_is_up(struct tipc_link *l_ptr);
 int tipc_link_is_active(struct tipc_link *l_ptr);
@@ -232,7 +230,7 @@ struct sk_buff *tipc_link_cmd_show_stats(const void *req_tlv_area,
 struct sk_buff *tipc_link_cmd_reset_stats(const void *req_tlv_area,
 					  int req_tlv_space);
 void tipc_link_reset(struct tipc_link *l_ptr);
-void tipc_link_reset_list(struct tipc_bearer *b_ptr);
+void tipc_link_reset_list(unsigned int bearer_id);
 int tipc_link_send(struct sk_buff *buf, u32 dest, u32 selector);
 void tipc_link_send_names(struct list_head *message_list, u32 dest);
 int tipc_link_send_buf(struct tipc_link *l_ptr, struct sk_buff *buf);
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH net-next v2 07/14] tipc: change reception of tunnelled failover packets
From: Jon Maloy @ 2014-02-13 22:29 UTC (permalink / raw)
  To: davem
  Cc: netdev, Paul Gortmaker, erik.hugne, ying.xue, maloy,
	tipc-discussion, Jon Maloy
In-Reply-To: <1392330558-19048-1-git-send-email-jon.maloy@ericsson.com>

When a link is reset, and there is a redundant link available, all
sender sockets will steer their subsequent traffic through the
remaining link. In order to guarantee preserved packet order and
cardinality during the transition, we tunnel the failing link's send
queue through the remaining link before we allow any sockets to use it.

In this commit, we change the algorithm for receiving failover
("ORIGINAL_MSG") packets in tipc_link_tunnel_rcv(), at the same time
delegating it to a new subfuncton, tipc_link_failover_rcv(). Instead
of directly returning an extracted inner packet to the packet reception
loop in tipc_rcv(), we first check if it is a message fragment, in which
case we append it to the reset link's fragment chain. If the fragment
chain is complete, we return the whole chain instead of the individual
buffer, eliminating any need for the tipc_rcv() loop to do reassembly of
tunneled packets.

This change makes it possible to further simplify tipc_link_tunnel_rcv(),
as well as the calling tipc_rcv() loop. We will do that in later
commits. It also makes it possible to identify a single spot in the code
where we can tell that a failover procedure is finished, something that
is useful when we are deleting links after a failover. This will also
be done in a later commit.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Ying Xue <ying.xue@windriver.com>
---
 net/tipc/link.c |   75 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 47 insertions(+), 28 deletions(-)

diff --git a/net/tipc/link.c b/net/tipc/link.c
index f227a38..26a54f4 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -2124,6 +2124,50 @@ static void tipc_link_dup_rcv(struct tipc_link *l_ptr,
 	link_handle_out_of_seq_msg(l_ptr, buf);
 }
 
+/*  tipc_link_failover_rcv(): Receive a tunnelled ORIGINAL_MSG packet
+ *  Owner node is locked.
+ */
+static struct sk_buff *tipc_link_failover_rcv(struct tipc_link *l_ptr,
+					      struct sk_buff *t_buf)
+{
+	struct tipc_msg *t_msg = buf_msg(t_buf);
+	struct sk_buff *buf = NULL;
+	struct tipc_msg *msg;
+
+	if (tipc_link_is_up(l_ptr))
+		tipc_link_reset(l_ptr);
+
+	/* First failover packet? */
+	if (l_ptr->exp_msg_count == START_CHANGEOVER)
+		l_ptr->exp_msg_count = msg_msgcnt(t_msg);
+
+	/* Should there be an inner packet? */
+	if (l_ptr->exp_msg_count) {
+		l_ptr->exp_msg_count--;
+		buf = buf_extract(t_buf, INT_H_SIZE);
+		if (buf == NULL) {
+			pr_warn("%sno inner failover pkt\n", link_co_err);
+			goto exit;
+		}
+		msg = buf_msg(buf);
+
+		if (less(msg_seqno(msg), l_ptr->reset_checkpoint)) {
+			kfree_skb(buf);
+			buf = NULL;
+			goto exit;
+		}
+		if (msg_user(msg) == MSG_FRAGMENTER) {
+			l_ptr->stats.recv_fragments++;
+			tipc_link_frag_rcv(&l_ptr->reasm_head,
+					   &l_ptr->reasm_tail,
+					   &buf);
+		}
+	}
+
+exit:
+	return buf;
+}
+
 /*  tipc_link_tunnel_rcv(): Receive a tunnelled packet, sent
  *  via other link as result of a failover (ORIGINAL_MSG) or
  *  a new active link (DUPLICATE_MSG). Failover packets are
@@ -2135,10 +2179,8 @@ static int tipc_link_tunnel_rcv(struct tipc_link **l_ptr,
 {
 	struct sk_buff *tunnel_buf = *buf;
 	struct tipc_link *dest_link;
-	struct tipc_msg *msg;
 	struct tipc_msg *tunnel_msg = buf_msg(tunnel_buf);
 	u32 msg_typ = msg_type(tunnel_msg);
-	u32 msg_count = msg_msgcnt(tunnel_msg);
 	u32 bearer_id = msg_bearer_id(tunnel_msg);
 
 	if (bearer_id >= MAX_BEARERS)
@@ -2153,42 +2195,19 @@ static int tipc_link_tunnel_rcv(struct tipc_link **l_ptr,
 		goto exit;
 	}
 	*l_ptr = dest_link;
-	msg = msg_get_wrapped(tunnel_msg);
 
 	if (msg_typ == DUPLICATE_MSG) {
 		tipc_link_dup_rcv(dest_link, tunnel_buf);
 		goto exit;
 	}
 
-	/* First original message ?: */
-	if (tipc_link_is_up(dest_link)) {
-		pr_info("%s<%s>, changeover initiated by peer\n", link_rst_msg,
-			dest_link->name);
-		tipc_link_reset(dest_link);
-		dest_link->exp_msg_count = msg_count;
-		if (!msg_count)
-			goto exit;
-	} else if (dest_link->exp_msg_count == START_CHANGEOVER) {
-		dest_link->exp_msg_count = msg_count;
-		if (!msg_count)
-			goto exit;
-	}
+	if (msg_type(tunnel_msg) == ORIGINAL_MSG) {
+		*buf = tipc_link_failover_rcv(dest_link, tunnel_buf);
 
-	/* Receive original message */
-	if (dest_link->exp_msg_count == 0) {
-		pr_warn("%sgot too many tunnelled messages\n", link_co_err);
-		goto exit;
-	}
-	dest_link->exp_msg_count--;
-	if (less(msg_seqno(msg), dest_link->reset_checkpoint)) {
-		goto exit;
-	} else {
-		*buf = buf_extract(tunnel_buf, INT_H_SIZE);
+		/* Do we have a buffer/buffer chain to return? */
 		if (*buf != NULL) {
 			kfree_skb(tunnel_buf);
 			return 1;
-		} else {
-			pr_warn("%soriginal msg dropped\n", link_co_err);
 		}
 	}
 exit:
-- 
1.7.9.5

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox