Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next] myri10ge: fix most sparse warnings
From: Andrew Gallatin @ 2012-12-04 20:17 UTC (permalink / raw)
  To: davem; +Cc: netdev, Andrew Gallatin

- convert remaining htonl/ntohl +__raw_read/__raw_writel to
  swab32 + readl/writel
- add missing __iomem qualifier in myri10ge_open()
- fix  dubious: x & !y warning by switching from logical to bitwise not

The swab32 conversion fixes a bug in myri10ge_led() where
big-endian machines would write the wrong pattern.

The only remaining warning (lock context imbalance) is due to
the use of __netif_tx_trylock(), and cannot easily be fixed.

Signed-off-by: Andrew Gallatin <gallatin@myri.com>
---
 drivers/net/ethernet/myricom/myri10ge/myri10ge.c |   10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
index a40234e..f8408d6 100644
--- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
+++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
@@ -1885,7 +1885,7 @@ static int myri10ge_led(struct myri10ge_priv *mgp, int on)
 	}
 	if (!on)
 		pattern = swab32(readl(mgp->sram + pattern_off + 4));
-	writel(htonl(pattern), mgp->sram + pattern_off);
+	writel(swab32(pattern), mgp->sram + pattern_off);
 	return 0;
 }
 
@@ -2294,7 +2294,7 @@ static int myri10ge_open(struct net_device *dev)
 	struct myri10ge_priv *mgp = netdev_priv(dev);
 	struct myri10ge_cmd cmd;
 	int i, status, big_pow2, slice;
-	u8 *itable;
+	u8 __iomem *itable;
 
 	if (mgp->running != MYRI10GE_ETH_STOPPED)
 		return -EBUSY;
@@ -2757,7 +2757,7 @@ again:
 					flags_next |= next_is_first *
 					    MXGEFW_FLAGS_FIRST;
 					rdma_count |= -(chop | next_is_first);
-					rdma_count += chop & !next_is_first;
+					rdma_count += chop & ~next_is_first;
 				} else if (likely(cum_len_next >= 0)) {	/* header ends */
 					int small;
 
@@ -3836,9 +3836,9 @@ static int myri10ge_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto abort_with_mtrr;
 	}
 	hdr_offset =
-	    ntohl(__raw_readl(mgp->sram + MCP_HEADER_PTR_OFFSET)) & 0xffffc;
+	    swab32(readl(mgp->sram + MCP_HEADER_PTR_OFFSET)) & 0xffffc;
 	ss_offset = hdr_offset + offsetof(struct mcp_gen_header, string_specs);
-	mgp->sram_size = ntohl(__raw_readl(mgp->sram + ss_offset));
+	mgp->sram_size = swab32(readl(mgp->sram + ss_offset));
 	if (mgp->sram_size > mgp->board_span ||
 	    mgp->sram_size <= MYRI10GE_FW_OFFSET) {
 		dev_err(&pdev->dev,
-- 
1.7.9.5

^ permalink raw reply related

* [Patch 1/1] net/phy: Add interrupt support for dp83640 phy.
From: Stephan Gatzka @ 2012-12-04 20:21 UTC (permalink / raw)
  To: netdev, davem, richardcochran; +Cc: Stephan Gatzka

Added functions for ack_interrupt and config_intr. Tested on an mpc5200b
powerpc board.

Signed-off-by: Stephan Gatzka <stephan.gatzka@gmail.com>
---
 drivers/net/phy/dp83640.c |   78 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 77 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c
index 24e05c4..7490b6c 100644
--- a/drivers/net/phy/dp83640.c
+++ b/drivers/net/phy/dp83640.c
@@ -48,6 +48,21 @@
 #define CAL_TRIGGER	7
 #define PER_TRIGGER	6
 
+#define MII_DP83640_MICR 0x11
+#define MII_DP83640_MISR 0x12
+
+#define MII_DP83640_MICR_OE 0x1
+#define MII_DP83640_MICR_IE 0x2
+
+#define MII_DP83640_MISR_RHF_INT_EN 0x01
+#define MII_DP83640_MISR_FHF_INT_EN 0x02
+#define MII_DP83640_MISR_ANC_INT_EN 0x04
+#define MII_DP83640_MISR_DUP_INT_EN 0x08
+#define MII_DP83640_MISR_SPD_INT_EN 0x10
+#define MII_DP83640_MISR_LINK_INT_EN 0x20
+#define MII_DP83640_MISR_ED_INT_EN 0x40
+#define MII_DP83640_MISR_LQ_INT_EN 0x80
+
 /* phyter seems to miss the mark by 16 ns */
 #define ADJTIME_FIX	16
 
@@ -1043,6 +1058,65 @@ static void dp83640_remove(struct phy_device *phydev)
 	kfree(dp83640);
 }
 
+static int dp83640_ack_interrupt(struct phy_device *phydev)
+{
+	int err = phy_read(phydev, MII_DP83640_MISR);
+
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static int dp83640_config_intr(struct phy_device *phydev)
+{
+	int micr;
+	int misr;
+	int err;
+
+	if (phydev->interrupts == PHY_INTERRUPT_ENABLED) {
+		misr = phy_read(phydev, MII_DP83640_MISR);
+		if (misr < 0)
+			return misr;
+		misr |=
+			(MII_DP83640_MISR_ANC_INT_EN |
+			MII_DP83640_MISR_DUP_INT_EN |
+			MII_DP83640_MISR_SPD_INT_EN |
+			MII_DP83640_MISR_LINK_INT_EN);
+		err = phy_write(phydev, MII_DP83640_MISR, misr);
+		if (err < 0)
+			return err;
+
+		micr = phy_read(phydev, MII_DP83640_MICR);
+		if (micr < 0)
+			return micr;
+		micr |=
+			(MII_DP83640_MICR_OE |
+			MII_DP83640_MICR_IE);
+		return phy_write(phydev, MII_DP83640_MICR, micr);
+	} else {
+		micr = phy_read(phydev, MII_DP83640_MICR);
+		if (micr < 0)
+			return micr;
+		micr &=
+			~(MII_DP83640_MICR_OE |
+			MII_DP83640_MICR_IE);
+		err = phy_write(phydev, MII_DP83640_MICR, micr);
+		if (err < 0)
+			return err;
+
+		misr = phy_read(phydev, MII_DP83640_MISR);
+		if (misr < 0)
+			return misr;
+		misr &=
+			~(MII_DP83640_MISR_ANC_INT_EN |
+			MII_DP83640_MISR_DUP_INT_EN |
+			MII_DP83640_MISR_SPD_INT_EN |
+			MII_DP83640_MISR_LINK_INT_EN);
+		return phy_write(phydev, MII_DP83640_MISR, misr);
+	}
+}
+
 static int dp83640_hwtstamp(struct phy_device *phydev, struct ifreq *ifr)
 {
 	struct dp83640_private *dp83640 = phydev->priv;
@@ -1253,11 +1327,13 @@ static struct phy_driver dp83640_driver = {
 	.phy_id_mask	= 0xfffffff0,
 	.name		= "NatSemi DP83640",
 	.features	= PHY_BASIC_FEATURES,
-	.flags		= 0,
+	.flags		= PHY_HAS_INTERRUPT,
 	.probe		= dp83640_probe,
 	.remove		= dp83640_remove,
 	.config_aneg	= genphy_config_aneg,
 	.read_status	= genphy_read_status,
+	.ack_interrupt  = dp83640_ack_interrupt,
+	.config_intr    = dp83640_config_intr,
 	.ts_info	= dp83640_ts_info,
 	.hwtstamp	= dp83640_hwtstamp,
 	.rxtstamp	= dp83640_rxtstamp,
-- 
1.7.9.5

^ permalink raw reply related

* WARNING: drivers/net/ethernet/broadcom/tg3.o(.text+0x16006): Section mismatch in reference from the function tg3_get_invariants() to the variable .devinit.rodata:tg3_write_reorder_chipsets
From: kbuild test robot @ 2012-12-04 20:24 UTC (permalink / raw)
  To: Bill Pemberton; +Cc: netdev, Greg Kroah-Hartman

tree:   git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master
head:   193c1e478cc496844fcbef402a10976c95a634ff
commit: 229b1ad1cb1a7159658f466c736bc3898dabb876 tg3: remove __dev* attributes
date:   25 hours ago
config: make ARCH=x86_64 allmodconfig

All warnings:

WARNING: drivers/net/ethernet/broadcom/tg3.o(.text+0x16006): Section mismatch in reference from the function tg3_get_invariants() to the variable .devinit.rodata:tg3_write_reorder_chipsets
The function tg3_get_invariants() references
the variable __devinitconst tg3_write_reorder_chipsets.
This is often because tg3_get_invariants lacks a __devinitconst
annotation or the annotation of tg3_write_reorder_chipsets is wrong.
--
WARNING: drivers/net/ethernet/broadcom/tg3.o(.text+0x231db): Section mismatch in reference from the function tg3_init_one() to the variable .devinit.rodata:tg3_dma_wait_state_chipsets
The function tg3_init_one() references
the variable __devinitconst tg3_dma_wait_state_chipsets.
This is often because tg3_init_one lacks a __devinitconst
annotation or the annotation of tg3_dma_wait_state_chipsets is wrong.

---
0-DAY kernel build testing backend         Open Source Technology Center
Fengguang Wu, Yuanhan Liu                              Intel Corporation

^ permalink raw reply

* Re: WARNING: drivers/net/ethernet/broadcom/tg3.o(.text+0x16006): Section mismatch in reference from the function tg3_get_invariants() to the variable .devinit.rodata:tg3_write_reorder_chipsets
From: David Miller @ 2012-12-04 20:28 UTC (permalink / raw)
  To: fengguang.wu; +Cc: wfp5p, netdev, gregkh
In-Reply-To: <50be5bf5./vbd9l3rQvm4EYDg%fengguang.wu@intel.com>

From: kbuild test robot <fengguang.wu@intel.com>
Date: Wed, 05 Dec 2012 04:24:21 +0800

> tree:   git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master
> head:   193c1e478cc496844fcbef402a10976c95a634ff
> commit: 229b1ad1cb1a7159658f466c736bc3898dabb876 tg3: remove __dev* attributes
> date:   25 hours ago
> config: make ARCH=x86_64 allmodconfig
> 
> All warnings:

This is a result of the __dev* macro removal from the networking
drivers.  I think it will be resolved when Greg KH's tree gets merged.

The issue is that the PCI device ID macros in linux/pci.h still
use the __dev* tags.

Greg, please confirm.

^ permalink raw reply

* Re: WARNING: drivers/net/ethernet/broadcom/tg3.o(.text+0x16006): Section mismatch in reference from the function tg3_get_invariants() to the variable .devinit.rodata:tg3_write_reorder_chipsets
From: Greg KH @ 2012-12-04 20:30 UTC (permalink / raw)
  To: David Miller; +Cc: fengguang.wu, wfp5p, netdev
In-Reply-To: <20121204.152825.1014358628158241121.davem@davemloft.net>

On Tue, Dec 04, 2012 at 03:28:25PM -0500, David Miller wrote:
> From: kbuild test robot <fengguang.wu@intel.com>
> Date: Wed, 05 Dec 2012 04:24:21 +0800
> 
> > tree:   git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master
> > head:   193c1e478cc496844fcbef402a10976c95a634ff
> > commit: 229b1ad1cb1a7159658f466c736bc3898dabb876 tg3: remove __dev* attributes
> > date:   25 hours ago
> > config: make ARCH=x86_64 allmodconfig
> > 
> > All warnings:
> 
> This is a result of the __dev* macro removal from the networking
> drivers.  I think it will be resolved when Greg KH's tree gets merged.
> 
> The issue is that the PCI device ID macros in linux/pci.h still
> use the __dev* tags.
> 
> Greg, please confirm.

Yes, that is correct.  Fengguang, you can stop reporting these kinds of
warnings, you are going to get a bunch of them from now until 3.8-rc1 is
out.

thanks,

greg k-h

^ permalink raw reply

* Re: [GIT PULL] Remove __dev* markings from the networking drivers
From: Greg KH @ 2012-12-04 20:30 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, wfp5p
In-Reply-To: <20121204.131726.103739824056368439.davem@davemloft.net>

On Tue, Dec 04, 2012 at 01:17:26PM -0500, David Miller wrote:
> From: David Miller <davem@davemloft.net>
> Date: Mon, 03 Dec 2012 15:36:23 -0500 (EST)
> 
> > From: Greg KH <gregkh@linuxfoundation.org>
> > Date: Mon, 3 Dec 2012 12:27:43 -0800
> > 
> >> Here's a pull request for all of Bill's __dev* removal patches for the
> >> networking drivers, based on net-next.  I'll send out a separate one for
> >> the wireless drivers in case those want to go through John's tree
> >> 
> >> If I should rework these in any way, please let me know.
> > 
> > Pulled, thanks Greg.
> 
> I had a reason to go look in detail at the changes in here.
> 
> It seemse the function declarations were not properly reformatted
> after the __dev* tags were removed.  You can't just search and replace
> this kind of stuff.  The result looks terrible.
> 
> Greg please check for things like this next time you send me changes
> written by someone else.

Ick, sorry about that.  Want me to fix them all back up?  It's the least
I could do.

greg k-h

^ permalink raw reply

* Re: [PATCH net-next 3/7] ipv6: improve ipv6_find_hdr() to skip empty routing headers
From: Ansis Atteka @ 2012-12-04 20:47 UTC (permalink / raw)
  To: Jesse Gross; +Cc: Pablo Neira Ayuso, David Miller, netdev, dev
In-Reply-To: <CAEP_g=_89wPzKi-F9VQaKxM5eAUhSx3KEQhOSJQgTNtfiFQTEw@mail.gmail.com>

On Tue, Dec 4, 2012 at 10:15 AM, Jesse Gross <jesse@nicira.com> wrote:
> On Mon, Dec 3, 2012 at 10:06 AM, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
>> On Mon, Dec 03, 2012 at 09:28:55AM -0800, Jesse Gross wrote:
>>> On Mon, Dec 3, 2012 at 6:04 AM, Pablo Neira Ayuso <pablo@netfilter.org> wrote:
>>> > On Thu, Nov 29, 2012 at 10:35:45AM -0800, Jesse Gross wrote:
>>> >> @@ -159,9 +162,10 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
>>> >>       }
>>> >>       len = skb->len - start;
>>> >>
>>> >> -     while (nexthdr != target) {
>>> >
>>> > If the offset is set as parameter via ipv6_find_hdr, we now are always
>>> > entering the loop even if we found the target header we're looking
>>> > for, before that didn't happen.
>>> >
>>> > Something seems wrong here to me.
>>>
>>> If the target header is a routing header then you might still need to
>>> continue searching because the first one that you see could be empty.
>>
>> OK, but if it's not a routing header what we're searching for (which
>> seems to be the case of netfilter/IPVS) we waste way more cycles on
>> copying the IPv6 header again and with way more things that are
>> completely useless.
>
> We could add a check to short circuit this but it seems like a
> premature optimization to me.
>
> Ansis, can you comment?

Pablo, I think that the only concern you have here is about
optimizations, right?

We could either:
1. add an "if" statement that terminates loop early; or
2. switch back to "while () {}" loop and change condition from
"nexthdr != target" to something like "nexthdr != target || (nexthdr
== NEXTHDR_ROUTING && flags && (*flags & IP6_FH_F_SKIP_RH))".

This function seemed like a good candidate to extend it for this. I
think that iptables could make use of it too (to figure out whether L4
checksums need to be updated in presence of routing headers and NAT).

Thanks,
Ansis

^ permalink raw reply

* Question about locking in ndisc.c
From: Paul Marks @ 2012-12-04 20:52 UTC (permalink / raw)
  To: netdev

I've been reading through net/ipv6/ndisc.c recently (trying to
determine if it'd be feasible to implement RFC6724 Rule 5.5), and I'm
confused by the lack of visible locking in the
ndisc_router_discovery() function.

http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=blob;f=net/ipv6/ndisc.c;h=2edce30ef7338cdf1916eef98b353c7dbbc7bcda;hb=HEAD#l1048

There are values written all over the place, for example:
- in6_dev->if_flags
- neigh->flags
- rt->rt6i_flags
- in6_dev->nd_parms->...
- in6_dev->tstamp

But I've looked through ndisc.c and icmp.c, and I can't find anything
that would protect this data from concurrent access.  Is there
something I'm missing?

^ permalink raw reply

* Re: [GIT PULL] Remove __dev* markings from the networking drivers
From: David Miller @ 2012-12-04 21:02 UTC (permalink / raw)
  To: gregkh; +Cc: netdev, wfp5p
In-Reply-To: <20121204203052.GB32674@kroah.com>

From: Greg KH <gregkh@linuxfoundation.org>
Date: Tue, 4 Dec 2012 12:30:52 -0800

> On Tue, Dec 04, 2012 at 01:17:26PM -0500, David Miller wrote:
>> It seemse the function declarations were not properly reformatted
>> after the __dev* tags were removed.  You can't just search and replace
>> this kind of stuff.  The result looks terrible.
>> 
>> Greg please check for things like this next time you send me changes
>> written by someone else.
> 
> Ick, sorry about that.  Want me to fix them all back up?  It's the least
> I could do.

If you could do that, I'd really appreciate it.

Thanks Greg.

^ permalink raw reply

* ip_rt_min_pmtu
From: Christopher Schramm @ 2012-12-04 21:04 UTC (permalink / raw)
  To: netdev

Hi,

I'm looking into an interesting detail of the Linux IPv4 implementation 
I stumbled upon during a University course.

In route.c there's a value ip_rt_min_pmtu, defined as 512 + 20 + 20, 
that tells Linux a minimum PMTU to use, even if e. g. an ICMP message 
tells it to set a smaller one.

Of course, this is not a problem in real world, but not 
standard-compliant, since RFC 791 defines a minimum MTU of 68 for IPv4. 
So I wonder what's the reason for the restriction.

I looked into it and found that it appeared in Linux 2.3.15 with the 
following ID in route.c:

v 1.71 1999/08/20 11:05:58 davem

While it was not present in Linux 2.3.14 with:

v 1.69 1999/06/09 10:11:02 davem

I couldn't find any related discussion or patch on the LKML around that 
dates, so I'm asking you for any hints to find out the reason for 
implementing this lower bound.

What I've found on the LKML is a topic around February 15th, 2001, 
titled "MTU and 2.4.x kernel", where Alexey Kuznetsov points out that 
the handling of "DF on syn frames" is broken for MTUs smaller than 128 
and "Preventing DoSes requires to block pmtu discovery at 576 or at 
least 552".

Does anybody know the actual reason for the change in 2.3.15? I first 
thought it's the common misinterpretation that 576 would be the lower 
bound for MTUs in IPv4, but I wonder why it was put in place as a patch 
years after the IPv4 implementation was already done. There seems to 
have been some clear reason for it. I also wonder why it has never been 
removed up to today if it's really nothing more than a mistake.

Would be great if someone could help me shed some light on this.

Regards

^ permalink raw reply

* WARNING: drivers/net/ethernet/dlink/sundance.o(.text+0x2e87): Section mismatch in reference from the function sundance_probe1() to the variable .devinit.rodata:sundance_pci_tbl
From: kbuild test robot @ 2012-12-04 22:23 UTC (permalink / raw)
  To: Bill Pemberton; +Cc: netdev, Greg Kroah-Hartman

tree:   git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master
head:   193c1e478cc496844fcbef402a10976c95a634ff
commit: 64bc40de134bb5c7826ff384016f654219ed3956 dlink: remove __dev* attributes
date:   27 hours ago
config: make ARCH=x86_64 allmodconfig

All warnings:

WARNING: drivers/net/ethernet/dlink/sundance.o(.text+0x2e87): Section mismatch in reference from the function sundance_probe1() to the variable .devinit.rodata:sundance_pci_tbl
The function sundance_probe1() references
the variable __devinitconst sundance_pci_tbl.
This is often because sundance_probe1 lacks a __devinitconst
annotation or the annotation of sundance_pci_tbl is wrong.

---
0-DAY kernel build testing backend         Open Source Technology Center
Fengguang Wu, Yuanhan Liu                              Intel Corporation

^ permalink raw reply

* WARNING: drivers/net/ethernet/dec/tulip/tulip.o(.text+0x6f9b): Section mismatch in reference from the function tulip_init_one() to the variable .devinit.rodata:early_486_chipsets
From: kbuild test robot @ 2012-12-05  0:10 UTC (permalink / raw)
  To: Bill Pemberton; +Cc: netdev, Greg Kroah-Hartman

tree:   git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master
head:   193c1e478cc496844fcbef402a10976c95a634ff
commit: 779c1a85813a4622cc3bb3d25ce10b523bd055ba tulip: remove __dev* attributes
date:   29 hours ago
config: make ARCH=x86_64 allmodconfig

All warnings:

WARNING: drivers/net/ethernet/dec/tulip/tulip.o(.text+0x6f9b): Section mismatch in reference from the function tulip_init_one() to the variable .devinit.rodata:early_486_chipsets
The function tulip_init_one() references
the variable __devinitconst early_486_chipsets.
This is often because tulip_init_one lacks a __devinitconst
annotation or the annotation of early_486_chipsets is wrong.

---
0-DAY kernel build testing backend         Open Source Technology Center
Fengguang Wu, Yuanhan Liu                              Intel Corporation

^ permalink raw reply

* NETDEV WATCHDOG: eth1 (r8169): transmit queue 0 timed out
From: Dave Jones @ 2012-12-05  1:19 UTC (permalink / raw)
  To: netdev

We continue to see warnings like this reported against the Fedora kernel
for a number of different NICs. I just hit this one myself for the first time
on that hardware iirc.

Anything else I can provide ?

	Dave

[22347.960097] ------------[ cut here ]------------
[22347.965388] WARNING: at net/sched/sch_generic.c:255 dev_watchdog+0x258/0x270()
[22347.973764] Hardware name:         
[22347.977748] NETDEV WATCHDOG: eth1 (r8169): transmit queue 0 timed out
[22347.985212] Modules linked in: ipt_MASQUERADE iptable_nat nf_nat_ipv4 nf_nat xt_LOG xt_limit ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 nf_conntrack_ipv4 nf_defrag_ipv4 ip6table_filter xt_conntrack ip6_tables nf_conntrack microcode pcspkr r8169 mii nfsd auth_rpcgss nfs_acl lockd sunrpc i915 video backlight i2c_algo_bit drm_kms_helper drm
[22348.020948] Pid: 0, comm: swapper/3 Not tainted 3.7.0-rc8+ #5
[22348.027483] Call Trace:
[22348.030420]  <IRQ>  [<ffffffff8104928f>] warn_slowpath_common+0x7f/0xc0
[22348.038029]  [<ffffffff81049386>] warn_slowpath_fmt+0x46/0x50
[22348.044710]  [<ffffffff814d35a8>] dev_watchdog+0x258/0x270
[22348.051096]  [<ffffffff814d3350>] ? dev_deactivate_queue.constprop.30+0x80/0x80
[22348.059408]  [<ffffffff8105afea>] call_timer_fn+0x8a/0x330
[22348.065791]  [<ffffffff8105af65>] ? call_timer_fn+0x5/0x330
[22348.072276]  [<ffffffff814d3350>] ? dev_deactivate_queue.constprop.30+0x80/0x80
[22348.080721]  [<ffffffff814d3350>] ? dev_deactivate_queue.constprop.30+0x80/0x80
[22348.089032]  [<ffffffff8105b4d4>] run_timer_softirq+0x244/0x380
[22348.095913]  [<ffffffff81052a00>] __do_softirq+0xe0/0x3c0
[22348.102204]  [<ffffffff810a71d4>] ? tick_program_event+0x24/0x30
[22348.109045]  [<ffffffff815e36cc>] call_softirq+0x1c/0x26
[22348.115236]  [<ffffffff810043cd>] do_softirq+0x8d/0xc0
[22348.121226]  [<ffffffff81052eb5>] irq_exit+0xd5/0xe0
[22348.126882]  [<ffffffff815e381b>] smp_apic_timer_interrupt+0x6b/0x98
[22348.134253]  [<ffffffff815e302f>] apic_timer_interrupt+0x6f/0x80
[22348.141293]  <EOI>  [<ffffffff8108bad5>] ? sched_clock_local+0x25/0xa0
[22348.148819]  [<ffffffff81343d3d>] ? intel_idle+0xfd/0x160
[22348.155110]  [<ffffffff81343d36>] ? intel_idle+0xf6/0x160
[22348.161399]  [<ffffffff81474dc9>] cpuidle_enter+0x19/0x20
[22348.167562]  [<ffffffff81475462>] cpuidle_idle_call+0xa2/0x5f0
[22348.174346]  [<ffffffff8100c59a>] cpu_idle+0xca/0x160
[22348.180235]  [<ffffffff81ceead4>] start_secondary+0x251/0x258
[22348.186776] ---[ end trace 62a349a839e1b608 ]---
[22348.261360] r8169 0000:05:00.0 eth1: link up

^ permalink raw reply

* WARNING: drivers/net/ethernet/broadcom/tg3.o(.text+0x1fe93): Section mismatch in reference from the function tg3_test_dma() to the variable .devinit.rodata:tg3_dma_wait_state_chipsets
From: kbuild test robot @ 2012-12-05  1:21 UTC (permalink / raw)
  To: Bill Pemberton; +Cc: netdev, Greg Kroah-Hartman

tree:   git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master
head:   193c1e478cc496844fcbef402a10976c95a634ff
commit: 229b1ad1cb1a7159658f466c736bc3898dabb876 tg3: remove __dev* attributes
date:   30 hours ago
config: make ARCH=i386 allmodconfig

All warnings:

WARNING: drivers/net/ethernet/broadcom/tg3.o(.text+0x1fe93): Section mismatch in reference from the function tg3_test_dma() to the variable .devinit.rodata:tg3_dma_wait_state_chipsets
The function tg3_test_dma() references
the variable __devinitconst tg3_dma_wait_state_chipsets.
This is often because tg3_test_dma lacks a __devinitconst
annotation or the annotation of tg3_dma_wait_state_chipsets is wrong.

---
0-DAY kernel build testing backend         Open Source Technology Center
Fengguang Wu, Yuanhan Liu                              Intel Corporation

^ permalink raw reply

* Re: [Suggestion] net/atm : for sprintf, need check the total write length whether larger than a page.
From: Chen Gang @ 2012-12-05  1:26 UTC (permalink / raw)
  To: Chas Williams (CONTRACTOR); +Cc: David Miller, netdev
In-Reply-To: <201212040346.qB43kVSQ018028@thirdoffive.cmf.nrl.navy.mil>


  firstly, sorry for reply late (yesterday I have an annual leave for
some personal things).

  and thank you for your reply, too.


于 2012年12月04日 11:46, Chas Williams (CONTRACTOR) 写道:
>  static ssize_t show_address(struct device *cdev,
>  			    struct device_attribute *attr, char *buf)
>  {
> -	char *pos = buf;
>  	struct atm_dev *adev = to_atm_dev(cdev);
> -	int i;
> -
> -	for (i = 0; i < (ESI_LEN - 1); i++)
> -		pos += sprintf(pos, "%02x:", adev->esi[i]);
> -	pos += sprintf(pos, "%02x\n", adev->esi[i]);
>  
> -	return pos - buf;
> +	return scnprintf(buf, PAGE_SIZE, "%pM\n", adev->esi);
>  }
>  

  "%p" seems print a pointer, not contents of pointer (is it correct ?)

  will it change the original display format to outside ?


>  static ssize_t show_atmaddress(struct device *cdev,
>  			       struct device_attribute *attr, char *buf)
>  {
>  	unsigned long flags;
> -	char *pos = buf;
>  	struct atm_dev *adev = to_atm_dev(cdev);
>  	struct atm_dev_addr *aaddr;
>  	int bin[] = { 1, 2, 10, 6, 1 }, *fmt = bin;
> -	int i, j;
> +	int i, j, count = 0;
>  
>  	spin_lock_irqsave(&adev->lock, flags);
>  	list_for_each_entry(aaddr, &adev->local, entry) {
>  		for (i = 0, j = 0; i < ATM_ESA_LEN; ++i, ++j) {
>  			if (j == *fmt) {
> -				pos += sprintf(pos, ".");
> +				count += scnprintf(buf + count,
> +						   PAGE_SIZE - count, ".");
>  				++fmt;
>  				j = 0;
>  			}
> -			pos += sprintf(pos, "%02x",
> -				       aaddr->addr.sas_addr.prv[i]);
> +			count += scnprintf(buf + count,
> +					   PAGE_SIZE - count, "%02x",
> +					   aaddr->addr.sas_addr.prv[i]);
>  		}
> -		pos += sprintf(pos, "\n");
> +		count += scnprintf(buf + count, PAGE_SIZE - count, "\n");
>  	}
>  	spin_unlock_irqrestore(&adev->lock, flags);
>  
> -	return pos - buf;
> +	return count;
>  }
>  

  need we judge whether count >= PAGE_SIZE ?


  these are my suggestions, maybe not correct.

  :-)

  thanks.

-- 
Chen Gang

Asianux Corporation

^ permalink raw reply

* Re: [Suggestion] net/atm :  for sprintf, need check the total write length whether larger than a page.
From: Chen Gang @ 2012-12-05  1:28 UTC (permalink / raw)
  To: chas williams - CONTRACTOR
  Cc: David.Woodhouse, David Miller, krzysiek, Joe Perches, edumazet,
	netdev
In-Reply-To: <20121203104825.78123ecc@thirdoffive.cmf.nrl.navy.mil>

于 2012年12月03日 23:48, chas williams - CONTRACTOR 写道:
> yes this seems like it should be done.  maybe this week i will try to
> put something together unless you already have a patch somewhere.
> 

  thank you very much.


  and also sorry for replying late (yesterday, I have an annual leave
for some personal things)

^ permalink raw reply

* Re: [PATCH net-next v2] bridge: export multicast database via netlink
From: Cong Wang @ 2012-12-05  2:07 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, bridge, herbert, shemminger, tgraf, brouer
In-Reply-To: <20121204.132505.1229048595483168009.davem@davemloft.net>

On Tue, 2012-12-04 at 13:25 -0500, David Miller wrote:
> From: Cong Wang <amwang@redhat.com>
> Date: Mon,  3 Dec 2012 21:03:43 +0800
> 
> > TODO: remove debugging printk's
> 
> Can you please take care of this so we can consider this patch
> seriously for inclusion?
> 

Sure. I planned to send V3 when I wrote this TODO. ;) And it seems
Thomas and Stephen are ok with it now, I will remove these printk's.

Thanks.

^ permalink raw reply

* Re: [PATCH net-next v2] bridge: export multicast database via netlink
From: Cong Wang @ 2012-12-05  2:08 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: netdev, bridge, Herbert Xu, Jesper Dangaard Brouer, Thomas Graf,
	David S. Miller
In-Reply-To: <20121204085320.3e08986a@nehalam.linuxnetplumber.net>

On Tue, 2012-12-04 at 08:53 -0800, Stephen Hemminger wrote:
> On Mon,  3 Dec 2012 21:03:43 +0800
> Cong Wang <amwang@redhat.com> wrote:
> 
> > V2: drop patch 1/2, export ifindex directly
> >     Redesign netlink attributes
> >     Improve netlink seq check
> >     Handle IPv6 addr as well
> > 
> > TODO: remove debugging printk's
> > 
> > This patch exports bridge multicast database via netlink
> > message type RTM_GETMDB. Similar to fdb, but currently bridge-specific.
> > We may need to support modify multicast database too (RTM_{ADD,DEL}MDB).
> > 
> > Cc: Herbert Xu <herbert@gondor.apana.org.au>
> > Cc: Stephen Hemminger <shemminger@vyatta.com>
> > Cc: "David S. Miller" <davem@davemloft.net>
> > Cc: Thomas Graf <tgraf@suug.ch>
> > Cc: Jesper Dangaard Brouer <brouer@redhat.com>
> > Signed-off-by: Cong Wang <amwang@redhat.com>
> >     
> 
> Minor nit reported by checkpatch was the messages should be using the api
> which provides the most info in the log to identify.
> 
> WARNING: Prefer netdev_info(netdev, ... then dev_info(dev, ... then pr_info(...  to printk(KERN_INFO ...
> #190: FILE: net/bridge/br_mdb.c:28:
> +		printk(KERN_INFO "no router on bridge\n")
> 
> There is a set of macro's already for use in bridging code:
>                 br_info(br, "no router on bridge\n");
> 

Never mind, removing these printk's is in my TODO list. :)

^ permalink raw reply

* [PATCH net-next v3] bridge: export multicast database via netlink
From: Cong Wang @ 2012-12-05  2:50 UTC (permalink / raw)
  To: netdev
  Cc: Cong Wang, bridge, Herbert Xu, Jesper Dangaard Brouer,
	Thomas Graf, Stephen Hemminger, David S. Miller

V3: drop debugging printk's
    update selinux perm table as well

V2: drop patch 1/2, export ifindex directly
    Redesign netlink attributes
    Improve netlink seq check
    Handle IPv6 addr as well

This patch exports bridge multicast database via netlink
message type RTM_GETMDB. Similar to fdb, but currently bridge-specific.
We may need to support modify multicast database too (RTM_{ADD,DEL}MDB).

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Cong Wang <amwang@redhat.com>
    
---
 include/uapi/linux/if_bridge.h |   55 ++++++++++++++
 include/uapi/linux/rtnetlink.h |    3 +
 net/bridge/Makefile            |    2 +-
 net/bridge/br_mdb.c            |  161 ++++++++++++++++++++++++++++++++++++++++
 net/bridge/br_multicast.c      |    2 +
 net/bridge/br_private.h        |    2 +
 security/selinux/nlmsgtab.c    |    1 +
 7 files changed, 225 insertions(+), 1 deletions(-)

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index b388579..9a0f6ff 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -116,4 +116,59 @@ enum {
 	__IFLA_BRIDGE_MAX,
 };
 #define IFLA_BRIDGE_MAX (__IFLA_BRIDGE_MAX - 1)
+
+/* Bridge multicast database attributes
+ * [MDBA_MDB] = {
+ *     [MDBA_MDB_ENTRY] = {
+ *         [MDBA_MDB_ENTRY_INFO]
+ *     }
+ * }
+ * [MDBA_ROUTER] = {
+ *    [MDBA_ROUTER_PORT]
+ * }
+ */
+enum {
+	MDBA_UNSPEC,
+	MDBA_MDB,
+	MDBA_ROUTER,
+	__MDBA_MAX,
+};
+#define MDBA_MAX (__MDBA_MAX - 1)
+
+enum {
+	MDBA_MDB_UNSPEC,
+	MDBA_MDB_ENTRY,
+	__MDBA_MDB_MAX,
+};
+#define MDBA_MDB_MAX (__MDBA_MDB_MAX - 1)
+
+enum {
+	MDBA_MDB_ENTRY_UNSPEC,
+	MDBA_MDB_ENTRY_INFO,
+	__MDBA_MDB_ENTRY_MAX,
+};
+#define MDBA_MDB_ENTRY_MAX (__MDBA_MDB_ENTRY_MAX - 1)
+
+enum {
+	MDBA_ROUTER_UNSPEC,
+	MDBA_ROUTER_PORT,
+	__MDBA_ROUTER_MAX,
+};
+#define MDBA_ROUTER_MAX (__MDBA_ROUTER_MAX - 1)
+
+struct br_port_msg {
+	__u32 ifindex;
+};
+
+struct br_mdb_entry {
+	__u32 ifindex;
+	struct {
+		union {
+			__be32	ip4;
+			struct in6_addr ip6;
+		} u;
+		__be16		proto;
+	} addr;
+};
+
 #endif /* _UAPI_LINUX_IF_BRIDGE_H */
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 33d29ce..354a1e7 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -125,6 +125,9 @@ enum {
 	RTM_GETNETCONF = 82,
 #define RTM_GETNETCONF RTM_GETNETCONF
 
+	RTM_GETMDB = 86,
+#define RTM_GETMDB RTM_GETMDB
+
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index d0359ea..e859098 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -12,6 +12,6 @@ bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
 
 bridge-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o
 
-bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o
+bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o
 
 obj-$(CONFIG_BRIDGE_NF_EBTABLES) += netfilter/
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
new file mode 100644
index 0000000..f0f4dcf
--- /dev/null
+++ b/net/bridge/br_mdb.c
@@ -0,0 +1,161 @@
+#include <linux/err.h>
+#include <linux/if_ether.h>
+#include <linux/igmp.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/mld.h>
+#include <net/addrconf.h>
+#include <net/ip6_checksum.h>
+#endif
+
+#include "br_private.h"
+
+static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
+			       struct net_device *dev)
+{
+	struct net_bridge *br = netdev_priv(dev);
+	struct net_bridge_port *p;
+	struct hlist_node *n;
+	struct nlattr *nest;
+
+	if (!br->multicast_router || hlist_empty(&br->router_list))
+		return 0;
+
+	nest = nla_nest_start(skb, MDBA_ROUTER);
+	if (nest == NULL)
+		return -EMSGSIZE;
+
+	hlist_for_each_entry_rcu(p, n, &br->router_list, rlist) {
+		if (p && nla_put_u32(skb, MDBA_ROUTER_PORT, p->dev->ifindex))
+			goto fail;
+	}
+
+	nla_nest_end(skb, nest);
+	return 0;
+fail:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
+			    struct net_device *dev)
+{
+	struct net_bridge *br = netdev_priv(dev);
+	struct net_bridge_mdb_htable *mdb;
+	struct nlattr *nest, *nest2;
+	int i, err = 0;
+	int idx = 0, s_idx = cb->args[1];
+
+	if (br->multicast_disabled)
+		return 0;
+
+	mdb = rcu_dereference(br->mdb);
+	if (!mdb)
+		return 0;
+
+	nest = nla_nest_start(skb, MDBA_MDB);
+	if (nest == NULL)
+		return -EMSGSIZE;
+
+	for (i = 0; i < mdb->max; i++) {
+		struct hlist_node *h;
+		struct net_bridge_mdb_entry *mp;
+		struct net_bridge_port_group *p, **pp;
+		struct net_bridge_port *port;
+
+		hlist_for_each_entry_rcu(mp, h, &mdb->mhash[i], hlist[mdb->ver]) {
+			if (idx < s_idx)
+				goto skip;
+
+			nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY);
+			if (nest2 == NULL) {
+				err = -EMSGSIZE;
+				goto out;
+			}
+
+			for (pp = &mp->ports;
+			     (p = rcu_dereference(*pp)) != NULL;
+			      pp = &p->next) {
+				port = p->port;
+				if (port) {
+					struct br_mdb_entry e;
+					e.ifindex = port->dev->ifindex;
+					e.addr.u.ip4 = p->addr.u.ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+					e.addr.u.ip6 = p->addr.u.ip6;
+#endif
+					e.addr.proto = p->addr.proto;
+					if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(e), &e)) {
+						nla_nest_cancel(skb, nest2);
+						err = -EMSGSIZE;
+						goto out;
+					}
+				}
+			}
+			nla_nest_end(skb, nest2);
+		skip:
+			idx++;
+		}
+	}
+
+out:
+	cb->args[1] = idx;
+	cb->args[2] = mdb->seq;
+	nla_nest_end(skb, nest);
+	return err;
+}
+
+static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct net_device *dev;
+	struct net *net = sock_net(skb->sk);
+	struct nlmsghdr *nlh;
+	int idx = 0, s_idx;
+
+	s_idx = cb->args[0];
+
+	rcu_read_lock();
+
+	for_each_netdev_rcu(net, dev) {
+		if (dev->priv_flags & IFF_EBRIDGE) {
+			struct br_port_msg *bpm;
+
+			if (idx < s_idx)
+				goto cont;
+
+			nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+					cb->nlh->nlmsg_seq, RTM_GETMDB,
+					sizeof(*bpm), NLM_F_MULTI);
+			if (nlh == NULL)
+				break;
+
+			bpm = nlmsg_data(nlh);
+			bpm->ifindex = dev->ifindex;
+			if (br_mdb_fill_info(skb, cb, dev) < 0)
+				goto out;
+			if (br_rports_fill_info(skb, cb, dev) < 0)
+				goto out;
+
+			nlmsg_end(skb, nlh);
+		cont:
+			idx++;
+		}
+	}
+
+out:
+	cb->seq = cb->args[2];
+	rcu_read_unlock();
+	cb->args[0] = idx;
+	return skb->len;
+}
+
+void br_mdb_init(void)
+{
+	rtnl_register(PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, NULL);
+}
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 2417434..d53e4f4 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -322,6 +322,7 @@ static int br_mdb_rehash(struct net_bridge_mdb_htable __rcu **mdbp, int max,
 
 	mdb->size = old ? old->size : 0;
 	mdb->ver = old ? old->ver ^ 1 : 0;
+	mdb->seq = old ? (old->seq + 1): 0;
 
 	if (!old || elasticity)
 		get_random_bytes(&mdb->secret, sizeof(mdb->secret));
@@ -1584,6 +1585,7 @@ void br_multicast_init(struct net_bridge *br)
 		    br_multicast_querier_expired, (unsigned long)br);
 	setup_timer(&br->multicast_query_timer, br_multicast_query_expired,
 		    (unsigned long)br);
+	br_mdb_init();
 }
 
 void br_multicast_open(struct net_bridge *br)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index eb9cd42..6484069 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -105,6 +105,7 @@ struct net_bridge_mdb_htable
 	u32				max;
 	u32				secret;
 	u32				ver;
+	u32				seq;
 };
 
 struct net_bridge_port
@@ -432,6 +433,7 @@ extern int br_multicast_set_port_router(struct net_bridge_port *p,
 extern int br_multicast_toggle(struct net_bridge *br, unsigned long val);
 extern int br_multicast_set_querier(struct net_bridge *br, unsigned long val);
 extern int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val);
+extern void br_mdb_init(void);
 
 static inline bool br_multicast_is_router(struct net_bridge *br)
 {
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index d309e7f..163aaa7 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -67,6 +67,7 @@ static struct nlmsg_perm nlmsg_route_perms[] =
 	{ RTM_GETADDRLABEL,	NETLINK_ROUTE_SOCKET__NLMSG_READ  },
 	{ RTM_GETDCB,		NETLINK_ROUTE_SOCKET__NLMSG_READ  },
 	{ RTM_SETDCB,		NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+	{ RTM_GETMDB,		NETLINK_ROUTE_SOCKET__NLMSG_READ  },
 };
 
 static struct nlmsg_perm nlmsg_tcpdiag_perms[] =

^ permalink raw reply related

* [RFC PATCH net-next 0/3 V4] net-tcp: TCP/IP stack bypass for loopback connections
From: Weiping Pan @ 2012-12-05  2:54 UTC (permalink / raw)
  To: netdev; +Cc: brutus, Weiping Pan
In-Reply-To: <CAEkNxbGTwGEBMCpSdib_paaxs0ekc52HWNo2Vai0nNSrZ1Zkng@mail.gmail.com>

1 patch overview
[PATCH 1/3] is the original V3 patch from  Bruce(brutus@google.com),
I just rebase it on top of net-next
commit 03f52a0a5542(ip6mr: Add sizeof verification to MRT6_ASSERT and
MT6_PIM).
http://patchwork.ozlabs.org/patch/184523/

[PATCH 2/3] is to fix the bug in tcp_close() that triggered by [PATCH 1/3],
since for tcp friends data skb, it has no tcp header, and its transport_header
is NULL,
so it will panic if we deference tcp_hdr(skb) in tcp_close().

[PATCH 3/3] is to fix the problem raised by Eric(eric.dumazet@gmail.com)
http://www.spinics.net/lists/netdev/msg210750.html

The sock pointed by request_sock->friend may be freed since it does not have a
lock to protect it.
I just delete request_sock->friend since I think it is useless.

For sk_buff->friend, it has the same problem, and I use
"atomic_add(skb->truesize, &sk->sk_wmem_alloc)" to guarantee that the sock can
not be freed before the skb is freed.

Then for 3-way handshake with tcp friends enabled,
SYN->friend is NULL, SYN/ACK->friend is set in tcp_make_synack(),
and ACK->friend is set in tcp_send_ack().

For normal data and FIN skbs, their friend pointer is NULL.

2 performance analysis
In short, TCP_RR increases by 5 or 6 times, TCP_CRR keeps the same,
TCP_SENDFILE and TCP_MAERTS are not stable, sometimes they increase while
sometimes decrease, so we can regard them as no increase.
For TCP_STREAM, it depends on the message size, if it is bigger than 8192, it
increases else decreases.

Intel(R) Xeon(R) E5506, 2 sockets, 8 cores, 2.13GHz
Memory 4GB
--------------------------------------------------------------------------
TCP friends performance results start

BASE means normal tcp with friends DISABLED.
AF_UNIX means sockets for local interprocess communication, for reference.
FRIENDS means tcp with friends ENABLED.
I set -s 51882 -m 16384 -M 87380 for all the three kinds of sockets by default.
The first percentage number is FRIENDS/BASE.
The second percentage number is FRIENDS/AF_UNIX.
We set -i 10,2 -I 95,20 to stabilize the statistics.

      BASE    AF_UNIX    FRIENDS               TCP_STREAM
  21741.94   30653.90   17115.66   78%   55%

      BASE    AF_UNIX    FRIENDS               TCP_MAERTS
  17464.98          -   17134.63   98%    -%

      BASE    AF_UNIX    FRIENDS             TCP_SENDFILE
     25707          -      30828  119%    -%

TCP_SENDFILE can not work with -i 10,2 -I 95,20 (strange), so I use average.

        MS       BASE    AF_UNIX    FRIENDS            TCP_STREAM_MS
         1      15.64       5.90       5.12   32%   86%
         2      30.93       9.81      10.48   33%  106%
         4      58.22      19.70      21.29   36%  108%
         8     117.00      39.00      42.74   36%  109%
        16     231.08      84.59      83.90   36%   99%
        32     439.39     159.93     163.03   37%  101%
        64     879.13     323.31     322.78   36%   99%
       128    1617.55     632.50     646.34   39%  102%
       256    3091.72    1316.36    1206.93   39%   91%
       512    5077.18    2359.51    2342.00   46%   99%
      1024    7403.20    6302.20    3335.23   45%   52%
      2048   10194.40   13922.19    5751.23   56%   41%
      4096   13338.08   22566.45    9447.29   70%   41%
      8192   14467.93   28122.20   13758.43   95%   48%
     16384   22463.15   37522.42   26804.36  119%   71%
     32768   14743.58   30591.61   17040.15  115%   55%
     65536   24743.77   33855.93   40418.15  163%  119%
    131072   13925.14   31762.52   48292.60  346%  152%
    262144   16126.15   32912.89   25610.47  158%   77%
    524288   12080.51   35059.27   30608.31  253%   87%
   1048576   10539.06   28200.14   16953.69  160%   60%
MS means Message Size in bytes, that is -m -M for netperf

        RR       BASE    AF_UNIX    FRIENDS                TCP_RR_RR
         1   13064.17   95593.46   72982.11  558%   76%
         2   12000.95   95477.38   65203.37  543%   68%
         4   12560.45   90758.17   69983.71  557%   77%
         8   17991.62   96794.53   77293.14  429%   79%
        16   13015.98   89384.69   83125.91  638%   92%
        32   13863.00   89870.17   88986.21  641%   99%
        64   10632.42   88906.59   83055.69  781%   93%
       128   13673.29   85629.27   92984.32  680%  108%
       256   12965.59   88117.74   86155.43  664%   97%
       512   17158.55   90866.08   85498.26  498%   94%
      1024   16951.15   82982.26   82286.84  485%   99%
      2048   11814.75   76684.40   83154.99  703%  108%
      4096   10393.91   63204.65   68558.71  659%  108%
      8192    7757.81   50318.63   50270.39  647%   99%
     16384    8147.26   37392.42   38619.89  474%  103%
     32768    8846.85   24847.64   28412.23  321%  114%
     65536    4974.59   16717.47   17327.65  348%  103%
    131072    4148.19    9053.56    9402.89  226%  103%
    262144    3029.66    5575.51    6119.65  201%  109%
    524288     923.40    3271.52    3649.37  395%  111%
   1048576     385.47    1173.18    1017.43  263%   86%
RR means Request Response Message Size in bytes, that is -r req,resp for netperf

        RR       BASE    AF_UNIX    FRIENDS               TCP_CRR_RR
         1    3424.40          -    3608.92  105%    -%
         2    3355.94          -    3523.77  105%    -%
         4    3437.05          -    3538.48  102%    -%
         8    3465.41          -    3630.49  104%    -%
        16    3495.40          -    3516.93  100%    -%
        32    3425.78          -    3524.90  102%    -%
        64    3432.01          -    3628.25  105%    -%
       128    3434.69          -    3573.88  104%    -%
       256    3413.94          -    3616.94  105%    -%
       512    3457.32          -    3675.38  106%    -%
      1024    3476.01          -    3634.25  104%    -%
      2048    3484.38          -    3539.96  101%    -%
      4096    3304.86          -    3564.57  107%    -%
      8192    3420.40          -    3599.02  105%    -%
     16384    3358.47          -    3571.60  106%    -%
     32768    3299.75          -    3469.19  105%    -%
     65536    2635.22          -    3292.74  124%    -%
    131072     119.97          -    3008.15 2507%    -%
    262144     933.66          -    2189.83  234%    -%
    524288     175.82          -     607.32  345%    -%
   1048576      41.70          -     296.22  710%    -%
RR means Request Response Message Size in bytes, that is -r req,resp for netperf -H 127.0.0.1

TCP friends performance results end
--------------------------------------------------------------------------

In short, I think the performance of tcp friends is not overwhelming than
loopback.

Friends VS AF__UNIX
Their call path are almost the same, but AF_UNIX uses its own send/recv codes
with proper locks,
so AF_UNIX's performance is much better than Friends.

Friends VS normal tcp
Friends directly adds skb into peer's sk_receive_queue if it gets the lock.
So the sender and receiver have serious lock contention.

Normal tcp sends skb into sk_write_queue, then sends it in net_tx_action() and
receives it in net_rx_action(), then adds it into peer's sk_receive_queue.
So the sender just needs to lock the write queue while the receiver just needs
to lock the receive queue, so they have little lock contention.

3 TODO
1 try to confirm that the root cause of regression in some cases is the lock
contention.

2 find a better way to fix the regression.

Any hints ?

thanks

Weiping Pan (3):
  Bruce's orignal tcp friend V3
  fix panic in tcp_close()
  delete request_sock->friend

 Documentation/networking/ip-sysctl.txt |    8 +
 include/linux/skbuff.h                 |    2 +
 include/net/inet_connection_sock.h     |    4 +
 include/net/sock.h                     |   32 ++-
 include/net/tcp.h                      |   13 +-
 net/core/skbuff.c                      |    1 +
 net/core/sock.c                        |    1 +
 net/core/stream.c                      |   36 ++
 net/ipv4/inet_connection_sock.c        |   38 ++
 net/ipv4/sysctl_net_ipv4.c             |    7 +
 net/ipv4/tcp.c                         |  610 +++++++++++++++++++++++++++-----
 net/ipv4/tcp_input.c                   |   12 +-
 net/ipv4/tcp_ipv4.c                    |    5 +
 net/ipv4/tcp_minisocks.c               |   11 +-
 net/ipv4/tcp_output.c                  |   19 +-
 15 files changed, 707 insertions(+), 92 deletions(-)

-- 
1.7.4.4

^ permalink raw reply

* [PATCH 1/3] Bruce's orignal tcp friend V3
From: Weiping Pan @ 2012-12-05  2:54 UTC (permalink / raw)
  To: netdev; +Cc: brutus, Weiping Pan
In-Reply-To: <cover.1354674151.git.wpan@redhat.com>

http://patchwork.ozlabs.org/patch/184523/

Rebase on top of commit 03f52a0a5542(ip6mr: Add sizeof verification to
MRT6_ASSERT and MT6_PIM).

Signed-off-by: Weiping Pan <wpan@redhat.com>
---
 Documentation/networking/ip-sysctl.txt |    8 +
 include/linux/skbuff.h                 |    2 +
 include/net/request_sock.h             |    1 +
 include/net/sock.h                     |   32 ++-
 include/net/tcp.h                      |   13 +-
 net/core/skbuff.c                      |    1 +
 net/core/sock.c                        |    1 +
 net/core/stream.c                      |   36 ++
 net/ipv4/inet_connection_sock.c        |   20 +
 net/ipv4/sysctl_net_ipv4.c             |    7 +
 net/ipv4/tcp.c                         |  604 +++++++++++++++++++++++++++-----
 net/ipv4/tcp_input.c                   |   22 +-
 net/ipv4/tcp_ipv4.c                    |    2 +
 net/ipv4/tcp_minisocks.c               |    4 +
 net/ipv4/tcp_output.c                  |   16 +-
 net/ipv6/tcp_ipv6.c                    |    1 +
 16 files changed, 679 insertions(+), 91 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 98ac0d7..152f488 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -214,6 +214,14 @@ tcp_fack - BOOLEAN
 	Enable FACK congestion avoidance and fast retransmission.
 	The value is not used, if tcp_sack is not enabled.
 
+tcp_friends - BOOLEAN
+	If set, TCP loopback socket pair stack bypass is enabled such
+	that all data sent will be directly queued to the receiver's
+	socket for receive. Note, normal connection establishment and
+	finish is used to make friends so any loopback interpose, e.g.
+	tcpdump, will see these TCP segements but no data segments.
+	Default: 1
+
 tcp_fin_timeout - INTEGER
 	Time to hold socket in state FIN-WAIT-2, if it was closed
 	by our side. Peer can be broken and never close its side,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f2af494..c890f65 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -334,6 +334,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@cb: Control buffer. Free for use by every layer. Put private vars here
  *	@_skb_refdst: destination entry (with norefcount bit)
  *	@sp: the security path, used for xfrm
+ *	@friend: loopback friend socket
  *	@len: Length of actual data
  *	@data_len: Data length
  *	@mac_len: Length of link layer header
@@ -409,6 +410,7 @@ struct sk_buff {
 #ifdef CONFIG_XFRM
 	struct	sec_path	*sp;
 #endif
+	struct sock		*friend;
 	unsigned int		len,
 				data_len;
 	__u16			mac_len,
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index a51dbd1..c6dfa26 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -66,6 +66,7 @@ struct request_sock {
 	unsigned long			expires;
 	const struct request_sock_ops	*rsk_ops;
 	struct sock			*sk;
+	struct sock			*friend;
 	u32				secid;
 	u32				peer_secid;
 };
diff --git a/include/net/sock.h b/include/net/sock.h
index c945fba..778d8dd 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -197,6 +197,7 @@ struct cg_proto;
   *	@sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
   *	@sk_lock:	synchronizer
   *	@sk_rcvbuf: size of receive buffer in bytes
+  *	@sk_friend: loopback friend socket
   *	@sk_wq: sock wait queue and async head
   *	@sk_rx_dst: receive input route used by early tcp demux
   *	@sk_dst_cache: destination cache
@@ -286,6 +287,14 @@ struct sock {
 	socket_lock_t		sk_lock;
 	struct sk_buff_head	sk_receive_queue;
 	/*
+	 * If socket has a friend (sk_friend != NULL) then a send skb is
+	 * enqueued directly to the friend's sk_receive_queue such that:
+	 *
+	 *        sk_sndbuf -> sk_sndbuf + sk_friend->sk_rcvbuf
+	 *   sk_wmem_queued -> sk_friend->sk_rmem_alloc
+	 */
+	struct sock		*sk_friend;
+	/*
 	 * The backlog queue is special, it is always used with
 	 * the per-socket spinlock held and requires low latency
 	 * access. Therefore we special case it's implementation.
@@ -703,24 +712,40 @@ static inline bool sk_acceptq_is_full(const struct sock *sk)
 	return sk->sk_ack_backlog > sk->sk_max_ack_backlog;
 }
 
+static inline int sk_wmem_queued_get(const struct sock *sk)
+{
+	if (sk->sk_friend)
+		return atomic_read(&sk->sk_friend->sk_rmem_alloc);
+	else
+		return sk->sk_wmem_queued;
+}
+
+static inline int sk_sndbuf_get(const struct sock *sk)
+{
+	if (sk->sk_friend)
+		return sk->sk_sndbuf + sk->sk_friend->sk_rcvbuf;
+	else
+		return sk->sk_sndbuf;
+}
+
 /*
  * Compute minimal free write space needed to queue new packets.
  */
 static inline int sk_stream_min_wspace(const struct sock *sk)
 {
-	return sk->sk_wmem_queued >> 1;
+	return sk_wmem_queued_get(sk) >> 1;
 }
 
 static inline int sk_stream_wspace(const struct sock *sk)
 {
-	return sk->sk_sndbuf - sk->sk_wmem_queued;
+	return sk_sndbuf_get(sk) - sk_wmem_queued_get(sk);
 }
 
 extern void sk_stream_write_space(struct sock *sk);
 
 static inline bool sk_stream_memory_free(const struct sock *sk)
 {
-	return sk->sk_wmem_queued < sk->sk_sndbuf;
+	return sk_wmem_queued_get(sk) < sk_sndbuf_get(sk);
 }
 
 /* OOB backlog add */
@@ -829,6 +854,7 @@ static inline void sock_rps_reset_rxhash(struct sock *sk)
 	})
 
 extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p);
+extern int sk_stream_wait_friend(struct sock *sk, long *timeo_p);
 extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
 extern void sk_stream_wait_close(struct sock *sk, long timeo_p);
 extern int sk_stream_error(struct sock *sk, int flags, int err);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3202bde..5f82770 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -292,6 +292,7 @@ extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
 extern int sysctl_tcp_limit_output_bytes;
 extern int sysctl_tcp_challenge_ack_limit;
+extern int sysctl_tcp_friends;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -687,6 +688,15 @@ void tcp_send_window_probe(struct sock *sk);
 #define TCPHDR_ECE 0x40
 #define TCPHDR_CWR 0x80
 
+/* If skb_get_friend() != NULL, TCP friends per packet state.
+ */
+struct friend_skb_parm {
+	bool	tail_inuse;		/* In use by skb_get_friend() send while */
+					/* on sk_receive_queue for tail put */
+};
+
+#define TCP_FRIEND_CB(tcb) (&(tcb)->header.hf)
+
 /* This is what the send packet queuing engine uses to pass
  * TCP per-packet control information to the transmission code.
  * We also store the host-order sequence numbers in here too.
@@ -699,6 +709,7 @@ struct tcp_skb_cb {
 #if IS_ENABLED(CONFIG_IPV6)
 		struct inet6_skb_parm	h6;
 #endif
+		struct friend_skb_parm	hf;
 	} header;	/* For incoming frames		*/
 	__u32		seq;		/* Starting sequence number	*/
 	__u32		end_seq;	/* SEQ + FIN + SYN + datalen	*/
@@ -1041,7 +1052,7 @@ static inline bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (sysctl_tcp_low_latency || !tp->ucopy.task)
+	if (sysctl_tcp_low_latency || !tp->ucopy.task || sk->sk_friend)
 		return false;
 
 	__skb_queue_tail(&tp->ucopy.prequeue, skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 880722e2..665826a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -690,6 +690,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #ifdef CONFIG_XFRM
 	new->sp			= secpath_get(old->sp);
 #endif
+	new->friend		= old->friend;
 	memcpy(new->cb, old->cb, sizeof(old->cb));
 	new->csum		= old->csum;
 	new->local_df		= old->local_df;
diff --git a/net/core/sock.c b/net/core/sock.c
index a692ef4..a8f59a9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2225,6 +2225,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 #ifdef CONFIG_NET_DMA
 	skb_queue_head_init(&sk->sk_async_wait_queue);
 #endif
+	sk->sk_friend		=	NULL;
 
 	sk->sk_send_head	=	NULL;
 
diff --git a/net/core/stream.c b/net/core/stream.c
index f5df85d..85e5b03 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -83,6 +83,42 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
 EXPORT_SYMBOL(sk_stream_wait_connect);
 
 /**
+ * sk_stream_wait_friend - Wait for a socket to make friends
+ * @sk: sock to wait on
+ * @timeo_p: for how long to wait
+ *
+ * Must be called with the socket locked.
+ */
+int sk_stream_wait_friend(struct sock *sk, long *timeo_p)
+{
+	struct task_struct *tsk = current;
+	DEFINE_WAIT(wait);
+	int done;
+
+	do {
+		int err = sock_error(sk);
+		if (err)
+			return err;
+		if (!sk->sk_friend)
+			return -EBADFD;
+		if (!*timeo_p)
+			return -EAGAIN;
+		if (signal_pending(tsk))
+			return sock_intr_errno(*timeo_p);
+
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		sk->sk_write_pending++;
+		done = sk_wait_event(sk, timeo_p,
+				     !sk->sk_err &&
+				     sk->sk_friend->sk_friend);
+		finish_wait(sk_sleep(sk), &wait);
+		sk->sk_write_pending--;
+	} while (!done);
+	return 0;
+}
+EXPORT_SYMBOL(sk_stream_wait_friend);
+
+/**
  * sk_stream_closing - Return 1 if we still have things to send in our buffers.
  * @sk: socket to verify
  */
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 2026542..ce4b79b 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -659,6 +659,26 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
 	if (newsk != NULL) {
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
 
+		if (req->friend) {
+			/*
+			 * Make friends with the requestor but the ACK of
+			 * the request is already in-flight so the race is
+			 * on to make friends before the ACK is processed.
+			 * If the requestor's sk_friend value is != NULL
+			 * then the requestor has already processed the
+			 * ACK so indicate state change to wake'm up.
+			 */
+			struct sock *was;
+
+			sock_hold(req->friend);
+			newsk->sk_friend = req->friend;
+			sock_hold(newsk);
+			was = xchg(&req->friend->sk_friend, newsk);
+			/* If requester already connect()ed, maybe sleeping */
+			if (was && !sock_flag(req->friend, SOCK_DEAD))
+				sk->sk_state_change(req->friend);
+		}
+
 		newsk->sk_state = TCP_SYN_RECV;
 		newicsk->icsk_bind_hash = NULL;
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d84400b..4ca53db 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -796,6 +796,13 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero
 	},
+	{
+		.procname	= "tcp_friends",
+		.data		= &sysctl_tcp_friends,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e6eace1..4327deb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -310,6 +310,56 @@ struct tcp_splice_state {
 };
 
 /*
+ * Validate friendp, if not a friend return 0, else if friend is also a
+ * friend return 1, else friendp points to a listen()er so wait for our
+ * friend to be ready then update friendp with pointer to the real friend
+ * and return 1, else an error has occurred so return a -errno.
+ */
+static inline int tcp_friend_validate(struct sock *sk, struct sock **friendp,
+			      long *timeo)
+{
+	struct sock *friend = *friendp;
+
+	if (!friend)
+		return 0;
+	if (unlikely(!friend->sk_friend)) {
+		/* Friendship not complete, wait? */
+		int err;
+
+		if (!timeo)
+			return -EAGAIN;
+		err = sk_stream_wait_friend(sk, timeo);
+		if (err < 0)
+			return err;
+		*friendp = sk->sk_friend;
+	}
+	return 1;
+}
+
+static inline int tcp_friend_send_lock(struct sock *friend)
+{
+	int err = 0;
+
+	spin_lock_bh(&friend->sk_lock.slock);
+	if (unlikely(friend->sk_shutdown & RCV_SHUTDOWN)) {
+		spin_unlock_bh(&friend->sk_lock.slock);
+		err = -ECONNRESET;
+	}
+
+	return err;
+}
+
+static inline void tcp_friend_recv_lock(struct sock *friend)
+{
+	spin_lock_bh(&friend->sk_lock.slock);
+}
+
+static void tcp_friend_unlock(struct sock *friend)
+{
+	spin_unlock_bh(&friend->sk_lock.slock);
+}
+
+/*
  * Pressure flag: try to collapse.
  * Technical note: it is used by multiple contexts non atomically.
  * All the __sk_mem_schedule() is of this nature: accounting
@@ -589,6 +639,76 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 }
 EXPORT_SYMBOL(tcp_ioctl);
 
+/*
+ * Friend receive_queue tail skb space? If true, set tail_inuse.
+ * Else if RCV_SHUTDOWN, return *copy = -ECONNRESET.
+ */
+static inline struct sk_buff *tcp_friend_tail(struct sock *friend, int *copy)
+{
+	struct sk_buff	*skb = NULL;
+	int		sz = 0;
+
+	if (skb_peek_tail(&friend->sk_receive_queue)) {
+		sz = tcp_friend_send_lock(friend);
+		if (!sz) {
+			skb = skb_peek_tail(&friend->sk_receive_queue);
+			if (skb && skb->friend) {
+				if (!*copy)
+					sz = skb_tailroom(skb);
+				else {
+					sz = *copy - skb->len;
+					if (sz < 0)
+						sz = 0;
+				}
+				if (sz > 0)
+					TCP_FRIEND_CB(TCP_SKB_CB(skb))->
+							tail_inuse = true;
+			}
+			tcp_friend_unlock(friend);
+		}
+	}
+
+	*copy = sz;
+	return skb;
+}
+
+static inline void tcp_friend_seq(struct sock *sk, int copy, int charge)
+{
+	struct sock	*friend = sk->sk_friend;
+	struct tcp_sock *tp = tcp_sk(friend);
+
+	if (charge) {
+		sk_mem_charge(friend, charge);
+		atomic_add(charge, &friend->sk_rmem_alloc);
+	}
+	tp->rcv_nxt += copy;
+	tp->rcv_wup += copy;
+	tcp_friend_unlock(friend);
+
+	tp = tcp_sk(sk);
+	tp->snd_nxt += copy;
+	tp->pushed_seq += copy;
+	tp->snd_una += copy;
+	tp->snd_up += copy;
+}
+
+static inline bool tcp_friend_push(struct sock *sk, struct sk_buff *skb)
+{
+	struct sock	*friend = sk->sk_friend;
+	int		wait = false;
+
+	skb_set_owner_r(skb, friend);
+	__skb_queue_tail(&friend->sk_receive_queue, skb);
+	if (!sk_rmem_schedule(friend, skb, skb->truesize))
+		wait = true;
+
+	tcp_friend_seq(sk, skb->len, 0);
+	if (skb == skb_peek(&friend->sk_receive_queue))
+		friend->sk_data_ready(friend, 0);
+
+	return wait;
+}
+
 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
 {
 	TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
@@ -605,8 +725,13 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 
-	skb->csum    = 0;
 	tcb->seq     = tcb->end_seq = tp->write_seq;
+	if (sk->sk_friend) {
+		skb->friend = sk;
+		TCP_FRIEND_CB(tcb)->tail_inuse = false;
+		return;
+	}
+	skb->csum    = 0;
 	tcb->tcp_flags = TCPHDR_ACK;
 	tcb->sacked  = 0;
 	skb_header_release(skb);
@@ -626,7 +751,10 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
 static inline void tcp_push(struct sock *sk, int flags, int mss_now,
 			    int nonagle)
 {
-	if (tcp_send_head(sk)) {
+	if (sk->sk_friend) {
+		if (skb_peek(&sk->sk_friend->sk_receive_queue))
+			sk->sk_friend->sk_data_ready(sk->sk_friend, 0);
+	} else if (tcp_send_head(sk)) {
 		struct tcp_sock *tp = tcp_sk(sk);
 
 		if (!(flags & MSG_MORE) || forced_push(tp))
@@ -758,6 +886,21 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
 }
 EXPORT_SYMBOL(tcp_splice_read);
 
+static inline struct sk_buff *tcp_friend_alloc_skb(struct sock *sk, int size)
+{
+	struct sk_buff *skb;
+
+	skb = alloc_skb(size, sk->sk_allocation);
+	if (skb)
+		skb->avail_size = skb_tailroom(skb);
+	else {
+		sk->sk_prot->enter_memory_pressure(sk);
+		sk_stream_moderate_sndbuf(sk);
+	}
+
+	return skb;
+}
+
 struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
 {
 	struct sk_buff *skb;
@@ -821,12 +964,53 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
 	return max(xmit_size_goal, mss_now);
 }
 
+static unsigned int tcp_friend_xmit_size_goal(struct sock *sk, int size_goal)
+{
+	u32 size = SKB_DATA_ALIGN(size_goal);
+	u32 overhead = sizeof(struct skb_shared_info) + sizeof(struct sk_buff);
+
+	/*
+	 * If alloc >= largest skb use largest order, else check
+	 * for optimal tail fill size, else use largest order.
+	 */
+	if (size >= SKB_MAX_ORDER(0, 4))
+		size = SKB_MAX_ORDER(0, 4);
+	else if (size <= (SKB_MAX_ORDER(0, 0) >> 3))
+		size = SKB_MAX_ORDER(0, 0);
+	else if (size <= (SKB_MAX_ORDER(0, 1) >> 3))
+		size = SKB_MAX_ORDER(0, 1);
+	else if (size <= (SKB_MAX_ORDER(0, 0) >> 1))
+		size = SKB_MAX_ORDER(0, 0);
+	else if (size <= (SKB_MAX_ORDER(0, 1) >> 1))
+		size = SKB_MAX_ORDER(0, 1);
+	else if (size <= (SKB_MAX_ORDER(0, 2) >> 1))
+		size = SKB_MAX_ORDER(0, 2);
+	else if (size <= (SKB_MAX_ORDER(0, 3) >> 1))
+		size = SKB_MAX_ORDER(0, 3);
+	else
+		size = SKB_MAX_ORDER(0, 4);
+
+	/* At least 2 true sized in sk_buf */
+	if (size + overhead > (sk_sndbuf_get(sk) >> 1))
+		size = (sk_sndbuf_get(sk) >> 1) - overhead;
+
+	return size;
+}
+
 static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
 {
 	int mss_now;
+	int tmp;
+
+	if (sk->sk_friend) {
+		mss_now = tcp_friend_xmit_size_goal(sk, *size_goal);
+		tmp = mss_now;
+	} else {
+		mss_now = tcp_current_mss(sk);
+		tmp = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
+	}
 
-	mss_now = tcp_current_mss(sk);
-	*size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
+	*size_goal = tmp;
 
 	return mss_now;
 }
@@ -834,8 +1018,9 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 			 size_t psize, int flags)
 {
+	struct sock *friend = sk->sk_friend;
 	struct tcp_sock *tp = tcp_sk(sk);
-	int mss_now, size_goal;
+	int mss_now, size_goal = psize;
 	int err;
 	ssize_t copied;
 	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -850,6 +1035,10 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
 			goto out_err;
 	}
 
+	err = tcp_friend_validate(sk, &friend, &timeo);
+	if (err < 0)
+		goto out_err;
+
 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
 	mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -860,25 +1049,47 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
 		goto out_err;
 
 	while (psize > 0) {
-		struct sk_buff *skb = tcp_write_queue_tail(sk);
+		struct sk_buff *skb;
+		struct tcp_skb_cb *tcb;
 		struct page *page = pages[poffset / PAGE_SIZE];
 		int copy, i;
 		int offset = poffset % PAGE_SIZE;
 		int size = min_t(size_t, psize, PAGE_SIZE - offset);
 		bool can_coalesce;
 
-		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
+		if (friend) {
+			copy = size_goal;
+			skb = tcp_friend_tail(friend, &copy);
+			if (copy < 0) {
+				sk->sk_err = -copy;
+				err = -EPIPE;
+				goto out_err;
+			}
+		} else if (!tcp_send_head(sk)) {
+			skb = NULL;
+			copy = 0;
+		} else {
+			skb = tcp_write_queue_tail(sk);
+			copy = size_goal - skb->len;
+		}
+
+		if (copy <= 0) {
 new_segment:
 			if (!sk_stream_memory_free(sk))
 				goto wait_for_sndbuf;
 
-			skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
+			if (friend)
+				skb = tcp_friend_alloc_skb(sk, 0);
+			else
+				skb = sk_stream_alloc_skb(sk, 0,
+							  sk->sk_allocation);
 			if (!skb)
 				goto wait_for_memory;
 
 			skb_entail(sk, skb);
 			copy = size_goal;
 		}
+		tcb = TCP_SKB_CB(skb);
 
 		if (copy > size)
 			copy = size;
@@ -886,10 +1097,14 @@ new_segment:
 		i = skb_shinfo(skb)->nr_frags;
 		can_coalesce = skb_can_coalesce(skb, i, page, offset);
 		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
-			tcp_mark_push(tp, skb);
+			if (friend) {
+				if (TCP_FRIEND_CB(tcb)->tail_inuse)
+					TCP_FRIEND_CB(tcb)->tail_inuse = false;
+			} else
+				tcp_mark_push(tp, skb);
 			goto new_segment;
 		}
-		if (!sk_wmem_schedule(sk, copy))
+		if (!friend && !sk_wmem_schedule(sk, copy))
 			goto wait_for_memory;
 
 		if (can_coalesce) {
@@ -902,19 +1117,41 @@ new_segment:
 		skb->len += copy;
 		skb->data_len += copy;
 		skb->truesize += copy;
-		sk->sk_wmem_queued += copy;
-		sk_mem_charge(sk, copy);
-		skb->ip_summed = CHECKSUM_PARTIAL;
 		tp->write_seq += copy;
-		TCP_SKB_CB(skb)->end_seq += copy;
-		skb_shinfo(skb)->gso_segs = 0;
-
-		if (!copied)
-			TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
 
 		copied += copy;
 		poffset += copy;
-		if (!(psize -= copy))
+		psize -= copy;
+
+		if (friend) {
+			err = tcp_friend_send_lock(friend);
+			if (err) {
+				sk->sk_err = -err;
+				err = -EPIPE;
+				goto out_err;
+			}
+			tcb->end_seq += copy;
+			if (TCP_FRIEND_CB(tcb)->tail_inuse) {
+				TCP_FRIEND_CB(tcb)->tail_inuse = false;
+				tcp_friend_seq(sk, copy, copy);
+			} else {
+				if (tcp_friend_push(sk, skb))
+					goto wait_for_sndbuf;
+			}
+			if (!psize)
+				goto out;
+			continue;
+		}
+
+		tcb->end_seq += copy;
+		skb_shinfo(skb)->gso_segs = 0;
+		sk->sk_wmem_queued += copy;
+		sk_mem_charge(sk, copy);
+		skb->ip_summed = CHECKSUM_PARTIAL;
+		if (copied == copy)
+			tcb->tcp_flags &= ~TCPHDR_PSH;
+
+		if (!psize)
 			goto out;
 
 		if (skb->len < size_goal || (flags & MSG_OOB))
@@ -935,7 +1172,8 @@ wait_for_memory:
 		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 			goto do_error;
 
-		mss_now = tcp_send_mss(sk, &size_goal, flags);
+		if (!friend)
+			mss_now = tcp_send_mss(sk, &size_goal, flags);
 	}
 
 out:
@@ -1026,10 +1264,12 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		size_t size)
 {
 	struct iovec *iov;
+	struct sock *friend = sk->sk_friend;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
+	struct tcp_skb_cb *tcb;
 	int iovlen, flags, err, copied = 0;
-	int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
+	int mss_now = 0, size_goal = size, copied_syn = 0, offset = 0;
 	bool sg;
 	long timeo;
 
@@ -1057,6 +1297,10 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			goto do_error;
 	}
 
+	err = tcp_friend_validate(sk, &friend, &timeo);
+	if (err < 0)
+		goto out;
+
 	if (unlikely(tp->repair)) {
 		if (tp->repair_queue == TCP_RECV_QUEUE) {
 			copied = tcp_send_rcvq(sk, msg, size);
@@ -1105,24 +1349,38 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			int copy = 0;
 			int max = size_goal;
 
-			skb = tcp_write_queue_tail(sk);
-			if (tcp_send_head(sk)) {
-				if (skb->ip_summed == CHECKSUM_NONE)
-					max = mss_now;
-				copy = max - skb->len;
+			if (friend) {
+				skb = tcp_friend_tail(friend, &copy);
+				if (copy < 0) {
+					sk->sk_err = -copy;
+					err = -EPIPE;
+					goto out_err;
+				}
+			} else {
+				skb = tcp_write_queue_tail(sk);
+				if (tcp_send_head(sk)) {
+					if (skb->ip_summed == CHECKSUM_NONE)
+						max = mss_now;
+					copy = max - skb->len;
+				}
 			}
 
 			if (copy <= 0) {
 new_segment:
-				/* Allocate new segment. If the interface is SG,
-				 * allocate skb fitting to single page.
-				 */
 				if (!sk_stream_memory_free(sk))
 					goto wait_for_sndbuf;
 
-				skb = sk_stream_alloc_skb(sk,
-							  select_size(sk, sg),
-							  sk->sk_allocation);
+				if (friend)
+					skb = tcp_friend_alloc_skb(sk, max);
+				else {
+					/* Allocate new segment. If the
+					 * interface is SG, allocate skb
+					 * fitting to single page.
+					 */
+					skb = sk_stream_alloc_skb(sk,
+							select_size(sk, sg),
+							sk->sk_allocation);
+				}
 				if (!skb)
 					goto wait_for_memory;
 
@@ -1136,6 +1394,7 @@ new_segment:
 				copy = size_goal;
 				max = size_goal;
 			}
+			tcb = TCP_SKB_CB(skb);
 
 			/* Try to append data to the end of skb. */
 			if (copy > seglen)
@@ -1153,6 +1412,8 @@ new_segment:
 				int i = skb_shinfo(skb)->nr_frags;
 				struct page_frag *pfrag = sk_page_frag(sk);
 
+				BUG_ON(friend);
+
 				if (!sk_page_frag_refill(sk, pfrag))
 					goto wait_for_memory;
 
@@ -1188,16 +1449,37 @@ new_segment:
 				pfrag->offset += copy;
 			}
 
-			if (!copied)
-				TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
-
 			tp->write_seq += copy;
-			TCP_SKB_CB(skb)->end_seq += copy;
-			skb_shinfo(skb)->gso_segs = 0;
 
 			from += copy;
 			copied += copy;
-			if ((seglen -= copy) == 0 && iovlen == 0)
+			seglen -= copy;
+
+			if (friend) {
+				err = tcp_friend_send_lock(friend);
+				if (err) {
+					sk->sk_err = -err;
+					err = -EPIPE;
+					goto out_err;
+				}
+				tcb->end_seq += copy;
+				if (TCP_FRIEND_CB(tcb)->tail_inuse) {
+					TCP_FRIEND_CB(tcb)->tail_inuse = false;
+					tcp_friend_seq(sk, copy, 0);
+				} else {
+					if (tcp_friend_push(sk, skb))
+						goto wait_for_sndbuf;
+				}
+				continue;
+			}
+
+			tcb->end_seq += copy;
+			skb_shinfo(skb)->gso_segs = 0;
+
+			if (copied == copy)
+				tcb->tcp_flags &= ~TCPHDR_PSH;
+
+			if (seglen == 0 && iovlen == 0)
 				goto out;
 
 			if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
@@ -1219,7 +1501,8 @@ wait_for_memory:
 			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 				goto do_error;
 
-			mss_now = tcp_send_mss(sk, &size_goal, flags);
+			if (!friend)
+				mss_now = tcp_send_mss(sk, &size_goal, flags);
 		}
 	}
 
@@ -1230,7 +1513,12 @@ out:
 	return copied + copied_syn;
 
 do_fault:
-	if (!skb->len) {
+	if (skb->friend) {
+		if (TCP_FRIEND_CB(tcb)->tail_inuse)
+			TCP_FRIEND_CB(tcb)->tail_inuse = false;
+		else
+			__kfree_skb(skb);
+	} else if (!skb->len) {
 		tcp_unlink_write_queue(skb, sk);
 		/* It is the one place in all of TCP, except connection
 		 * reset, where we can be unlinking the send_head.
@@ -1249,6 +1537,13 @@ out_err:
 }
 EXPORT_SYMBOL(tcp_sendmsg);
 
+static inline void tcp_friend_write_space(struct sock *sk)
+{
+	/* Queued data below 1/4th of sndbuf? */
+	if ((sk_sndbuf_get(sk) >> 2) > sk_wmem_queued_get(sk))
+		sk->sk_friend->sk_write_space(sk->sk_friend);
+}
+
 /*
  *	Handle reading urgent data. BSD has very simple semantics for
  *	this, no blocking and very strange errors 8)
@@ -1327,7 +1622,12 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
 	struct tcp_sock *tp = tcp_sk(sk);
 	bool time_to_ack = false;
 
-	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+	struct sk_buff *skb;
+
+	if (sk->sk_friend)
+		return;
+
+	skb = skb_peek(&sk->sk_receive_queue);
 
 	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
 	     "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
@@ -1431,17 +1731,27 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)
 }
 #endif
 
-static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
+static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off,
+					   size_t *len)
 {
 	struct sk_buff *skb;
 	u32 offset;
+	size_t avail;
 
 	skb_queue_walk(&sk->sk_receive_queue, skb) {
-		offset = seq - TCP_SKB_CB(skb)->seq;
-		if (tcp_hdr(skb)->syn)
-			offset--;
-		if (offset < skb->len || tcp_hdr(skb)->fin) {
+		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+		offset = seq - tcb->seq;
+		if (skb->friend)
+			avail = (u32)(tcb->end_seq - seq);
+		else {
+			if (tcp_hdr(skb)->syn)
+				offset--;
+			avail = skb->len - offset;
+		}
+		if (avail > 0 || (!skb->friend && tcp_hdr(skb)->fin)) {
 			*off = offset;
+			*len = avail;
 			return skb;
 		}
 	}
@@ -1467,15 +1777,23 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 	u32 seq = tp->copied_seq;
 	u32 offset;
 	int copied = 0;
+	size_t len;
+	int err;
+	struct sock *friend = sk->sk_friend;
+	long timeo = sock_rcvtimeo(sk, false);
 
 	if (sk->sk_state == TCP_LISTEN)
 		return -ENOTCONN;
-	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
-		if (offset < skb->len) {
-			int used;
-			size_t len;
+	err = tcp_friend_validate(sk, &friend, &timeo);
+	if (err < 0)
+		return err;
+	if (friend)
+		tcp_friend_recv_lock(sk);
 
-			len = skb->len - offset;
+	while ((skb = tcp_recv_skb(sk, seq, &offset, &len)) != NULL) {
+		if (len > 0) {
+			int used;
+	again:
 			/* Stop reading if we hit a patch of urgent data */
 			if (tp->urg_data) {
 				u32 urg_offset = tp->urg_seq - seq;
@@ -1484,6 +1802,10 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 				if (!len)
 					break;
 			}
+
+			if (friend)
+				tcp_friend_unlock(sk);
+
 			used = recv_actor(desc, skb, offset, len);
 			if (used < 0) {
 				if (!copied)
@@ -1494,33 +1816,65 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 				copied += used;
 				offset += used;
 			}
-			/*
-			 * If recv_actor drops the lock (e.g. TCP splice
-			 * receive) the skb pointer might be invalid when
-			 * getting here: tcp_collapse might have deleted it
-			 * while aggregating skbs from the socket queue.
-			 */
-			skb = tcp_recv_skb(sk, seq-1, &offset);
-			if (!skb || (offset+1 != skb->len))
-				break;
+
+			if (friend)
+				tcp_friend_recv_lock(sk);
+			if (skb->friend) {
+				len = (u32)(TCP_SKB_CB(skb)->end_seq - seq);
+				if (len > 0) {
+					/*
+					 * Friend did an skb_put() while we
+					 * were away so process the same skb.
+					 */
+					if (!desc->count)
+						break;
+					tp->copied_seq = seq;
+					goto again;
+				}
+			} else {
+				/*
+				 * If recv_actor drops the lock (e.g. TCP
+				 * splice receive) the skb pointer might be
+				 * invalid when getting here: tcp_collapse
+				 * might have deleted it while aggregating
+				 * skbs from the socket queue.
+				 */
+				skb = tcp_recv_skb(sk, seq-1, &offset, &len);
+				if (!skb || (offset+1 != skb->len))
+					break;
+			}
 		}
-		if (tcp_hdr(skb)->fin) {
+		if (!skb->friend && tcp_hdr(skb)->fin) {
 			sk_eat_skb(sk, skb, false);
 			++seq;
 			break;
 		}
-		sk_eat_skb(sk, skb, false);
+		if (skb->friend) {
+			if (!TCP_FRIEND_CB(TCP_SKB_CB(skb))->tail_inuse) {
+				__skb_unlink(skb, &sk->sk_receive_queue);
+				__kfree_skb(skb);
+				tcp_friend_write_space(sk);
+			}
+			tcp_friend_unlock(sk);
+			tcp_friend_recv_lock(sk);
+		} else
+			sk_eat_skb(sk, skb, 0);
 		if (!desc->count)
 			break;
 		tp->copied_seq = seq;
 	}
 	tp->copied_seq = seq;
 
-	tcp_rcv_space_adjust(sk);
+	if (friend) {
+		tcp_friend_unlock(sk);
+		tcp_friend_write_space(sk);
+	} else {
+		tcp_rcv_space_adjust(sk);
 
-	/* Clean up data we have read: This will do ACK frames. */
-	if (copied > 0)
-		tcp_cleanup_rbuf(sk, copied);
+		/* Clean up data we have read: This will do ACK frames. */
+		if (copied > 0)
+			tcp_cleanup_rbuf(sk, copied);
+	}
 	return copied;
 }
 EXPORT_SYMBOL(tcp_read_sock);
@@ -1536,6 +1890,7 @@ EXPORT_SYMBOL(tcp_read_sock);
 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		size_t len, int nonblock, int flags, int *addr_len)
 {
+	struct sock *friend = sk->sk_friend;
 	struct tcp_sock *tp = tcp_sk(sk);
 	int copied = 0;
 	u32 peek_seq;
@@ -1548,6 +1903,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	bool copied_early = false;
 	struct sk_buff *skb;
 	u32 urg_hole = 0;
+	bool locked = false;
 
 	lock_sock(sk);
 
@@ -1557,6 +1913,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 	timeo = sock_rcvtimeo(sk, nonblock);
 
+	err = tcp_friend_validate(sk, &friend, &timeo);
+	if (err < 0)
+		goto out;
+
 	/* Urgent data needs to be handled specially. */
 	if (flags & MSG_OOB)
 		goto recv_urg;
@@ -1595,7 +1955,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			available = TCP_SKB_CB(skb)->seq + skb->len - (*seq);
 		if ((available < target) &&
 		    (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
-		    !sysctl_tcp_low_latency &&
+		    !sysctl_tcp_low_latency && !friend &&
 		    net_dma_find_channel()) {
 			preempt_enable_no_resched();
 			tp->ucopy.pinned_list =
@@ -1606,7 +1966,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	}
 #endif
 
+	err = 0;
+
 	do {
+		struct tcp_skb_cb *tcb;
 		u32 offset;
 
 		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
@@ -1614,37 +1977,77 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			if (copied)
 				break;
 			if (signal_pending(current)) {
-				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
+				err = timeo ? sock_intr_errno(timeo) : -EAGAIN;
 				break;
 			}
 		}
 
-		/* Next get a buffer. */
+		/*
+		 * Next get a buffer. Note, for friends sendmsg() queues
+		 * data directly to our sk_receive_queue by holding our
+		 * slock and either tail queuing a new skb or adding new
+		 * data to the tail skb. In the later case tail_inuse is
+		 * set, slock dropped, copyin, skb->len updated, re-hold
+		 * slock, end_seq updated, so we can only use the bytes
+		 * from *seq to end_seq!
+		 */
+		if (friend && !locked) {
+			tcp_friend_recv_lock(sk);
+			locked = true;
+		}
 
 		skb_queue_walk(&sk->sk_receive_queue, skb) {
+			tcb = TCP_SKB_CB(skb);
+			offset = *seq - tcb->seq;
+			if (friend) {
+				if (skb->friend) {
+					used = (u32)(tcb->end_seq - *seq);
+					if (used > 0) {
+						tcp_friend_unlock(sk);
+						locked = false;
+						/* Can use it all */
+						goto found_ok_skb;
+					}
+					/* No data to copyout */
+					if (flags & MSG_PEEK)
+						continue;
+					if (!TCP_FRIEND_CB(tcb)->tail_inuse)
+						goto unlink;
+					break;
+				}
+				tcp_friend_unlock(sk);
+				locked = false;
+			}
+
 			/* Now that we have two receive queues this
 			 * shouldn't happen.
 			 */
-			if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
+			if (WARN(before(*seq, tcb->seq),
 				 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
-				 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
-				 flags))
+				 *seq, tcb->seq, tp->rcv_nxt, flags))
 				break;
 
-			offset = *seq - TCP_SKB_CB(skb)->seq;
 			if (tcp_hdr(skb)->syn)
 				offset--;
-			if (offset < skb->len)
+			if (offset < skb->len) {
+				/* Ok so how much can we use? */
+				used = skb->len - offset;
 				goto found_ok_skb;
+			}
 			if (tcp_hdr(skb)->fin)
 				goto found_fin_ok;
 			WARN(!(flags & MSG_PEEK),
 			     "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
-			     *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
+			     *seq, tcb->seq, tp->rcv_nxt, flags);
 		}
 
 		/* Well, if we have backlog, try to process it now yet. */
 
+		if (friend && locked) {
+			tcp_friend_unlock(sk);
+			locked = false;
+		}
+
 		if (copied >= target && !sk->sk_backlog.tail)
 			break;
 
@@ -1691,7 +2094,8 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 		tcp_cleanup_rbuf(sk, copied);
 
-		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
+		if (!sysctl_tcp_low_latency && !friend &&
+		    tp->ucopy.task == user_recv) {
 			/* Install new reader */
 			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
 				user_recv = current;
@@ -1791,8 +2195,6 @@ do_prequeue:
 		continue;
 
 	found_ok_skb:
-		/* Ok so how much can we use? */
-		used = skb->len - offset;
 		if (len < used)
 			used = len;
 
@@ -1849,7 +2251,7 @@ do_prequeue:
 				if (err) {
 					/* Exception. Bailout! */
 					if (!copied)
-						copied = -EFAULT;
+						copied = err;
 					break;
 				}
 			}
@@ -1858,6 +2260,7 @@ do_prequeue:
 		*seq += used;
 		copied += used;
 		len -= used;
+		offset += used;
 
 		tcp_rcv_space_adjust(sk);
 
@@ -1866,10 +2269,43 @@ skip_copy:
 			tp->urg_data = 0;
 			tcp_fast_path_check(sk);
 		}
-		if (used + offset < skb->len)
+
+		if (skb->friend) {
+			tcp_friend_recv_lock(sk);
+			locked = true;
+			used = (u32)(tcb->end_seq - *seq);
+			if (used) {
+				/*
+				 * Friend did an skb_put() while we were away
+				 * so if more to do process the same skb.
+				 */
+				if (len > 0) {
+					tcp_friend_unlock(sk);
+					locked = false;
+					goto found_ok_skb;
+				}
+				continue;
+			}
+			if (TCP_FRIEND_CB(tcb)->tail_inuse) {
+				/* Give sendmsg a chance */
+				tcp_friend_unlock(sk);
+				locked = false;
+				continue;
+			}
+			if (!(flags & MSG_PEEK)) {
+		unlink:
+				__skb_unlink(skb, &sk->sk_receive_queue);
+				__kfree_skb(skb);
+				tcp_friend_unlock(sk);
+				locked = false;
+				tcp_friend_write_space(sk);
+			}
 			continue;
+		}
 
-		if (tcp_hdr(skb)->fin)
+		if (offset < skb->len)
+			continue;
+		else if (tcp_hdr(skb)->fin)
 			goto found_fin_ok;
 		if (!(flags & MSG_PEEK)) {
 			sk_eat_skb(sk, skb, copied_early);
@@ -1887,6 +2323,9 @@ skip_copy:
 		break;
 	} while (len > 0);
 
+	if (friend && locked)
+		tcp_friend_unlock(sk);
+
 	if (user_recv) {
 		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
 			int chunk;
@@ -2065,6 +2504,9 @@ void tcp_close(struct sock *sk, long timeout)
 		goto adjudge_to_death;
 	}
 
+	if (sk->sk_friend)
+		sock_put(sk->sk_friend);
+
 	/*  We need to flush the recv. buffs.  We do this only on the
 	 *  descriptor close, not protocol-sourced closes, because the
 	 *  reader process may not have drained the data yet!
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fc67831..9640a81 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -530,6 +530,9 @@ void tcp_rcv_space_adjust(struct sock *sk)
 	int time;
 	int space;
 
+	if (sk->sk_friend)
+		return;
+
 	if (tp->rcvq_space.time == 0)
 		goto new_measure;
 
@@ -4350,8 +4353,9 @@ static int tcp_prune_queue(struct sock *sk);
 static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
 				 unsigned int size)
 {
-	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-	    !sk_rmem_schedule(sk, skb, size)) {
+	if (!sk->sk_friend &&
+	    (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+	    !sk_rmem_schedule(sk, skb, size))) {
 
 		if (tcp_prune_queue(sk) < 0)
 			return -1;
@@ -5722,6 +5726,16 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		 *    state to ESTABLISHED..."
 		 */
 
+		if (skb->friend) {
+			/*
+			 * If friends haven't been made yet, our sk_friend
+			 * still == NULL, then update with the ACK's friend
+			 * value (the listen()er's sock addr) which is used
+			 * as a place holder.
+			 */
+			cmpxchg(&sk->sk_friend, NULL, skb->friend);
+		}
+
 		TCP_ECN_rcv_synack(tp, th);
 
 		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
@@ -5797,9 +5811,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		    tcp_rcv_fastopen_synack(sk, skb, &foc))
 			return -1;
 
-		if (sk->sk_write_pending ||
+		if (!skb->friend && (sk->sk_write_pending ||
 		    icsk->icsk_accept_queue.rskq_defer_accept ||
-		    icsk->icsk_ack.pingpong) {
+		    icsk->icsk_ack.pingpong)) {
 			/* Save one ACK. Data will be ready after
 			 * several ticks, if write_pending is set.
 			 *
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1ed2307..f494914 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1512,6 +1512,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
 #endif
 
+	req->friend = skb->friend;
+
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f35f2df..36d832a 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -270,6 +270,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 	const struct tcp_sock *tp = tcp_sk(sk);
 	bool recycle_ok = false;
 
+	if (sk->sk_friend)
+		goto out;
+
 	if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
 		recycle_ok = tcp_remember_stamp(sk);
 
@@ -349,6 +352,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 	}
 
 	tcp_update_metrics(sk);
+out:
 	tcp_done(sk);
 }
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8ac0855..509c5e3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -65,6 +65,9 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
 /* By default, RFC2861 behavior.  */
 int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 
+/* By default, TCP loopback bypass */
+int sysctl_tcp_friends __read_mostly = 1;
+
 int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
 EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
 
@@ -1025,9 +1028,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	tcb = TCP_SKB_CB(skb);
 	memset(&opts, 0, sizeof(opts));
 
-	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
+	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
+		/* Only try to make friends if enabled */
+		if (sysctl_tcp_friends)
+			skb->friend = sk;
+
 		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
-	else
+	} else
 		tcp_options_size = tcp_established_options(sk, skb, &opts,
 							   &md5);
 	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
@@ -2725,6 +2732,11 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	}
 
 	memset(&opts, 0, sizeof(opts));
+
+	/* Only try to make friends if enabled */
+	if (sysctl_tcp_friends)
+		skb->friend = sk;
+
 #ifdef CONFIG_SYN_COOKIES
 	if (unlikely(req->cookie_ts))
 		TCP_SKB_CB(skb)->when = cookie_init_timestamp(req);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6565cf5..828d5f7 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -969,6 +969,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
 #endif
 
+	req->friend = skb->friend;
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
 	tmp_opt.user_mss = tp->rx_opt.user_mss;
-- 
1.7.4.4

^ permalink raw reply related

* [PATCH 2/3] fix panic in tcp_close()
From: Weiping Pan @ 2012-12-05  2:54 UTC (permalink / raw)
  To: netdev; +Cc: brutus, Weiping Pan
In-Reply-To: <cover.1354674151.git.wpan@redhat.com>

For tcp friends data skb, it has no tcp header,
and its transport_header is NULL, so it will panic if we deference tcp_hdr(skb)
in tcp_close().

So I add a check before we use tcp_hdr().

Signed-off-by: Weiping Pan <wpan@redhat.com>
---
 net/ipv4/tcp.c |    6 +++++-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4327deb..e9d82e0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2512,8 +2512,12 @@ void tcp_close(struct sock *sk, long timeout)
 	 *  reader process may not have drained the data yet!
 	 */
 	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
-		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
+		u32 len;
+		if (tcp_hdr(skb))
+			len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
 			  tcp_hdr(skb)->fin;
+		else
+			len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
 		data_was_unread += len;
 		__kfree_skb(skb);
 	}
-- 
1.7.4.4

^ permalink raw reply related

* [PATCH 3/3] delete request_sock->friend
From: Weiping Pan @ 2012-12-05  2:54 UTC (permalink / raw)
  To: netdev; +Cc: brutus, Weiping Pan
In-Reply-To: <cover.1354674151.git.wpan@redhat.com>

The sock pointed by request_sock->friend may be freed since it does not have a
lock to protect it.
I just delete request_sock->friend since I think it is useless.

For sk_buff->friend, it has the same problem, and we use
"atomic_add(skb->truesize, &sk->sk_wmem_alloc)" to guarantee that the sock can
not be freed before the skb is freed.

Then for 3-way handshake with tcp friends enabled,
SYN->friend is NULL, SYN/ACK->friend is set in tcp_make_synack(),
and ACK->friend is set in tcp_send_ack().

Signed-off-by: Weiping Pan <wpan@redhat.com>
---
 include/net/inet_connection_sock.h |    4 ++
 include/net/request_sock.h         |    1 -
 net/ipv4/inet_connection_sock.c    |   58 +++++++++++++++++++++++------------
 net/ipv4/tcp_input.c               |   10 ------
 net/ipv4/tcp_ipv4.c                |    7 +++-
 net/ipv4/tcp_minisocks.c           |    7 ++++-
 net/ipv4/tcp_output.c              |   21 ++++++++-----
 net/ipv6/tcp_ipv6.c                |    1 -
 8 files changed, 66 insertions(+), 43 deletions(-)

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index ba1d361..883e029 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -147,6 +147,10 @@ static inline void *inet_csk_ca(const struct sock *sk)
 extern struct sock *inet_csk_clone_lock(const struct sock *sk,
 					const struct request_sock *req,
 					const gfp_t priority);
+extern struct sock *inet_csk_friend_clone_lock(const struct sock *sk,
+					const struct request_sock *req,
+					const struct sk_buff *skb,
+					const gfp_t priority);
 
 enum inet_csk_ack_state_t {
 	ICSK_ACK_SCHED	= 1,
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index c6dfa26..a51dbd1 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -66,7 +66,6 @@ struct request_sock {
 	unsigned long			expires;
 	const struct request_sock_ops	*rsk_ops;
 	struct sock			*sk;
-	struct sock			*friend;
 	u32				secid;
 	u32				peer_secid;
 };
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index ce4b79b..7af92ed 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -659,26 +659,6 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
 	if (newsk != NULL) {
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
 
-		if (req->friend) {
-			/*
-			 * Make friends with the requestor but the ACK of
-			 * the request is already in-flight so the race is
-			 * on to make friends before the ACK is processed.
-			 * If the requestor's sk_friend value is != NULL
-			 * then the requestor has already processed the
-			 * ACK so indicate state change to wake'm up.
-			 */
-			struct sock *was;
-
-			sock_hold(req->friend);
-			newsk->sk_friend = req->friend;
-			sock_hold(newsk);
-			was = xchg(&req->friend->sk_friend, newsk);
-			/* If requester already connect()ed, maybe sleeping */
-			if (was && !sock_flag(req->friend, SOCK_DEAD))
-				sk->sk_state_change(req->friend);
-		}
-
 		newsk->sk_state = TCP_SYN_RECV;
 		newicsk->icsk_bind_hash = NULL;
 
@@ -700,6 +680,44 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
 }
 EXPORT_SYMBOL_GPL(inet_csk_clone_lock);
 
+/**
+ *	inet_csk_friend_clone_lock - clone an inet socket, and lock its clone
+ *	@sk: the socket to clone
+ *	@req: request_sock
+ *	@skb: who sends the request
+ *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *
+ *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ */
+struct sock *inet_csk_friend_clone_lock(const struct sock *sk,
+				 const struct request_sock *req,
+				 const struct sk_buff *skb,
+				 const gfp_t priority)
+{
+	struct sock *newsk = inet_csk_clone_lock(sk, req, priority);
+
+	if (newsk) {
+		struct sock *friend = skb->friend;
+		if (friend) {
+			/*
+			 * Make friends.
+			 */
+			struct sock *was;
+
+			sock_hold(friend);
+			newsk->sk_friend = friend;
+			sock_hold(newsk);
+			was = xchg(&friend->sk_friend, newsk);
+			/* If requester already connect()ed, maybe sleeping */
+			if (was && !sock_flag(friend, SOCK_DEAD))
+				sk->sk_state_change(friend);
+		}
+	}
+
+	return newsk;
+}
+EXPORT_SYMBOL_GPL(inet_csk_friend_clone_lock);
+
 /*
  * At this point, there should be no process reference to this
  * socket, and thus no user references at all.  Therefore we
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9640a81..39db09d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5726,16 +5726,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		 *    state to ESTABLISHED..."
 		 */
 
-		if (skb->friend) {
-			/*
-			 * If friends haven't been made yet, our sk_friend
-			 * still == NULL, then update with the ACK's friend
-			 * value (the listen()er's sock addr) which is used
-			 * as a place holder.
-			 */
-			cmpxchg(&sk->sk_friend, NULL, skb->friend);
-		}
-
 		TCP_ECN_rcv_synack(tp, th);
 
 		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f494914..8d61e4c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1512,8 +1512,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
 #endif
 
-	req->friend = skb->friend;
-
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
@@ -1873,6 +1871,11 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
 		goto csum_err;
 
+	if (sysctl_tcp_friends && skb->friend) {
+		skb->sk = skb->friend;
+		skb->destructor = sock_wfree;
+	}
+
 	if (sk->sk_state == TCP_LISTEN) {
 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
 		if (!nsk)
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 36d832a..753126e 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -383,7 +383,12 @@ static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
  */
 struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
 {
-	struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
+	struct sock *newsk = NULL;
+
+	if (sysctl_tcp_friends && skb->friend)
+		newsk = inet_csk_friend_clone_lock(sk, req, skb, GFP_ATOMIC);
+	else
+		newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
 
 	if (newsk != NULL) {
 		const struct inet_request_sock *ireq = inet_rsk(req);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 509c5e3..4d71549 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1028,13 +1028,9 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	tcb = TCP_SKB_CB(skb);
 	memset(&opts, 0, sizeof(opts));
 
-	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
-		/* Only try to make friends if enabled */
-		if (sysctl_tcp_friends)
-			skb->friend = sk;
-
+	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
 		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
-	} else
+	else
 		tcp_options_size = tcp_established_options(sk, skb, &opts,
 							   &md5);
 	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
@@ -1050,7 +1046,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
 	skb_orphan(skb);
 	skb->sk = sk;
-	skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
+
+	if (skb->friend)
+		skb->destructor = NULL;
+	else
+		skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
 			  tcp_wfree : sock_wfree;
 	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
 
@@ -2734,8 +2734,10 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	memset(&opts, 0, sizeof(opts));
 
 	/* Only try to make friends if enabled */
-	if (sysctl_tcp_friends)
+	if (sysctl_tcp_friends) {
 		skb->friend = sk;
+		atomic_add(skb->truesize, &sk->sk_wmem_alloc);
+	}
 
 #ifdef CONFIG_SYN_COOKIES
 	if (unlikely(req->cookie_ts))
@@ -3120,6 +3122,9 @@ void tcp_send_ack(struct sock *sk)
 
 	/* Send it off, this clears delayed acks for us. */
 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
+
+	if (sysctl_tcp_friends)
+		buff->friend = sk;
 	tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
 }
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 828d5f7..6565cf5 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -969,7 +969,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
 #endif
 
-	req->friend = skb->friend;
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
 	tmp_opt.user_mss = tp->rx_opt.user_mss;
-- 
1.7.4.4

^ permalink raw reply related

* new portal
From: marknmel05 @ 2012-12-05  3:18 UTC (permalink / raw)
  To: ewokandiam

Dear Colleague,  http://elysianwhitetails.com/images/stories/nadomu.html

^ permalink raw reply

* Re: [Patch net-next] netlink: add missing netlink message types to selinux perm table
From: Cong Wang @ 2012-12-05  3:23 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller
In-Reply-To: <1354593208-21939-1-git-send-email-amwang@redhat.com>

On Tue, 2012-12-04 at 11:53 +0800, Cong Wang wrote:
> RTM_NEWNETCONF and RTM_GETNETCONF are missing in this table.

This patch has conflicts with "v3 bridge: export multicast database via
netlink", please drop this for now, I will resend it later.

Thanks.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox