Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 3/4 v3 net-next] tg3: Add APE scratchpad read function
From: Michael Chan @ 2012-07-17  2:24 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1342491842-29818-3-git-send-email-mchan@broadcom.com>

From: Matt Carlson <mcarlson@broadcom.com>

for retreiving temperature sensor data.

Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Signed-off-by: Nithin Nayak Sujir <nsujir@broadcom.com>
Signed-off-by: Michael Chan <mchan@broadcom.com>
---
 drivers/net/ethernet/broadcom/tg3.c |   79 +++++++++++++++++++++++++++++++++++
 drivers/net/ethernet/broadcom/tg3.h |    9 +++-
 2 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 5dda3ce..26ca368 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -752,6 +752,85 @@ static int tg3_ape_event_lock(struct tg3 *tp, u32 timeout_us)
 	return timeout_us ? 0 : -EBUSY;
 }
 
+static int tg3_ape_wait_for_event(struct tg3 *tp, u32 timeout_us)
+{
+	u32 i, apedata;
+
+	for (i = 0; i < timeout_us / 10; i++) {
+		apedata = tg3_ape_read32(tp, TG3_APE_EVENT_STATUS);
+
+		if (!(apedata & APE_EVENT_STATUS_EVENT_PENDING))
+			break;
+
+		udelay(10);
+	}
+
+	return i == timeout_us / 10;
+}
+
+int tg3_ape_scratchpad_read(struct tg3 *tp, u32 *data, u32 base_off, u32 len)
+{
+	int err;
+	u32 i, bufoff, msgoff, maxlen, apedata;
+
+	if (!tg3_flag(tp, APE_HAS_NCSI))
+		return 0;
+
+	apedata = tg3_ape_read32(tp, TG3_APE_SEG_SIG);
+	if (apedata != APE_SEG_SIG_MAGIC)
+		return -ENODEV;
+
+	apedata = tg3_ape_read32(tp, TG3_APE_FW_STATUS);
+	if (!(apedata & APE_FW_STATUS_READY))
+		return -EAGAIN;
+
+	bufoff = tg3_ape_read32(tp, TG3_APE_SEG_MSG_BUF_OFF) +
+		 TG3_APE_SHMEM_BASE;
+	msgoff = bufoff + 2 * sizeof(u32);
+	maxlen = tg3_ape_read32(tp, TG3_APE_SEG_MSG_BUF_LEN);
+
+	while (len) {
+		u32 length;
+
+		/* Cap xfer sizes to scratchpad limits. */
+		length = (len > maxlen) ? maxlen : len;
+		len -= length;
+
+		apedata = tg3_ape_read32(tp, TG3_APE_FW_STATUS);
+		if (!(apedata & APE_FW_STATUS_READY))
+			return -EAGAIN;
+
+		/* Wait for up to 1 msec for APE to service previous event. */
+		err = tg3_ape_event_lock(tp, 1000);
+		if (err)
+			return err;
+
+		apedata = APE_EVENT_STATUS_DRIVER_EVNT |
+			  APE_EVENT_STATUS_SCRTCHPD_READ |
+			  APE_EVENT_STATUS_EVENT_PENDING;
+		tg3_ape_write32(tp, TG3_APE_EVENT_STATUS, apedata);
+
+		tg3_ape_write32(tp, bufoff, base_off);
+		tg3_ape_write32(tp, bufoff + sizeof(u32), length);
+
+		tg3_ape_unlock(tp, TG3_APE_LOCK_MEM);
+		tg3_ape_write32(tp, TG3_APE_EVENT, APE_EVENT_1);
+
+		base_off += length;
+
+		if (tg3_ape_wait_for_event(tp, 30000))
+			return -EAGAIN;
+
+		for (i = 0; length; i += 4, length -= 4) {
+			u32 val = tg3_ape_read32(tp, msgoff + i);
+			memcpy(data, &val, sizeof(u32));
+			data++;
+		}
+	}
+
+	return 0;
+}
+
 static int tg3_ape_send_event(struct tg3 *tp, u32 event)
 {
 	int err;
diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h
index 93865f8..f8a0d9c 100644
--- a/drivers/net/ethernet/broadcom/tg3.h
+++ b/drivers/net/ethernet/broadcom/tg3.h
@@ -2311,10 +2311,11 @@
 #define  APE_LOCK_REQ_DRIVER		 0x00001000
 #define TG3_APE_LOCK_GRANT		0x004c
 #define  APE_LOCK_GRANT_DRIVER		 0x00001000
-#define TG3_APE_SEG_SIG			0x4000
-#define  APE_SEG_SIG_MAGIC		 0x41504521
 
 /* APE shared memory.  Accessible through BAR1 */
+#define TG3_APE_SHMEM_BASE		0x4000
+#define TG3_APE_SEG_SIG			0x4000
+#define  APE_SEG_SIG_MAGIC		 0x41504521
 #define TG3_APE_FW_STATUS		0x400c
 #define  APE_FW_STATUS_READY		 0x00000100
 #define TG3_APE_FW_FEATURES		0x4010
@@ -2327,6 +2328,8 @@
 #define  APE_FW_VERSION_REVMSK		 0x0000ff00
 #define  APE_FW_VERSION_REVSFT		 8
 #define  APE_FW_VERSION_BLDMSK		 0x000000ff
+#define TG3_APE_SEG_MSG_BUF_OFF		0x401c
+#define TG3_APE_SEG_MSG_BUF_LEN		0x4020
 #define TG3_APE_HOST_SEG_SIG		0x4200
 #define  APE_HOST_SEG_SIG_MAGIC		 0x484f5354
 #define TG3_APE_HOST_SEG_LEN		0x4204
@@ -2353,6 +2356,8 @@
 
 #define  APE_EVENT_STATUS_DRIVER_EVNT	 0x00000010
 #define  APE_EVENT_STATUS_STATE_CHNGE	 0x00000500
+#define  APE_EVENT_STATUS_SCRTCHPD_READ	 0x00001600
+#define  APE_EVENT_STATUS_SCRTCHPD_WRITE 0x00001700
 #define  APE_EVENT_STATUS_STATE_START	 0x00010000
 #define  APE_EVENT_STATUS_STATE_UNLOAD	 0x00020000
 #define  APE_EVENT_STATUS_STATE_WOL	 0x00030000
-- 
1.7.1

^ permalink raw reply related

* [PATCH 4/4 v3 net-next] tg3: Add hwmon support for temperature
From: Michael Chan @ 2012-07-17  2:24 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1342491842-29818-4-git-send-email-mchan@broadcom.com>

Some tg3 devices have management firmware that can export sensor data.
Export temperature sensor reading via hwmon sysfs.

[hwmon interface suggested by Ben Hutchings <bhutchings@solarflare.com>]

Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Signed-off-by: Nithin Nayak Sujir <nsujir@broadcom.com>
Signed-off-by: Michael Chan <mchan@broadcom.com>
---
 drivers/net/ethernet/broadcom/tg3.c |  112 +++++++++++++++++++++++++++++++++++
 drivers/net/ethernet/broadcom/tg3.h |   38 ++++++++++++
 2 files changed, 150 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 26ca368..fce4c1e 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -44,6 +44,10 @@
 #include <linux/prefetch.h>
 #include <linux/dma-mapping.h>
 #include <linux/firmware.h>
+#if IS_ENABLED(CONFIG_HWMON)
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#endif
 
 #include <net/checksum.h>
 #include <net/ip.h>
@@ -9481,6 +9485,110 @@ static int tg3_init_hw(struct tg3 *tp, int reset_phy)
 	return tg3_reset_hw(tp, reset_phy);
 }
 
+#if IS_ENABLED(CONFIG_HWMON)
+static void tg3_sd_scan_scratchpad(struct tg3 *tp, struct tg3_ocir *ocir)
+{
+	int i;
+
+	for (i = 0; i < TG3_SD_NUM_RECS; i++, ocir++) {
+		u32 off = i * TG3_OCIR_LEN, len = TG3_OCIR_LEN;
+
+		tg3_ape_scratchpad_read(tp, (u32 *) ocir, off, len);
+		off += len;
+
+		if (ocir->signature != TG3_OCIR_SIG_MAGIC ||
+		    !(ocir->version_flags & TG3_OCIR_FLAG_ACTIVE))
+			memset(ocir, 0, TG3_OCIR_LEN);
+	}
+}
+
+/* sysfs attributes for hwmon */
+static ssize_t tg3_show_temp(struct device *dev,
+			     struct device_attribute *devattr, char *buf)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	struct tg3 *tp = netdev_priv(netdev);
+	struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
+	u32 temperature;
+
+	spin_lock_bh(&tp->lock);
+	tg3_ape_scratchpad_read(tp, &temperature, attr->index,
+				sizeof(temperature));
+	spin_unlock_bh(&tp->lock);
+	return sprintf(buf, "%u\n", temperature);
+}
+
+
+static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, tg3_show_temp, NULL,
+			  TG3_TEMP_SENSOR_OFFSET);
+static SENSOR_DEVICE_ATTR(temp1_crit, S_IRUGO, tg3_show_temp, NULL,
+			  TG3_TEMP_CAUTION_OFFSET);
+static SENSOR_DEVICE_ATTR(temp1_max, S_IRUGO, tg3_show_temp, NULL,
+			  TG3_TEMP_MAX_OFFSET);
+
+static struct attribute *tg3_attributes[] = {
+	&sensor_dev_attr_temp1_input.dev_attr.attr,
+	&sensor_dev_attr_temp1_crit.dev_attr.attr,
+	&sensor_dev_attr_temp1_max.dev_attr.attr,
+	NULL
+};
+
+static const struct attribute_group tg3_group = {
+	.attrs = tg3_attributes,
+};
+
+#endif
+
+static void tg3_hwmon_close(struct tg3 *tp)
+{
+#if IS_ENABLED(CONFIG_HWMON)
+	if (tp->hwmon_dev) {
+		hwmon_device_unregister(tp->hwmon_dev);
+		tp->hwmon_dev = NULL;
+		sysfs_remove_group(&tp->pdev->dev.kobj, &tg3_group);
+	}
+#endif
+}
+
+static void tg3_hwmon_open(struct tg3 *tp)
+{
+#if IS_ENABLED(CONFIG_HWMON)
+	int i, err;
+	u32 size = 0;
+	struct pci_dev *pdev = tp->pdev;
+	struct tg3_ocir ocirs[TG3_SD_NUM_RECS];
+
+	tg3_sd_scan_scratchpad(tp, ocirs);
+
+	for (i = 0; i < TG3_SD_NUM_RECS; i++) {
+		if (!ocirs[i].src_data_length)
+			continue;
+
+		size += ocirs[i].src_hdr_length;
+		size += ocirs[i].src_data_length;
+	}
+
+	if (!size)
+		return;
+
+	/* Register hwmon sysfs hooks */
+	err = sysfs_create_group(&pdev->dev.kobj, &tg3_group);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot create sysfs group, aborting\n");
+		return;
+	}
+
+	tp->hwmon_dev = hwmon_device_register(&pdev->dev);
+	if (IS_ERR(tp->hwmon_dev)) {
+		tp->hwmon_dev = NULL;
+		dev_err(&pdev->dev, "Cannot register hwmon device, aborting\n");
+		sysfs_remove_group(&pdev->dev.kobj, &tg3_group);
+	}
+#endif
+}
+
+
 #define TG3_STAT_ADD32(PSTAT, REG) \
 do {	u32 __val = tr32(REG); \
 	(PSTAT)->low += __val; \
@@ -10189,6 +10297,8 @@ static int tg3_open(struct net_device *dev)
 
 	tg3_phy_start(tp);
 
+	tg3_hwmon_open(tp);
+
 	tg3_full_lock(tp, 0);
 
 	tg3_timer_start(tp);
@@ -10238,6 +10348,8 @@ static int tg3_close(struct net_device *dev)
 
 	tg3_timer_stop(tp);
 
+	tg3_hwmon_close(tp);
+
 	tg3_phy_stop(tp);
 
 	tg3_full_lock(tp, 1);
diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h
index f8a0d9c..a1b75cd 100644
--- a/drivers/net/ethernet/broadcom/tg3.h
+++ b/drivers/net/ethernet/broadcom/tg3.h
@@ -2676,6 +2676,40 @@ struct tg3_hw_stats {
 	u8				__reserved4[0xb00-0x9c8];
 };
 
+#define TG3_SD_NUM_RECS			3
+#define TG3_OCIR_LEN			(sizeof(struct tg3_ocir))
+#define TG3_OCIR_SIG_MAGIC		0x5253434f
+#define TG3_OCIR_FLAG_ACTIVE		0x00000001
+
+#define TG3_TEMP_CAUTION_OFFSET		0xc8
+#define TG3_TEMP_MAX_OFFSET		0xcc
+#define TG3_TEMP_SENSOR_OFFSET		0xd4
+
+
+struct tg3_ocir {
+	u32				signature;
+	u16				version_flags;
+	u16				refresh_int;
+	u32				refresh_tmr;
+	u32				update_tmr;
+	u32				dst_base_addr;
+	u16				src_hdr_offset;
+	u16				src_hdr_length;
+	u16				src_data_offset;
+	u16				src_data_length;
+	u16				dst_hdr_offset;
+	u16				dst_data_offset;
+	u16				dst_reg_upd_offset;
+	u16				dst_sem_offset;
+	u32				reserved1[2];
+	u32				port0_flags;
+	u32				port1_flags;
+	u32				port2_flags;
+	u32				port3_flags;
+	u32				reserved2[1];
+};
+
+
 /* 'mapping' is superfluous as the chip does not write into
  * the tx/rx post rings so we could just fetch it from there.
  * But the cache behavior is better how we are doing it now.
@@ -3211,6 +3245,10 @@ struct tg3 {
 	const char			*fw_needed;
 	const struct firmware		*fw;
 	u32				fw_len; /* includes BSS */
+
+#if IS_ENABLED(CONFIG_HWMON)
+	struct device			*hwmon_dev;
+#endif
 };
 
 #endif /* !(_T3_H) */
-- 
1.7.1

^ permalink raw reply related

* [PATCH 0/4 v3 net-next] tg3: Add hwmon support
From: Michael Chan @ 2012-07-17  2:23 UTC (permalink / raw)
  To: davem; +Cc: netdev


David, I've removed the binary sysfs attribute and now use
hwmon only.  Please consider this patchset for net-next.

^ permalink raw reply

* Re: [PATCH] ipv6: fix incorrect route 'expires' value passed to userspace.
From: Li Wei @ 2012-07-17  1:55 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, shemminger
In-Reply-To: <20120716.025649.1070277404591664104.davem@davemloft.net>

于 2012-7-16 17:56, David Miller 写道:
> 
> Without a proper signoff, I won't apply your patch.
> 
> 

Sorry for that, I will append a signoff, do some code refactor
and send a V2.

Thanks,
Wei

^ permalink raw reply

* Re: [PATCH] ipv6: fix incorrect route 'expires' value passed to userspace.
From: Li Wei @ 2012-07-17  1:53 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, netdev
In-Reply-To: <20120716094124.0040561f@s6510.linuxnetplumber.net>

于 2012-7-17 0:41, Stephen Hemminger 写道:
> On Mon, 16 Jul 2012 16:09:37 +0800
> Li Wei <lw@cn.fujitsu.com> wrote:
> 
>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>> index becb048..a7fec9d 100644
>> --- a/net/ipv6/route.c
>> +++ b/net/ipv6/route.c
>> @@ -2516,7 +2516,7 @@ static int rt6_fill_node(struct net *net,
>>  		goto nla_put_failure;
>>  	if (!(rt->rt6i_flags & RTF_EXPIRES))
>>  		expires = 0;
>> -	else if (rt->dst.expires - jiffies < INT_MAX)
>> +	else if ((int)(rt->dst.expires - jiffies) < INT_MAX)
>>  		expires = rt->dst.expires - jiffies;
>>  	else
>>  		expires = INT_MAX;
> 
> Why not use time_is_after_jiffies() macro?

time_is_after_jiffies() return a bool but we need "how much time
before/after jiffies" here.

> 
> 

However, I also think these code seems a little ugly, because we
need to store the result of two "unsigned long"'s subtraction into
an integer. Maybe we should distinguish expires before and after 
jiffies to proper process the overflows.

Thanks,
Wei

^ permalink raw reply

* [net] ixgbevf: Prevent RX/TX statistics getting reset to zero
From: Jeff Kirsher @ 2012-07-17  1:24 UTC (permalink / raw)
  To: davem; +Cc: Narendra K, netdev, gospo, sassmann, Jeff Kirsher

From: Narendra K <narendra_k@dell.com>

The commit 4197aa7bb81877ebb06e4f2cc1b5fea2da23a7bd implements 64 bit
per ring statistics. But the driver resets the 'total_bytes' and
'total_packets' from RX and TX rings in the RX and TX interrupt
handlers to zero. This results in statistics being lost and user space
reporting RX and TX statistics as zero. This patch addresses the
issue by preventing the resetting of RX and TX ring statistics to
zero.

Signed-off-by: Narendra K <narendra_k@dell.com>
Tested-by: Sibai Li <sibai.li@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |   12 ------------
 1 file changed, 12 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index f69ec42..8b304a4 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -969,8 +969,6 @@ static irqreturn_t ixgbevf_msix_clean_tx(int irq, void *data)
 	r_idx = find_first_bit(q_vector->txr_idx, adapter->num_tx_queues);
 	for (i = 0; i < q_vector->txr_count; i++) {
 		tx_ring = &(adapter->tx_ring[r_idx]);
-		tx_ring->total_bytes = 0;
-		tx_ring->total_packets = 0;
 		ixgbevf_clean_tx_irq(adapter, tx_ring);
 		r_idx = find_next_bit(q_vector->txr_idx, adapter->num_tx_queues,
 				      r_idx + 1);
@@ -994,16 +992,6 @@ static irqreturn_t ixgbevf_msix_clean_rx(int irq, void *data)
 	struct ixgbe_hw *hw = &adapter->hw;
 	struct ixgbevf_ring  *rx_ring;
 	int r_idx;
-	int i;
-
-	r_idx = find_first_bit(q_vector->rxr_idx, adapter->num_rx_queues);
-	for (i = 0; i < q_vector->rxr_count; i++) {
-		rx_ring = &(adapter->rx_ring[r_idx]);
-		rx_ring->total_bytes = 0;
-		rx_ring->total_packets = 0;
-		r_idx = find_next_bit(q_vector->rxr_idx, adapter->num_rx_queues,
-				      r_idx + 1);
-	}
 
 	if (!q_vector->rxr_count)
 		return IRQ_HANDLED;
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH 1/4] pch_gbe: Fix the checksum fill to the error location
From: Paul Gortmaker @ 2012-07-17  0:59 UTC (permalink / raw)
  To: Andy Cress; +Cc: netdev
In-Reply-To: <40680C535D6FE6498883F1640FACD44D011432D4@ka-exchange-1.kontronamerica.local>

Hi Andy,

On Mon, Jul 16, 2012 at 4:03 PM, Andy Cress <andy.cress@us.kontron.com> wrote:
>
> Author: Zhong Hongbo <hongbo.zhong@windriver.com>

I'm curious as to how you are generating these mails, since if it
were with the "normal" method, then the above would have a
"From: " prefix, and not "Author: " prefix.  (More on what is the
"normal" method below...)

>
> Due to some unknown hardware limitations the pch_gbe hardware cannot
> calculate checksums when the length of network package is less
> than 64 bytes, where we will surprisingly encounter a problem of
> the destination IP incorrectly changed.

This has the feel of a problem that isn't properly understood, and
a solution that is aimed at masking the symptom at the point
where it rears its ugly head...  Not a good sign, when it comes to
requesting upstream acceptance.

>
> When forwarding network packages at the network layer the IP packages

s/packages/packets/

> won't be relayed to the upper transport layer and analyzed there,
> consequently, skb->transport_header pointer will be mistakenly remained
> the same as that of skb->network_header, resulting in TCP checksum
> wrongly
> filled into the field of destination IP in IP header.

Similarly here, the one-word-per-line symptom seems to hint at it
being a case of data manually entered into a GUI MUA, which
generally translates into headaches for the maintainer who has
to integrate your work.  If you can adopt a "standard" workflow,
it will be easier for both you and the maintainer.  And that
"standard" workflow could be something as simple as this:

--------------------------
cd /path/to/my/git-repo/of/net-next
git pull
git checkout -b pch-fixes master
<create commits, by cherry-pick or manual creation>
<validate with compilation, boot testing>
git format-patch --subject-prefix="PATCH net-next" --cover-letter -o
sendme master..pch-fixes
vi sendme/0000-cover-letter.patch
<enter text describing your series of patches>
git send-email --to netdev@vger.kernel.org -cc maintainer@someaddress.com sendme
-------------------------

>
> We can fix this issue by manually calculate the offset of the TCP
> checksum
>  and update it accordingly.
>
> We would normally use the skb_checksum_start_offset(skb) here, but in
> this
> case it is sometimes -2 (csum_start=0 - skb_headroom=2 => -2), hence the
> manual calculation.
>
> Signed-off-by: Zhong Hongbo <hongbo.zhong@windriver.com>
> Merged-by: Andy Cress <andy.cress@us.kontron.com>

There really isn't a precedent for Merged-by: -- you may want to see:

     http://kerneltrap.org/taxonomy/term/245

A "real" merge actually creates a merge commit, which is unique
in its own right.  So here, if you've carried forward an older patch,
or similar, you'd be most likely to add your own SOB line underneath.

Also a Cc: addition might be in order, say based on the output of:

./scripts/get_maintainer.pl -f drivers/net/ethernet/oki-semi/pch_gbe

A Cc: line put here goes on record.  If you use the --cc option
mentioned above, the Cc: record is limited to the mail header in
the netdev mail archive.

>
> ---
>  drivers/net/pch_gbe/pch_gbe_main.c |   14 ++++++++------
>  1 files changed, 8 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
> b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
> index 3787c64..1642bff 100644
> --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
> +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
> @@ -1178,32 +1178,35 @@ static void pch_gbe_tx_queue(struct
> pch_gbe_adapter *adapter,
>         /*
>          * It is because the hardware accelerator does not support a
> checksum,
>          * when the received data size is less than 64 bytes.
> +        * Note: skb_checksum_start_offset(skb) is sometimes -2 here.

This comment is borderline useless, without some level of justification
of how/why that happens, and why it is OK/acceptable.

>          */
>         if (skb->len < PCH_GBE_SHORT_PKT && skb->ip_summed !=
> CHECKSUM_NONE) {
> +               struct iphdr *iph = ip_hdr(skb);
>                 frame_ctrl |= PCH_GBE_TXD_CTRL_APAD |
>                               PCH_GBE_TXD_CTRL_TCPIP_ACC_OFF;
>                 if (skb->protocol == htons(ETH_P_IP)) {
> -                       struct iphdr *iph = ip_hdr(skb);
>                         unsigned int offset;
> -                       offset = skb_transport_offset(skb);
> +                       offset = (unsigned char *)((u8 *)iph + iph->ihl
> * 4) - skb->data;
>                         if (iph->protocol == IPPROTO_TCP) {
> +                               struct tcphdr *tcphdr_point = (struct
> tcphdr *)((u8 *)iph + iph->ihl * 4);

There are several things going on here that are not ideal.  There
are casts of casts with unexplained math magic, there is the
attempt to encode variable types ("_point") in the variable name,
and there is apparently mailer munging which inserts rogue newlines.

At this point, I'd have to suggest that you go back to trying to
describe the use case, and the exact symptoms you are seeing
there; describe that and how you believe it comes about as
articulately as possible, and ask for input on how best to deal
with it; giving reference to this as a workaround that was deployed
some time in the past. In the end, this is a recipe for a good
commit log anyways, i.e. describing the end user symptom,
the use case that triggers it, the underlying reason as to why
it happens, and finally the technically correct fix to deal with
solving it (and indicate why it is the correct fix, if not obvious.)

Hope this helps with basic process and expectations.

Paul.
--

>                                 skb->csum = 0;
> -                               tcp_hdr(skb)->check = 0;
> +                               tcphdr_point->check = 0;
>                                 skb->csum = skb_checksum(skb, offset,
>                                                          skb->len -
> offset, 0);
> -                               tcp_hdr(skb)->check =
> +                               tcphdr_point->check =
>                                         csum_tcpudp_magic(iph->saddr,
>                                                           iph->daddr,
>                                                           skb->len -
> offset,
>                                                           IPPROTO_TCP,
>                                                           skb->csum);
>                         } else if (iph->protocol == IPPROTO_UDP) {
> +                               struct udphdr *udphdr_point = (struct
> udphdr *)((u8 *)iph + iph->ihl * 4);
>                                 skb->csum = 0;
> -                               udp_hdr(skb)->check = 0;
> +                               udphdr_point->check = 0;
>                                 skb->csum =
>                                         skb_checksum(skb, offset,
>                                                      skb->len - offset,
> 0);
> -                               udp_hdr(skb)->check =
> +                               udphdr_point->check =
>                                         csum_tcpudp_magic(iph->saddr,
>                                                           iph->daddr,
>                                                           skb->len -
> offset,
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [RFC 1/2] PCI-Express Non-Transparent Bridge Support
From: Jon Mason @ 2012-07-17  0:23 UTC (permalink / raw)
  To: chetan loke; +Cc: linux-kernel, netdev, linux-pci, Dave Jiang
In-Reply-To: <CAAsGZS43FkfTFkYnvFdaA+KSLD1ZVBfuFe7_2e7s29_X+rTmug@mail.gmail.com>

On Mon, Jul 16, 2012 at 03:27:48PM -0400, chetan loke wrote:
> On Mon, Jul 16, 2012 at 2:38 PM, Jon Mason <jon.mason@intel.com> wrote:
> > On Mon, Jul 16, 2012 at 12:49:39PM -0400, chetan loke wrote:
> 
> ....
> 
> >> Is it ok to rename the following vars for convenience sake?
> >>
> >> > +       struct list_head txq;
> >> tx_pend_q - (pending_queue) or tx_out_q - (outstanding_queue) - or
> >> pick any new string you like - other than a mono-syllable definition
> >>
> >> > +       struct list_head txc;
> >> tx_compl_q - completion queue
> >>
> >> > +       struct list_head txe;
> >> tx_avail_e - available entry queue
> >>
> >>
> >> > +       spinlock_t txq_lock;
> >> > +       spinlock_t txc_lock;
> >> > +       spinlock_t txe_lock;
> >>
> >> then match the variants accordingly.
> >>
> >> > +       struct list_head rxq;
> >> > +       struct list_head rxc;
> >> > +       struct list_head rxe;
> >> > +       spinlock_t rxq_lock;
> >> > +       spinlock_t rxc_lock;
> >> > +       spinlock_t rxe_lock;
> >>
> >> similarly the rx-counterpart
> >
> > Are they difficult to understand?  I can change them, but it seems rather moot since you seemed to immediately understand the logic behind the names.
> >
> 
> Immediately understand? Not at first glance. I had to re-read the
> functions. Its really is a minor change and variables will then become
> self-explanatory. I can almost feel that a developer who works on this
> code for the first time might end up choosing the wrong 'syllable' and
> locking an entirely different lock.
> 
> Infact add a prefix 'ntb' to all the rx/tx locks. That way grep'ing
> output of lockstat also becomes easier.
> 
> It now reads: ntb_tx_pend_lock

Makes sense

> 
> >>
> >>
> >> ..................
> >>
> >> > +static void ntb_tx_copy_task(struct ntb_transport_qp *qp,
> >> > +                            struct ntb_queue_entry *entry,
> >> > +                            void *offset)
> >> > +{
> >> > +       struct ntb_payload_header *hdr = offset;
> >> > +       int rc;
> >> > +
> >> > +       offset += sizeof(struct ntb_payload_header);
> >> > +       memcpy_toio(offset, entry->buf, entry->len);
> >> > +
> >> > +       hdr->len = entry->len;
> >> > +       hdr->ver = qp->tx_pkts;
> >> > +
> >> > +       /* Ensure that the data is fully copied out before setting the flag */
> >> > +       wmb();
> >> > +       hdr->flags = entry->flags | DESC_DONE_FLAG;
> >> > +
> >> > +       rc = ntb_ring_sdb(qp->ndev, qp->qp_num);
> >> > +       if (rc)
> >> > +               pr_err("%s: error ringing db %d\n", __func__, qp->qp_num);
> >> > +
> >> > +       if (entry->len > 0) {
> >>
> >> how do you interpret this len variable and decide if it's a good/bad completion?
> >
> > The length of 0 is for messages from the remote system, which currently only consists of a "link down" message notifying the local system to no longer send any messages to the remote side.
> >
> 
> May be I didn't read the code properly. Is there a length-comment that
> explains the above? If not then just by pure code inspection it would
> seem that a skb was leaked. So add the above comment that way we can
> save time for other netdev folks too.

I'll add a comment.

> 
> 
> >>
> >> > +               qp->tx_bytes += entry->len;
> >> > +
> >> > +               /* Add fully transmitted data to completion queue */
> >> > +               ntb_list_add_tail(&qp->txc_lock, &entry->entry, &qp->txc);
> >> > +
> >> > +               if (qp->tx_handler)
> >> > +                       qp->tx_handler(qp);
> >> > +       } else
> >> > +               ntb_list_add_tail(&qp->txe_lock, &entry->entry, &qp->txe);
> >>
> >> I could be wrong but how is the original skb handled if the code path
> >> goes in the else clause?
> >
> > There is no skb is the length is zero.  Since the only client is virtual ethernet, it will always be > 60.  However, I should add a sanity check for 0 length in tx_enqueue.
> >
> >> Also, in the else clause, how about a ntb_list_add_head rather than a _tail.
> >
> > Why add to the head, it was just used?
> 
> Yes, just re-use what's hot(best guess).
> 
> >
> >> > +
> >> > +static int ntb_process_tx(struct ntb_transport_qp *qp,
> >> > +                         struct ntb_queue_entry *entry)
> >> > +{
> >> > +       struct ntb_payload_header *hdr;
> >> > +       void *offset;
> >> > +
> >> > +       offset = qp->tx_offset;
> >> > +       hdr = offset;
> >> > +
> >> > +       pr_debug("%lld - offset %p, tx %p, entry len %d flags %x buff %p\n",
> >> > +                qp->tx_pkts, offset, qp->tx_offset, entry->len, entry->flags,
> >> > +                entry->buf);
> >> > +       if (hdr->flags) {
> >> > +               ntb_list_add_head(&qp->txq_lock, &entry->entry, &qp->txq);
> >> > +               qp->tx_ring_full++;
> >> > +               return -EAGAIN;
> >> > +       }
> >> > +
> >> > +       if (entry->len > transport_mtu) {
> >> > +               pr_err("Trying to send pkt size of %d\n", entry->len);
> >> > +               entry->flags = HW_ERROR_FLAG;
> >> > +
> >> > +               ntb_list_add_tail(&qp->txc_lock, &entry->entry, &qp->txc);
> >> > +
> >> > +               if (qp->tx_handler)
> >> > +                       qp->tx_handler(qp);
> >> > +
> >> > +               return 0;
> >> > +       }
> >> > +
> >> > +       ntb_tx_copy_task(qp, entry, offset);
> >>
> >> what happens when ntb_sdb_ring returns an error? would you still want
> >> to increment tx_pkts below?
> >
> > It's not fatal if the remote side isn't notified.  It will hurt latency, since the next packet would be the one that triggers the next interrupt.  Also, the only way it could ever fail would be if it was an invalid interrupt bit being set, which should never happen.
> >
> 
> What happens when the 'db >= ndev->max_cbs' check fails? Under what
> circumstances will that happen? When it does happen how does the
> remote side then gets notified or should it even get notified?

It should never happen, as there are checks when enqueuing that the transport queue is valid.  It is more of a sanity check than anything else.

> 'which should never happen'? FYI - I have seen and debugged(not this
> one but doorbells and what not) weirdness while working on CLARiiON +
> PCie-interconnects. Board bring-up is a PITA. So you get the idea ...

I understand, that is why I had the sanity check code.  Paranoia can be a good thing.
 
> >>
> >> > +void *ntb_transport_tx_dequeue(struct ntb_transport_qp *qp, unsigned int *len)
> >> > +{
> >> > +       struct ntb_queue_entry *entry;
> >> > +       void *buf;
> >> > +
> >> > +       if (!qp)
> >> > +               return NULL;
> >> > +
> >> > +       entry = ntb_list_rm_head(&qp->txc_lock, &qp->txc);
> >> > +       if (!entry)
> >> > +               return NULL;
> >> > +
> >> > +       buf = entry->callback_data;
> >> > +       if (entry->flags != HW_ERROR_FLAG)
> >> > +               *len = entry->len;
> >> > +       else
> >> > +               *len = -EIO;
> >> > +
> >> > +       ntb_list_add_tail(&qp->txe_lock, &entry->entry, &qp->txe);
> >>
> >> how about a ntb_list_add_head?
> >
> > Is there a benefit to adding to the head versus tail?  It makes more sense to me to pull from the head and add to the tail.
> >
> 
> Yes, explained above. Today there are 100(..DEF_NUM...) entries.
> Tomorrow there could be more. So why not re-use what's hot? You could
> also think of this as yet another way of forcing detection of
> double-use corruption/bug. I'm not saying that there's a bug here but
> you get the idea.

I'm fairly sure the list is suboptimal performance-wise, but it is much easier to use when developing.  Perhaps I should use a fixed size array instead.

Thanks for the reviews!

> 
> Chetan Loke

^ permalink raw reply

* [PATCH net-next] bnx2: Try to recover from PCI block reset
From: Michael Chan @ 2012-07-17  0:25 UTC (permalink / raw)
  To: davem; +Cc: netdev, nhorman

If the PCI block has reset, the memory enable bit will be reset and
the device will not respond to MMIO access.  bnx2_reset_task() currently
will not recover when this happens.  Add code to detect this condition
and restore the PCI state.  This scenario has been reported by some
users.

Signed-off-by: Michael Chan <mchan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnx2.c |    7 +++++++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index 0ced154..79cebd8 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -6388,6 +6388,7 @@ bnx2_reset_task(struct work_struct *work)
 {
 	struct bnx2 *bp = container_of(work, struct bnx2, reset_task);
 	int rc;
+	u16 pcicmd;
 
 	rtnl_lock();
 	if (!netif_running(bp->dev)) {
@@ -6397,6 +6398,12 @@ bnx2_reset_task(struct work_struct *work)
 
 	bnx2_netif_stop(bp, true);
 
+	pci_read_config_word(bp->pdev, PCI_COMMAND, &pcicmd);
+	if (!(pcicmd & PCI_COMMAND_MEMORY)) {
+		/* in case PCI block has reset */
+		pci_restore_state(bp->pdev);
+		pci_save_state(bp->pdev);
+	}
 	rc = bnx2_init_nic(bp, 1);
 	if (rc) {
 		netdev_err(bp->dev, "failed to reset NIC, closing\n");
-- 
1.7.1

^ permalink raw reply related

* Re: [ethtool PATCH] ethtool: Resolve use of uninitialized memory in rxclass_get_dev_info
From: Alexander Duyck @ 2012-07-17  0:10 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, jeffrey.t.kirsher
In-Reply-To: <1342469017.2523.33.camel@bwh-desktop.uk.solarflarecom.com>

On 07/16/2012 01:03 PM, Ben Hutchings wrote:
> On Fri, 2012-07-13 at 09:55 -0700, Alexander Duyck wrote:
>> The ethtool function for getting the rule count was not zeroing out the
>> data field before passing it to the kernel.  As a result the value started
>> uninitialized and was incorrectly returning a result indicating that
>> devices supported setting new rule indexes.  In order to correct this I am
>> adding a one line fix that sets data to zero before we pass the command to
>> the kernel.
> Right.  For 'get' commands with no parameters (besides the device) the
> data copied back to userland is normally zero-initialised and then
> filled out by the driver, and I seem to have worked on that assumption.
> But because of the odd multiplexing of RX NFC commands
> ETHTOOL_GRXCLSRLCNT doesn't work like that.  And for 'my' driver that
> didn't matter.  Sorry about that.
>
> (We should really have some explicit documentation of responsibility for
> structure initialisation.)
>
>> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
>> ---
>>
>> I am resending this since I didn't see any notification that it had been seen.
>> I also realized that I had not clearly identified that this is an ethtool user
>> space patch and not an ethtool kernel space patch.
> It was perfectly clear and I had queued it up to review but hadn't yet
> done so.
>
> Ben.
>
Yeah, that was my mistake.  I thought I hadn't sent it out with the
ethtool prefix when I actually had.

Thanks,

Alex

^ permalink raw reply

* Re: [PATCH 4/7] net-tcp: Fast Open client - receiving SYN-ACK
From: Yuchung Cheng @ 2012-07-16 23:33 UTC (permalink / raw)
  To: Vijay Subramanian; +Cc: davem, hkchu, edumazet, ncardwell, sivasankar, netdev
In-Reply-To: <CAGK4HS9iEVy6NEx_dJgdEW8P2K9_G0vhziE+7O7iW+6WFc3VEw@mail.gmail.com>

On Mon, Jul 16, 2012 at 3:57 PM, Vijay Subramanian
<subramanian.vijay@gmail.com> wrote:
> On 16 July 2012 14:16, Yuchung Cheng <ycheng@google.com> wrote:
>> On receiving the SYN-ACK after SYN-data, the client needs to
>> a) update the cached MSS and cookie (if included in SYN-ACK)
>> b) retransmit the data yet acknowledged by the SYN-ACK in the final ACK of
>>    the handshake.
>
> I think you mean "data not yet acknowledged " in point (b).
yes "not yet". I will fix it in the next round after I collect more
reviews. Thanks!
>
> Vijay

^ permalink raw reply

* Re: [PATCH] netem: fix rate extension and drop accounting
From: Hagen Paul Pfeifer @ 2012-07-16 23:26 UTC (permalink / raw)
  To: Mark Gordon
  Cc: Eric Dumazet, David Miller, netdev, Yuchung Cheng, Andreas Terzis
In-Reply-To: <CAPVr9VP7DniPZj4vZi_myJWfL5JLYKYTXXtrXcKHo9LjEQzjYw@mail.gmail.com>

Sorry Eric for the delay! I am a little bit under project pressure. But I will
test patches where I can.

* Mark Gordon | 2012-07-16 15:15:07 [-0700]:

>Sorry for the late reply on this.  I'm still pretty unsure about the math
>we're doing on
>
>delay -= netem_skb_cb(skb_peek(list))->time_to_send - now;
>
>First, shouldn't it be skb_peek_tail(list)?  Even if you do that the math

The delta is between now and when the _next_ packet is to send. New packets
are enqueued on the tail.

>doesn't really seem to work out.  In my mind there ought to be at least one
>more non-linearity.  The code before/after this patch does not work as I
>would expect and seems to have fairly random effects on the bandwidth and
>that the below patch does work for me.  Here is what I'm suggesting
>(relative to the new patches)

As Eric said: there are problems in combination with a static delay. During
rate extension development we tested the raw/vanilla "rate" functionality.
The rate part works faultless[TM] - at least independet of any other
"delay-latency generator".

Hagen

^ permalink raw reply

* Re: [PATCH 4/7] net-tcp: Fast Open client - receiving SYN-ACK
From: Vijay Subramanian @ 2012-07-16 22:57 UTC (permalink / raw)
  To: Yuchung Cheng; +Cc: davem, hkchu, edumazet, ncardwell, sivasankar, netdev
In-Reply-To: <1342473410-6265-5-git-send-email-ycheng@google.com>

On 16 July 2012 14:16, Yuchung Cheng <ycheng@google.com> wrote:
> On receiving the SYN-ACK after SYN-data, the client needs to
> a) update the cached MSS and cookie (if included in SYN-ACK)
> b) retransmit the data yet acknowledged by the SYN-ACK in the final ACK of
>    the handshake.

I think you mean "data not yet acknowledged " in point (b).

Vijay

^ permalink raw reply

* [PATCH ethtool 2/2] Fix reporting of VLAN tag offload flags for Linux 2.6.24-2.6.36
From: Ben Hutchings @ 2012-07-16 22:20 UTC (permalink / raw)
  To: netdev; +Cc: linux-net-drivers
In-Reply-To: <1342477041.2523.40.camel@bwh-desktop.uk.solarflarecom.com>

These kernel versions implement ETHTOOL_GFLAGS but do not include the
flags for VLAN tag offload (and do not implement ETHTOOL_GFEATURES).
Since the VLAN tag offload features were already defined and
implemented by many drivers, we shouldn't assume they are off.
Instead, since these feature flag values were stable, read them from
sysfs.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 ethtool.c |   41 +++++++++++++++++++++++++++++++++++++++++
 1 files changed, 41 insertions(+), 0 deletions(-)

diff --git a/ethtool.c b/ethtool.c
index b424756..9991ce2 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -33,6 +33,8 @@
 #include <sys/utsname.h>
 #include <limits.h>
 #include <ctype.h>
+#include <assert.h>
+#include <sys/fcntl.h>
 
 #include <sys/socket.h>
 #include <netinet/in.h>
@@ -1417,6 +1419,31 @@ static struct feature_defs *get_feature_defs(struct cmd_context *ctx)
 	return defs;
 }
 
+static int get_netdev_attr(struct cmd_context *ctx, const char *name,
+		    char *buf, size_t buf_len)
+{
+#ifdef TEST_ETHTOOL
+	errno = ENOENT;
+	return -1;
+#else
+	char path[40 + IFNAMSIZ];
+	ssize_t len;
+	int fd;
+
+	len = snprintf(path, sizeof(path), "/sys/class/net/%s/%s",
+		       ctx->devname, name);
+	assert(len < sizeof(path));
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		return fd;
+	len = read(fd, buf, buf_len - 1);
+	if (len >= 0)
+		buf[len] = 0;
+	close(fd);
+	return len;
+#endif
+}
+
 static int do_gdrv(struct cmd_context *ctx)
 {
 	int err;
@@ -1858,6 +1885,20 @@ get_features(struct cmd_context *ctx, const struct feature_defs *defs)
 			perror("Cannot get device generic features");
 		else
 			allfail = 0;
+	} else {
+		/* We should have got VLAN tag offload flags through
+		 * ETHTOOL_GFLAGS.  However, prior to Linux 2.6.37
+		 * they were not exposed in this way - and since VLAN
+		 * tag offload was defined and implemented by many
+		 * drivers, we shouldn't assume they are off.
+		 * Instead, since these feature flag values were
+		 * stable, read them from sysfs.
+		 */
+		char buf[20];
+		if (get_netdev_attr(ctx, "features", buf, sizeof(buf)) > 0)
+			state->off_flags |=
+				strtoul(buf, NULL, 0) &
+				(ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN);
 	}
 
 	if (allfail) {
-- 
1.7.7.6


-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [PATCH ethtool 1/2] Remove bogus error message when changing offload settings on older kernels
From: Ben Hutchings @ 2012-07-16 22:18 UTC (permalink / raw)
  To: netdev; +Cc: linux-net-drivers
In-Reply-To: <1342477041.2523.40.camel@bwh-desktop.uk.solarflarecom.com>

We should not be checking for fixed features when we have no
information about which are fixed.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 ethtool.c |   60 +++++++++++++++++++++++++++++++++---------------------------
 1 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/ethtool.c b/ethtool.c
index 3c34273..b424756 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -1962,39 +1962,45 @@ static int do_sfeatures(struct cmd_context *ctx)
 	if (!old_state)
 		return 1;
 
-	/* For each offload that the user specified, update any
-	 * related features that the user did not specify and that
-	 * are not fixed.  Warn if all related features are fixed.
-	 */
-	for (i = 0; i < ARRAY_SIZE(off_flag_def); i++) {
-		int fixed = 1;
-
-		if (!(off_flags_mask & off_flag_def[i].value))
-			continue;
+	if (efeatures) {
+		/* For each offload that the user specified, update any
+		 * related features that the user did not specify and that
+		 * are not fixed.  Warn if all related features are fixed.
+		 */
+		for (i = 0; i < ARRAY_SIZE(off_flag_def); i++) {
+			int fixed = 1;
 
-		for (j = 0; j < defs->n_features; j++) {
-			if (defs->def[j].off_flag_index != i ||
-			    !FEATURE_BIT_IS_SET(old_state->features.features,
-						j, available) ||
-			    FEATURE_BIT_IS_SET(old_state->features.features,
-					       j, never_changed))
+			if (!(off_flags_mask & off_flag_def[i].value))
 				continue;
 
-			fixed = 0;
-			if (!FEATURE_BIT_IS_SET(efeatures->features, j, valid)) {
-				FEATURE_BIT_SET(efeatures->features, j, valid);
-				if (off_flags_wanted & off_flag_def[i].value)
-					FEATURE_BIT_SET(efeatures->features, j,
-							requested);
+			for (j = 0; j < defs->n_features; j++) {
+				if (defs->def[j].off_flag_index != i ||
+				    !FEATURE_BIT_IS_SET(
+					    old_state->features.features,
+					    j, available) ||
+				    FEATURE_BIT_IS_SET(
+					    old_state->features.features,
+					    j, never_changed))
+					continue;
+
+				fixed = 0;
+				if (!FEATURE_BIT_IS_SET(efeatures->features,
+							j, valid)) {
+					FEATURE_BIT_SET(efeatures->features,
+							j, valid);
+					if (off_flags_wanted &
+					    off_flag_def[i].value)
+						FEATURE_BIT_SET(
+							efeatures->features,
+							j, requested);
+				}
 			}
-		}
 
-		if (fixed)
-			fprintf(stderr, "Cannot change %s\n",
-				off_flag_def[i].long_name);
-	}
+			if (fixed)
+				fprintf(stderr, "Cannot change %s\n",
+					off_flag_def[i].long_name);
+		}
 
-	if (efeatures) {
 		err = send_ioctl(ctx, efeatures);
 		if (err < 0) {
 			perror("Cannot set device feature settings");
-- 
1.7.7.6



-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [PATCH ethtool 0/2] Backward-compatibility for feature get/set
From: Ben Hutchings @ 2012-07-16 22:17 UTC (permalink / raw)
  To: netdev; +Cc: linux-net-drivers

I've committed fixes for another 2 bugs in features (-k/-K options):

- A silly regression introduced in 3.4, not covered by unit tests which
don't yet check the command output
- An older bug (really a defect in the ETHTOOL_GFLAGS API) which we can
work around

Ben.

Ben Hutchings (2):
  Remove bogus error message when changing offload settings on older
    kernels
  Fix reporting of VLAN tag offload flags for Linux 2.6.24-2.6.36

 ethtool.c |  101 ++++++++++++++++++++++++++++++++++++++++++++----------------
 1 files changed, 74 insertions(+), 27 deletions(-)

-- 
1.7.7.6


-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* [PATCH] SUNRPC: Prevent kernel stack corruption on long values of flush
From: Sasha Levin @ 2012-07-16 22:01 UTC (permalink / raw)
  To: Trond.Myklebust, bfields, davem
  Cc: davej, linux-nfs, netdev, linux-kernel, Sasha Levin

The buffer size in read_flush() is too small for the longest possible values
for it. This can lead to a kernel stack corruption:

[   43.047329] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: ffffffff833e64b4
[   43.047329]
[   43.049030] Pid: 6015, comm: trinity-child18 Tainted: G        W    3.5.0-rc7-next-20120716-sasha #221
[   43.050038] Call Trace:
[   43.050435]  [<ffffffff836c60c2>] panic+0xcd/0x1f4
[   43.050931]  [<ffffffff833e64b4>] ? read_flush.isra.7+0xe4/0x100
[   43.051602]  [<ffffffff810e94e6>] __stack_chk_fail+0x16/0x20
[   43.052206]  [<ffffffff833e64b4>] read_flush.isra.7+0xe4/0x100
[   43.052951]  [<ffffffff833e6500>] ? read_flush_pipefs+0x30/0x30
[   43.053594]  [<ffffffff833e652c>] read_flush_procfs+0x2c/0x30
[   43.053596]  [<ffffffff812b9a8c>] proc_reg_read+0x9c/0xd0
[   43.053596]  [<ffffffff812b99f0>] ? proc_reg_write+0xd0/0xd0
[   43.053596]  [<ffffffff81250d5b>] do_loop_readv_writev+0x4b/0x90
[   43.053596]  [<ffffffff81250fd6>] do_readv_writev+0xf6/0x1d0
[   43.053596]  [<ffffffff812510ee>] vfs_readv+0x3e/0x60
[   43.053596]  [<ffffffff812511b8>] sys_readv+0x48/0xb0
[   43.053596]  [<ffffffff8378167d>] system_call_fastpath+0x1a/0x1f

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 net/sunrpc/cache.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 2afd2a8..f86d95e 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1409,11 +1409,11 @@ static ssize_t read_flush(struct file *file, char __user *buf,
 			  size_t count, loff_t *ppos,
 			  struct cache_detail *cd)
 {
-	char tbuf[20];
+	char tbuf[22];
 	unsigned long p = *ppos;
 	size_t len;
 
-	sprintf(tbuf, "%lu\n", convert_to_wallclock(cd->flush_time));
+	snprintf(tbuf, sizeof(tbuf), "%lu\n", convert_to_wallclock(cd->flush_time));
 	len = strlen(tbuf);
 	if (p >= len)
 		return 0;
-- 
1.7.8.6

^ permalink raw reply related

* [PATCH 5/7] net-tcp: Fast Open client - sendmsg(MSG_FASTOPEN)
From: Yuchung Cheng @ 2012-07-16 21:16 UTC (permalink / raw)
  To: davem, hkchu, edumazet, ncardwell; +Cc: sivasankar, netdev, Yuchung Cheng
In-Reply-To: <1342473410-6265-1-git-send-email-ycheng@google.com>

sendmsg() (or sendto()) with MSG_FASTOPEN is a combo of connect(2)
and write(2). The application should replace connect() with it to
send data in the opening SYN packet.

For blocking socket, sendmsg() blocks until all the data are buffered
locally and the handshake is completed like connect() call. It
returns similar errno like connect() if the TCP handshake fails.

For non-blocking socket, it returns the number of bytes queued (and
transmitted in the SYN-data packet) if cookie is available. If cookie
is not available, it transmits a data-less SYN packet with Fast Open
cookie request option and returns -EINPROGRESS like connect().

Using MSG_FASTOPEN on connecting or connected socket will result in
simlar errno like repeating connect() calls. Therefore the application
should only use this flag on new sockets.

The buffer size of sendmsg() is independent of the MSS of the connection.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
---
 Documentation/networking/ip-sysctl.txt |   11 ++++++
 include/linux/socket.h                 |    1 +
 include/net/inet_common.h              |    6 ++-
 include/net/tcp.h                      |    3 ++
 net/ipv4/af_inet.c                     |   19 +++++++---
 net/ipv4/tcp.c                         |   61 +++++++++++++++++++++++++++++---
 net/ipv4/tcp_ipv4.c                    |    3 ++
 7 files changed, 92 insertions(+), 12 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index e20c17a..b3c5225 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -468,6 +468,17 @@ tcp_syncookies - BOOLEAN
 	SYN flood warnings in logs not being really flooded, your server
 	is seriously misconfigured.
 
+tcp_fastopen - INTEGER
+	Enable TCP Fast Open feature (draft-ietf-tcpm-fastopen) to send data
+	in the opening SYN packet. To use this feature, the client application
+	must not use connect(). Instead, it should use sendmsg() or sendto()
+	with MSG_FASTOPEN flag which performs a TCP handshake automatically.
+
+	The values (bitmap) are:
+	1: Enables sending data in the opening SYN on the client
+
+	Default: 0
+
 tcp_syn_retries - INTEGER
 	Number of times initial SYNs for an active TCP connection attempt
 	will be retransmitted. Should not be higher than 255. Default value
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 25d6322..90297db 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -276,6 +276,7 @@ struct ucred {
 #else
 #define MSG_CMSG_COMPAT	0		/* We never have 32 bit fixups */
 #endif
+#define MSG_FASTOPEN	0x20000000 /* Send data in TCP SYN */
 
 
 /* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 22fac98..2340087 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -14,9 +14,11 @@ struct sockaddr;
 struct socket;
 
 extern int inet_release(struct socket *sock);
-extern int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr,
+extern int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 			       int addr_len, int flags);
-extern int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+extern int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+				 int addr_len, int flags);
+extern int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
 			      int addr_len, int flags);
 extern int inet_accept(struct socket *sock, struct socket *newsock, int flags);
 extern int inet_sendmsg(struct kiocb *iocb, struct socket *sock,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2d3b09d2..8c20edd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -212,6 +212,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 /* TCP initial congestion window as per draft-hkchu-tcpm-initcwnd-01 */
 #define TCP_INIT_CWND		10
 
+/* Bit Flags for sysctl_tcp_fastopen */
+#define	TFO_CLIENT_ENABLE	1
+
 extern struct inet_timewait_death_row tcp_death_row;
 
 /* sysctl variables for tcp */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6ef67b7..c05ec41 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -590,8 +590,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
  *	Connect to a remote host. There is regrettably still a little
  *	TCP 'magic' in here.
  */
-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
-			int addr_len, int flags)
+int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+			  int addr_len, int flags)
 {
 	struct sock *sk = sock->sk;
 	int err;
@@ -600,8 +600,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	if (addr_len < sizeof(uaddr->sa_family))
 		return -EINVAL;
 
-	lock_sock(sk);
-
 	if (uaddr->sa_family == AF_UNSPEC) {
 		err = sk->sk_prot->disconnect(sk, flags);
 		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
@@ -664,7 +662,6 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	sock->state = SS_CONNECTED;
 	err = 0;
 out:
-	release_sock(sk);
 	return err;
 
 sock_error:
@@ -674,6 +671,18 @@ sock_error:
 		sock->state = SS_DISCONNECTING;
 	goto out;
 }
+EXPORT_SYMBOL(__inet_stream_connect);
+
+int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+			int addr_len, int flags)
+{
+	int err;
+
+	lock_sock(sock->sk);
+	err = __inet_stream_connect(sock, uaddr, addr_len, flags);
+	release_sock(sock->sk);
+	return err;
+}
 EXPORT_SYMBOL(inet_stream_connect);
 
 /*
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4252cd8..581ecf0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -270,6 +270,7 @@
 #include <linux/slab.h>
 
 #include <net/icmp.h>
+#include <net/inet_common.h>
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
@@ -982,26 +983,67 @@ static inline int select_size(const struct sock *sk, bool sg)
 	return tmp;
 }
 
+void tcp_free_fastopen_req(struct tcp_sock *tp)
+{
+	if (tp->fastopen_req != NULL) {
+		kfree(tp->fastopen_req);
+		tp->fastopen_req = NULL;
+	}
+}
+
+static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int err, flags;
+
+	if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
+		return -EOPNOTSUPP;
+	if (tp->fastopen_req != NULL)
+		return -EALREADY; /* Another Fast Open is in progress */
+
+	tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
+				   sk->sk_allocation);
+	if (unlikely(tp->fastopen_req == NULL))
+		return -ENOBUFS;
+	tp->fastopen_req->data = msg;
+
+	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
+	err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
+				    msg->msg_namelen, flags);
+	*size = tp->fastopen_req->copied;
+	tcp_free_fastopen_req(tp);
+	return err;
+}
+
 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		size_t size)
 {
 	struct iovec *iov;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	int iovlen, flags, err, copied;
-	int mss_now = 0, size_goal;
+	int iovlen, flags, err, copied = 0;
+	int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
 	bool sg;
 	long timeo;
 
 	lock_sock(sk);
 
 	flags = msg->msg_flags;
+	if (flags & MSG_FASTOPEN) {
+		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
+		if (err == -EINPROGRESS && copied_syn > 0)
+			goto out;
+		else if (err)
+			goto out_err;
+		offset = copied_syn;
+	}
+
 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 
 	/* Wait for a connection to finish. */
 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
-			goto out_err;
+			goto do_error;
 
 	if (unlikely(tp->repair)) {
 		if (tp->repair_queue == TCP_RECV_QUEUE) {
@@ -1037,6 +1079,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		unsigned char __user *from = iov->iov_base;
 
 		iov++;
+		if (unlikely(offset > 0)) {  /* Skip bytes copied in SYN */
+			if (offset >= seglen) {
+				offset -= seglen;
+				continue;
+			}
+			seglen -= offset;
+			from += offset;
+			offset = 0;
+		}
 
 		while (seglen > 0) {
 			int copy = 0;
@@ -1199,7 +1250,7 @@ out:
 	if (copied && likely(!tp->repair))
 		tcp_push(sk, flags, mss_now, tp->nonagle);
 	release_sock(sk);
-	return copied;
+	return copied + copied_syn;
 
 do_fault:
 	if (!skb->len) {
@@ -1212,7 +1263,7 @@ do_fault:
 	}
 
 do_error:
-	if (copied)
+	if (copied + copied_syn)
 		goto out;
 out_err:
 	err = sk_stream_error(sk, flags, err);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 588200e..b2f457f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1959,6 +1959,9 @@ void tcp_v4_destroy_sock(struct sock *sk)
 		tp->cookie_values = NULL;
 	}
 
+	/* If socket is aborted during connect operation */
+	tcp_free_fastopen_req(tp);
+
 	sk_sockets_allocated_dec(sk);
 	sock_release_memcg(sk);
 }
-- 
1.7.7.3

^ permalink raw reply related

* [PATCH 6/7] net-tcp: Fast Open client - detecting SYN-data drops
From: Yuchung Cheng @ 2012-07-16 21:16 UTC (permalink / raw)
  To: davem, hkchu, edumazet, ncardwell; +Cc: sivasankar, netdev, Yuchung Cheng
In-Reply-To: <1342473410-6265-1-git-send-email-ycheng@google.com>

On paths with firewalls dropping SYN with data or experimental TCP options,
Fast Open connections will have experience SYN timeout and bad performance.
The solution is to track such incidents in the cookie cache and disables
Fast Open temporarily.

Since only the original SYN includes data and/or Fast Open option, the
SYN-ACK has some tell-tale sign (tcp_rcv_fastopen_synack()) to detect
such drops. If a path has recurring Fast Open SYN drops, Fast Open is
disabled for 2^(recurring_losses) minutes starting from four minutes up to
roughly one and half day. sendmsg with MSG_FASTOPEN flag will succeed but
it behaves as connect() then write().

Signed-off-by: Yuchung Cheng <ycheng@google.com>
---
 include/net/tcp.h       |    6 ++++--
 net/ipv4/tcp_fastopen.c |   16 ++++++++++++++--
 net/ipv4/tcp_input.c    |   10 +++++++++-
 net/ipv4/tcp_output.c   |   13 +++++++++++--
 4 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8c20edd..1b70444 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -391,9 +391,11 @@ enum tcp_tw_status {
 /* From tcp_fastopen.c */
 extern int tcp_fastopen_size_cache(int size);
 extern void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
-				   struct tcp_fastopen_cookie *cookie);
+				   struct tcp_fastopen_cookie *cookie,
+				   int *syn_loss, unsigned long *last_syn_loss);
 extern void tcp_fastopen_cache_set(struct sock *sk, u16 *mss,
-				   struct tcp_fastopen_cookie *cookie);
+				   struct tcp_fastopen_cookie *cookie,
+				   bool syn_lost);
 
 extern enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw,
 						     struct sk_buff *skb,
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 40fdf21..b10e8ca 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -22,6 +22,8 @@ int sysctl_tcp_fastopen;
  */
 struct fastopen_entry {
 	u16	mss;			/* TCP MSS value */
+	u16	syn_loss:10;		/* Recurring Fast Open SYN losses */
+	unsigned long	last_syn_loss;	/* Last Fast Open SYN loss */
 	struct	tcp_fastopen_cookie	cookie;	/* TCP Fast Open cookie */
 	struct	list_head	lru_list;	/* cookie cache lru_list node */
 	struct	inet_peer	*peer;	/* inetpeer entry (for fast lookup) */
@@ -82,7 +84,8 @@ static struct inet_peer *tcp_fastopen_inetpeer(struct sock *sk, int create)
 }
 
 void tcp_fastopen_cache_get(struct sock *sk, u32 *mss,
-			    struct tcp_fastopen_cookie *cookie)
+			    struct tcp_fastopen_cookie *cookie,
+			    int *syn_loss, unsigned long *last_syn_loss)
 {
 	struct inet_peer *peer = tcp_fastopen_inetpeer(sk, 0);
 	struct fastopen_entry *entry;
@@ -95,6 +98,8 @@ void tcp_fastopen_cache_get(struct sock *sk, u32 *mss,
 	if (entry != NULL) {
 		*mss = entry->mss;
 		*cookie = entry->cookie;
+		*syn_loss = entry->syn_loss;
+		*last_syn_loss = *syn_loss ? entry->last_syn_loss : 0;
 		list_move_tail(&entry->lru_list, &cookie_cache.lru_list);
 	}
 	spin_unlock_bh(&cookie_cache.lock);
@@ -103,7 +108,7 @@ void tcp_fastopen_cache_get(struct sock *sk, u32 *mss,
 }
 
 void tcp_fastopen_cache_set(struct sock *sk, u32 *mss,
-			    struct tcp_fastopen_cookie *cookie)
+			    struct tcp_fastopen_cookie *cookie, bool syn_lost)
 {
 	struct inet_peer *peer = tcp_fastopen_inetpeer(sk, 1);
 	struct fastopen_entry *entry = NULL, *new_entry = NULL;
@@ -119,6 +124,8 @@ void tcp_fastopen_cache_set(struct sock *sk, u32 *mss,
 			goto out;
 		}
 		new_entry->peer = peer;
+		new_entry->mss = new_entry->cookie.len = 0;
+		new_entry->last_syn_loss = new_entry->syn_loss = 0;
 		INIT_LIST_HEAD(&new_entry->lru_list);
 		peer->fastopen = new_entry;
 		++cookie_cache.cnt;
@@ -127,6 +134,11 @@ void tcp_fastopen_cache_set(struct sock *sk, u32 *mss,
 	entry->mss = *mss;
 	if (cookie->len > 0)
 		entry->cookie = *cookie;
+	if (syn_lost) {
+		++entry->syn_loss;
+		entry->last_syn_loss = jiffies;
+	} else
+		entry->syn_loss = 0;
 	list_move_tail(&entry->lru_list, &cookie_cache.lru_list);
 	entry = __tcp_fastopen_remove_lru();
 	spin_unlock_bh(&cookie_cache.lock);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5b09f71..aef8514 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5616,6 +5616,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *data = tcp_write_queue_head(sk);
 	u16 mss = tp->rx_opt.mss_clamp;
+	bool syn_drop;
 
 	if (mss == tp->rx_opt.user_mss) {
 		struct tcp_options_received opt;
@@ -5628,7 +5629,14 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 		mss = opt.mss_clamp;
 	}
 
-	tcp_fastopen_cache_set(sk, &mss, cookie);
+	/* The SYN-ACK neither has cookie nor acknowledges the data. Presumably
+	 * the remote receives only the retransmitted (regular) SYNs: either
+	 * the original SYN-data or the corresponding SYN-ACK is lost.
+	 */
+	syn_drop = (cookie->len <= 0 && data &&
+		    inet_csk(sk)->icsk_retransmits);
+
+	tcp_fastopen_cache_set(sk, &mss, cookie, syn_drop);
 
 	if (data) { /* Retransmit unacked data in SYN */
 		tcp_retransmit_skb(sk, data);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8869328..c5cfd5e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2860,10 +2860,19 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_fastopen_request *fo = tp->fastopen_req;
-	int space, i, err = 0, iovlen = fo->data->msg_iovlen;
+	int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen;
 	struct sk_buff *syn_data = NULL, *data;
+	unsigned long last_syn_loss = 0;
+
+	tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
+			       &syn_loss, &last_syn_loss);
+	/* Recurring FO SYN losses: revert to regular handshake temporarily */
+	if (syn_loss > 1 &&
+	    time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
+		fo->cookie.len = -1;
+		goto fallback;
+	}
 
-	tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie);
 	if (fo->cookie.len <= 0)
 		goto fallback;
 
-- 
1.7.7.3

^ permalink raw reply related

* [PATCH 7/7] net-tcp: Fast Open client - cookie-less mode
From: Yuchung Cheng @ 2012-07-16 21:16 UTC (permalink / raw)
  To: davem, hkchu, edumazet, ncardwell; +Cc: sivasankar, netdev, Yuchung Cheng
In-Reply-To: <1342473410-6265-1-git-send-email-ycheng@google.com>

In trusted networks, e.g., intranet, data-center, the client does not
need to use Fast Open cookie to mitigate DoS attacks. In cookie-less
mode, sendmsg() with MSG_FASTOPEN flag will send SYN-data regardless
of cookie availability.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
---
 Documentation/networking/ip-sysctl.txt |    2 ++
 include/linux/tcp.h                    |    1 +
 include/net/tcp.h                      |    1 +
 net/ipv4/tcp_input.c                   |    8 ++++++--
 net/ipv4/tcp_output.c                  |    6 +++++-
 5 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index b3c5225..d67d858 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -476,6 +476,8 @@ tcp_fastopen - INTEGER
 
 	The values (bitmap) are:
 	1: Enables sending data in the opening SYN on the client
+	5: Enables sending data in the opening SYN on the client regardless
+	   of cookie availability.
 
 	Default: 0
 
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1edf96a..9febfb6 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -387,6 +387,7 @@ struct tcp_sock {
 	u8	repair_queue;
 	u8	do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
 		early_retrans_delayed:1, /* Delayed ER timer installed */
+		syn_data:1,	/* SYN includes data */
 		syn_fastopen:1;	/* SYN includes Fast Open option */
 
 /* RTT measurement */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1b70444..99ec440 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -214,6 +214,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 
 /* Bit Flags for sysctl_tcp_fastopen */
 #define	TFO_CLIENT_ENABLE	1
+#define	TFO_CLIENT_NO_COOKIE	4	/* Data in SYN w/o cookie option */
 
 extern struct inet_timewait_death_row tcp_death_row;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index aef8514..5915ac2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5614,7 +5614,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 				    struct tcp_fastopen_cookie *cookie)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *data = tcp_write_queue_head(sk);
+	struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
 	u16 mss = tp->rx_opt.mss_clamp;
 	bool syn_drop;
 
@@ -5636,6 +5636,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 	syn_drop = (cookie->len <= 0 && data &&
 		    inet_csk(sk)->icsk_retransmits);
 
+	if (!tp->syn_fastopen)  /* Ignore an unsolicited cookie */
+		cookie->len = -1;
+
 	tcp_fastopen_cache_set(sk, &mss, cookie, syn_drop);
 
 	if (data) { /* Retransmit unacked data in SYN */
@@ -5780,7 +5783,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 
 		tcp_finish_connect(sk, skb);
 
-		if (tp->syn_fastopen && tcp_rcv_fastopen_synack(sk, skb, &foc))
+		if ((tp->syn_fastopen || tp->syn_data) &&
+		    tcp_rcv_fastopen_synack(sk, skb, &foc))
 			return -1;
 
 		if (sk->sk_write_pending ||
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c5cfd5e..27a32ac 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2864,6 +2864,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 	struct sk_buff *syn_data = NULL, *data;
 	unsigned long last_syn_loss = 0;
 
+	tp->rx_opt.mss_clamp = tp->advmss;  /* If MSS is not cached */
 	tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
 			       &syn_loss, &last_syn_loss);
 	/* Recurring FO SYN losses: revert to regular handshake temporarily */
@@ -2873,7 +2874,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 		goto fallback;
 	}
 
-	if (fo->cookie.len <= 0)
+	if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
+		fo->cookie.len = -1;
+	else if (fo->cookie.len <= 0)
 		goto fallback;
 
 	/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
@@ -2916,6 +2919,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 	fo->copied = data->len;
 
 	if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
+		tp->syn_data = (fo->copied > 0);
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
 		goto done;
 	}
-- 
1.7.7.3

^ permalink raw reply related

* [PATCH 1/7] net-tcp: Fast Open base
From: Yuchung Cheng @ 2012-07-16 21:16 UTC (permalink / raw)
  To: davem, hkchu, edumazet, ncardwell; +Cc: sivasankar, netdev, Yuchung Cheng
In-Reply-To: <1342473410-6265-1-git-send-email-ycheng@google.com>

This patch impelements the common code for both the client and server.

1. TCP Fast Open option processing. Since Fast Open does not have a
   option number assigned by IANA yet, it shares the experiment option
   code 254 by implementing draft-ietf-tcpm-experimental-options
   with a 16 bits magic number 0xF989

2. The new sysctl tcp_fastopen

3. A place holder init function

Signed-off-by: Yuchung Cheng <ycheng@google.com>
---
 include/linux/tcp.h        |   10 ++++++++++
 include/net/tcp.h          |    9 ++++++++-
 net/ipv4/Makefile          |    2 +-
 net/ipv4/syncookies.c      |    2 +-
 net/ipv4/sysctl_net_ipv4.c |    7 +++++++
 net/ipv4/tcp_fastopen.c    |   11 +++++++++++
 net/ipv4/tcp_input.c       |   26 ++++++++++++++++++++++----
 net/ipv4/tcp_ipv4.c        |    2 +-
 net/ipv4/tcp_minisocks.c   |    4 ++--
 net/ipv4/tcp_output.c      |   25 +++++++++++++++++++++----
 net/ipv6/syncookies.c      |    2 +-
 net/ipv6/tcp_ipv6.c        |    2 +-
 12 files changed, 86 insertions(+), 16 deletions(-)
 create mode 100644 net/ipv4/tcp_fastopen.c

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1888169..12948f5 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -243,6 +243,16 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)
 	return (tcp_hdr(skb)->doff - 5) * 4;
 }
 
+/* TCP Fast Open */
+#define TCP_FASTOPEN_COOKIE_MIN	4	/* Min Fast Open Cookie size in bytes */
+#define TCP_FASTOPEN_COOKIE_MAX	16	/* Max Fast Open Cookie size in bytes */
+
+/* TCP Fast Open Cookie as stored in memory */
+struct tcp_fastopen_cookie {
+	s8	len;
+	u8	val[TCP_FASTOPEN_COOKIE_MAX];
+};
+
 /* This defines a selective acknowledgement block. */
 struct tcp_sack_block_wire {
 	__be32	start_seq;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 439984b..87f486f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -170,6 +170,11 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOPT_TIMESTAMP	8	/* Better RTT estimations/PAWS */
 #define TCPOPT_MD5SIG		19	/* MD5 Signature (RFC2385) */
 #define TCPOPT_COOKIE		253	/* Cookie extension (experimental) */
+#define TCPOPT_EXP		254	/* Experimental */
+/* Magic number to be after the option value for sharing TCP
+ * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
+ */
+#define TCPOPT_FASTOPEN_MAGIC	0xF989
 
 /*
  *     TCP option lengths
@@ -180,6 +185,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOLEN_SACK_PERM      2
 #define TCPOLEN_TIMESTAMP      10
 #define TCPOLEN_MD5SIG         18
+#define TCPOLEN_EXP_FASTOPEN_BASE  4
 #define TCPOLEN_COOKIE_BASE    2	/* Cookie-less header extension */
 #define TCPOLEN_COOKIE_PAIR    3	/* Cookie pair header extension */
 #define TCPOLEN_COOKIE_MIN     (TCPOLEN_COOKIE_BASE+TCP_COOKIE_MIN)
@@ -222,6 +228,7 @@ extern int sysctl_tcp_retries1;
 extern int sysctl_tcp_retries2;
 extern int sysctl_tcp_orphan_retries;
 extern int sysctl_tcp_syncookies;
+extern int sysctl_tcp_fastopen;
 extern int sysctl_tcp_retrans_collapse;
 extern int sysctl_tcp_stdurg;
 extern int sysctl_tcp_rfc1337;
@@ -417,7 +424,7 @@ extern int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		       size_t len, int nonblock, int flags, int *addr_len);
 extern void tcp_parse_options(const struct sk_buff *skb,
 			      struct tcp_options_received *opt_rx, const u8 **hvpp,
-			      int estab);
+			      int estab, struct tcp_fastopen_cookie *foc);
 extern const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
 
 /*
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 5a23e8b..63e6995 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,7 +7,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     ip_output.o ip_sockglue.o inet_hashtables.o \
 	     inet_timewait_sock.o inet_connection_sock.o \
 	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
-	     tcp_minisocks.o tcp_cong.o tcp_metrics.o \
+	     tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
 	     datagram.o raw.o udp.o udplite.o \
 	     arp.o icmp.o devinet.o af_inet.o  igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index eab2a7f..650e152 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -293,7 +293,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 
 	/* check for timestamp cookie support */
 	memset(&tcp_opt, 0, sizeof(tcp_opt));
-	tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
+	tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL);
 
 	if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
 		goto out;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 70730f7..2a946a19 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -367,6 +367,13 @@ static struct ctl_table ipv4_table[] = {
 	},
 #endif
 	{
+		.procname	= "tcp_fastopen",
+		.data		= &sysctl_tcp_fastopen,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
 		.procname	= "tcp_tw_recycle",
 		.data		= &tcp_death_row.sysctl_tw_recycle,
 		.maxlen		= sizeof(int),
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
new file mode 100644
index 0000000..a7f729c
--- /dev/null
+++ b/net/ipv4/tcp_fastopen.c
@@ -0,0 +1,11 @@
+#include <linux/init.h>
+#include <linux/kernel.h>
+
+int sysctl_tcp_fastopen;
+
+static int __init tcp_fastopen_init(void)
+{
+	return 0;
+}
+
+late_initcall(tcp_fastopen_init);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 055ac49..d6e16e2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3729,7 +3729,8 @@ old_ack:
  * the fast version below fails.
  */
 void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx,
-		       const u8 **hvpp, int estab)
+		       const u8 **hvpp, int estab,
+		       struct tcp_fastopen_cookie *foc)
 {
 	const unsigned char *ptr;
 	const struct tcphdr *th = tcp_hdr(skb);
@@ -3836,8 +3837,25 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
 					break;
 				}
 				break;
-			}
 
+			case TCPOPT_EXP:
+				/* Fast Open option shares code 254 using a
+				 * 16 bits magic number. It's valid only in
+				 * SYN or SYN-ACK with an even size.
+				 */
+				if (opsize < TCPOLEN_EXP_FASTOPEN_BASE ||
+				    get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC ||
+				    foc == NULL || !th->syn || (opsize & 1))
+					break;
+				foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE;
+				if (foc->len >= TCP_FASTOPEN_COOKIE_MIN &&
+				    foc->len <= TCP_FASTOPEN_COOKIE_MAX)
+					memcpy(foc->val, ptr + 2, foc->len);
+				else if (foc->len != 0)
+					foc->len = -1;
+				break;
+
+			}
 			ptr += opsize-2;
 			length -= opsize;
 		}
@@ -3879,7 +3897,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
 		if (tcp_parse_aligned_timestamp(tp, th))
 			return true;
 	}
-	tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
+	tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL);
 	return true;
 }
 
@@ -5601,7 +5619,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 	struct tcp_cookie_values *cvp = tp->cookie_values;
 	int saved_clamp = tp->rx_opt.mss_clamp;
 
-	tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0);
+	tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, NULL);
 
 	if (th->ack) {
 		/* rfc793:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7a0062c..588200e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1314,7 +1314,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
-	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+	tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
 
 	if (tmp_opt.cookie_plus > 0 &&
 	    tmp_opt.saw_tstamp &&
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index c66f2ed..5912ac3 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -97,7 +97,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 
 	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
-		tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+		tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent	= tcptw->tw_ts_recent;
@@ -534,7 +534,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 
 	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(struct tcphdr)>>2)) {
-		tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+		tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent = req->ts_recent;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 15a7c7b..4849be7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -385,15 +385,17 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
 #define OPTION_MD5		(1 << 2)
 #define OPTION_WSCALE		(1 << 3)
 #define OPTION_COOKIE_EXTENSION	(1 << 4)
+#define OPTION_FAST_OPEN_COOKIE	(1 << 8)
 
 struct tcp_out_options {
-	u8 options;		/* bit field of OPTION_* */
+	u16 options;		/* bit field of OPTION_* */
+	u16 mss;		/* 0 to disable */
 	u8 ws;			/* window scale, 0 to disable */
 	u8 num_sack_blocks;	/* number of SACK blocks to include */
 	u8 hash_size;		/* bytes in hash_location */
-	u16 mss;		/* 0 to disable */
-	__u32 tsval, tsecr;	/* need to include OPTION_TS */
 	__u8 *hash_location;	/* temporary pointer, overloaded */
+	__u32 tsval, tsecr;	/* need to include OPTION_TS */
+	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
 };
 
 /* The sysctl int routines are generic, so check consistency here.
@@ -442,7 +444,7 @@ static u8 tcp_cookie_size_check(u8 desired)
 static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
 			      struct tcp_out_options *opts)
 {
-	u8 options = opts->options;	/* mungable copy */
+	u16 options = opts->options;	/* mungable copy */
 
 	/* Having both authentication and cookies for security is redundant,
 	 * and there's certainly not enough room.  Instead, the cookie-less
@@ -564,6 +566,21 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
 
 		tp->rx_opt.dsack = 0;
 	}
+
+	if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
+		struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
+
+		*ptr++ = htonl((TCPOPT_EXP << 24) |
+			       ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) |
+			       TCPOPT_FASTOPEN_MAGIC);
+
+		memcpy(ptr, foc->val, foc->len);
+		if ((foc->len & 3) == 2) {
+			u8 *align = ((u8 *)ptr) + foc->len;
+			align[0] = align[1] = TCPOPT_NOP;
+		}
+		ptr += (foc->len + 3) >> 2;
+	}
 }
 
 /* Compute TCP options for SYN packets. This is not the final
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 7bf3cc4..bb46061 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -177,7 +177,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 
 	/* check for timestamp cookie support */
 	memset(&tcp_opt, 0, sizeof(tcp_opt));
-	tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
+	tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL);
 
 	if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
 		goto out;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 3071f37..41eaeb5 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1062,7 +1062,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
 	tmp_opt.user_mss = tp->rx_opt.user_mss;
-	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+	tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
 
 	if (tmp_opt.cookie_plus > 0 &&
 	    tmp_opt.saw_tstamp &&
-- 
1.7.7.3

^ permalink raw reply related

* [PATCH 3/7] net-tcp: Fast Open client - sending SYN-data
From: Yuchung Cheng @ 2012-07-16 21:16 UTC (permalink / raw)
  To: davem, hkchu, edumazet, ncardwell; +Cc: sivasankar, netdev, Yuchung Cheng
In-Reply-To: <1342473410-6265-1-git-send-email-ycheng@google.com>

This patch implements sending SYN-data in tcp_connect(). The data is
from tcp_sendmsg() with flag MSG_FASTOPEN (implemented in a later patch).

The length of the cookie in tcp_fastopen_req, init'd to 0, controls the
type of the SYN. If the cookie is not cached (len==0), the host sends
data-less SYN with Fast Open cookie request option to solicit a cookie
from the remote. If cookie is not available (len > 0), the host sends
a SYN-data with Fast Open cookie option. If cookie length is negative,
  the SYN will not include any Fast Open option (for fall back operations).

To deal with middleboxes that may drop SYN with data or experimental TCP
option, the SYN-data is only sent once. SYN retransmits do not include
data or Fast Open options. The connection will fall back to regular TCP
handshake.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
---
 include/linux/snmp.h  |    3 +-
 include/linux/tcp.h   |    6 ++-
 include/net/tcp.h     |    9 ++++
 net/ipv4/af_inet.c    |    7 +++
 net/ipv4/proc.c       |    1 +
 net/ipv4/tcp_output.c |  115 +++++++++++++++++++++++++++++++++++++++++++++----
 6 files changed, 130 insertions(+), 11 deletions(-)

diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index 2e68f5b..c0a34c6 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -233,7 +233,8 @@ enum
 	LINUX_MIB_TCPREQQFULLDOCOOKIES,		/* TCPReqQFullDoCookies */
 	LINUX_MIB_TCPREQQFULLDROP,		/* TCPReqQFullDrop */
 	LINUX_MIB_TCPRETRANSFAIL,		/* TCPRetransFail */
-	LINUX_MIB_TCPRCVCOALESCE,			/* TCPRcvCoalesce */
+	LINUX_MIB_TCPRCVCOALESCE,		/* TCPRcvCoalesce */
+	LINUX_MIB_TCPFASTOPENACTIVE,		/* TCPFastOpenActive */
 	__LINUX_MIB_MAX
 };
 
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 12948f5..1edf96a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -386,7 +386,8 @@ struct tcp_sock {
 		unused      : 1;
 	u8	repair_queue;
 	u8	do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
-		early_retrans_delayed:1; /* Delayed ER timer installed */
+		early_retrans_delayed:1, /* Delayed ER timer installed */
+		syn_fastopen:1;	/* SYN includes Fast Open option */
 
 /* RTT measurement */
 	u32	srtt;		/* smoothed round trip time << 3	*/
@@ -500,6 +501,9 @@ struct tcp_sock {
 	struct tcp_md5sig_info	__rcu *md5sig_info;
 #endif
 
+/* TCP fastopen related information */
+	struct tcp_fastopen_request *fastopen_req;
+
 	/* When the cookie options are generated and exchanged, then this
 	 * object holds a reference to them (cookie_values->kref).  Also
 	 * contains related tcp_cookie_transactions fields.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4b29688..2d3b09d2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1290,6 +1290,15 @@ extern int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *, const struct sk_buff
 extern int tcp_md5_hash_key(struct tcp_md5sig_pool *hp,
 			    const struct tcp_md5sig_key *key);
 
+struct tcp_fastopen_request {
+	/* Fast Open cookie. Size 0 means a cookie request */
+	struct tcp_fastopen_cookie	cookie;
+	struct msghdr			*data;  /* data in MSG_FASTOPEN */
+	u16				copied;	/* queued in tcp_connect() */
+};
+
+void tcp_free_fastopen_req(struct tcp_sock *tp);
+
 /* write queue abstraction */
 static inline void tcp_write_queue_purge(struct sock *sk)
 {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 07a02f6..6ef67b7 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -558,9 +558,14 @@ EXPORT_SYMBOL(inet_dgram_connect);
 
 static long inet_wait_for_connect(struct sock *sk, long timeo)
 {
+	const bool write = (sk->sk_protocol == IPPROTO_TCP) &&
+			   tcp_sk(sk)->fastopen_req &&
+			   tcp_sk(sk)->fastopen_req->data;
 	DEFINE_WAIT(wait);
 
 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+	if (write)
+		sk->sk_write_pending++;
 
 	/* Basic assumption: if someone sets sk->sk_err, he _must_
 	 * change state of the socket from TCP_SYN_*.
@@ -576,6 +581,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	}
 	finish_wait(sk_sleep(sk), &wait);
+	if (write)
+		sk->sk_write_pending--;
 	return timeo;
 }
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 8af0d44..8f8c842 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -258,6 +258,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),
 	SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL),
 	SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE),
+	SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4849be7..8869328 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -596,6 +596,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 	u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
 			 tcp_cookie_size_check(cvp->cookie_desired) :
 			 0;
+	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
 
 #ifdef CONFIG_TCP_MD5SIG
 	*md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -636,6 +637,16 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 			remaining -= TCPOLEN_SACKPERM_ALIGNED;
 	}
 
+	if (fastopen && fastopen->cookie.len >= 0) {
+		u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
+		need = (need + 3) & ~3U;  /* Align to 32 bits */
+		if (remaining >= need) {
+			opts->options |= OPTION_FAST_OPEN_COOKIE;
+			opts->fastopen_cookie = &fastopen->cookie;
+			remaining -= need;
+			tp->syn_fastopen = 1;
+		}
+	}
 	/* Note that timestamps are required by the specification.
 	 *
 	 * Odd numbers of bytes are prohibited by the specification, ensuring
@@ -2824,6 +2835,96 @@ void tcp_connect_init(struct sock *sk)
 	tcp_clear_retrans(tp);
 }
 
+static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+	tcb->end_seq += skb->len;
+	skb_header_release(skb);
+	__tcp_add_write_queue_tail(sk, skb);
+	sk->sk_wmem_queued += skb->truesize;
+	sk_mem_charge(sk, skb->truesize);
+	tp->write_seq = tcb->end_seq;
+	tp->packets_out += tcp_skb_pcount(skb);
+}
+
+/* Build and send a SYN with data and (cached) Fast Open cookie. However,
+ * queue a data-only packet after the regular SYN, such that regular SYNs
+ * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
+ * only the SYN sequence, the data are retransmitted in the first ACK.
+ * If cookie is not cached or other error occurs, falls back to send a
+ * regular SYN with Fast Open cookie request option.
+ */
+static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_fastopen_request *fo = tp->fastopen_req;
+	int space, i, err = 0, iovlen = fo->data->msg_iovlen;
+	struct sk_buff *syn_data = NULL, *data;
+
+	tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie);
+	if (fo->cookie.len <= 0)
+		goto fallback;
+
+	/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
+	 * user-MSS. Reserve maximum option space for middleboxes that add
+	 * private TCP options. The cost is reduced data space in SYN :(
+	 */
+	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
+		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+	space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
+		MAX_TCP_OPTION_SPACE;
+
+	syn_data = skb_copy_expand(syn, skb_headroom(syn), space,
+				   sk->sk_allocation);
+	if (syn_data == NULL)
+		goto fallback;
+
+	for (i = 0; i < iovlen && syn_data->len < space; ++i) {
+		struct iovec *iov = &fo->data->msg_iov[i];
+		unsigned char __user *from = iov->iov_base;
+		int len = iov->iov_len;
+
+		if (syn_data->len + len > space)
+			len = space - syn_data->len;
+		else if (i + 1 == iovlen)
+			/* No more data pending in inet_wait_for_connect() */
+			fo->data = NULL;
+
+		if (skb_add_data(syn_data, from, len))
+			goto fallback;
+	}
+
+	/* Queue a data-only packet after the regular SYN for retransmission */
+	data = pskb_copy(syn_data, sk->sk_allocation);
+	if (data == NULL)
+		goto fallback;
+	TCP_SKB_CB(data)->seq++;
+	TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN;
+	TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH);
+	tcp_connect_queue_skb(sk, data);
+	fo->copied = data->len;
+
+	if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+		goto done;
+	}
+	syn_data = NULL;
+
+fallback:
+	/* Send a regular SYN with Fast Open cookie request option */
+	if (fo->cookie.len > 0)
+		fo->cookie.len = 0;
+	err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
+	if (err)
+		tp->syn_fastopen = 0;
+	kfree_skb(syn_data);
+done:
+	fo->cookie.len = -1;  /* Exclude Fast Open option for SYN retries */
+	return err;
+}
+
 /* Build a SYN and send it off. */
 int tcp_connect(struct sock *sk)
 {
@@ -2841,17 +2942,13 @@ int tcp_connect(struct sock *sk)
 	skb_reserve(buff, MAX_TCP_HEADER);
 
 	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
+	tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp;
+	tcp_connect_queue_skb(sk, buff);
 	TCP_ECN_send_syn(sk, buff);
 
-	/* Send it off. */
-	TCP_SKB_CB(buff)->when = tcp_time_stamp;
-	tp->retrans_stamp = TCP_SKB_CB(buff)->when;
-	skb_header_release(buff);
-	__tcp_add_write_queue_tail(sk, buff);
-	sk->sk_wmem_queued += buff->truesize;
-	sk_mem_charge(sk, buff->truesize);
-	tp->packets_out += tcp_skb_pcount(buff);
-	err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
+	/* Send off SYN; include data in Fast Open. */
+	err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
+	      tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
 	if (err == -ECONNREFUSED)
 		return err;
 
-- 
1.7.7.3

^ permalink raw reply related

* [PATCH 2/7] net-tcp: Fast Open client - cookie cache
From: Yuchung Cheng @ 2012-07-16 21:16 UTC (permalink / raw)
  To: davem, hkchu, edumazet, ncardwell; +Cc: sivasankar, netdev, Yuchung Cheng
In-Reply-To: <1342473410-6265-1-git-send-email-ycheng@google.com>

The Fast Open cookie cache is used by a TCP Fast Open client to store
remote servers' Fast Open cookies. It stores one Fast Open cookie
per IP (v4 or v6) and by default 1024 cookies total. The size is
tunable via /proc/sys/net/ipv4/tcp_fastopen_cookies. Setting it to 0
will flush the cache.

The inetpeer cache also caches remote peer's information but the
in-active cache entries are recycled on the scale of minutes. Therefore
a separate storage is required but the lookup is done via inetpeer.
Each inetpeer entry holds a cookie cache entry pointer (if TFO is used
on that IP). On cache write, the cookie cache entry is allocated and
stored in a list for LRU replacement. A spinlock protects any R/W
operation on the cookie cache entry and the list.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
---
 include/net/inetpeer.h     |    2 +
 include/net/tcp.h          |    6 ++
 net/ipv4/inetpeer.c        |    2 +
 net/ipv4/sysctl_net_ipv4.c |   26 ++++++++
 net/ipv4/tcp_fastopen.c    |  140 ++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 176 insertions(+), 0 deletions(-)

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 53f464d..0240709 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/jiffies.h>
 #include <linux/spinlock.h>
+#include <linux/tcp.h>
 #include <linux/rtnetlink.h>
 #include <net/ipv6.h>
 #include <linux/atomic.h>
@@ -53,6 +54,7 @@ struct inet_peer {
 		struct rcu_head         rcu;
 		struct inet_peer	*gc_next;
 	};
+	struct fastopen_entry		*fastopen;
 
 	/* following fields might be frequently dirtied */
 	__u32			dtime;	/* the time of last use of not referenced entries */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 87f486f..4b29688 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -385,6 +385,12 @@ enum tcp_tw_status {
 	TCP_TW_SYN = 3
 };
 
+/* From tcp_fastopen.c */
+extern int tcp_fastopen_size_cache(int size);
+extern void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
+				   struct tcp_fastopen_cookie *cookie);
+extern void tcp_fastopen_cache_set(struct sock *sk, u16 *mss,
+				   struct tcp_fastopen_cookie *cookie);
 
 extern enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw,
 						     struct sk_buff *skb,
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index e1e0a4e..9151ddc 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -63,6 +63,7 @@
  *		   usually under some other lock to prevent node disappearing
  *		daddr: unchangeable
  *		ip_id_count: atomic value (no lock needed)
+ *		fastopen_entry: TCP Fast Open cookie cache. See tcp_fastopen.c
  */
 
 static struct kmem_cache *peer_cachep __read_mostly;
@@ -511,6 +512,7 @@ relookup:
 		p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
 		p->rate_tokens = 0;
 		p->rate_last = 0;
+		p->fastopen = NULL;
 		INIT_LIST_HEAD(&p->gc_list);
 
 		/* Link the node. */
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 2a946a19..8d4571a 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -29,6 +29,7 @@
 static int zero;
 static int two = 2;
 static int tcp_retr1_max = 255;
+static int tcp_fastopen_cookies_max = 65536;
 static int ip_local_port_range_min[] = { 1, 1 };
 static int ip_local_port_range_max[] = { 65535, 65535 };
 static int tcp_adv_win_scale_min = -31;
@@ -220,6 +221,25 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
 	return 0;
 }
 
+static int proc_tcp_fastopen_cache_size(ctl_table *ctl, int write,
+					void __user *buffer, size_t *lenp,
+					loff_t *ppos)
+{
+	int ret;
+	int max_cookies = tcp_fastopen_size_cache(-1);
+	ctl_table tbl = {
+		.data = &max_cookies,
+		.maxlen = sizeof(max_cookies),
+		.extra1 = &zero,
+		.extra2 = &tcp_fastopen_cookies_max,
+	};
+
+	ret = proc_dointvec_minmax(&tbl, write, buffer, lenp, ppos);
+	if (write && ret == 0)
+		tcp_fastopen_size_cache(max_cookies);
+	return ret;
+}
+
 static struct ctl_table ipv4_table[] = {
 	{
 		.procname	= "tcp_timestamps",
@@ -374,6 +394,12 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 	{
+		.procname	= "tcp_fastopen_cookies",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_tcp_fastopen_cache_size,
+	},
+	{
 		.procname	= "tcp_tw_recycle",
 		.data		= &tcp_death_row.sysctl_tw_recycle,
 		.maxlen		= sizeof(int),
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index a7f729c..40fdf21 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -1,10 +1,150 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/tcp.h>
+#include <net/inetpeer.h>
 
 int sysctl_tcp_fastopen;
 
+/* The Fast Open cookie cache is used by a TCP Fast Open client to store
+ * remote servers' Fast Open cookies. It stores one Fast Open cookie
+ * per IP and by default 1024 cookies total. The size is tunable via
+ * /proc/sys/net/ipv4/tcp_fastopen_cookies. Setting it to 0 will flush
+ * the cache.
+ *
+ * The inetpeer cache also caches remote peer's information but the
+ * in-active cache entries are recycled on the scale of minutes. Therefore
+ * a separate storage is required but the lookup is done via inetpeer.
+ * Each inetpeer entry holds a cookie cache entry pointer (if TFO is used
+ * on that IP). On cache write, the cookie cache entry is allocated and
+ * stored in a list for LRU replacement. A spinlock protects any R/W
+ * operation on the cookie cache entry and the list.
+ */
+struct fastopen_entry {
+	u16	mss;			/* TCP MSS value */
+	struct	tcp_fastopen_cookie	cookie;	/* TCP Fast Open cookie */
+	struct	list_head	lru_list;	/* cookie cache lru_list node */
+	struct	inet_peer	*peer;	/* inetpeer entry (for fast lookup) */
+};
+
+static struct tcp_fastopen_cookie_cache {
+	spinlock_t lock;		/* for lru_list, cnt, size, entries */
+	struct list_head lru_list;	/* head is the least recently used */
+	int cnt;			/* size of lru_list */
+	int size;			/* cache capacity */
+} cookie_cache;
+
+/* Evict the LRU entry if cache is full. Caller must hold cooke_cache.lock */
+static struct fastopen_entry *__tcp_fastopen_remove_lru(void)
+{
+	struct fastopen_entry *entry;
+
+	if (cookie_cache.cnt <= cookie_cache.size || cookie_cache.cnt <= 0)
+		return NULL;
+
+	entry = list_first_entry(&cookie_cache.lru_list,
+				 struct fastopen_entry, lru_list);
+	list_del_init(&entry->lru_list);
+	--cookie_cache.cnt;
+	entry->peer->fastopen = NULL;
+	return entry;
+}
+
+int tcp_fastopen_size_cache(int size)
+{
+	while (size >= 0) {
+		struct fastopen_entry *lru_entry;
+
+		spin_lock_bh(&cookie_cache.lock);
+		cookie_cache.size = size;
+		lru_entry = __tcp_fastopen_remove_lru();
+		spin_unlock_bh(&cookie_cache.lock);
+
+		if (lru_entry == NULL)
+			break;
+		inet_putpeer(lru_entry->peer);
+		kfree(lru_entry);
+	}
+	return cookie_cache.size;
+}
+
+static struct inet_peer *tcp_fastopen_inetpeer(struct sock *sk, int create)
+{
+	struct net *net = dev_net(__sk_dst_get(sk)->dev);
+
+	if (sk->sk_family == AF_INET)
+		return inet_getpeer_v4(net->ipv4.peers,
+				       inet_sk(sk)->inet_daddr, create);
+	else if (sk->sk_family == AF_INET6)
+		return inet_getpeer_v6(net->ipv6.peers,
+				       &inet6_sk(sk)->daddr, create);
+	return NULL;
+}
+
+void tcp_fastopen_cache_get(struct sock *sk, u32 *mss,
+			    struct tcp_fastopen_cookie *cookie)
+{
+	struct inet_peer *peer = tcp_fastopen_inetpeer(sk, 0);
+	struct fastopen_entry *entry;
+
+	if (peer == NULL)
+		return;
+
+	spin_lock_bh(&cookie_cache.lock);
+	entry = peer->fastopen;
+	if (entry != NULL) {
+		*mss = entry->mss;
+		*cookie = entry->cookie;
+		list_move_tail(&entry->lru_list, &cookie_cache.lru_list);
+	}
+	spin_unlock_bh(&cookie_cache.lock);
+
+	inet_putpeer(peer);
+}
+
+void tcp_fastopen_cache_set(struct sock *sk, u32 *mss,
+			    struct tcp_fastopen_cookie *cookie)
+{
+	struct inet_peer *peer = tcp_fastopen_inetpeer(sk, 1);
+	struct fastopen_entry *entry = NULL, *new_entry = NULL;
+
+	if (peer == NULL)
+		return;
+
+	spin_lock_bh(&cookie_cache.lock);
+	if (peer->fastopen == NULL) {
+		new_entry = kmalloc(sizeof(struct fastopen_entry), GFP_ATOMIC);
+		if (new_entry == NULL) {
+			spin_unlock_bh(&cookie_cache.lock);
+			goto out;
+		}
+		new_entry->peer = peer;
+		INIT_LIST_HEAD(&new_entry->lru_list);
+		peer->fastopen = new_entry;
+		++cookie_cache.cnt;
+	}
+	entry = peer->fastopen;
+	entry->mss = *mss;
+	if (cookie->len > 0)
+		entry->cookie = *cookie;
+	list_move_tail(&entry->lru_list, &cookie_cache.lru_list);
+	entry = __tcp_fastopen_remove_lru();
+	spin_unlock_bh(&cookie_cache.lock);
+
+	if (entry) {
+		inet_putpeer(entry->peer);
+		kfree(entry);
+	}
+out:
+	if (new_entry == NULL)
+		inet_putpeer(peer);
+}
+
 static int __init tcp_fastopen_init(void)
 {
+	INIT_LIST_HEAD(&cookie_cache.lru_list);
+	spin_lock_init(&cookie_cache.lock);
+	cookie_cache.size = 1024;
 	return 0;
 }
 
-- 
1.7.7.3

^ permalink raw reply related

* [PATCH 4/7] net-tcp: Fast Open client - receiving SYN-ACK
From: Yuchung Cheng @ 2012-07-16 21:16 UTC (permalink / raw)
  To: davem, hkchu, edumazet, ncardwell; +Cc: sivasankar, netdev, Yuchung Cheng
In-Reply-To: <1342473410-6265-1-git-send-email-ycheng@google.com>

On receiving the SYN-ACK after SYN-data, the client needs to
a) update the cached MSS and cookie (if included in SYN-ACK)
b) retransmit the data yet acknowledged by the SYN-ACK in the final ACK of
   the handshake.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
---
 net/ipv4/tcp_input.c |   40 +++++++++++++++++++++++++++++++++++-----
 1 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d6e16e2..5b09f71 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5610,6 +5610,34 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 	}
 }
 
+static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
+				    struct tcp_fastopen_cookie *cookie)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *data = tcp_write_queue_head(sk);
+	u16 mss = tp->rx_opt.mss_clamp;
+
+	if (mss == tp->rx_opt.user_mss) {
+		struct tcp_options_received opt;
+		const u8 *hash_location;
+
+		/* Get original SYNACK MSS value if user MSS sets mss_clamp */
+		tcp_clear_options(&opt);
+		opt.user_mss = opt.mss_clamp = 0;
+		tcp_parse_options(synack, &opt, &hash_location, 0, NULL);
+		mss = opt.mss_clamp;
+	}
+
+	tcp_fastopen_cache_set(sk, &mss, cookie);
+
+	if (data) { /* Retransmit unacked data in SYN */
+		tcp_retransmit_skb(sk, data);
+		tcp_rearm_rto(sk);
+		return true;
+	}
+	return false;
+}
+
 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 					 const struct tcphdr *th, unsigned int len)
 {
@@ -5617,9 +5645,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_cookie_values *cvp = tp->cookie_values;
+	struct tcp_fastopen_cookie foc = { .len = -1 };
 	int saved_clamp = tp->rx_opt.mss_clamp;
 
-	tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, NULL);
+	tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc);
 
 	if (th->ack) {
 		/* rfc793:
@@ -5629,11 +5658,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		 *	  If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
 		 *        a reset (unless the RST bit is set, if so drop
 		 *        the segment and return)"
-		 *
-		 *  We do not send data with SYN, so that RFC-correct
-		 *  test reduces to:
 		 */
-		if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
+		if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
+		    after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
 			goto reset_and_undo;
 
 		if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -5745,6 +5772,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 
 		tcp_finish_connect(sk, skb);
 
+		if (tp->syn_fastopen && tcp_rcv_fastopen_synack(sk, skb, &foc))
+			return -1;
+
 		if (sk->sk_write_pending ||
 		    icsk->icsk_accept_queue.rskq_defer_accept ||
 		    icsk->icsk_ack.pingpong) {
-- 
1.7.7.3

^ permalink raw reply related

* [PATCH 0/7] TCP Fast Open client
From: Yuchung Cheng @ 2012-07-16 21:16 UTC (permalink / raw)
  To: davem, hkchu, edumazet, ncardwell; +Cc: sivasankar, netdev, Yuchung Cheng

This patch series implement the client functionality of TCP Fast Open.
TCP Fast Open (TFO) allows data to be carried in the SYN and SYN-ACK
packets and consumed by the receiving end during the initial connection
handshake, thus providing a saving of up to one full round trip time (RTT)
compared to standard TCP requiring a three-way handshake (3WHS) to
complete before data can be exchanged.

The protocol change is detailed in the IETF internet draft at
http://www.ietf.org/id/draft-ietf-tcpm-fastopen-00.txt . The research
paper (http://conferences.sigcomm.org/co-next/2011/papers/1569470463.pdf)
studied the performance impact of HTTP using Fast Open, based on this
Linux implementation and the Chrome browser.

To use Fast Open, the client application (active SYN sender) must 
replace connect() socket call with sendmsg() or sendto() with the new
MSG_FASTOPEN flag. If the server supports Fast Open the data exchange
starts at TCP handshake. Otherwise the connection will automatically
fall back to conventional TCP.

Yuchung Cheng (7):
  net-tcp: Fast Open base
  net-tcp: Fast Open client - cookie cache
  net-tcp: Fast Open client - sending SYN-data
  net-tcp: Fast Open client - receiving SYN-ACK
  net-tcp: Fast Open client - sendmsg(MSG_FASTOPEN)
  net-tcp: Fast Open client - detecting SYN-data drops
  net-tcp: Fast Open client - cookie-less mode

 Documentation/networking/ip-sysctl.txt |   13 +++
 include/linux/snmp.h                   |    3 +-
 include/linux/socket.h                 |    1 +
 include/linux/tcp.h                    |   17 +++-
 include/net/inet_common.h              |    6 +-
 include/net/inetpeer.h                 |    2 +
 include/net/tcp.h                      |   30 ++++++-
 net/ipv4/Makefile                      |    2 +-
 net/ipv4/af_inet.c                     |   26 ++++-
 net/ipv4/inetpeer.c                    |    2 +
 net/ipv4/proc.c                        |    1 +
 net/ipv4/syncookies.c                  |    2 +-
 net/ipv4/sysctl_net_ipv4.c             |   33 +++++++
 net/ipv4/tcp.c                         |   61 +++++++++++-
 net/ipv4/tcp_fastopen.c                |  163 ++++++++++++++++++++++++++++++++
 net/ipv4/tcp_input.c                   |   76 +++++++++++++--
 net/ipv4/tcp_ipv4.c                    |    5 +-
 net/ipv4/tcp_minisocks.c               |    4 +-
 net/ipv4/tcp_output.c                  |  153 +++++++++++++++++++++++++++---
 net/ipv6/syncookies.c                  |    2 +-
 net/ipv6/tcp_ipv6.c                    |    2 +-
 21 files changed, 561 insertions(+), 43 deletions(-)
 create mode 100644 net/ipv4/tcp_fastopen.c

-- 
1.7.7.3

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox