Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 06/10] cxgb4vf: fix up "Section Mismatch" compiler warning.
From: Casey Leedom @ 2010-11-11  2:04 UTC (permalink / raw)
  To: netdev; +Cc: davem, Casey Leedom
In-Reply-To: <450C6E98-D0D9-42A6-9F15-4DB98647DB04@chelsio.com>

Fix up "Section Mismatch" compiler warning and mark another routine as
__devinit.

Signed-off-by: Casey Leedom <leedom@chelsio.com>
---
 drivers/net/cxgb4vf/cxgb4vf_main.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/cxgb4vf/cxgb4vf_main.c b/drivers/net/cxgb4vf/cxgb4vf_main.c
index 75b85ca..b5c0bff 100644
--- a/drivers/net/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/cxgb4vf/cxgb4vf_main.c
@@ -2032,7 +2032,7 @@ static int __devinit setup_debugfs(struct adapter *adapter)
  * Tear down the /sys/kernel/debug/cxgb4vf sub-nodes created above.  We leave
  * it to our caller to tear down the directory (debugfs_root).
  */
-static void __devexit cleanup_debugfs(struct adapter *adapter)
+static void cleanup_debugfs(struct adapter *adapter)
 {
 	BUG_ON(adapter->debugfs_root == NULL);
 
@@ -2050,7 +2050,7 @@ static void __devexit cleanup_debugfs(struct adapter *adapter)
  * adapter parameters we're going to be using and initialize basic adapter
  * hardware support.
  */
-static int adap_init0(struct adapter *adapter)
+static int __devinit adap_init0(struct adapter *adapter)
 {
 	struct vf_resources *vfres = &adapter->params.vfres;
 	struct sge_params *sge_params = &adapter->params.sge;
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 05/10] cxgb4vf: flesh out PCI Device ID Table ...
From: Casey Leedom @ 2010-11-11  2:04 UTC (permalink / raw)
  To: netdev; +Cc: davem, Casey Leedom
In-Reply-To: <450C6E98-D0D9-42A6-9F15-4DB98647DB04@chelsio.com>

Add a bunch of T4 Device IDs for the VF Driver.

Signed-off-by: Casey Leedom <leedom@chelsio.com>
---
 drivers/net/cxgb4vf/cxgb4vf_main.c |    8 ++++++++
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/drivers/net/cxgb4vf/cxgb4vf_main.c b/drivers/net/cxgb4vf/cxgb4vf_main.c
index 9e9e4f6..75b85ca 100644
--- a/drivers/net/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/cxgb4vf/cxgb4vf_main.c
@@ -2834,6 +2834,14 @@ static struct pci_device_id cxgb4vf_pci_tbl[] = {
 	CH_DEVICE(0x4800, 0),	/* T440-dbg */
 	CH_DEVICE(0x4801, 0),	/* T420-cr */
 	CH_DEVICE(0x4802, 0),	/* T422-cr */
+	CH_DEVICE(0x4803, 0),	/* T440-cr */
+	CH_DEVICE(0x4804, 0),	/* T420-bch */
+	CH_DEVICE(0x4805, 0),   /* T440-bch */
+	CH_DEVICE(0x4806, 0),	/* T460-ch */
+	CH_DEVICE(0x4807, 0),	/* T420-so */
+	CH_DEVICE(0x4808, 0),	/* T420-cx */
+	CH_DEVICE(0x4809, 0),	/* T420-bt */
+	CH_DEVICE(0x480a, 0),   /* T404-bt */
 	{ 0, }
 };
 
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 07/10] cxgb4vf: Fail open if link_start() fails.
From: Casey Leedom @ 2010-11-11  2:04 UTC (permalink / raw)
  To: netdev; +Cc: davem, Casey Leedom
In-Reply-To: <450C6E98-D0D9-42A6-9F15-4DB98647DB04@chelsio.com>

Fail open if link_start() fails.

Signed-off-by: Casey Leedom <leedom@chelsio.com>
---
 drivers/net/cxgb4vf/cxgb4vf_main.c |    4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/drivers/net/cxgb4vf/cxgb4vf_main.c b/drivers/net/cxgb4vf/cxgb4vf_main.c
index b5c0bff..2d09e83 100644
--- a/drivers/net/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/cxgb4vf/cxgb4vf_main.c
@@ -753,7 +753,9 @@ static int cxgb4vf_open(struct net_device *dev)
 	if (err)
 		return err;
 	set_bit(pi->port_id, &adapter->open_device_map);
-	link_start(dev);
+	err = link_start(dev);
+	if (err)
+		return err;
 	netif_tx_start_all_queues(dev);
 	return 0;
 }
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 08/10] cxgb4vf: Advertise NETIF_F_TSO_ECN.
From: Casey Leedom @ 2010-11-11  2:04 UTC (permalink / raw)
  To: netdev; +Cc: davem, Casey Leedom
In-Reply-To: <450C6E98-D0D9-42A6-9F15-4DB98647DB04@chelsio.com>

Advertise NETIF_F_TSO_ECN.

Signed-off-by: Casey Leedom <leedom@chelsio.com>
---
 drivers/net/cxgb4vf/cxgb4vf_main.c |   11 ++++++++---
 1 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/net/cxgb4vf/cxgb4vf_main.c b/drivers/net/cxgb4vf/cxgb4vf_main.c
index 2d09e83..15b14ef 100644
--- a/drivers/net/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/cxgb4vf/cxgb4vf_main.c
@@ -1536,14 +1536,19 @@ static void cxgb4vf_get_wol(struct net_device *dev,
 }
 
 /*
+ * TCP Segmentation Offload flags which we support.
+ */
+#define TSO_FLAGS (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN)
+
+/*
  * Set TCP Segmentation Offloading feature capabilities.
  */
 static int cxgb4vf_set_tso(struct net_device *dev, u32 tso)
 {
 	if (tso)
-		dev->features |= NETIF_F_TSO | NETIF_F_TSO6;
+		dev->features |= TSO_FLAGS;
 	else
-		dev->features &= ~(NETIF_F_TSO | NETIF_F_TSO6);
+		dev->features &= ~TSO_FLAGS;
 	return 0;
 }
 
@@ -2596,7 +2601,7 @@ static int __devinit cxgb4vf_pci_probe(struct pci_dev *pdev,
 		netif_carrier_off(netdev);
 		netdev->irq = pdev->irq;
 
-		netdev->features = (NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6 |
+		netdev->features = (NETIF_F_SG | TSO_FLAGS |
 				    NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
 				    NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX |
 				    NETIF_F_GRO);
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 09/10] cxgb4vf: Mark "UDP [RSS Hash] Enable" as a 1-bit field.
From: Casey Leedom @ 2010-11-11  2:04 UTC (permalink / raw)
  To: netdev; +Cc: davem, Casey Leedom
In-Reply-To: <450C6E98-D0D9-42A6-9F15-4DB98647DB04@chelsio.com>

Mark the UDP RSS Hash Enable field as 1-bit in length.  Also clean up
formatting from previous changeset which changed the RSS 1-bit fields from
"int" to "unsigned int".

Signed-off-by: Casey Leedom <leedom@chelsio.com>
---
 drivers/net/cxgb4vf/t4vf_common.h |   28 ++++++++++++++--------------
 1 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/net/cxgb4vf/t4vf_common.h b/drivers/net/cxgb4vf/t4vf_common.h
index 873cb7d..659616b 100644
--- a/drivers/net/cxgb4vf/t4vf_common.h
+++ b/drivers/net/cxgb4vf/t4vf_common.h
@@ -132,15 +132,15 @@ struct rss_params {
 	unsigned int mode;		/* RSS mode */
 	union {
 	    struct {
-		unsigned int synmapen:1;	/* SYN Map Enable */
-		unsigned int syn4tupenipv6:1;	/* enable hashing 4-tuple IPv6 SYNs */
-		unsigned int syn2tupenipv6:1;	/* enable hashing 2-tuple IPv6 SYNs */
-		unsigned int syn4tupenipv4:1;	/* enable hashing 4-tuple IPv4 SYNs */
-		unsigned int syn2tupenipv4:1;	/* enable hashing 2-tuple IPv4 SYNs */
-		unsigned int ofdmapen:1;	/* Offload Map Enable */
-		unsigned int tnlmapen:1;	/* Tunnel Map Enable */
-		unsigned int tnlalllookup:1;	/* Tunnel All Lookup */
-		unsigned int hashtoeplitz:1;	/* use Toeplitz hash */
+		uint synmapen:1;	/* SYN Map Enable */
+		uint syn4tupenipv6:1;	/* enable hashing 4-tuple IPv6 SYNs */
+		uint syn2tupenipv6:1;	/* enable hashing 2-tuple IPv6 SYNs */
+		uint syn4tupenipv4:1;	/* enable hashing 4-tuple IPv4 SYNs */
+		uint syn2tupenipv4:1;	/* enable hashing 2-tuple IPv4 SYNs */
+		uint ofdmapen:1;	/* Offload Map Enable */
+		uint tnlmapen:1;	/* Tunnel Map Enable */
+		uint tnlalllookup:1;	/* Tunnel All Lookup */
+		uint hashtoeplitz:1;	/* use Toeplitz hash */
 	    } basicvirtual;
 	} u;
 };
@@ -151,11 +151,11 @@ struct rss_params {
 union rss_vi_config {
     struct {
 	u16 defaultq;			/* Ingress Queue ID for !tnlalllookup */
-	unsigned int ip6fourtupen:1;	/* hash 4-tuple IPv6 ingress packets */
-	unsigned int ip6twotupen:1;	/* hash 2-tuple IPv6 ingress packets */
-	unsigned int ip4fourtupen:1;	/* hash 4-tuple IPv4 ingress packets */
-	unsigned int ip4twotupen:1;	/* hash 2-tuple IPv4 ingress packets */
-	int udpen;			/* hash 4-tuple UDP ingress packets */
+	uint ip6fourtupen:1;		/* hash 4-tuple IPv6 ingress packets */
+	uint ip6twotupen:1;		/* hash 2-tuple IPv6 ingress packets */
+	uint ip4fourtupen:1;		/* hash 4-tuple IPv4 ingress packets */
+	uint ip4twotupen:1;		/* hash 2-tuple IPv4 ingress packets */
+	uint udpen:1;			/* hash 4-tuple UDP ingress packets */
     } basicvirtual;
 };
 
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 10/10] cxgb4vf: add call to Firmware to reset VF State.
From: Casey Leedom @ 2010-11-11  2:04 UTC (permalink / raw)
  To: netdev; +Cc: davem, Casey Leedom
In-Reply-To: <450C6E98-D0D9-42A6-9F15-4DB98647DB04@chelsio.com>

Add call to Firmware to reset its VF State when we first attach to the VF.

Signed-off-by: Casey Leedom <leedom@chelsio.com>
---
 drivers/net/cxgb4vf/cxgb4vf_main.c |   16 ++++++++++++++++
 drivers/net/cxgb4vf/t4vf_common.h  |    1 +
 drivers/net/cxgb4vf/t4vf_hw.c      |   19 +++++++++++++++++++
 3 files changed, 36 insertions(+), 0 deletions(-)

diff --git a/drivers/net/cxgb4vf/cxgb4vf_main.c b/drivers/net/cxgb4vf/cxgb4vf_main.c
index 15b14ef..3456a9b 100644
--- a/drivers/net/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/cxgb4vf/cxgb4vf_main.c
@@ -2076,6 +2076,22 @@ static int __devinit adap_init0(struct adapter *adapter)
 	}
 
 	/*
+	 * Some environments do not properly handle PCIE FLRs -- e.g. in Linux
+	 * 2.6.31 and later we can't call pci_reset_function() in order to
+	 * issue an FLR because of a self- deadlock on the device semaphore.
+	 * Meanwhile, the OS infrastructure doesn't issue FLRs in all the
+	 * cases where they're needed -- for instance, some versions of KVM
+	 * fail to reset "Assigned Devices" when the VM reboots.  Therefore we
+	 * use the firmware based reset in order to reset any per function
+	 * state.
+	 */
+ 	err = t4vf_fw_reset(adapter);
+	if (err < 0) {
+		dev_err(adapter->pdev_dev, "FW reset failed: err=%d\n", err);
+		return err;
+	}
+
+	/*
 	 * Grab basic operational parameters.  These will predominantly have
 	 * been set up by the Physical Function Driver or will be hard coded
 	 * into the adapter.  We just have to live with them ...  Note that
diff --git a/drivers/net/cxgb4vf/t4vf_common.h b/drivers/net/cxgb4vf/t4vf_common.h
index 659616b..7541a60 100644
--- a/drivers/net/cxgb4vf/t4vf_common.h
+++ b/drivers/net/cxgb4vf/t4vf_common.h
@@ -235,6 +235,7 @@ static inline int t4vf_wr_mbox_ns(struct adapter *adapter, const void *cmd,
 int __devinit t4vf_wait_dev_ready(struct adapter *);
 int __devinit t4vf_port_init(struct adapter *, int);
 
+int t4vf_fw_reset(struct adapter *);
 int t4vf_query_params(struct adapter *, unsigned int, const u32 *, u32 *);
 int t4vf_set_params(struct adapter *, unsigned int, const u32 *, const u32 *);
 
diff --git a/drivers/net/cxgb4vf/t4vf_hw.c b/drivers/net/cxgb4vf/t4vf_hw.c
index 2180181..591f161 100644
--- a/drivers/net/cxgb4vf/t4vf_hw.c
+++ b/drivers/net/cxgb4vf/t4vf_hw.c
@@ -326,6 +326,25 @@ int __devinit t4vf_port_init(struct adapter *adapter, int pidx)
 }
 
 /**
+ *      t4vf_fw_reset - issue a reset to FW
+ *      @adapter: the adapter
+ *
+ *	Issues a reset command to FW.  For a Physical Function this would
+ *	result in the Firmware reseting all of its state.  For a Virtual
+ *	Function this just resets the state associated with the VF.
+ */
+int t4vf_fw_reset(struct adapter *adapter)
+{
+	struct fw_reset_cmd cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.op_to_write = cpu_to_be32(FW_CMD_OP(FW_RESET_CMD) |
+				      FW_CMD_WRITE);
+	cmd.retval_len16 = cpu_to_be32(FW_LEN16(cmd));
+	return t4vf_wr_mbox(adapter, &cmd, sizeof(cmd), NULL);
+}
+
+/**
  *	t4vf_query_params - query FW or device parameters
  *	@adapter: the adapter
  *	@nparams: the number of parameters
-- 
1.7.0.4


^ permalink raw reply related

* Re: [PATCH 01/10] cxgb4vf: minor comment/symbolic name cleanup.
From: Joe Perches @ 2010-11-11  3:19 UTC (permalink / raw)
  To: Casey Leedom; +Cc: netdev, davem
In-Reply-To: <1289441052-4925-1-git-send-email-leedom@chelsio.com>

On Wed, 2010-11-10 at 18:04 -0800, Casey Leedom wrote:
> Minor cleanup of comments and symbolic constant names for clarity.
> diff --git a/drivers/net/cxgb4vf/cxgb4vf_main.c b/drivers/net/cxgb4vf/cxgb4vf_main.c
> index 6de5e2e..f5259a1 100644
> --- a/drivers/net/cxgb4vf/cxgb4vf_main.c
> +++ b/drivers/net/cxgb4vf/cxgb4vf_main.c
> @@ -280,7 +280,7 @@ static void name_msix_vecs(struct adapter *adapter)
>  		const struct port_info *pi = netdev_priv(dev);
>  		int qs, msi;
>  
> -		for (qs = 0, msi = MSIX_NIQFLINT;
> +		for (qs = 0, msi = MSIX_IQFLINT;
>  		     qs < pi->nqsets;
>  		     qs++, msi++) {

This for now fits on a single line.

> diff --git a/drivers/net/cxgb4vf/t4vf_hw.c b/drivers/net/cxgb4vf/t4vf_hw.c
> index ea1c123..2180181 100644
> --- a/drivers/net/cxgb4vf/t4vf_hw.c
> +++ b/drivers/net/cxgb4vf/t4vf_hw.c
> @@ -1257,7 +1257,7 @@ int t4vf_eth_eq_free(struct adapter *adapter, unsigned int eqid)
>   */
>  int t4vf_handle_fw_rpl(struct adapter *adapter, const __be64 *rpl)
> -	struct fw_cmd_hdr *cmd_hdr = (struct fw_cmd_hdr *)rpl;
> +	const struct fw_cmd_hdr *cmd_hdr = (const struct fw_cmd_hdr *)rpl;
> @@ -1265,7 +1265,7 @@ int t4vf_handle_fw_rpl(struct adapter *adapter, const __be64 *rpl)
> -		const struct fw_port_cmd *port_cmd = (void *)rpl;
> +		const struct fw_port_cmd *port_cmd = (const void *)rpl;

might be better to have a consistent casting style.
1st uses a direct cast, 2nd an implicit one.


^ permalink raw reply

* [PATCH] drivers/isdn/hisax: Add printf format/argument verification and fix fallout
From: Joe Perches @ 2010-11-11  4:54 UTC (permalink / raw)
  To: linux-kernel; +Cc: Karsten Keil, netdev

Add __attribute__((format... to several functins
Make formats and arguments match.

Signed-off-by: Joe Perches <joe@perches.com>
---
 drivers/isdn/hisax/avm_pci.c   |    2 +-
 drivers/isdn/hisax/callc.c     |    4 ++--
 drivers/isdn/hisax/hfc_2bds0.c |    4 ++--
 drivers/isdn/hisax/hfc_2bs0.c  |    2 +-
 drivers/isdn/hisax/hfc_pci.c   |    4 ++--
 drivers/isdn/hisax/hfc_sx.c    |    6 +++---
 drivers/isdn/hisax/hisax.h     |    2 ++
 drivers/isdn/hisax/ipacx.c     |    2 +-
 drivers/isdn/hisax/isar.c      |    6 +++---
 drivers/isdn/hisax/isdnl1.h    |    1 +
 drivers/isdn/hisax/isdnl3.c    |    2 +-
 drivers/isdn/hisax/netjet.c    |   10 +++++-----
 drivers/isdn/hisax/st5481_d.c  |    6 ++++--
 13 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/drivers/isdn/hisax/avm_pci.c b/drivers/isdn/hisax/avm_pci.c
index fcf4ed1..0e66af1 100644
--- a/drivers/isdn/hisax/avm_pci.c
+++ b/drivers/isdn/hisax/avm_pci.c
@@ -314,7 +314,7 @@ hdlc_fill_fifo(struct BCState *bcs)
 			bcs->hw.hdlc.ctrl.sr.cmd |= HDLC_CMD_XME;
 	}
 	if ((cs->debug & L1_DEB_HSCX) && !(cs->debug & L1_DEB_HSCX_FIFO))
-		debugl1(cs, "hdlc_fill_fifo %d/%ld", count, bcs->tx_skb->len);
+		debugl1(cs, "hdlc_fill_fifo %d/%u", count, bcs->tx_skb->len);
 	p = bcs->tx_skb->data;
 	ptr = (u_int *)p;
 	skb_pull(bcs->tx_skb, count);
diff --git a/drivers/isdn/hisax/callc.c b/drivers/isdn/hisax/callc.c
index f150330..37e685e 100644
--- a/drivers/isdn/hisax/callc.c
+++ b/drivers/isdn/hisax/callc.c
@@ -65,7 +65,7 @@ hisax_findcard(int driverid)
 	return (struct IsdnCardState *) 0;
 }
 
-static void
+static __attribute__((format(printf, 3, 4))) void
 link_debug(struct Channel *chanp, int direction, char *fmt, ...)
 {
 	va_list args;
@@ -1068,7 +1068,7 @@ init_d_st(struct Channel *chanp)
 	return 0;
 }
 
-static void
+static __attribute__((format(printf, 2, 3))) void
 callc_debug(struct FsmInst *fi, char *fmt, ...)
 {
 	va_list args;
diff --git a/drivers/isdn/hisax/hfc_2bds0.c b/drivers/isdn/hisax/hfc_2bds0.c
index 7250f56..a16459a 100644
--- a/drivers/isdn/hisax/hfc_2bds0.c
+++ b/drivers/isdn/hisax/hfc_2bds0.c
@@ -292,7 +292,7 @@ hfc_fill_fifo(struct BCState *bcs)
 	}
 	count = GetFreeFifoBytes_B(bcs);
 	if (cs->debug & L1_DEB_HSCX)
-		debugl1(cs, "hfc_fill_fifo %d count(%ld/%d),%lx",
+		debugl1(cs, "hfc_fill_fifo %d count(%u/%d),%lx",
 			bcs->channel, bcs->tx_skb->len,
 			count, current->state);
 	if (count < bcs->tx_skb->len) {
@@ -719,7 +719,7 @@ hfc_fill_dfifo(struct IsdnCardState *cs)
 	}
 	count = GetFreeFifoBytes_D(cs);
 	if (cs->debug & L1_DEB_ISAC)
-		debugl1(cs, "hfc_fill_Dfifo count(%ld/%d)",
+		debugl1(cs, "hfc_fill_Dfifo count(%u/%d)",
 			cs->tx_skb->len, count);
 	if (count < cs->tx_skb->len) {
 		if (cs->debug & L1_DEB_ISAC)
diff --git a/drivers/isdn/hisax/hfc_2bs0.c b/drivers/isdn/hisax/hfc_2bs0.c
index b1f6481..626f85d 100644
--- a/drivers/isdn/hisax/hfc_2bs0.c
+++ b/drivers/isdn/hisax/hfc_2bs0.c
@@ -282,7 +282,7 @@ hfc_fill_fifo(struct BCState *bcs)
 	    count += cs->hw.hfc.fifosize; 
 	} /* L1_MODE_TRANS */
 	if (cs->debug & L1_DEB_HSCX)
-		debugl1(cs, "hfc_fill_fifo %d count(%ld/%d)",
+		debugl1(cs, "hfc_fill_fifo %d count(%u/%d)",
 			bcs->channel, bcs->tx_skb->len,
 			count);
 	if (count < bcs->tx_skb->len) {
diff --git a/drivers/isdn/hisax/hfc_pci.c b/drivers/isdn/hisax/hfc_pci.c
index 917cc84..3147020 100644
--- a/drivers/isdn/hisax/hfc_pci.c
+++ b/drivers/isdn/hisax/hfc_pci.c
@@ -550,7 +550,7 @@ hfcpci_fill_dfifo(struct IsdnCardState *cs)
 		count += D_FIFO_SIZE;	/* count now contains available bytes */
 
 	if (cs->debug & L1_DEB_ISAC)
-		debugl1(cs, "hfcpci_fill_Dfifo count(%ld/%d)",
+		debugl1(cs, "hfcpci_fill_Dfifo count(%u/%d)",
 			cs->tx_skb->len, count);
 	if (count < cs->tx_skb->len) {
 		if (cs->debug & L1_DEB_ISAC)
@@ -681,7 +681,7 @@ hfcpci_fill_fifo(struct BCState *bcs)
 		count += B_FIFO_SIZE;	/* count now contains available bytes */
 
 	if (cs->debug & L1_DEB_HSCX)
-		debugl1(cs, "hfcpci_fill_fifo %d count(%ld/%d),%lx",
+		debugl1(cs, "hfcpci_fill_fifo %d count(%u/%d),%lx",
 			bcs->channel, bcs->tx_skb->len,
 			count, current->state);
 
diff --git a/drivers/isdn/hisax/hfc_sx.c b/drivers/isdn/hisax/hfc_sx.c
index 5aa138e..1235b71 100644
--- a/drivers/isdn/hisax/hfc_sx.c
+++ b/drivers/isdn/hisax/hfc_sx.c
@@ -179,7 +179,7 @@ write_fifo(struct IsdnCardState *cs, struct sk_buff *skb, u_char fifo, int trans
 	  count += fifo_size;	/* count now contains available bytes */
 
 	if (cs->debug & L1_DEB_ISAC_FIFO)
-	  debugl1(cs, "hfcsx_write_fifo %d count(%ld/%d)",
+	  debugl1(cs, "hfcsx_write_fifo %d count(%u/%d)",
 		  fifo, skb->len, count);
 	if (count < skb->len) {
 	  if (cs->debug & L1_DEB_ISAC_FIFO)
@@ -265,7 +265,7 @@ read_fifo(struct IsdnCardState *cs, u_char fifo, int trans_max)
 	  count++;
 
 	  if (cs->debug & L1_DEB_ISAC_FIFO)
-	    debugl1(cs, "hfcsx_read_fifo %d count %ld)",
+	    debugl1(cs, "hfcsx_read_fifo %d count %u)",
 		    fifo, count);
 
 	  if ((count > fifo_size) || (count < 4)) {
@@ -986,7 +986,7 @@ HFCSX_l1hw(struct PStack *st, int pr, void *arg)
 				default:
 					spin_unlock_irqrestore(&cs->lock, flags);
 					if (cs->debug & L1_DEB_WARN)
-						debugl1(cs, "hfcsx_l1hw loop invalid %4lx", arg);
+						debugl1(cs, "hfcsx_l1hw loop invalid %4lx", (unsigned long)arg);
 					return;
 			}
 			cs->hw.hfcsx.trm |= 0x80;	/* enable IOM-loop */
diff --git a/drivers/isdn/hisax/hisax.h b/drivers/isdn/hisax/hisax.h
index 32ab392..de1c669 100644
--- a/drivers/isdn/hisax/hisax.h
+++ b/drivers/isdn/hisax/hisax.h
@@ -1286,7 +1286,9 @@ int jiftime(char *s, long mark);
 
 int HiSax_command(isdn_ctrl * ic);
 int HiSax_writebuf_skb(int id, int chan, int ack, struct sk_buff *skb);
+__attribute__((format(printf, 3, 4)))
 void HiSax_putstatus(struct IsdnCardState *cs, char *head, char *fmt, ...);
+__attribute__((format(printf, 3, 0)))
 void VHiSax_putstatus(struct IsdnCardState *cs, char *head, char *fmt, va_list args);
 void HiSax_reportcard(int cardnr, int sel);
 int QuickHex(char *txt, u_char * p, int cnt);
diff --git a/drivers/isdn/hisax/ipacx.c b/drivers/isdn/hisax/ipacx.c
index 751b25f..3321041 100644
--- a/drivers/isdn/hisax/ipacx.c
+++ b/drivers/isdn/hisax/ipacx.c
@@ -717,7 +717,7 @@ bch_mode(struct BCState *bcs, int mode, int bc)
 
         bc = bc ? 1 : 0;  // in case bc is greater than 1
 	if (cs->debug & L1_DEB_HSCX)
-		debugl1(cs, "mode_bch() switch B-% mode %d chan %d", hscx, mode, bc);
+		debugl1(cs, "mode_bch() switch B-%d mode %d chan %d", hscx, mode, bc);
 	bcs->mode = mode;
 	bcs->channel = bc;
   
diff --git a/drivers/isdn/hisax/isar.c b/drivers/isdn/hisax/isar.c
index 2e72227..1be4552 100644
--- a/drivers/isdn/hisax/isar.c
+++ b/drivers/isdn/hisax/isar.c
@@ -953,7 +953,7 @@ isar_pump_statev_modem(struct BCState *bcs, u_char devt) {
 			break;
 		case PSEV_GSTN_CLR:
 			if (cs->debug & L1_DEB_HSCX)
-				debugl1(cs, "pump stev GSTN CLEAR", devt);
+				debugl1(cs, "pump stev GSTN CLEAR");
 			break;
 		default:
 			if (cs->debug & L1_DEB_HSCX)
@@ -1268,7 +1268,7 @@ isar_int_main(struct IsdnCardState *cs)
 static void
 ftimer_handler(struct BCState *bcs) {
 	if (bcs->cs->debug)
-		debugl1(bcs->cs, "ftimer flags %04x",
+		debugl1(bcs->cs, "ftimer flags %04lx",
 			bcs->Flag);
 	test_and_clear_bit(BC_FLG_FTI_RUN, &bcs->Flag);
 	if (test_and_clear_bit(BC_FLG_LL_CONN, &bcs->Flag)) {
@@ -1748,7 +1748,7 @@ isar_auxcmd(struct IsdnCardState *cs, isdn_ctrl *ic) {
 	struct BCState *bcs;
 
 	if (cs->debug & L1_DEB_HSCX)
-		debugl1(cs, "isar_auxcmd cmd/ch %x/%d", ic->command, ic->arg);
+		debugl1(cs, "isar_auxcmd cmd/ch %x/%ld", ic->command, ic->arg);
 	switch (ic->command) {
 		case (ISDN_CMD_FAXCMD):
 			bcs = cs->channel[ic->arg].bcs;
diff --git a/drivers/isdn/hisax/isdnl1.h b/drivers/isdn/hisax/isdnl1.h
index 172ad4c..425d861 100644
--- a/drivers/isdn/hisax/isdnl1.h
+++ b/drivers/isdn/hisax/isdnl1.h
@@ -21,6 +21,7 @@
 #define B_XMTBUFREADY	1
 #define B_ACKPENDING	2
 
+__attribute__((format(printf, 2, 3)))
 void debugl1(struct IsdnCardState *cs, char *fmt, ...);
 void DChannel_proc_xmt(struct IsdnCardState *cs);
 void DChannel_proc_rcv(struct IsdnCardState *cs);
diff --git a/drivers/isdn/hisax/isdnl3.c b/drivers/isdn/hisax/isdnl3.c
index fd0b643..ad291f2 100644
--- a/drivers/isdn/hisax/isdnl3.c
+++ b/drivers/isdn/hisax/isdnl3.c
@@ -66,7 +66,7 @@ static char *strL3Event[] =
 	"EV_TIMEOUT",
 };
 
-static void
+static __attribute__((format(printf, 2, 3))) void
 l3m_debug(struct FsmInst *fi, char *fmt, ...)
 {
 	va_list args;
diff --git a/drivers/isdn/hisax/netjet.c b/drivers/isdn/hisax/netjet.c
index 5d7f0f2..644891e 100644
--- a/drivers/isdn/hisax/netjet.c
+++ b/drivers/isdn/hisax/netjet.c
@@ -254,7 +254,7 @@ static int make_raw_data(struct BCState *bcs) {
 		val >>= 1;
 	}
 	if (bcs->cs->debug & L1_DEB_HSCX)
-		debugl1(bcs->cs,"tiger make_raw: in %ld out %d.%d",
+		debugl1(bcs->cs,"tiger make_raw: in %u out %d.%d",
 			bcs->tx_skb->len, s_cnt, bitcnt);
 	if (bitcnt) {
 		while (8>bitcnt++) {
@@ -361,7 +361,7 @@ static int make_raw_data_56k(struct BCState *bcs) {
 		val >>= 1;
 	}
 	if (bcs->cs->debug & L1_DEB_HSCX)
-		debugl1(bcs->cs,"tiger make_raw_56k: in %ld out %d.%d",
+		debugl1(bcs->cs,"tiger make_raw_56k: in %u out %d.%d",
 			bcs->tx_skb->len, s_cnt, bitcnt);
 	if (bitcnt) {
 		while (8>bitcnt++) {
@@ -612,7 +612,7 @@ void netjet_fill_dma(struct BCState *bcs)
 	if (!bcs->tx_skb)
 		return;
 	if (bcs->cs->debug & L1_DEB_HSCX)
-		debugl1(bcs->cs,"tiger fill_dma1: c%d %4x", bcs->channel,
+		debugl1(bcs->cs,"tiger fill_dma1: c%d %4lx", bcs->channel,
 			bcs->Flag);
 	if (test_and_set_bit(BC_FLG_BUSY, &bcs->Flag))
 		return;
@@ -625,7 +625,7 @@ void netjet_fill_dma(struct BCState *bcs)
 			return;		
 	};
 	if (bcs->cs->debug & L1_DEB_HSCX)
-		debugl1(bcs->cs,"tiger fill_dma2: c%d %4x", bcs->channel,
+		debugl1(bcs->cs,"tiger fill_dma2: c%d %4lx", bcs->channel,
 			bcs->Flag);
 	if (test_and_clear_bit(BC_FLG_NOFRAME, &bcs->Flag)) {
 		write_raw(bcs, bcs->hw.tiger.sendp, bcs->hw.tiger.free);
@@ -667,7 +667,7 @@ void netjet_fill_dma(struct BCState *bcs)
 		write_raw(bcs, p, cnt);
 	}
 	if (bcs->cs->debug & L1_DEB_HSCX)
-		debugl1(bcs->cs,"tiger fill_dma3: c%d %4x", bcs->channel,
+		debugl1(bcs->cs,"tiger fill_dma3: c%d %4lx", bcs->channel,
 			bcs->Flag);
 }
 
diff --git a/drivers/isdn/hisax/st5481_d.c b/drivers/isdn/hisax/st5481_d.c
index b7876b1..4408263 100644
--- a/drivers/isdn/hisax/st5481_d.c
+++ b/drivers/isdn/hisax/st5481_d.c
@@ -167,7 +167,8 @@ static struct FsmNode L1FnList[] __initdata =
 	{ST_L1_F8, EV_IND_RSY,           l1_ignore},
 };
 
-static void l1m_debug(struct FsmInst *fi, char *fmt, ...)
+static __attribute__((format(printf, 2, 3)))
+void l1m_debug(struct FsmInst *fi, char *fmt, ...)
 {
 	va_list args;
 	char buf[256];
@@ -269,7 +270,8 @@ static char *strDoutEvent[] =
 	"EV_DOUT_UNDERRUN",
 };
 
-static void dout_debug(struct FsmInst *fi, char *fmt, ...)
+static __attribute__((format(printf, 2, 3)))
+void dout_debug(struct FsmInst *fi, char *fmt, ...)
 {
 	va_list args;
 	char buf[256];
-- 
1.7.3.1.g432b3.dirty


^ permalink raw reply related

* Re: possible kernel oops from user MSS
From: Shan Wei @ 2010-11-11  5:15 UTC (permalink / raw)
  To: David Miller; +Cc: schen, netdev
In-Reply-To: <20101110.124119.102563803.davem@davemloft.net>

David Miller wrote, at 11/11/2010 04:41 AM:
> From: Steve Chen <schen@mvista.com>
> Date: Wed, 10 Nov 2010 07:24:51 -0600
> 
>> With commit f5fff5dc8a7a3f395b0525c02ba92c95d42b7390, a user program
>> can pass in TCP_MAXSEG of 12 (or TCPOLEN_TSTAMP_ALIGNED), and cause
>> kernel oops with division by 0
>>  in tcp_select_initial_window.  One way to prevent it is to change the
>> minimum value for TCP_MAXSEG in do_tcp_setsockopt from 8 to some value
>> over 12.  Two questions.
>>
>> 1.  Is this the right solution?
>> 2.  If it is, what is a good minimum value?
> 
> Thanks Steve, I'll fix this like so:
> 
> --------------------
> tcp: Increase TCP_MAXSEG socket option minimum.
> 
> As noted by Steve Chen, since commit
> f5fff5dc8a7a3f395b0525c02ba92c95d42b7390 ("tcp: advertise MSS
> requested by user") we can end up with a situation where
> tcp_select_initial_window() does a divide by a zero (or
> even negative) mss value.
> 
> The problem is that sometimes we subtract TCPOLEN_TSTAMP_ALIGNED
> from the mss.
> 
> Fix this by increasing the minimum from 8 to 8 plus the value
> of TCPOLEN_TSTATMP_ALIGNED.

In tcp_connect_init(), if tcp_header_len includes TCPOLEN_TSTAMP_ALIGNED(12 bytes)
and TCPOLEN_MD5SIG_ALIGNED(20 bytes).

This fix is still not perfect.

The minimum value of TCP_MAXSEG is 20 tytes, tcp_select_initial_window() still be 
called with negative mss value.

-- 
Best Regards
-----
Shan Wei


> Reported-by: Steve Chen <schen@mvista.com>
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>  net/ipv4/tcp.c |    2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 245603c..6b0eb4d 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -2246,7 +2246,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
>  		/* Values greater than interface MTU won't take effect. However
>  		 * at the point when this call is done we typically don't yet
>  		 * know which interface is going to be used */
> -		if (val < 8 || val > MAX_TCP_WINDOW) {
> +		if (val < TCPOLEN_TSTAMP_ALIGNED + 8 || val > MAX_TCP_WINDOW) {
>  			err = -EINVAL;
>  			break;
>  		}



^ permalink raw reply

* Re: possible kernel oops from user MSS
From: David Miller @ 2010-11-11  5:33 UTC (permalink / raw)
  To: shanwei; +Cc: schen, netdev
In-Reply-To: <4CDB7BD5.6030204@cn.fujitsu.com>

From: Shan Wei <shanwei@cn.fujitsu.com>
Date: Thu, 11 Nov 2010 13:15:01 +0800

> In tcp_connect_init(), if tcp_header_len includes TCPOLEN_TSTAMP_ALIGNED(12 bytes)
> and TCPOLEN_MD5SIG_ALIGNED(20 bytes).

If you knew this, why didn't you mention it in your initial report? :-/

I'll make the minimum 64 or something like that.

^ permalink raw reply

* Re: possible kernel oops from user MSS
From: David Miller @ 2010-11-11  5:36 UTC (permalink / raw)
  To: shanwei; +Cc: schen, netdev
In-Reply-To: <20101110.213313.71108940.davem@davemloft.net>

From: David Miller <davem@davemloft.net>
Date: Wed, 10 Nov 2010 21:33:13 -0800 (PST)

> I'll make the minimum 64 or something like that.

Here is the patch I will use:

--------------------
tcp: Increase TCP_MAXSEG socket option minimum.

As noted by Steve Chen, since commit
f5fff5dc8a7a3f395b0525c02ba92c95d42b7390 ("tcp: advertise MSS
requested by user") we can end up with a situation where
tcp_select_initial_window() does a divide by a zero (or
even negative) mss value.

The problem is that sometimes we effectively subtract
TCPOLEN_TSTAMP_ALIGNED and/or TCPOLEN_MD5SIG_ALIGNED from the mss.

Fix this by increasing the minimum from 8 to 64.

Reported-by: Steve Chen <schen@mvista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 245603c..0814199 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2246,7 +2246,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		/* Values greater than interface MTU won't take effect. However
 		 * at the point when this call is done we typically don't yet
 		 * know which interface is going to be used */
-		if (val < 8 || val > MAX_TCP_WINDOW) {
+		if (val < 64 || val > MAX_TCP_WINDOW) {
 			err = -EINVAL;
 			break;
 		}
-- 
1.7.3.2


^ permalink raw reply related

* Re: possible kernel oops from user MSS
From: Shan Wei @ 2010-11-11  5:37 UTC (permalink / raw)
  To: David Miller; +Cc: schen, netdev
In-Reply-To: <20101110.213313.71108940.davem@davemloft.net>

David Miller wrote, at 11/11/2010 01:33 PM:
> From: Shan Wei <shanwei@cn.fujitsu.com>
> Date: Thu, 11 Nov 2010 13:15:01 +0800
> 
>> In tcp_connect_init(), if tcp_header_len includes TCPOLEN_TSTAMP_ALIGNED(12 bytes)
>> and TCPOLEN_MD5SIG_ALIGNED(20 bytes).
> 
> If you knew this, why didn't you mention it in your initial report? :-/

Firstly reported by Steve Chen, not me. :-)
I just review your patch.
 
> I'll make the minimum 64 or something like that.

Welcome. 


-- 
Best Regards
-----
Shan Wei

^ permalink raw reply

* [PATCH] dlm: Handle application limited situations properly.
From: David Miller @ 2010-11-11  5:56 UTC (permalink / raw)
  To: ccaulfie; +Cc: teigland, cluster-devel, netdev, linux-kernel

In the normal regime where an application uses non-blocking I/O
writes on a socket, they will handle -EAGAIN and use poll() to
wait for send space.

They don't actually sleep on the socket I/O write.

But kernel level RPC layers that do socket I/O operations directly
and key off of -EAGAIN on the write() to "try again later" don't
use poll(), they instead have their own sleeping mechanism and
rely upon ->sk_write_space() to trigger the wakeup.

So they do effectively sleep on the write(), but this mechanism
alone does not let the socket layers know what's going on.

Therefore they must emulate what would have happened, otherwise
TCP cannot possibly see that the connection is application window
size limited.

Handle this, therefore, like SUNRPC by setting SOCK_NOSPACE and
bumping the ->sk_write_count as needed when we hit the send buffer
limits.

This should make TCP send buffer size auto-tuning and the
->sk_write_space() callback invocations actually happen.

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 37a34c2..77720f8 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -108,6 +108,7 @@ struct connection {
 #define CF_INIT_PENDING 4
 #define CF_IS_OTHERCON 5
 #define CF_CLOSE 6
+#define CF_APP_LIMITED 7
 	struct list_head writequeue;  /* List of outgoing writequeue_entries */
 	spinlock_t writequeue_lock;
 	int (*rx_action) (struct connection *);	/* What to do when active */
@@ -295,7 +296,17 @@ static void lowcomms_write_space(struct sock *sk)
 {
 	struct connection *con = sock2con(sk);

-	if (con && !test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+	if (!con)
+		return;
+
+	clear_bit(SOCK_NOSPACE, &con->sock->flags);
+
+	if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
+		con->sock->sk->sk_write_pending--;
+		clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags);
+	}
+
+	if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
 		queue_work(send_workqueue, &con->swork);
 }

@@ -1319,6 +1330,15 @@ static void send_to_sock(struct connection *con)
 			ret = kernel_sendpage(con->sock, e->page, offset, len,
 					      msg_flags);
 			if (ret == -EAGAIN || ret == 0) {
+				if (ret == -EAGAIN &&
+				    test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
+				    !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
+					/* Notify TCP that we're limited by the
+					 * application window size.
+					 */
+					set_bit(SOCK_NOSPACE, &con->sock->flags);
+					con->sock->sk->sk_write_pending++;
+				}
 				cond_resched();
 				goto out;
 			}

^ permalink raw reply related

* [PATCH 2/3] dccp ccid-2: Separate internals of Ack Vectors from option-parsing code
From: Gerrit Renker @ 2010-11-11  6:07 UTC (permalink / raw)
  To: davem; +Cc: dccp, netdev, Gerrit Renker
In-Reply-To: <1289455653-5463-2-git-send-email-gerrit@erg.abdn.ac.uk>

This patch
 * separates Ack Vector housekeeping code from option-insertion code;
 * shifts option-specific code from ackvec.c into options.c;
 * introduces a dedicated routine to take care of the Ack Vector records;
 * simplifies the dccp_ackvec_insert_avr() routine: the BUG_ON was redundant,
   since the list is automatically arranged in descending order of ack_seqno.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ackvec.h  |    2 +-
 net/dccp/ackvec.c  |  100 +++++++++------------------------------------------
 net/dccp/options.c |   60 +++++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+), 83 deletions(-)

--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -111,7 +111,7 @@ extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
 			     u64 *ackno, const u8 opt,
 			     const u8 *value, const u8 len);
 
-extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb);
+extern int  dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
 
 static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
 {
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -52,99 +52,35 @@ void dccp_ackvec_free(struct dccp_ackvec *av)
 	}
 }
 
-static void dccp_ackvec_insert_avr(struct dccp_ackvec *av,
-				   struct dccp_ackvec_record *avr)
-{
-	/*
-	 * AVRs are sorted by seqno. Since we are sending them in order, we
-	 * just add the AVR at the head of the list.
-	 * -sorbo.
-	 */
-	if (!list_empty(&av->av_records)) {
-		const struct dccp_ackvec_record *head =
-					list_entry(av->av_records.next,
-						   struct dccp_ackvec_record,
-						   avr_node);
-		BUG_ON(before48(avr->avr_ack_seqno, head->avr_ack_seqno));
-	}
-
-	list_add(&avr->avr_node, &av->av_records);
-}
-
-int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
+/**
+ * dccp_ackvec_update_records  -  Record information about sent Ack Vectors
+ * @av:		Ack Vector records to update
+ * @seqno:	Sequence number of the packet carrying the Ack Vector just sent
+ * @nonce_sum:	The sum of all buffer nonces contained in the Ack Vector
+ */
+int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
-	struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
-	/* Figure out how many options do we need to represent the ackvec */
-	const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN);
-	u16 len = av->av_vec_len + 2 * nr_opts;
-	u8 i, nonce = 0;
-	const unsigned char *tail, *from;
-	unsigned char *to;
 	struct dccp_ackvec_record *avr;
 
-	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
-		return -1;
-
 	avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
 	if (avr == NULL)
-		return -1;
-
-	DCCP_SKB_CB(skb)->dccpd_opt_len += len;
-
-	to   = skb_push(skb, len);
-	len  = av->av_vec_len;
-	from = av->av_buf + av->av_buf_head;
-	tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
-
-	for (i = 0; i < nr_opts; ++i) {
-		int copylen = len;
-
-		if (len > DCCP_SINGLE_OPT_MAXLEN)
-			copylen = DCCP_SINGLE_OPT_MAXLEN;
-
-		/*
-		 * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via
-		 * its type; ack_nonce is the sum of all individual buf_nonce's.
-		 */
-		nonce ^= av->av_buf_nonce[i];
-
-		*to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i];
-		*to++ = copylen + 2;
-
-		/* Check if buf_head wraps */
-		if (from + copylen > tail) {
-			const u16 tailsize = tail - from;
-
-			memcpy(to, from, tailsize);
-			to	+= tailsize;
-			len	-= tailsize;
-			copylen	-= tailsize;
-			from	= av->av_buf;
-		}
-
-		memcpy(to, from, copylen);
-		from += copylen;
-		to   += copylen;
-		len  -= copylen;
-	}
+		return -ENOBUFS;
 
-	/*
-	 * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
-	 */
-	avr->avr_ack_seqno  = DCCP_SKB_CB(skb)->dccpd_seq;
+	avr->avr_ack_seqno  = seqno;
 	avr->avr_ack_ptr    = av->av_buf_head;
 	avr->avr_ack_ackno  = av->av_buf_ackno;
-	avr->avr_ack_nonce  = nonce;
+	avr->avr_ack_nonce  = nonce_sum;
 	avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head);
+	/*
+	 * Since GSS is incremented for each packet, the list is automatically
+	 * arranged in descending order of @ack_seqno.
+	 */
+	list_add(&avr->avr_node, &av->av_records);
 
-	dccp_ackvec_insert_avr(av, avr);
-
-	dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, "
-		      "ack_ackno=%llu\n",
-		      dccp_role(sk), avr->avr_ack_runlen,
+	dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n",
 		      (unsigned long long)avr->avr_ack_seqno,
-		      (unsigned long long)avr->avr_ack_ackno);
+		      (unsigned long long)avr->avr_ack_ackno,
+		      avr->avr_ack_runlen);
 	return 0;
 }
 
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -425,6 +425,66 @@ static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
 	return 0;
 }
 
+static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
+	/* Figure out how many options do we need to represent the ackvec */
+	const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN);
+	u16 len = av->av_vec_len + 2 * nr_opts;
+	u8 i, nonce = 0;
+	const unsigned char *tail, *from;
+	unsigned char *to;
+
+	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
+		return -1;
+
+	DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+	to   = skb_push(skb, len);
+	len  = av->av_vec_len;
+	from = av->av_buf + av->av_buf_head;
+	tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
+
+	for (i = 0; i < nr_opts; ++i) {
+		int copylen = len;
+
+		if (len > DCCP_SINGLE_OPT_MAXLEN)
+			copylen = DCCP_SINGLE_OPT_MAXLEN;
+
+		/*
+		 * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via
+		 * its type; ack_nonce is the sum of all individual buf_nonce's.
+		 */
+		nonce ^= av->av_buf_nonce[i];
+
+		*to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i];
+		*to++ = copylen + 2;
+
+		/* Check if buf_head wraps */
+		if (from + copylen > tail) {
+			const u16 tailsize = tail - from;
+
+			memcpy(to, from, tailsize);
+			to	+= tailsize;
+			len	-= tailsize;
+			copylen	-= tailsize;
+			from	= av->av_buf;
+		}
+
+		memcpy(to, from, copylen);
+		from += copylen;
+		to   += copylen;
+		len  -= copylen;
+	}
+	/*
+	 * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
+	 */
+	if (dccp_ackvec_update_records(av, DCCP_SKB_CB(skb)->dccpd_seq, nonce))
+		return -ENOBUFS;
+	return 0;
+}
+
 /**
  * dccp_insert_option_mandatory  -  Mandatory option (5.8.2)
  * Note that since we are using skb_push, this function needs to be called

^ permalink raw reply

* net-next-2.6 [PATCH 0/3] dccp: Ack Vectors in circular buffer instead of array
From: Gerrit Renker @ 2010-11-11  6:07 UTC (permalink / raw)
  To: davem; +Cc: dccp, netdev
In-Reply-To: <ack_vector_circular_buffer_dccp>

Dave,

please can you consider this set, which is part I of a two-part patch set
to fix issues in the DCCP Ack Vector implementation.

It is a self-contained set to move away from a linear array to a circular
buffer with overflow handling. Part II would then follow next week.


 Patch #1: cleans up the old interface to prepare for the improved one.
 Patch #2: also tidies up the old interface, by separating the internals
           of Ack Vectors from the option-parsing code.
 Patch #3: Completes the implementation of a circular Ack Vector buffer.


I have also placed this in into a fresh (today's) copy of net-next-2.6, on

    git://eden-feed.erg.abdn.ac.uk/net-next-2.6        [subtree 'dccp']

The set has been tested for 3 years, and is fully bisectable.
---
 net/dccp/ackvec.c      |  251 ++++++++++++++++--------------------------------
 net/dccp/ackvec.h      |  115 +++++++++++++----------
 net/dccp/ccids/ccid2.c |   13 +--
 net/dccp/dccp.h        |   11 ++-
 net/dccp/input.c       |    6 +-
 net/dccp/options.c     |   65 ++++++++++++-
 6 files changed, 225 insertions(+), 236 deletions(-)

^ permalink raw reply

* [PATCH 3/3] dccp ccid-2: Implementation of circular Ack Vector buffer with overflow handling
From: Gerrit Renker @ 2010-11-11  6:07 UTC (permalink / raw)
  To: davem; +Cc: dccp, netdev, Gerrit Renker
In-Reply-To: <1289455653-5463-3-git-send-email-gerrit@erg.abdn.ac.uk>

This completes the implementation of a circular buffer for Ack Vectors, by
extending the current (linear array-based) implementation.  The changes are:

 (a) An `overflow' flag to deal with the case of overflow. As before, dynamic
     growth of the buffer will not be supported; but code will be added to deal
     robustly with overflowing Ack Vector buffers.

 (b) A `tail_seqno' field. When naively implementing the algorithm of Appendix A
     in RFC 4340, problems arise whenever subsequent Ack Vector records overlap,
     which can bring the entire run length calculation completely out of synch.
     (This is documented on http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/\
                                             ack_vectors/tracking_tail_ackno/ .)
 (c) The buffer length is now computed dynamically (i.e. current fill level),
     as the span between head to tail.

As a result, dccp_ackvec_pending() is now simpler - the #ifdef is no longer
necessary since buf_empty is always true when IP_DCCP_ACKVEC is not configured.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ackvec.h  |   10 ++++++++--
 net/dccp/ackvec.c  |   31 ++++++++++++++++++++++++++++++-
 net/dccp/dccp.h    |   11 +++++++----
 net/dccp/options.c |   10 +++++-----
 4 files changed, 50 insertions(+), 12 deletions(-)

--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -21,6 +21,7 @@
  * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1
  * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives
  * more headroom if Ack Ratio is higher or when the sender acknowledges slowly.
+ * The maximum value is bounded by the u16 types for indices and functions.
  */
 #define DCCPAV_NUM_ACKVECS	2
 #define DCCPAV_MAX_ACKVEC_LEN	(DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS)
@@ -55,8 +56,10 @@ static inline u8 dccp_ackvec_state(const u8 *cell)
  * @av_buf_head:   head index; begin of live portion in @av_buf
  * @av_buf_tail:   tail index; first index _after_ the live portion in @av_buf
  * @av_buf_ackno:  highest seqno of acknowledgeable packet recorded in @av_buf
+ * @av_tail_ackno: lowest  seqno of acknowledgeable packet recorded in @av_buf
  * @av_buf_nonce:  ECN nonce sums, each covering subsequent segments of up to
  *		   %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
+ * @av_overflow:   if 1 then buf_head == buf_tail indicates buffer wraparound
  * @av_records:	   list of %dccp_ackvec_record (Ack Vectors sent previously)
  * @av_veclen:	   length of the live portion of @av_buf
  */
@@ -65,7 +68,9 @@ struct dccp_ackvec {
 	u16			av_buf_head;
 	u16			av_buf_tail;
 	u64			av_buf_ackno:48;
+	u64			av_tail_ackno:48;
 	bool			av_buf_nonce[DCCPAV_NUM_ACKVECS];
+	u8			av_overflow:1;
 	struct list_head	av_records;
 	u16			av_vec_len;
 };
@@ -112,9 +117,10 @@ extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
 			     const u8 *value, const u8 len);
 
 extern int  dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
+extern u16  dccp_ackvec_buflen(const struct dccp_ackvec *av);
 
-static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
+static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
 {
-	return av->av_vec_len;
+	return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail;
 }
 #endif /* _ACKVEC_H */
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -29,7 +29,7 @@ struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
 	struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority);
 
 	if (av != NULL) {
-		av->av_buf_head	= DCCPAV_MAX_ACKVEC_LEN - 1;
+		av->av_buf_head	= av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1;
 		INIT_LIST_HEAD(&av->av_records);
 	}
 	return av;
@@ -72,6 +72,14 @@ int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
 	avr->avr_ack_nonce  = nonce_sum;
 	avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head);
 	/*
+	 * When the buffer overflows, we keep no more than one record. This is
+	 * the simplest way of disambiguating sender-Acks dating from before the
+	 * overflow from sender-Acks which refer to after the overflow; a simple
+	 * solution is preferable here since we are handling an exception.
+	 */
+	if (av->av_overflow)
+		dccp_ackvec_purge_records(av);
+	/*
 	 * Since GSS is incremented for each packet, the list is automatically
 	 * arranged in descending order of @ack_seqno.
 	 */
@@ -85,6 +93,27 @@ int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
 }
 
 /*
+ * Buffer index and length computation using modulo-buffersize arithmetic.
+ * Note that, as pointers move from right to left, head is `before' tail.
+ */
+static inline u16 __ackvec_idx_add(const u16 a, const u16 b)
+{
+	return (a + b) % DCCPAV_MAX_ACKVEC_LEN;
+}
+
+static inline u16 __ackvec_idx_sub(const u16 a, const u16 b)
+{
+	return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b);
+}
+
+u16 dccp_ackvec_buflen(const struct dccp_ackvec *av)
+{
+	if (unlikely(av->av_overflow))
+		return DCCPAV_MAX_ACKVEC_LEN;
+	return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head);
+}
+
+/*
  * If several packets are missing, the HC-Receiver may prefer to enter multiple
  * bytes with run length 0, rather than a single byte with a larger run length;
  * this simplifies table updates if one of the missing packets arrives.
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -457,12 +457,15 @@ static inline void dccp_update_gss(struct sock *sk, u64 seq)
 	dp->dccps_awh = dp->dccps_gss;
 }
 
+static inline int dccp_ackvec_pending(const struct sock *sk)
+{
+	return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL &&
+	       !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec);
+}
+
 static inline int dccp_ack_pending(const struct sock *sk)
 {
-	const struct dccp_sock *dp = dccp_sk(sk);
-	return (dp->dccps_hc_rx_ackvec != NULL &&
-		dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
-	       inet_csk_ack_scheduled(sk);
+	return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk);
 }
 
 extern int  dccp_feat_finalise_settings(struct dccp_sock *dp);
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -429,9 +429,10 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
+	const u16 buflen = dccp_ackvec_buflen(av);
 	/* Figure out how many options do we need to represent the ackvec */
-	const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN);
-	u16 len = av->av_vec_len + 2 * nr_opts;
+	const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN);
+	u16 len = buflen + 2 * nr_opts;
 	u8 i, nonce = 0;
 	const unsigned char *tail, *from;
 	unsigned char *to;
@@ -442,7 +443,7 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 	DCCP_SKB_CB(skb)->dccpd_opt_len += len;
 
 	to   = skb_push(skb, len);
-	len  = av->av_vec_len;
+	len  = buflen;
 	from = av->av_buf + av->av_buf_head;
 	tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
 
@@ -580,8 +581,7 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 			if (dccp_insert_option_timestamp(skb))
 				return -1;
 
-		} else if (dp->dccps_hc_rx_ackvec != NULL &&
-			   dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) &&
+		} else if (dccp_ackvec_pending(sk) &&
 			   dccp_insert_option_ackvec(sk, skb)) {
 				return -1;
 		}

^ permalink raw reply

* [PATCH 1/3] dccp ccid-2: Ack Vector interface clean-up
From: Gerrit Renker @ 2010-11-11  6:07 UTC (permalink / raw)
  To: davem; +Cc: dccp, netdev, Gerrit Renker
In-Reply-To: <1289455653-5463-1-git-send-email-gerrit@erg.abdn.ac.uk>

This patch brings the Ack Vector interface up to date. Its main purpose is
to lay the basis for the subsequent patches of this set, which will use the
new data structure fields and routines.

There are no real algorithmic changes, rather an adaptation:

 (1) Replaced the static Ack Vector size (2) with a #define so that it can
     be adapted (with low loss / Ack Ratio, a value of 1 works, so 2 seems
     to be sufficient for the moment) and added a solution so that computing
     the ECN nonce will continue to work - even with larger Ack Vectors.

 (2) Replaced the #defines for Ack Vector states with a complete enum.

 (3) Replaced #defines to compute Ack Vector length and state with general
     purpose routines (inlines), and updated code to use these.

 (4) Added a `tail' field (conversion to circular buffer in subsequent patch).

 (5) Updated the (outdated) documentation for Ack Vector struct.

 (6) All sequence number containers now trimmed to 48 bits.

 (7) Removal of unused bits:
     * removed dccpav_ack_nonce from struct dccp_ackvec, since this is already
       redundantly stored in the `dccpavr_ack_nonce' (of Ack Vector record);
     * removed Elapsed Time for Ack Vectors (it was nowhere used);
     * replaced semantics of dccpavr_sent_len with dccpavr_ack_runlen, since
       the code needs to be able to remember the old run length;
     * reduced the de-/allocation routines (redundant / duplicate tests).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/ackvec.h      |  103 +++++++++++++++-------------
 net/dccp/ackvec.c      |  178 +++++++++++++++++-------------------------------
 net/dccp/ccids/ccid2.c |   13 ++--
 net/dccp/input.c       |    6 +-
 net/dccp/options.c     |    1 +
 5 files changed, 128 insertions(+), 173 deletions(-)

--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -3,9 +3,9 @@
 /*
  *  net/dccp/ackvec.h
  *
- *  An implementation of the DCCP protocol
+ *  An implementation of Ack Vectors for the DCCP protocol
+ *  Copyright (c) 2007 University of Aberdeen, Scotland, UK
  *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com>
- *
  *	This program is free software; you can redistribute it and/or modify it
  *	under the terms of the GNU General Public License version 2 as
  *	published by the Free Software Foundation.
@@ -13,75 +13,84 @@
 
 #include <linux/dccp.h>
 #include <linux/compiler.h>
-#include <linux/ktime.h>
 #include <linux/list.h>
 #include <linux/types.h>
 
-/* We can spread an ack vector across multiple options */
-#define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2)
+/*
+ * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN,
+ * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1
+ * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives
+ * more headroom if Ack Ratio is higher or when the sender acknowledges slowly.
+ */
+#define DCCPAV_NUM_ACKVECS	2
+#define DCCPAV_MAX_ACKVEC_LEN	(DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS)
 
 /* Estimated minimum average Ack Vector length - used for updating MPS */
 #define DCCPAV_MIN_OPTLEN	16
 
-#define DCCP_ACKVEC_STATE_RECEIVED	0
-#define DCCP_ACKVEC_STATE_ECN_MARKED	(1 << 6)
-#define DCCP_ACKVEC_STATE_NOT_RECEIVED	(3 << 6)
+enum dccp_ackvec_states {
+	DCCPAV_RECEIVED =	0x00,
+	DCCPAV_ECN_MARKED =	0x40,
+	DCCPAV_RESERVED =	0x80,
+	DCCPAV_NOT_RECEIVED =	0xC0
+};
+#define DCCPAV_MAX_RUNLEN	0x3F
 
-#define DCCP_ACKVEC_STATE_MASK		0xC0 /* 11000000 */
-#define DCCP_ACKVEC_LEN_MASK		0x3F /* 00111111 */
+static inline u8 dccp_ackvec_runlen(const u8 *cell)
+{
+	return *cell & DCCPAV_MAX_RUNLEN;
+}
 
-/** struct dccp_ackvec - ack vector
- *
- * This data structure is the one defined in RFC 4340, Appendix A.
- *
- * @av_buf_head - circular buffer head
- * @av_buf_tail - circular buffer tail
- * @av_buf_ackno - ack # of the most recent packet acknowledgeable in the
- *		       buffer (i.e. %av_buf_head)
- * @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked
- * 		       by the buffer with State 0
- *
- * Additionally, the HC-Receiver must keep some information about the
- * Ack Vectors it has recently sent. For each packet sent carrying an
- * Ack Vector, it remembers four variables:
+static inline u8 dccp_ackvec_state(const u8 *cell)
+{
+	return *cell & ~DCCPAV_MAX_RUNLEN;
+}
+
+/** struct dccp_ackvec - Ack Vector main data structure
  *
- * @av_records - list of dccp_ackvec_record
- * @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0.
+ * This implements a fixed-size circular buffer within an array and is largely
+ * based on Appendix A of RFC 4340.
  *
- * @av_time - the time in usecs
- * @av_buf - circular buffer of acknowledgeable packets
+ * @av_buf:	   circular buffer storage area
+ * @av_buf_head:   head index; begin of live portion in @av_buf
+ * @av_buf_tail:   tail index; first index _after_ the live portion in @av_buf
+ * @av_buf_ackno:  highest seqno of acknowledgeable packet recorded in @av_buf
+ * @av_buf_nonce:  ECN nonce sums, each covering subsequent segments of up to
+ *		   %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
+ * @av_records:	   list of %dccp_ackvec_record (Ack Vectors sent previously)
+ * @av_veclen:	   length of the live portion of @av_buf
  */
 struct dccp_ackvec {
-	u64			av_buf_ackno;
-	struct list_head	av_records;
-	ktime_t			av_time;
+	u8			av_buf[DCCPAV_MAX_ACKVEC_LEN];
 	u16			av_buf_head;
+	u16			av_buf_tail;
+	u64			av_buf_ackno:48;
+	bool			av_buf_nonce[DCCPAV_NUM_ACKVECS];
+	struct list_head	av_records;
 	u16			av_vec_len;
-	u8			av_buf_nonce;
-	u8			av_ack_nonce;
-	u8			av_buf[DCCP_MAX_ACKVEC_LEN];
 };
 
-/** struct dccp_ackvec_record - ack vector record
+/** struct dccp_ackvec_record - Records information about sent Ack Vectors
  *
- * ACK vector record as defined in Appendix A of spec.
+ * These list entries define the additional information which the HC-Receiver
+ * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A.
  *
- * The list is sorted by avr_ack_seqno
+ * @avr_node:	    the list node in @av_records
+ * @avr_ack_seqno:  sequence number of the packet the Ack Vector was sent on
+ * @avr_ack_ackno:  the Ack number that this record/Ack Vector refers to
+ * @avr_ack_ptr:    pointer into @av_buf where this record starts
+ * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending
+ * @avr_ack_nonce:  the sum of @av_buf_nonce's at the time this record was sent
  *
- * @avr_node - node in av_records
- * @avr_ack_seqno - sequence number of the packet this record was sent on
- * @avr_ack_ackno - sequence number being acknowledged
- * @avr_ack_ptr - pointer into av_buf where this record starts
- * @avr_ack_nonce - av_ack_nonce at the time this record was sent
- * @avr_sent_len - lenght of the record in av_buf
+ * The list as a whole is sorted in descending order by @avr_ack_seqno.
  */
 struct dccp_ackvec_record {
 	struct list_head avr_node;
-	u64		 avr_ack_seqno;
-	u64		 avr_ack_ackno;
+	u64		 avr_ack_seqno:48;
+	u64		 avr_ack_ackno:48;
 	u16		 avr_ack_ptr;
-	u16		 avr_sent_len;
-	u8		 avr_ack_nonce;
+	u8		 avr_ack_runlen;
+	u8		 avr_ack_nonce:1;
 };
 
 struct sock;
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -1,7 +1,8 @@
 /*
  *  net/dccp/ackvec.c
  *
- *  An implementation of the DCCP protocol
+ *  An implementation of Ack Vectors for the DCCP protocol
+ *  Copyright (c) 2007 University of Aberdeen, Scotland, UK
  *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
  *
  *      This program is free software; you can redistribute it and/or modify it
@@ -23,24 +24,32 @@
 static struct kmem_cache *dccp_ackvec_slab;
 static struct kmem_cache *dccp_ackvec_record_slab;
 
-static struct dccp_ackvec_record *dccp_ackvec_record_new(void)
+struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
 {
-	struct dccp_ackvec_record *avr =
-			kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
+	struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority);
+
+	if (av != NULL) {
+		av->av_buf_head	= DCCPAV_MAX_ACKVEC_LEN - 1;
+		INIT_LIST_HEAD(&av->av_records);
+	}
+	return av;
+}
 
-	if (avr != NULL)
-		INIT_LIST_HEAD(&avr->avr_node);
+static void dccp_ackvec_purge_records(struct dccp_ackvec *av)
+{
+	struct dccp_ackvec_record *cur, *next;
 
-	return avr;
+	list_for_each_entry_safe(cur, next, &av->av_records, avr_node)
+		kmem_cache_free(dccp_ackvec_record_slab, cur);
+	INIT_LIST_HEAD(&av->av_records);
 }
 
-static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr)
+void dccp_ackvec_free(struct dccp_ackvec *av)
 {
-	if (unlikely(avr == NULL))
-		return;
-	/* Check if deleting a linked record */
-	WARN_ON(!list_empty(&avr->avr_node));
-	kmem_cache_free(dccp_ackvec_record_slab, avr);
+	if (likely(av != NULL)) {
+		dccp_ackvec_purge_records(av);
+		kmem_cache_free(dccp_ackvec_slab, av);
+	}
 }
 
 static void dccp_ackvec_insert_avr(struct dccp_ackvec *av,
@@ -68,24 +77,16 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 	struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
 	/* Figure out how many options do we need to represent the ackvec */
 	const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN);
-	u16 len = av->av_vec_len + 2 * nr_opts, i;
-	u32 elapsed_time;
+	u16 len = av->av_vec_len + 2 * nr_opts;
+	u8 i, nonce = 0;
 	const unsigned char *tail, *from;
 	unsigned char *to;
 	struct dccp_ackvec_record *avr;
-	suseconds_t delta;
 
 	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
 		return -1;
 
-	delta = ktime_us_delta(ktime_get_real(), av->av_time);
-	elapsed_time = delta / 10;
-
-	if (elapsed_time != 0 &&
-	    dccp_insert_option_elapsed_time(skb, elapsed_time))
-		return -1;
-
-	avr = dccp_ackvec_record_new();
+	avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
 	if (avr == NULL)
 		return -1;
 
@@ -94,7 +95,7 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 	to   = skb_push(skb, len);
 	len  = av->av_vec_len;
 	from = av->av_buf + av->av_buf_head;
-	tail = av->av_buf + DCCP_MAX_ACKVEC_LEN;
+	tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
 
 	for (i = 0; i < nr_opts; ++i) {
 		int copylen = len;
@@ -102,7 +103,13 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 		if (len > DCCP_SINGLE_OPT_MAXLEN)
 			copylen = DCCP_SINGLE_OPT_MAXLEN;
 
-		*to++ = DCCPO_ACK_VECTOR_0;
+		/*
+		 * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via
+		 * its type; ack_nonce is the sum of all individual buf_nonce's.
+		 */
+		nonce ^= av->av_buf_nonce[i];
+
+		*to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i];
 		*to++ = copylen + 2;
 
 		/* Check if buf_head wraps */
@@ -123,75 +130,24 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 	}
 
 	/*
-	 *	From RFC 4340, A.2:
-	 *
-	 *	For each acknowledgement it sends, the HC-Receiver will add an
-	 *	acknowledgement record.  ack_seqno will equal the HC-Receiver
-	 *	sequence number it used for the ack packet; ack_ptr will equal
-	 *	buf_head; ack_ackno will equal buf_ackno; and ack_nonce will
-	 *	equal buf_nonce.
+	 * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
 	 */
-	avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
-	avr->avr_ack_ptr   = av->av_buf_head;
-	avr->avr_ack_ackno = av->av_buf_ackno;
-	avr->avr_ack_nonce = av->av_buf_nonce;
-	avr->avr_sent_len  = av->av_vec_len;
+	avr->avr_ack_seqno  = DCCP_SKB_CB(skb)->dccpd_seq;
+	avr->avr_ack_ptr    = av->av_buf_head;
+	avr->avr_ack_ackno  = av->av_buf_ackno;
+	avr->avr_ack_nonce  = nonce;
+	avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head);
 
 	dccp_ackvec_insert_avr(av, avr);
 
 	dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, "
 		      "ack_ackno=%llu\n",
-		      dccp_role(sk), avr->avr_sent_len,
+		      dccp_role(sk), avr->avr_ack_runlen,
 		      (unsigned long long)avr->avr_ack_seqno,
 		      (unsigned long long)avr->avr_ack_ackno);
 	return 0;
 }
 
-struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
-{
-	struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority);
-
-	if (av != NULL) {
-		av->av_buf_head	 = DCCP_MAX_ACKVEC_LEN - 1;
-		av->av_buf_ackno = UINT48_MAX + 1;
-		av->av_buf_nonce = 0;
-		av->av_time	 = ktime_set(0, 0);
-		av->av_vec_len	 = 0;
-		INIT_LIST_HEAD(&av->av_records);
-	}
-
-	return av;
-}
-
-void dccp_ackvec_free(struct dccp_ackvec *av)
-{
-	if (unlikely(av == NULL))
-		return;
-
-	if (!list_empty(&av->av_records)) {
-		struct dccp_ackvec_record *avr, *next;
-
-		list_for_each_entry_safe(avr, next, &av->av_records, avr_node) {
-			list_del_init(&avr->avr_node);
-			dccp_ackvec_record_delete(avr);
-		}
-	}
-
-	kmem_cache_free(dccp_ackvec_slab, av);
-}
-
-static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av,
-				   const u32 index)
-{
-	return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK;
-}
-
-static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
-				 const u32 index)
-{
-	return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK;
-}
-
 /*
  * If several packets are missing, the HC-Receiver may prefer to enter multiple
  * bytes with run length 0, rather than a single byte with a larger run length;
@@ -204,7 +160,7 @@ static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
 	long gap;
 	long new_head;
 
-	if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN)
+	if (av->av_vec_len + packets > DCCPAV_MAX_ACKVEC_LEN)
 		return -ENOBUFS;
 
 	gap	 = packets - 1;
@@ -212,18 +168,18 @@ static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
 
 	if (new_head < 0) {
 		if (gap > 0) {
-			memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED,
+			memset(av->av_buf, DCCPAV_NOT_RECEIVED,
 			       gap + new_head + 1);
 			gap = -new_head;
 		}
-		new_head += DCCP_MAX_ACKVEC_LEN;
+		new_head += DCCPAV_MAX_ACKVEC_LEN;
 	}
 
 	av->av_buf_head = new_head;
 
 	if (gap > 0)
 		memset(av->av_buf + av->av_buf_head + 1,
-		       DCCP_ACKVEC_STATE_NOT_RECEIVED, gap);
+		       DCCPAV_NOT_RECEIVED, gap);
 
 	av->av_buf[av->av_buf_head] = state;
 	av->av_vec_len += packets;
@@ -236,6 +192,8 @@ static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
 int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
 		    const u64 ackno, const u8 state)
 {
+	u8 *cur_head = av->av_buf + av->av_buf_head,
+	   *buf_end  = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
 	/*
 	 * Check at the right places if the buffer is full, if it is, tell the
 	 * caller to start dropping packets till the HC-Sender acks our ACK
@@ -260,7 +218,7 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
 
 	/* See if this is the first ackno being inserted */
 	if (av->av_vec_len == 0) {
-		av->av_buf[av->av_buf_head] = state;
+		*cur_head = state;
 		av->av_vec_len = 1;
 	} else if (after48(ackno, av->av_buf_ackno)) {
 		const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno);
@@ -269,10 +227,9 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
 		 * Look if the state of this packet is the same as the
 		 * previous ackno and if so if we can bump the head len.
 		 */
-		if (delta == 1 &&
-		    dccp_ackvec_state(av, av->av_buf_head) == state &&
-		    dccp_ackvec_len(av, av->av_buf_head) < DCCP_ACKVEC_LEN_MASK)
-			av->av_buf[av->av_buf_head]++;
+		if (delta == 1 && dccp_ackvec_state(cur_head) == state &&
+		    dccp_ackvec_runlen(cur_head) < DCCPAV_MAX_RUNLEN)
+			*cur_head += 1;
 		else if (dccp_ackvec_set_buf_head_state(av, delta, state))
 			return -ENOBUFS;
 	} else {
@@ -285,21 +242,17 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
 		 *	could reduce the complexity of this scan.)
 		 */
 		u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno);
-		u32 index = av->av_buf_head;
 
 		while (1) {
-			const u8 len = dccp_ackvec_len(av, index);
-			const u8 av_state = dccp_ackvec_state(av, index);
+			const u8 len = dccp_ackvec_runlen(cur_head);
 			/*
 			 * valid packets not yet in av_buf have a reserved
 			 * entry, with a len equal to 0.
 			 */
-			if (av_state == DCCP_ACKVEC_STATE_NOT_RECEIVED &&
-			    len == 0 && delta == 0) { /* Found our
-							 reserved seat! */
+			if (*cur_head == DCCPAV_NOT_RECEIVED && delta == 0) {
 				dccp_pr_debug("Found %llu reserved seat!\n",
 					      (unsigned long long)ackno);
-				av->av_buf[index] = state;
+				*cur_head = state;
 				goto out;
 			}
 			/* len == 0 means one packet */
@@ -307,13 +260,12 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
 				goto out_duplicate;
 
 			delta -= len + 1;
-			if (++index == DCCP_MAX_ACKVEC_LEN)
-				index = 0;
+			if (++cur_head == buf_end)
+				cur_head = av->av_buf;
 		}
 	}
 
 	av->av_buf_ackno = ackno;
-	av->av_time = ktime_get_real();
 out:
 	return 0;
 
@@ -333,13 +285,13 @@ static void dccp_ackvec_throw_record(struct dccp_ackvec *av,
 	if (av->av_buf_head <= avr->avr_ack_ptr)
 		av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head;
 	else
-		av->av_vec_len = DCCP_MAX_ACKVEC_LEN - 1 -
+		av->av_vec_len = DCCPAV_MAX_ACKVEC_LEN - 1 -
 				 av->av_buf_head + avr->avr_ack_ptr;
 
 	/* free records */
 	list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
-		list_del_init(&avr->avr_node);
-		dccp_ackvec_record_delete(avr);
+		list_del(&avr->avr_node);
+		kmem_cache_free(dccp_ackvec_record_slab, avr);
 	}
 }
 
@@ -357,7 +309,7 @@ void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk,
 		if (ackno == avr->avr_ack_seqno) {
 			dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, "
 				      "ack_ackno=%llu, ACKED!\n",
-				      dccp_role(sk), 1,
+				      dccp_role(sk), avr->avr_ack_runlen,
 				      (unsigned long long)avr->avr_ack_seqno,
 				      (unsigned long long)avr->avr_ack_ackno);
 			dccp_ackvec_throw_record(av, avr);
@@ -387,7 +339,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
 	 */
 	avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node);
 	while (i--) {
-		const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
+		const u8 rl = dccp_ackvec_runlen(vector);
 		u64 ackno_end_rl;
 
 		dccp_set_seqno(&ackno_end_rl, *ackno - rl);
@@ -404,8 +356,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
 		break;
 found:
 		if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) {
-			const u8 state = *vector & DCCP_ACKVEC_STATE_MASK;
-			if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) {
+			if (dccp_ackvec_state(vector) != DCCPAV_NOT_RECEIVED) {
 				dccp_pr_debug("%s ACK vector 0, len=%d, "
 					      "ack_seqno=%llu, ack_ackno=%llu, "
 					      "ACKED!\n",
@@ -448,10 +399,9 @@ int __init dccp_ackvec_init(void)
 	if (dccp_ackvec_slab == NULL)
 		goto out_err;
 
-	dccp_ackvec_record_slab =
-			kmem_cache_create("dccp_ackvec_record",
-					  sizeof(struct dccp_ackvec_record),
-					  0, SLAB_HWCACHE_ALIGN, NULL);
+	dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record",
+					     sizeof(struct dccp_ackvec_record),
+					     0, SLAB_HWCACHE_ALIGN, NULL);
 	if (dccp_ackvec_record_slab == NULL)
 		goto out_destroy_slab;
 
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -513,8 +513,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 					 &vector, &veclen)) != -1) {
 		/* go through this ack vector */
 		while (veclen--) {
-			const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
-			u64 ackno_end_rl = SUB48(ackno, rl);
+			u64 ackno_end_rl = SUB48(ackno, dccp_ackvec_runlen(vector));
 
 			ccid2_pr_debug("ackvec start:%llu end:%llu\n",
 				       (unsigned long long)ackno,
@@ -537,17 +536,15 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 			 * run length
 			 */
 			while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
-				const u8 state = *vector &
-						 DCCP_ACKVEC_STATE_MASK;
+				const u8 state = dccp_ackvec_state(vector);
 
 				/* new packet received or marked */
-				if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED &&
+				if (state != DCCPAV_NOT_RECEIVED &&
 				    !seqp->ccid2s_acked) {
-					if (state ==
-					    DCCP_ACKVEC_STATE_ECN_MARKED) {
+					if (state == DCCPAV_ECN_MARKED)
 						ccid2_congestion_event(sk,
 								       seqp);
-					} else
+					else
 						ccid2_new_ack(sk, seqp,
 							      &maxincr);
 
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -378,8 +378,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
 
 	if (dp->dccps_hc_rx_ackvec != NULL &&
 	    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
-			    DCCP_SKB_CB(skb)->dccpd_seq,
-			    DCCP_ACKVEC_STATE_RECEIVED))
+			    DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED))
 		goto discard;
 	dccp_deliver_input_to_ccids(sk, skb);
 
@@ -637,8 +636,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 
 		if (dp->dccps_hc_rx_ackvec != NULL &&
 		    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
-				    DCCP_SKB_CB(skb)->dccpd_seq,
-				    DCCP_ACKVEC_STATE_RECEIVED))
+				    DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED))
 			goto discard;
 
 		dccp_deliver_input_to_ccids(sk, skb);
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -340,6 +340,7 @@ static inline int dccp_elapsed_time_len(const u32 elapsed_time)
 	return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4;
 }
 
+/* FIXME: This function is currently not used anywhere */
 int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed_time)
 {
 	const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time);

^ permalink raw reply

* Re: pull request: wireless-2.6 2010-11-10
From: David Miller @ 2010-11-11  6:16 UTC (permalink / raw)
  To: linville-2XuSBdqkA4R54TAoqtyWWQ
  Cc: linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20101110185843.GC2714-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org>

From: "John W. Linville" <linville-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org>
Date: Wed, 10 Nov 2010 13:58:43 -0500

> Here is a batch of fixes intended for 2.6.37.  Also mixed-in are a few
> device ID updates.  I think the changelog entries are reasonably
> documentary of the issues being fixed, so I won't belabor them. ;-)
> 
> This also includes a round of Bluetooth fixes from Gustavo:
> 
> "The following batch contains some bugfixes for 2.6.37. A fix for unaligned
> access in L2CAP, a Kconfig error, two fixes related to security of
> the Bluetooth links, and support for the a MacBook Air Bluetooth device.
> There is also a one line patch from Matthew Garret, that enables USB
> autosuspend for btusb module, which shall be completely safe. Please
> pull, thanks."

Pulled, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4)
From: Eric Dumazet @ 2010-11-11  6:29 UTC (permalink / raw)
  To: nhorman; +Cc: netdev, davem, zenczykowski
In-Reply-To: <1289416194-1844-1-git-send-email-nhorman@tuxdriver.com>

Le mercredi 10 novembre 2010 à 14:09 -0500, nhorman@tuxdriver.com a
écrit :
> From: Neil Horman <nhorman@tuxdriver.com>
> 
> Version 4 of this patch.
> 
> Change notes:
> 1) Removed extra memset.  Didn't think kcalloc added a GFP_ZERO the way kzalloc did :)
> 
> Summary:
> It was shown to me recently that systems under high load were driven very deep
> into swap when tcpdump was run.  The reason this happened was because the
> AF_PACKET protocol has a SET_RINGBUFFER socket option that allows the user space
> application to specify how many entries an AF_PACKET socket will have and how
> large each entry will be.  It seems the default setting for tcpdump is to set
> the ring buffer to 32 entries of 64 Kb each, which implies 32 order 5
> allocation.  Thats difficult under good circumstances, and horrid under memory
> pressure.
> 
> I thought it would be good to make that a bit more usable.  I was going to do a
> simple conversion of the ring buffer from contigous pages to iovecs, but
> unfortunately, the metadata which AF_PACKET places in these buffers can easily
> span a page boundary, and given that these buffers get mapped into user space,
> and the data layout doesn't easily allow for a change to padding between frames
> to avoid that, a simple iovec change is just going to break user space ABI
> consistency.
> 
> So I've done this, I've added a three tiered mechanism to the af_packet set_ring
> socket option.  It attempts to allocate memory in the following order:
> 
> 1) Using __get_free_pages with GFP_NORETRY set, so as to fail quickly without
> digging into swap
> 
> 2) Using vmalloc
> 
> 3) Using __get_free_pages with GFP_NORETRY clear, causing us to try as hard as
> needed to get the memory
> 
> The effect is that we don't disturb the system as much when we're under load,
> while still being able to conduct tcpdumps effectively.
> 
> Tested successfully by me.
> 
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>

Acked-by: Eric Dumazet <eric.dumazet@gmail.com>

Thanks Neil !



^ permalink raw reply

* Re: [PATCH] macvlan: lockless tx path
From: Eric Dumazet @ 2010-11-11  7:03 UTC (permalink / raw)
  To: Ben Greear; +Cc: netdev
In-Reply-To: <4CDB2EBC.2090905@candelatech.com>

Le mercredi 10 novembre 2010 à 15:46 -0800, Ben Greear a écrit :
> On 11/10/2010 03:36 PM, Eric Dumazet wrote:
> > Le mercredi 10 novembre 2010 à 14:53 -0800, Ben Greear a écrit :
> >
> >> I did similar, and then wrote extra code to detect a 64-bit kernel and if
> >> so assume that the counters wrap at 64 bits so I didn't have to poll so
> >> often to make sure I didn't miss a wrap for a 10G NIC.  If instead one wraps at 33
> >> bits and the other at 36, there is no way for me to deal with the wrap
> >> properly w/out explicitly knowing about that 33 and 36.
> >>
> >
> > How do you define 'wrap around' ? Maybe your definition is wrong.
> 
> Maybe so.  My algorithm looks like:
> 
>   // uint64 accum;
>   // uint32 old;
>   // uint32 new;
>   if (old > new) {
>       // This assumes counters wrap at 32 bits (ie, 0xFFFFFFFF).
>       accum += ((uint32)(0xFFFFFFFF) - old) + new;
>   }
>   else if (old < new) {
>       accum += new - old;
>   }
>   old = new;
> 
> ...
> 
> Is there some way I can do this w/out the (0xFFFFFFFF - old),
> and thus the assumption of 32-bit counters?
> 

Yes, please take a look at RRD for an example, then you can adapt to
your needs.

<quote>

http://www.mrtg.org/rrdtool/tut/rrdtutorial.en.html

At the time of writing this document, RRDtool knows of counters that are
either 32 bits or 64 bits of size. These counters can handle the
following different values:

 - 32 bits: 0 ..           4294967295
 - 64 bits: 0 .. 18446744073709551615

If these numbers look strange to you, you can view them in their
hexadecimal form:

 - 32 bits: 0 ..         FFFFFFFF
 - 64 bits: 0 .. FFFFFFFFFFFFFFFF

RRDtool handles both counters the same. If an overflow occurs and the
delta would be negative, RRDtool first adds the maximum of a small
counter + 1 to the delta. If the delta is still negative, it had to be
the large counter that wrapped. Add the maximum possible value of the
large counter + 1 and subtract the erroneously added small value.

There is a risk in this: suppose the large counter wrapped while adding
a huge delta, it could happen, theoretically, that adding the smaller
value would make the delta positive. In this unlikely case the results
would not be correct. The increase should be nearly as high as the
maximum counter value for that to happen, so chances are you would have
several other problems as well and this particular problem would not
even be worth thinking about. Even though, I did include an example, so
you can judge for yourself.

The next section gives you some numerical examples for counter-wraps.
Try to do the calculations yourself or just believe me if your
calculator can't handle the numbers :)

Correction numbers:

 - 32 bits: (4294967295 + 1) =                                4294967296
 - 64 bits: (18446744073709551615 + 1)
                                    - correction1 = 18446744069414584320

 Before:        4294967200
 Increase:                100
 Should become: 4294967300
 But really is:             4
 Delta:        -4294967196
 Correction1:  -4294967196 + 4294967296 = 100

 Before:        18446744073709551000
 Increase:                             800
 Should become: 18446744073709551800
 But really is:                        184
 Delta:        -18446744073709550816
 Correction1:  -18446744073709550816
                                + 4294967296 = -18446744069414583520
 Correction2:  -18446744069414583520
                   + 18446744069414584320 = 800

 Before:        18446744073709551615 ( maximum value )
 Increase:      18446744069414584320 ( absurd increase, minimum for
 Should become: 36893488143124135935             this example to work )
 But really is: 18446744069414584319
 Delta:                     -4294967296
 Correction1:  -4294967296 + 4294967296 = 0
 (not negative -> no correction2)

 Before:        18446744073709551615 ( maximum value )
 Increase:      18446744069414584319 ( one less increase )
 Should become: 36893488143124135934
 But really is: 18446744069414584318
 Delta:                     -4294967297
 Correction1:  -4294967297 + 4294967296 = -1
 Correction2:  -1 + 18446744069414584320 = 18446744069414584319

As you can see from the last two examples, you need strange numbers for
RRDtool to fail (provided it's bug free of course), so this should not
happen. However, SNMP or whatever method you choose to collect the data,
might also report wrong numbers occasionally. We can't prevent all
errors, but there are some things we can do. The RRDtool "create"
command takes two special parameters for this. They define the minimum
and maximum allowed values. Until now, we used "U", meaning "unknown".
If you provide values for one or both of them and if RRDtool receives
data points that are outside these limits, it will ignore those values.
For a thermometer in degrees Celsius, the absolute minimum is just under
-273. For my router, I can assume this minimum is much higher so I would
set it to 10, where as the maximum temperature I would set to 80. Any
higher and the device would be out of order.

For the speed of my car, I would never expect negative numbers and also
I would not expect a speed higher than 230. Anything else, and there
must have been an error. Remember: the opposite is not true, if the
numbers pass this check, it doesn't mean that they are correct. Always
judge the graph with a healthy dose of suspicion if it seems weird to
you.

http://www.mrtg.org/rrdtool/doc/rrdcreate.en.html

COUNTER

    is for continuous incrementing counters like the ifInOctets counter
in a router. The COUNTER data source assumes that the counter never
decreases, except when a counter overflows. The update function takes
the overflow into account. The counter is stored as a per-second rate.
When the counter overflows, RRDtool checks if the overflow happened at
the 32bit or 64bit border and acts accordingly by adding an appropriate
value to the result.

</quote>



^ permalink raw reply

* [PATCH net-next-2.6 v2] macvlan: lockless tx path
From: Eric Dumazet @ 2010-11-11  7:14 UTC (permalink / raw)
  To: David Miller; +Cc: Patrick McHardy, netdev, Ben Greear, Ben Hutchings
In-Reply-To: <1289411027.2860.248.camel@edumazet-laptop>

macvlan is a stacked device, like tunnels. We should use the lockless
mechanism we are using in tunnels and loopback.

This patch completely removes locking in TX path.

tx stat counters are added into existing percpu stat structure, renamed
from rx_stats to pcpu_stats.

Note : this reverts commit 2c11455321f37 (macvlan: add multiqueue
capability)

Note : rx_errors converted to a 32bit counter, like tx_dropped, since
they dont need 64bit range.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Ben Greear <greearb@candelatech.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
---
V2: correct kerneldoc
    u32 for tx_dropped and rx_errors

 drivers/net/macvlan.c      |   80 +++++++++++++++--------------------
 include/linux/if_macvlan.h |   34 ++++++++------
 2 files changed, 55 insertions(+), 59 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 0fc9dc7..93f0ba2 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -243,18 +243,22 @@ xmit_world:
 netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
 			       struct net_device *dev)
 {
-	int i = skb_get_queue_mapping(skb);
-	struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 	unsigned int len = skb->len;
 	int ret;
+	const struct macvlan_dev *vlan = netdev_priv(dev);
 
 	ret = macvlan_queue_xmit(skb, dev);
 	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
-		txq->tx_packets++;
-		txq->tx_bytes += len;
-	} else
-		txq->tx_dropped++;
+		struct macvlan_pcpu_stats *pcpu_stats;
 
+		pcpu_stats = this_cpu_ptr(vlan->pcpu_stats);
+		u64_stats_update_begin(&pcpu_stats->syncp);
+		pcpu_stats->tx_packets++;
+		pcpu_stats->tx_bytes += len;
+		u64_stats_update_end(&pcpu_stats->syncp);
+	} else {
+		this_cpu_inc(vlan->pcpu_stats->tx_dropped);
+	}
 	return ret;
 }
 EXPORT_SYMBOL_GPL(macvlan_start_xmit);
@@ -414,14 +418,15 @@ static int macvlan_init(struct net_device *dev)
 	dev->state		= (dev->state & ~MACVLAN_STATE_MASK) |
 				  (lowerdev->state & MACVLAN_STATE_MASK);
 	dev->features 		= lowerdev->features & MACVLAN_FEATURES;
+	dev->features		|= NETIF_F_LLTX;
 	dev->gso_max_size	= lowerdev->gso_max_size;
 	dev->iflink		= lowerdev->ifindex;
 	dev->hard_header_len	= lowerdev->hard_header_len;
 
 	macvlan_set_lockdep_class(dev);
 
-	vlan->rx_stats = alloc_percpu(struct macvlan_rx_stats);
-	if (!vlan->rx_stats)
+	vlan->pcpu_stats = alloc_percpu(struct macvlan_pcpu_stats);
+	if (!vlan->pcpu_stats)
 		return -ENOMEM;
 
 	return 0;
@@ -431,7 +436,7 @@ static void macvlan_uninit(struct net_device *dev)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 
-	free_percpu(vlan->rx_stats);
+	free_percpu(vlan->pcpu_stats);
 }
 
 static struct rtnl_link_stats64 *macvlan_dev_get_stats64(struct net_device *dev,
@@ -439,33 +444,38 @@ static struct rtnl_link_stats64 *macvlan_dev_get_stats64(struct net_device *dev,
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 
-	dev_txq_stats_fold(dev, stats);
-
-	if (vlan->rx_stats) {
-		struct macvlan_rx_stats *p, accum = {0};
-		u64 rx_packets, rx_bytes, rx_multicast;
+	if (vlan->pcpu_stats) {
+		struct macvlan_pcpu_stats *p;
+		u64 rx_packets, rx_bytes, rx_multicast, tx_packets, tx_bytes;
+		u32 rx_errors = 0, tx_dropped = 0;
 		unsigned int start;
 		int i;
 
 		for_each_possible_cpu(i) {
-			p = per_cpu_ptr(vlan->rx_stats, i);
+			p = per_cpu_ptr(vlan->pcpu_stats, i);
 			do {
 				start = u64_stats_fetch_begin_bh(&p->syncp);
 				rx_packets	= p->rx_packets;
 				rx_bytes	= p->rx_bytes;
 				rx_multicast	= p->rx_multicast;
+				tx_packets	= p->tx_packets;
+				tx_bytes	= p->tx_bytes;
 			} while (u64_stats_fetch_retry_bh(&p->syncp, start));
-			accum.rx_packets	+= rx_packets;
-			accum.rx_bytes		+= rx_bytes;
-			accum.rx_multicast	+= rx_multicast;
-			/* rx_errors is an ulong, updated without syncp protection */
-			accum.rx_errors		+= p->rx_errors;
+
+			stats->rx_packets	+= rx_packets;
+			stats->rx_bytes		+= rx_bytes;
+			stats->multicast	+= rx_multicast;
+			stats->tx_packets	+= tx_packets;
+			stats->tx_bytes		+= tx_bytes;
+			/* rx_errors & tx_dropped are u32, updated
+			 * without syncp protection.
+			 */
+			rx_errors	+= p->rx_errors;
+			tx_dropped	+= p->tx_dropped;
 		}
-		stats->rx_packets = accum.rx_packets;
-		stats->rx_bytes   = accum.rx_bytes;
-		stats->rx_errors  = accum.rx_errors;
-		stats->rx_dropped = accum.rx_errors;
-		stats->multicast  = accum.rx_multicast;
+		stats->rx_errors	= rx_errors;
+		stats->rx_dropped	= rx_errors;
+		stats->tx_dropped	= tx_dropped;
 	}
 	return stats;
 }
@@ -601,25 +611,6 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[])
 	return 0;
 }
 
-static int macvlan_get_tx_queues(struct net *net,
-				 struct nlattr *tb[],
-				 unsigned int *num_tx_queues,
-				 unsigned int *real_num_tx_queues)
-{
-	struct net_device *real_dev;
-
-	if (!tb[IFLA_LINK])
-		return -EINVAL;
-
-	real_dev = __dev_get_by_index(net, nla_get_u32(tb[IFLA_LINK]));
-	if (!real_dev)
-		return -ENODEV;
-
-	*num_tx_queues      = real_dev->num_tx_queues;
-	*real_num_tx_queues = real_dev->real_num_tx_queues;
-	return 0;
-}
-
 int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 			   struct nlattr *tb[], struct nlattr *data[],
 			   int (*receive)(struct sk_buff *skb),
@@ -743,7 +734,6 @@ int macvlan_link_register(struct rtnl_link_ops *ops)
 {
 	/* common fields */
 	ops->priv_size		= sizeof(struct macvlan_dev);
-	ops->get_tx_queues	= macvlan_get_tx_queues;
 	ops->validate		= macvlan_validate;
 	ops->maxtype		= IFLA_MACVLAN_MAX;
 	ops->policy		= macvlan_policy;
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 8a2fd66..e28b2e4 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -25,19 +25,25 @@ struct macvlan_port;
 struct macvtap_queue;
 
 /**
- *	struct macvlan_rx_stats - MACVLAN percpu rx stats
+ *	struct macvlan_pcpu_stats - MACVLAN percpu stats
  *	@rx_packets: number of received packets
  *	@rx_bytes: number of received bytes
  *	@rx_multicast: number of received multicast packets
+ *	@tx_packets: number of transmitted packets
+ *	@tx_bytes: number of transmitted bytes
  *	@syncp: synchronization point for 64bit counters
- *	@rx_errors: number of errors
+ *	@rx_errors: number of rx errors
+ *	@tx_dropped: number of tx dropped packets
  */
-struct macvlan_rx_stats {
+struct macvlan_pcpu_stats {
 	u64			rx_packets;
 	u64			rx_bytes;
 	u64			rx_multicast;
+	u64			tx_packets;
+	u64			tx_bytes;
 	struct u64_stats_sync	syncp;
-	unsigned long		rx_errors;
+	u32			rx_errors;
+	u32			tx_dropped;
 };
 
 /*
@@ -52,7 +58,7 @@ struct macvlan_dev {
 	struct hlist_node	hlist;
 	struct macvlan_port	*port;
 	struct net_device	*lowerdev;
-	struct macvlan_rx_stats __percpu *rx_stats;
+	struct macvlan_pcpu_stats __percpu *pcpu_stats;
 	enum macvlan_mode	mode;
 	int (*receive)(struct sk_buff *skb);
 	int (*forward)(struct net_device *dev, struct sk_buff *skb);
@@ -64,18 +70,18 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
 				    unsigned int len, bool success,
 				    bool multicast)
 {
-	struct macvlan_rx_stats *rx_stats;
-
-	rx_stats = this_cpu_ptr(vlan->rx_stats);
 	if (likely(success)) {
-		u64_stats_update_begin(&rx_stats->syncp);
-		rx_stats->rx_packets++;;
-		rx_stats->rx_bytes += len;
+		struct macvlan_pcpu_stats *pcpu_stats;
+
+		pcpu_stats = this_cpu_ptr(vlan->pcpu_stats);
+		u64_stats_update_begin(&pcpu_stats->syncp);
+		pcpu_stats->rx_packets++;
+		pcpu_stats->rx_bytes += len;
 		if (multicast)
-			rx_stats->rx_multicast++;
-		u64_stats_update_end(&rx_stats->syncp);
+			pcpu_stats->rx_multicast++;
+		u64_stats_update_end(&pcpu_stats->syncp);
 	} else {
-		rx_stats->rx_errors++;
+		this_cpu_inc(vlan->pcpu_stats->rx_errors);
 	}
 }
 



^ permalink raw reply related

* Re: [PATCH] Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4)
From: Maciej Żenczykowski @ 2010-11-11  8:03 UTC (permalink / raw)
  To: nhorman; +Cc: netdev, davem, eric.dumazet
In-Reply-To: <1289416194-1844-1-git-send-email-nhorman@tuxdriver.com>

On Wed, Nov 10, 2010 at 11:09,  <nhorman@tuxdriver.com> wrote:
> From: Neil Horman <nhorman@tuxdriver.com>
>
> Version 4 of this patch.
>
> Change notes:
> 1) Removed extra memset.  Didn't think kcalloc added a GFP_ZERO the way kzalloc did :)
>
> Summary:
> It was shown to me recently that systems under high load were driven very deep
> into swap when tcpdump was run.  The reason this happened was because the
> AF_PACKET protocol has a SET_RINGBUFFER socket option that allows the user space
> application to specify how many entries an AF_PACKET socket will have and how
> large each entry will be.  It seems the default setting for tcpdump is to set
> the ring buffer to 32 entries of 64 Kb each, which implies 32 order 5
> allocation.  Thats difficult under good circumstances, and horrid under memory
> pressure.
>
> I thought it would be good to make that a bit more usable.  I was going to do a
> simple conversion of the ring buffer from contigous pages to iovecs, but
> unfortunately, the metadata which AF_PACKET places in these buffers can easily
> span a page boundary, and given that these buffers get mapped into user space,
> and the data layout doesn't easily allow for a change to padding between frames
> to avoid that, a simple iovec change is just going to break user space ABI
> consistency.
>
> So I've done this, I've added a three tiered mechanism to the af_packet set_ring
> socket option.  It attempts to allocate memory in the following order:
>
> 1) Using __get_free_pages with GFP_NORETRY set, so as to fail quickly without
> digging into swap
>
> 2) Using vmalloc
>
> 3) Using __get_free_pages with GFP_NORETRY clear, causing us to try as hard as
> needed to get the memory
>
> The effect is that we don't disturb the system as much when we're under load,
> while still being able to conduct tcpdumps effectively.
>
> Tested successfully by me.
>
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> ---
>  net/packet/af_packet.c |   84 ++++++++++++++++++++++++++++++++++++++---------
>  1 files changed, 68 insertions(+), 16 deletions(-)
>
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index 3616f27..a372390 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -163,8 +163,14 @@ struct packet_mreq_max {
>  static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
>                int closing, int tx_ring);
>
> +#define PGV_FROM_VMALLOC 1
> +struct pgv {
> +       char *buffer;
> +       unsigned char flags;
> +};
> +
>  struct packet_ring_buffer {
> -       char                    **pg_vec;
> +       struct pgv              *pg_vec;
>        unsigned int            head;
>        unsigned int            frames_per_block;
>        unsigned int            frame_size;
> @@ -283,7 +289,8 @@ static void *packet_lookup_frame(struct packet_sock *po,
>        pg_vec_pos = position / rb->frames_per_block;
>        frame_offset = position % rb->frames_per_block;
>
> -       h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
> +       h.raw = rb->pg_vec[pg_vec_pos].buffer +
> +               (frame_offset * rb->frame_size);
>
>        if (status != __packet_get_status(po, h.raw))
>                return NULL;
> @@ -2322,37 +2329,74 @@ static const struct vm_operations_struct packet_mmap_ops = {
>        .close  =       packet_mm_close,
>  };
>
> -static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
> +static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
> +                       unsigned int len)
>  {
>        int i;
>
>        for (i = 0; i < len; i++) {
> -               if (likely(pg_vec[i]))
> -                       free_pages((unsigned long) pg_vec[i], order);
> +               if (likely(pg_vec[i].buffer)) {
> +                       if (pg_vec[i].flags & PGV_FROM_VMALLOC)
> +                               vfree(pg_vec[i].buffer);
> +                       else
> +                               free_pages((unsigned long)pg_vec[i].buffer,
> +                                          order);
> +                       pg_vec[i].buffer = NULL;
> +               }
>        }
>        kfree(pg_vec);
>  }
>
> -static inline char *alloc_one_pg_vec_page(unsigned long order)
> +static inline char *alloc_one_pg_vec_page(unsigned long order,
> +                                         unsigned char *flags)
>  {
> -       gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
> +       char *buffer = NULL;
> +       gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
> +                         __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
> +
> +       buffer = (char *) __get_free_pages(gfp_flags, order);
> +
> +       if (buffer)
> +               return buffer;
> +
> +       /*
> +        * __get_free_pages failed, fall back to vmalloc
> +        */
> +       *flags |= PGV_FROM_VMALLOC;
> +       buffer = vmalloc((1 << order) * PAGE_SIZE);
>
> -       return (char *) __get_free_pages(gfp_flags, order);
> +       if (buffer)
> +               return buffer;
> +
> +       /*
> +        * vmalloc failed, lets dig into swap here
> +        */
> +       *flags = 0;
> +       gfp_flags &= ~__GFP_NORETRY;
> +       buffer = (char *)__get_free_pages(gfp_flags, order);
> +       if (buffer)
> +               return buffer;
> +
> +       /*
> +        * complete and utter failure
> +        */
> +       return NULL;
>  }
>
> -static char **alloc_pg_vec(struct tpacket_req *req, int order)
> +static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
>  {
>        unsigned int block_nr = req->tp_block_nr;
> -       char **pg_vec;
> +       struct pgv *pg_vec;
>        int i;
>
> -       pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
> +       pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
>        if (unlikely(!pg_vec))
>                goto out;
>
>        for (i = 0; i < block_nr; i++) {
> -               pg_vec[i] = alloc_one_pg_vec_page(order);
> -               if (unlikely(!pg_vec[i]))
> +               pg_vec[i].buffer = alloc_one_pg_vec_page(order,
> +                                                        &pg_vec[i].flags);
> +               if (unlikely(!pg_vec[i].buffer))
>                        goto out_free_pgvec;
>        }
>
> @@ -2361,6 +2405,7 @@ out:
>
>  out_free_pgvec:
>        free_pg_vec(pg_vec, order, block_nr);
> +       kfree(pg_vec);
>        pg_vec = NULL;
>        goto out;
>  }
> @@ -2368,7 +2413,7 @@ out_free_pgvec:
>  static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
>                int closing, int tx_ring)
>  {
> -       char **pg_vec = NULL;
> +       struct pgv *pg_vec = NULL;
>        struct packet_sock *po = pkt_sk(sk);
>        int was_running, order = 0;
>        struct packet_ring_buffer *rb;
> @@ -2530,15 +2575,22 @@ static int packet_mmap(struct file *file, struct socket *sock,
>                        continue;
>
>                for (i = 0; i < rb->pg_vec_len; i++) {
> -                       struct page *page = virt_to_page(rb->pg_vec[i]);
> +                       struct page *page;
> +                       void *kaddr = rb->pg_vec[i].buffer;
>                        int pg_num;
>
>                        for (pg_num = 0; pg_num < rb->pg_vec_pages;
> -                                       pg_num++, page++) {
> +                                       pg_num++) {
> +                               if (rb->pg_vec[i].flags & PGV_FROM_VMALLOC)
> +                                       page = vmalloc_to_page(kaddr);
> +                               else
> +                                       page = virt_to_page(kaddr);
> +
>                                err = vm_insert_page(vma, start, page);
>                                if (unlikely(err))
>                                        goto out;
>                                start += PAGE_SIZE;
> +                               kaddr += PAGE_SIZE;
>                        }
>                }
>        }
> --
> 1.7.2.3
>
>
Acked-by: Maciej Żenczykowski <zenczykowski@gmail.com>
Reported-by: Maciej Żenczykowski <zenczykowski@gmail.com>

Nice to see this.

-- Maciej

^ permalink raw reply

* RE: [PATCH v15 00/17] Provide a zero-copy method on KVM virtio-net.
From: Xin, Xiaohui @ 2010-11-11  8:28 UTC (permalink / raw)
  To: David Miller
  Cc: netdev@vger.kernel.org, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, mst@redhat.com, mingo@elte.hu,
	herbert@gondor.apana.org.au, jdike@linux.intel.com
In-Reply-To: <20101110.094641.193701683.davem@davemloft.net>

>-----Original Message-----
>From: David Miller [mailto:davem@davemloft.net]
>Sent: Thursday, November 11, 2010 1:47 AM
>To: Xin, Xiaohui
>Cc: netdev@vger.kernel.org; kvm@vger.kernel.org; linux-kernel@vger.kernel.org;
>mst@redhat.com; mingo@elte.hu; herbert@gondor.apana.org.au; jdike@linux.intel.com
>Subject: Re: [PATCH v15 00/17] Provide a zero-copy method on KVM virtio-net.
>
>From: xiaohui.xin@intel.com
>Date: Wed, 10 Nov 2010 17:23:28 +0800
>
>> From: Xin Xiaohui <xiaohui.xin@intel.com>
>>
>>>2) The idea to key off of skb->dev in skb_release_data() is
>>>   fundamentally flawed since many actions can change skb->dev on you,
>>>   which will end up causing a leak of your external data areas.
>>
>> How about this one? If the destructor_arg is not a good candidate,
>> then I have to add an apparent field in shinfo.
>
>If destructor_arg is actually a net_device pointer or similar,
>you will need to take a reference count on it or similar.
>
Do you mean destructor_arg will be consumed by other user?
If that case, may I add a new structure member in shinfo?
Thus only zero-copy will use it, and no need for the reference count.

>Which means --> good bye performance especially on SMP.
>
>You're going to be adding new serialization points and at
>least two new atomics per packet.

^ permalink raw reply

* Re: [PATCH 1/2] r6040: fix multicast operations
From: Shawn Lin @ 2010-11-11  8:39 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: netdev, Marc Leclerc, Albert Chen, David Miller, Ben Hutchings
In-Reply-To: <201011041104.24681.florian@openwrt.org>

The original code does not work well when the number of mulitcast
address to handle is greater than MCAST_MAX. It only enable promiscous
mode instead of multicast hash table mode, so the hash table function
will not be activated and all multicast frames will be recieved in this
condition.

This patch fixes the following issues with the r6040 NIC operating in
multicast:

1) When the IFF_ALLMULTI flag is set, we should write 0xffff to the NIC
hash table registers to make it process multicast traffic.

2) When the number of multicast address to handle is smaller than
MCAST_MAX, we should use the NIC multicast registers MID1_{L,M,H}.

3) The hashing of the address was not correct, due to an invalid
substraction (15 - (crc & 0x0f)) instead of (crc & 0x0f) and an
incorrect crc algorithm (ether_crc_le) instead of (ether_crc).

4) If necessary, we should set HASH_EN flag in MCR0 to enable multicast
hash table function.


The version is for net-next-2.6:

Reported-by: Marc Leclerc <marc-leclerc@signaturealpha.com>
Tested-by: Marc Leclerc <marc-leclerc@signaturealpha.com>
Signed-off-by: Shawn Lin <shawn@dmp.com.tw>
Signed-off-by: Albert Chen <albert.chen@rdc.com.tw>
---
diff --git a/drivers/net/r6040.c b/drivers/net/r6040.c
index 0b014c8..e88e171 100644
--- a/drivers/net/r6040.c
+++ b/drivers/net/r6040.c
@@ -69,6 +69,8 @@
 
 /* MAC registers */
 #define MCR0           0x00    /* Control register 0 */
+#define  PROMISC       0x0020  /* Promiscuous mode */
+#define  HASH_EN       0x0100  /* Enable multicast hash table function
*/
 #define MCR1           0x04    /* Control register 1 */
 #define  MAC_RST       0x0001  /* Reset the MAC */
 #define MBCR           0x08    /* Bus control */
@@ -851,77 +853,84 @@ static void r6040_multicast_list(struct net_device
*dev)
 {
        struct r6040_private *lp = netdev_priv(dev);
        void __iomem *ioaddr = lp->base;
-       u16 *adrp;
-       u16 reg;
        unsigned long flags;
        struct netdev_hw_addr *ha;
        int i;
+       u16 hash_table[4] = { 0, };
 
-       /* MAC Address */
-       adrp = (u16 *)dev->dev_addr;
-       iowrite16(adrp[0], ioaddr + MID_0L);
-       iowrite16(adrp[1], ioaddr + MID_0M);
-       iowrite16(adrp[2], ioaddr + MID_0H);
-
-       /* Promiscous Mode */
        spin_lock_irqsave(&lp->lock, flags);
 
        /* Clear AMCP & PROM bits */
-       reg = ioread16(ioaddr) & ~0x0120;
+       lp->mcr0 = ioread16(ioaddr + MCR0) & ~(PROMISC | HASH_EN);
+
+       /* Promiscuous Mode */
        if (dev->flags & IFF_PROMISC) {
-               reg |= 0x0020;
-               lp->mcr0 |= 0x0020;
+               lp->mcr0 |= PROMISC;
        }
-       /* Too many multicast addresses
-        * accept all traffic */
-       else if ((netdev_mc_count(dev) > MCAST_MAX) ||
-                (dev->flags & IFF_ALLMULTI))
-               reg |= 0x0020;
-
-       iowrite16(reg, ioaddr);
-       spin_unlock_irqrestore(&lp->lock, flags);
+       /* Enable multicast hash table function to
+        * receive all multicast packets. */
+       else if (dev->flags & IFF_ALLMULTI) {
+               lp->mcr0 |= HASH_EN;
+
+               for (i = 0; i < MCAST_MAX ; i++) {
+                       iowrite16(0, ioaddr + MID_1L + 8 * i);
+                       iowrite16(0, ioaddr + MID_1M + 8 * i);
+                       iowrite16(0, ioaddr + MID_1H + 8 * i);
+               }
 
-       /* Build the hash table */
-       if (netdev_mc_count(dev) > MCAST_MAX) {
-               u16 hash_table[4];
+               for (i = 0; i < 4; i++)
+                       hash_table[i] = 0xffff;
+       }
+       /* Use internal multicast address registers if the number of
+        * multicast addresses is not greater than MCAST_MAX. */
+       else if (netdev_mc_count(dev) <= MCAST_MAX) {
+               i = 0;
+               netdev_for_each_mc_addr(ha, dev) {
+                       u16 *adrp = (u16 *) ha->addr;
+                       iowrite16(adrp[0], ioaddr + MID_1L + 8 * i);
+                       iowrite16(adrp[1], ioaddr + MID_1M + 8 * i);
+                       iowrite16(adrp[2], ioaddr + MID_1H + 8 * i);
+                       i++;
+               }
+               while (i < MCAST_MAX) {
+                       iowrite16(0, ioaddr + MID_1L + 8 * i);
+                       iowrite16(0, ioaddr + MID_1M + 8 * i);
+                       iowrite16(0, ioaddr + MID_1H + 8 * i);
+                       i++;
+               }
+       }
+       /* Otherwise, Enable multicast hash table function. */
+       else {
                u32 crc;
 
-               for (i = 0; i < 4; i++)
-                       hash_table[i] = 0;
+               lp->mcr0 |= HASH_EN;
 
-               netdev_for_each_mc_addr(ha, dev) {
-                       char *addrs = ha->addr;
+               for (i = 0; i < MCAST_MAX ; i++) {
+                       iowrite16(0, ioaddr + MID_1L + 8 * i);
+                       iowrite16(0, ioaddr + MID_1M + 8 * i);
+                       iowrite16(0, ioaddr + MID_1H + 8 * i);
+               }
 
-                       if (!(*addrs & 1))
-                               continue;
+               /* Build multicast hash table */
+               netdev_for_each_mc_addr(ha, dev) {
+                       u8 *addrs = ha->addr;
 
-                       crc = ether_crc_le(6, addrs);
+                       crc = ether_crc(ETH_ALEN, addrs);
                        crc >>= 26;
-                       hash_table[crc >> 4] |= 1 << (15 - (crc & 0xf));
+                       hash_table[crc >> 4] |= 1 << (crc & 0xf);
                }
-               /* Fill the MAC hash tables with their values */
+       }
+       iowrite16(lp->mcr0, ioaddr + MCR0);
+
+       /* Fill the MAC hash tables with their values */
+       if (lp->mcr0 && HASH_EN) {
                iowrite16(hash_table[0], ioaddr + MAR0);
                iowrite16(hash_table[1], ioaddr + MAR1);
                iowrite16(hash_table[2], ioaddr + MAR2);
                iowrite16(hash_table[3], ioaddr + MAR3);
        }
-       /* Multicast Address 1~4 case */
-       i = 0;
-       netdev_for_each_mc_addr(ha, dev) {
-               if (i >= MCAST_MAX)
-                       break;
-               adrp = (u16 *) ha->addr;
-               iowrite16(adrp[0], ioaddr + MID_1L + 8 * i);
-               iowrite16(adrp[1], ioaddr + MID_1M + 8 * i);
-               iowrite16(adrp[2], ioaddr + MID_1H + 8 * i);
-               i++;
-       }
-       while (i < MCAST_MAX) {
-               iowrite16(0xffff, ioaddr + MID_1L + 8 * i);
-               iowrite16(0xffff, ioaddr + MID_1M + 8 * i);
-               iowrite16(0xffff, ioaddr + MID_1H + 8 * i);
-               i++;
-       }
+
+       spin_unlock_irqrestore(&lp->lock, flags);
 }
 
 static void netdev_get_drvinfo(struct net_device *dev,
---


The version is for 2.6.32.y and 2.6.27.y:

Reported-by: Marc Leclerc <marc-leclerc@signaturealpha.com>
Tested-by: Marc Leclerc <marc-leclerc@signaturealpha.com>
Signed-off-by: Shawn Lin <shawn@dmp.com.tw>
Signed-off-by: Albert Chen <albert.chen@rdc.com.tw>
---
diff --git a/drivers/net/r6040.c b/drivers/net/r6040.c
index 9ee9f01..f9af419 100644
--- a/drivers/net/r6040.c
+++ b/drivers/net/r6040.c
@@ -69,6 +69,8 @@
 
 /* MAC registers */
 #define MCR0           0x00    /* Control register 0 */
+#define  PROMISC       0x0020  /* Promiscuous mode */
+#define  HASH_EN       0x0100  /* Enable multicast hash table function
*/
 #define MCR1           0x04    /* Control register 1 */
 #define  MAC_RST       0x0001  /* Reset the MAC */
 #define MBCR           0x08    /* Bus control */
@@ -935,76 +937,88 @@ static void r6040_multicast_list(struct net_device
*dev)
 {
        struct r6040_private *lp = netdev_priv(dev);
        void __iomem *ioaddr = lp->base;
-       u16 *adrp;
-       u16 reg;
        unsigned long flags;
        struct dev_mc_list *dmi = dev->mc_list;
        int i;
+       u16 hash_table[4] = { 0, };
 
-       /* MAC Address */
-       adrp = (u16 *)dev->dev_addr;
-       iowrite16(adrp[0], ioaddr + MID_0L);
-       iowrite16(adrp[1], ioaddr + MID_0M);
-       iowrite16(adrp[2], ioaddr + MID_0H);
-
-       /* Promiscous Mode */
        spin_lock_irqsave(&lp->lock, flags);
 
        /* Clear AMCP & PROM bits */
-       reg = ioread16(ioaddr) & ~0x0120;
+       lp->mcr0 = ioread16(ioaddr + MCR0) & ~(PROMISC | HASH_EN);
+
+       /* Promiscuous Mode */
        if (dev->flags & IFF_PROMISC) {
-               reg |= 0x0020;
-               lp->mcr0 |= 0x0020;
+               lp->mcr0 |= PROMISC;
        }
-       /* Too many multicast addresses
-        * accept all traffic */
-       else if ((dev->mc_count > MCAST_MAX)
-               || (dev->flags & IFF_ALLMULTI))
-               reg |= 0x0020;
+       /* Enable multicast hash table function to
+        * receive all multicast packets. */
+       else if (dev->flags & IFF_ALLMULTI) {
+               lp->mcr0 |= HASH_EN;
+
+               for (i = 0; i < MCAST_MAX ; i++) {
+                       iowrite16(0, ioaddr + MID_1L + 8 * i);
+                       iowrite16(0, ioaddr + MID_1M + 8 * i);
+                       iowrite16(0, ioaddr + MID_1H + 8 * i);
+               }
 
-       iowrite16(reg, ioaddr);
-       spin_unlock_irqrestore(&lp->lock, flags);
+               for (i = 0; i < 4; i++)
+                       hash_table[i] = 0xffff;
+       }
+       /* Use internal multicast address registers if the number of
+        * multicast addresses is not greater than MCAST_MAX. */
+       else if (dev->mc_count <= MCAST_MAX) {
+               i = 0;
+               while (i < dev->mc_count) {
+                       u16 *adrp = (u16 *) dmi->dmi_addr;
+                       dmi = dmi->next;
 
-       /* Build the hash table */
-       if (dev->mc_count > MCAST_MAX) {
-               u16 hash_table[4];
+                       iowrite16(adrp[0], ioaddr + MID_1L + 8 * i);
+                       iowrite16(adrp[1], ioaddr + MID_1M + 8 * i);
+                       iowrite16(adrp[2], ioaddr + MID_1H + 8 * i);
+                       i++;
+               }
+               while (i < MCAST_MAX) {
+                       iowrite16(0, ioaddr + MID_1L + 8 * i);
+                       iowrite16(0, ioaddr + MID_1M + 8 * i);
+                       iowrite16(0, ioaddr + MID_1H + 8 * i);
+                       i++;
+               }
+       }
+       /* Otherwise, Enable multicast hash table function. */
+       else {
                u32 crc;
 
-               for (i = 0; i < 4; i++)
-                       hash_table[i] = 0;
+               lp->mcr0 |= HASH_EN;
 
-               for (i = 0; i < dev->mc_count; i++) {
-                       char *addrs = dmi->dmi_addr;
+               for (i = 0; i < MCAST_MAX ; i++) {
+                       iowrite16(0, ioaddr + MID_1L + 8 * i);
+                       iowrite16(0, ioaddr + MID_1M + 8 * i);
+                       iowrite16(0, ioaddr + MID_1H + 8 * i);
+               }
 
+               /* Build multicast hash table */
+               for (i = 0; i < dev->mc_count; i++) {
+                       u8 *addrs = dmi->dmi_addr;
                        dmi = dmi->next;
 
-                       if (!(*addrs & 1))
-                               continue;
-
-                       crc = ether_crc_le(6, addrs);
+                       crc = ether_crc(ETH_ALEN, addrs);
                        crc >>= 26;
-                       hash_table[crc >> 4] |= 1 << (15 - (crc & 0xf));
+                       hash_table[crc >> 4] |= 1 << (crc & 0xf);
                }
-               /* Fill the MAC hash tables with their values */
+
+       }
+       iowrite16(lp->mcr0, ioaddr + MCR0);
+
+       /* Fill the MAC hash tables with their values */
+       if (lp->mcr0 && HASH_EN) {
                iowrite16(hash_table[0], ioaddr + MAR0);
                iowrite16(hash_table[1], ioaddr + MAR1);
                iowrite16(hash_table[2], ioaddr + MAR2);
                iowrite16(hash_table[3], ioaddr + MAR3);
        }
-       /* Multicast Address 1~4 case */
-       dmi = dev->mc_list;
-       for (i = 0, dmi; (i < dev->mc_count) && (i < MCAST_MAX); i++) {
-               adrp = (u16 *)dmi->dmi_addr;
-               iowrite16(adrp[0], ioaddr + MID_1L + 8*i);
-               iowrite16(adrp[1], ioaddr + MID_1M + 8*i);
-               iowrite16(adrp[2], ioaddr + MID_1H + 8*i);
-               dmi = dmi->next;
-       }
-       for (i = dev->mc_count; i < MCAST_MAX; i++) {
-               iowrite16(0xffff, ioaddr + MID_1L + 8*i);
-               iowrite16(0xffff, ioaddr + MID_1M + 8*i);
-               iowrite16(0xffff, ioaddr + MID_1H + 8*i);
-       }
+
+       spin_unlock_irqrestore(&lp->lock, flags);
 }
 
 static void netdev_get_drvinfo(struct net_device *dev,
---



===========================================================================================
The privileged confidential information contained in this email is intended for use only by the addressees as indicated by the original sender of this email. 
If you are not the addressee indicated in this email or are not responsible for delivery of the email to such a person, please kindly reply to the sender indicating this fact and delete all copies of it from your computer and network server immediately. 
Your cooperation is highly appreciated. It is advised that any unauthorized use of confidential information of DM&P Group is strictly prohibited; and any information in this email irrelevant to the official business of DM&P Group shall be deemed as neither given nor endorsed by DM&P Group.

===========================================================================================

^ permalink raw reply related

* [PATCH net-next-2.6] vlan: lockless transmit path
From: Eric Dumazet @ 2010-11-11  9:42 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Patrick McHardy

vlan is a stacked device, like tunnels. We should use the lockless
mechanism we are using in tunnels and loopback.

This patch completely removes locking in TX path.

tx stat counters are added into existing percpu stat structure, renamed
from vlan_rx_stats to vlan_pcpu_stats.

Note : this partially reverts commit 2e59af3dcbdf (vlan: multiqueue vlan
device)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Patrick McHardy <kaber@trash.net>
---
 net/8021q/vlan.c         |    4 -
 net/8021q/vlan.h         |   16 +++++--
 net/8021q/vlan_core.c    |    4 -
 net/8021q/vlan_dev.c     |   78 +++++++++++++++++++++----------------
 net/8021q/vlan_netlink.c |   20 ---------
 5 files changed, 59 insertions(+), 63 deletions(-)

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 52077ca..2f54ce8 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -272,13 +272,11 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
 		snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id);
 	}
 
-	new_dev = alloc_netdev_mq(sizeof(struct vlan_dev_info), name,
-				  vlan_setup, real_dev->num_tx_queues);
+	new_dev = alloc_netdev(sizeof(struct vlan_dev_info), name, vlan_setup);
 
 	if (new_dev == NULL)
 		return -ENOBUFS;
 
-	netif_copy_real_num_queues(new_dev, real_dev);
 	dev_net_set(new_dev, net);
 	/* need 4 bytes for extra VLAN header info,
 	 * hope the underlying device can handle it.
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index db01b31..a2dafb1 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -19,19 +19,25 @@ struct vlan_priority_tci_mapping {
 
 
 /**
- *	struct vlan_rx_stats - VLAN percpu rx stats
+ *	struct vlan_pcpu_stats - VLAN percpu rx/tx stats
  *	@rx_packets: number of received packets
  *	@rx_bytes: number of received bytes
  *	@rx_multicast: number of received multicast packets
+ *	@tx_packets: number of transmitted packets
+ *	@tx_bytes: number of transmitted bytes
  *	@syncp: synchronization point for 64bit counters
- *	@rx_errors: number of errors
+ *	@rx_errors: number of rx errors
+ *	@tx_dropped: number of tx drops
  */
-struct vlan_rx_stats {
+struct vlan_pcpu_stats {
 	u64			rx_packets;
 	u64			rx_bytes;
 	u64			rx_multicast;
+	u64			tx_packets;
+	u64			tx_bytes;
 	struct u64_stats_sync	syncp;
-	unsigned long		rx_errors;
+	u32			rx_errors;
+	u32			tx_dropped;
 };
 
 /**
@@ -64,7 +70,7 @@ struct vlan_dev_info {
 	struct proc_dir_entry			*dent;
 	unsigned long				cnt_inc_headroom_on_tx;
 	unsigned long				cnt_encap_on_xmit;
-	struct vlan_rx_stats __percpu		*vlan_rx_stats;
+	struct vlan_pcpu_stats __percpu		*vlan_pcpu_stats;
 };
 
 static inline struct vlan_dev_info *vlan_dev_info(const struct net_device *dev)
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 69b2f79..ce8e3ab 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -9,7 +9,7 @@ bool vlan_hwaccel_do_receive(struct sk_buff **skbp)
 	struct sk_buff *skb = *skbp;
 	u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK;
 	struct net_device *vlan_dev;
-	struct vlan_rx_stats *rx_stats;
+	struct vlan_pcpu_stats *rx_stats;
 
 	vlan_dev = vlan_find_dev(skb->dev, vlan_id);
 	if (!vlan_dev) {
@@ -26,7 +26,7 @@ bool vlan_hwaccel_do_receive(struct sk_buff **skbp)
 	skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci);
 	skb->vlan_tci = 0;
 
-	rx_stats = this_cpu_ptr(vlan_dev_info(vlan_dev)->vlan_rx_stats);
+	rx_stats = this_cpu_ptr(vlan_dev_info(vlan_dev)->vlan_pcpu_stats);
 
 	u64_stats_update_begin(&rx_stats->syncp);
 	rx_stats->rx_packets++;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 14e3d1f..3a67483 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -141,7 +141,7 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
 		  struct packet_type *ptype, struct net_device *orig_dev)
 {
 	struct vlan_hdr *vhdr;
-	struct vlan_rx_stats *rx_stats;
+	struct vlan_pcpu_stats *rx_stats;
 	struct net_device *vlan_dev;
 	u16 vlan_id;
 	u16 vlan_tci;
@@ -177,7 +177,7 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
 	} else {
 		skb->dev = vlan_dev;
 
-		rx_stats = this_cpu_ptr(vlan_dev_info(skb->dev)->vlan_rx_stats);
+		rx_stats = this_cpu_ptr(vlan_dev_info(skb->dev)->vlan_pcpu_stats);
 
 		u64_stats_update_begin(&rx_stats->syncp);
 		rx_stats->rx_packets++;
@@ -313,8 +313,6 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
 					    struct net_device *dev)
 {
-	int i = skb_get_queue_mapping(skb);
-	struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 	struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data);
 	unsigned int len;
 	int ret;
@@ -335,7 +333,7 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
 		vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb);
 		skb = __vlan_put_tag(skb, vlan_tci);
 		if (!skb) {
-			txq->tx_dropped++;
+			this_cpu_inc(vlan_dev_info(dev)->vlan_pcpu_stats->tx_dropped);
 			return NETDEV_TX_OK;
 		}
 
@@ -349,19 +347,22 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
 	ret = dev_queue_xmit(skb);
 
 	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
-		txq->tx_packets++;
-		txq->tx_bytes += len;
-	} else
-		txq->tx_dropped++;
+		struct vlan_pcpu_stats *stats;
 
+		stats = this_cpu_ptr(vlan_dev_info(dev)->vlan_pcpu_stats);
+		u64_stats_update_begin(&stats->syncp);
+		stats->tx_packets++;
+		stats->tx_bytes += len;
+		u64_stats_update_begin(&stats->syncp);
+	} else {
+		this_cpu_inc(vlan_dev_info(dev)->vlan_pcpu_stats->tx_dropped);
+	}
 	return ret;
 }
 
 static netdev_tx_t vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb,
 						    struct net_device *dev)
 {
-	int i = skb_get_queue_mapping(skb);
-	struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 	u16 vlan_tci;
 	unsigned int len;
 	int ret;
@@ -375,10 +376,16 @@ static netdev_tx_t vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb,
 	ret = dev_queue_xmit(skb);
 
 	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
-		txq->tx_packets++;
-		txq->tx_bytes += len;
-	} else
-		txq->tx_dropped++;
+		struct vlan_pcpu_stats *stats;
+
+		stats = this_cpu_ptr(vlan_dev_info(dev)->vlan_pcpu_stats);
+		u64_stats_update_begin(&stats->syncp);
+		stats->tx_packets++;
+		stats->tx_bytes += len;
+		u64_stats_update_begin(&stats->syncp);
+	} else {
+		this_cpu_inc(vlan_dev_info(dev)->vlan_pcpu_stats->tx_dropped);
+	}
 
 	return ret;
 }
@@ -738,6 +745,7 @@ static int vlan_dev_init(struct net_device *dev)
 		      (1<<__LINK_STATE_PRESENT);
 
 	dev->features |= real_dev->features & real_dev->vlan_features;
+	dev->features |= NETIF_F_LLTX;
 	dev->gso_max_size = real_dev->gso_max_size;
 
 	/* ipv6 shared card related stuff */
@@ -773,8 +781,8 @@ static int vlan_dev_init(struct net_device *dev)
 
 	vlan_dev_set_lockdep_class(dev, subclass);
 
-	vlan_dev_info(dev)->vlan_rx_stats = alloc_percpu(struct vlan_rx_stats);
-	if (!vlan_dev_info(dev)->vlan_rx_stats)
+	vlan_dev_info(dev)->vlan_pcpu_stats = alloc_percpu(struct vlan_pcpu_stats);
+	if (!vlan_dev_info(dev)->vlan_pcpu_stats)
 		return -ENOMEM;
 
 	return 0;
@@ -786,8 +794,8 @@ static void vlan_dev_uninit(struct net_device *dev)
 	struct vlan_dev_info *vlan = vlan_dev_info(dev);
 	int i;
 
-	free_percpu(vlan->vlan_rx_stats);
-	vlan->vlan_rx_stats = NULL;
+	free_percpu(vlan->vlan_pcpu_stats);
+	vlan->vlan_pcpu_stats = NULL;
 	for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) {
 		while ((pm = vlan->egress_priority_map[i]) != NULL) {
 			vlan->egress_priority_map[i] = pm->next;
@@ -825,33 +833,37 @@ static u32 vlan_ethtool_get_flags(struct net_device *dev)
 
 static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
-	dev_txq_stats_fold(dev, stats);
 
-	if (vlan_dev_info(dev)->vlan_rx_stats) {
-		struct vlan_rx_stats *p, accum = {0};
+	if (vlan_dev_info(dev)->vlan_pcpu_stats) {
+		struct vlan_pcpu_stats *p;
+		u32 rx_errors = 0, tx_dropped = 0;
 		int i;
 
 		for_each_possible_cpu(i) {
-			u64 rxpackets, rxbytes, rxmulticast;
+			u64 rxpackets, rxbytes, rxmulticast, txpackets, txbytes;
 			unsigned int start;
 
-			p = per_cpu_ptr(vlan_dev_info(dev)->vlan_rx_stats, i);
+			p = per_cpu_ptr(vlan_dev_info(dev)->vlan_pcpu_stats, i);
 			do {
 				start = u64_stats_fetch_begin_bh(&p->syncp);
 				rxpackets	= p->rx_packets;
 				rxbytes		= p->rx_bytes;
 				rxmulticast	= p->rx_multicast;
+				txpackets	= p->tx_packets;
+				txbytes		= p->tx_bytes;
 			} while (u64_stats_fetch_retry_bh(&p->syncp, start));
-			accum.rx_packets += rxpackets;
-			accum.rx_bytes   += rxbytes;
-			accum.rx_multicast += rxmulticast;
-			/* rx_errors is ulong, not protected by syncp */
-			accum.rx_errors  += p->rx_errors;
+
+			stats->rx_packets	+= rxpackets;
+			stats->rx_bytes		+= rxbytes;
+			stats->multicast	+= rxmulticast;
+			stats->tx_packets	+= txpackets;
+			stats->tx_bytes		+= txbytes;
+			/* rx_errors & tx_dropped are u32 */
+			rx_errors	+= p->rx_errors;
+			tx_dropped	+= p->tx_dropped;
 		}
-		stats->rx_packets = accum.rx_packets;
-		stats->rx_bytes   = accum.rx_bytes;
-		stats->rx_errors  = accum.rx_errors;
-		stats->multicast  = accum.rx_multicast;
+		stats->rx_errors  = rx_errors;
+		stats->tx_dropped = tx_dropped;
 	}
 	return stats;
 }
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index ddc1057..be9a5c1 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -101,25 +101,6 @@ static int vlan_changelink(struct net_device *dev,
 	return 0;
 }
 
-static int vlan_get_tx_queues(struct net *net,
-			      struct nlattr *tb[],
-			      unsigned int *num_tx_queues,
-			      unsigned int *real_num_tx_queues)
-{
-	struct net_device *real_dev;
-
-	if (!tb[IFLA_LINK])
-		return -EINVAL;
-
-	real_dev = __dev_get_by_index(net, nla_get_u32(tb[IFLA_LINK]));
-	if (!real_dev)
-		return -ENODEV;
-
-	*num_tx_queues      = real_dev->num_tx_queues;
-	*real_num_tx_queues = real_dev->real_num_tx_queues;
-	return 0;
-}
-
 static int vlan_newlink(struct net *src_net, struct net_device *dev,
 			struct nlattr *tb[], struct nlattr *data[])
 {
@@ -237,7 +218,6 @@ struct rtnl_link_ops vlan_link_ops __read_mostly = {
 	.maxtype	= IFLA_VLAN_MAX,
 	.policy		= vlan_policy,
 	.priv_size	= sizeof(struct vlan_dev_info),
-	.get_tx_queues  = vlan_get_tx_queues,
 	.setup		= vlan_setup,
 	.validate	= vlan_validate,
 	.newlink	= vlan_newlink,



^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox