Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 7/8] r8169: (re)init phy on resume
From: Stanislaw Gruszka @ 2010-10-21  8:25 UTC (permalink / raw)
  To: Francois Romieu, netdev; +Cc: Denis Kirjanov, Stanislaw Gruszka
In-Reply-To: <1287649543-6569-1-git-send-email-sgruszka@redhat.com>

Fix switching device to low-speed mode after resume reported in:
https://bugzilla.redhat.com/show_bug.cgi?id=502974

Reported-and-tested-by: Laurentiu Badea <bugzilla-redhat@wotevah.com>
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
---
 drivers/net/r8169.c |    5 +++++
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index ad3f37a..1165e7d 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -4824,6 +4824,9 @@ static int rtl8169_resume(struct device *device)
 {
 	struct pci_dev *pdev = to_pci_dev(device);
 	struct net_device *dev = pci_get_drvdata(pdev);
+	struct rtl8169_private *tp = netdev_priv(dev);
+
+	rtl8169_init_phy(dev, tp);
 
 	if (netif_running(dev))
 		__rtl8169_resume(dev);
@@ -4864,6 +4867,8 @@ static int rtl8169_runtime_resume(struct device *device)
 	tp->saved_wolopts = 0;
 	spin_unlock_irq(&tp->lock);
 
+	rtl8169_init_phy(dev, tp);
+
 	__rtl8169_resume(dev);
 
 	return 0;
-- 
1.6.5.2


^ permalink raw reply related

* [PATCH 6/8] r8169: changing mtu clean up
From: Stanislaw Gruszka @ 2010-10-21  8:25 UTC (permalink / raw)
  To: Francois Romieu, netdev; +Cc: Denis Kirjanov, Stanislaw Gruszka
In-Reply-To: <1287649543-6569-1-git-send-email-sgruszka@redhat.com>

Since we do not change rx buffer size any longer, we can
clean up rtl8169_change_mtu and in consequence rtl8169_down.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
---
 drivers/net/r8169.c |   47 ++++++-----------------------------------------
 1 files changed, 6 insertions(+), 41 deletions(-)

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index ef304c5..ad3f37a 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -3908,31 +3908,11 @@ static void rtl_hw_start_8101(struct net_device *dev)
 
 static int rtl8169_change_mtu(struct net_device *dev, int new_mtu)
 {
-	struct rtl8169_private *tp = netdev_priv(dev);
-	int ret = 0;
-
 	if (new_mtu < ETH_ZLEN || new_mtu > SafeMtu)
 		return -EINVAL;
 
 	dev->mtu = new_mtu;
-
-	if (!netif_running(dev))
-		goto out;
-
-	rtl8169_down(dev);
-
-	ret = rtl8169_init_ring(dev);
-	if (ret < 0)
-		goto out;
-
-	napi_enable(&tp->napi);
-
-	rtl_hw_start(dev);
-
-	rtl8169_request_timer(dev);
-
-out:
-	return ret;
+	return 0;
 }
 
 static inline void rtl8169_make_unusable_by_asic(struct RxDesc *desc)
@@ -4684,7 +4664,6 @@ static void rtl8169_down(struct net_device *dev)
 {
 	struct rtl8169_private *tp = netdev_priv(dev);
 	void __iomem *ioaddr = tp->mmio_addr;
-	unsigned int intrmask;
 
 	rtl8169_delete_timer(dev);
 
@@ -4692,11 +4671,14 @@ static void rtl8169_down(struct net_device *dev)
 
 	napi_disable(&tp->napi);
 
-core_down:
 	spin_lock_irq(&tp->lock);
 
 	rtl8169_asic_down(ioaddr);
-
+	/*
+	 * At this point device interrupts can not be enabled in any function,
+	 * as netif_running is not true (rtl8169_interrupt, rtl8169_reset_task,
+	 * rtl8169_reinit_task) and napi is disabled (rtl8169_poll).
+	 */
 	rtl8169_rx_missed(dev, ioaddr);
 
 	spin_unlock_irq(&tp->lock);
@@ -4706,23 +4688,6 @@ core_down:
 	/* Give a racing hard_start_xmit a few cycles to complete. */
 	synchronize_sched();  /* FIXME: should this be synchronize_irq()? */
 
-	/*
-	 * And now for the 50k$ question: are IRQ disabled or not ?
-	 *
-	 * Two paths lead here:
-	 * 1) dev->close
-	 *    -> netif_running() is available to sync the current code and the
-	 *       IRQ handler. See rtl8169_interrupt for details.
-	 * 2) dev->change_mtu
-	 *    -> rtl8169_poll can not be issued again and re-enable the
-	 *       interruptions. Let's simply issue the IRQ down sequence again.
-	 *
-	 * No loop if hotpluged or major error (0xffff).
-	 */
-	intrmask = RTL_R16(IntrMask);
-	if (intrmask && (intrmask != 0xffff))
-		goto core_down;
-
 	rtl8169_tx_clear(tp);
 
 	rtl8169_rx_clear(tp);
-- 
1.6.5.2


^ permalink raw reply related

* [PATCH 5/8] r8169: do not account fragments as packets
From: Stanislaw Gruszka @ 2010-10-21  8:25 UTC (permalink / raw)
  To: Francois Romieu, netdev; +Cc: Denis Kirjanov, Stanislaw Gruszka
In-Reply-To: <1287649543-6569-1-git-send-email-sgruszka@redhat.com>

Only increase tx_{packets,dropped} statistics when transmit or drop
full skb, not just fragment.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
--
 drivers/net/r8169.c |    8 +++-----
 1 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 0a797d0..ef304c5 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -4094,10 +4094,10 @@ static void rtl8169_tx_clear_range(struct rtl8169_private *tp, u32 start,
 			rtl8169_unmap_tx_skb(&tp->pci_dev->dev, tx_skb,
 					     tp->TxDescArray + entry);
 			if (skb) {
+				tp->dev->stats.tx_dropped++;
 				dev_kfree_skb(skb);
 				tx_skb->skb = NULL;
 			}
-			tp->dev->stats.tx_dropped++;
 		}
 	}
 }
@@ -4402,7 +4402,6 @@ static void rtl8169_tx_interrupt(struct net_device *dev,
 	while (tx_left > 0) {
 		unsigned int entry = dirty_tx % NUM_TX_DESC;
 		struct ring_info *tx_skb = tp->tx_skb + entry;
-		u32 len = tx_skb->len;
 		u32 status;
 
 		rmb();
@@ -4410,12 +4409,11 @@ static void rtl8169_tx_interrupt(struct net_device *dev,
 		if (status & DescOwn)
 			break;
 
-		dev->stats.tx_bytes += len;
-		dev->stats.tx_packets++;
-
 		rtl8169_unmap_tx_skb(&tp->pci_dev->dev, tx_skb,
 				     tp->TxDescArray + entry);
 		if (status & LastFrag) {
+			dev->stats.tx_packets++;
+			dev->stats.tx_bytes += tx_skb->skb->len;
 			dev_kfree_skb(tx_skb->skb);
 			tx_skb->skb = NULL;
 		}
-- 
1.6.5.2


^ permalink raw reply related

* [PATCH 4/8] r8169: use pointer to struct device as local variable
From: Stanislaw Gruszka @ 2010-10-21  8:25 UTC (permalink / raw)
  To: Francois Romieu, netdev; +Cc: Denis Kirjanov, Stanislaw Gruszka
In-Reply-To: <1287649543-6569-1-git-send-email-sgruszka@redhat.com>

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
---
 drivers/net/r8169.c |   51 +++++++++++++++++++++++++--------------------------
 1 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 7ad119f..0a797d0 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -1200,6 +1200,7 @@ static void rtl8169_update_counters(struct net_device *dev)
 	dma_addr_t paddr;
 	u32 cmd;
 	int wait = 1000;
+	struct device *d = &tp->pci_dev->dev;
 
 	/*
 	 * Some chips are unable to dump tally counters when the receiver
@@ -1208,8 +1209,7 @@ static void rtl8169_update_counters(struct net_device *dev)
 	if ((RTL_R8(ChipCmd) & CmdRxEnb) == 0)
 		return;
 
-	counters = dma_alloc_coherent(&tp->pci_dev->dev, sizeof(*counters),
-				      &paddr, GFP_KERNEL);
+	counters = dma_alloc_coherent(d, sizeof(*counters), &paddr, GFP_KERNEL);
 	if (!counters)
 		return;
 
@@ -1230,8 +1230,7 @@ static void rtl8169_update_counters(struct net_device *dev)
 	RTL_W32(CounterAddrLow, 0);
 	RTL_W32(CounterAddrHigh, 0);
 
-	dma_free_coherent(&tp->pci_dev->dev, sizeof(*counters), counters,
-			  paddr);
+	dma_free_coherent(d, sizeof(*counters), counters, paddr);
 }
 
 static void rtl8169_get_ethtool_stats(struct net_device *dev,
@@ -3945,10 +3944,9 @@ static inline void rtl8169_make_unusable_by_asic(struct RxDesc *desc)
 static void rtl8169_free_rx_databuff(struct rtl8169_private *tp,
 				     void **data_buff, struct RxDesc *desc)
 {
-	struct pci_dev *pdev = tp->pci_dev;
-
-	dma_unmap_single(&pdev->dev, le64_to_cpu(desc->addr), rx_buf_sz,
+	dma_unmap_single(&tp->pci_dev->dev, le64_to_cpu(desc->addr), rx_buf_sz,
 			 DMA_FROM_DEVICE);
+
 	kfree(*data_buff);
 	*data_buff = NULL;
 	rtl8169_make_unusable_by_asic(desc);
@@ -3979,6 +3977,7 @@ static struct sk_buff *rtl8169_alloc_rx_data(struct rtl8169_private *tp,
 {
 	void *data;
 	dma_addr_t mapping;
+	struct device *d = &tp->pci_dev->dev;
 	struct net_device *dev = tp->dev;
 	int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
 
@@ -3993,9 +3992,9 @@ static struct sk_buff *rtl8169_alloc_rx_data(struct rtl8169_private *tp,
 			return NULL;
 	}
 
-	mapping = dma_map_single(&tp->pci_dev->dev, rtl8169_align(data), rx_buf_sz,
+	mapping = dma_map_single(d, rtl8169_align(data), rx_buf_sz,
 				 DMA_FROM_DEVICE);
-	if (unlikely(dma_mapping_error(&tp->pci_dev->dev, mapping)))
+	if (unlikely(dma_mapping_error(d, mapping)))
 		goto err_out;
 
 	rtl8169_map_to_asic(desc, mapping, rx_buf_sz);
@@ -4066,13 +4065,13 @@ static int rtl8169_init_ring(struct net_device *dev)
 	return rtl8169_rx_fill(tp);
 }
 
-static void rtl8169_unmap_tx_skb(struct pci_dev *pdev, struct ring_info *tx_skb,
+static void rtl8169_unmap_tx_skb(struct device *d, struct ring_info *tx_skb,
 				 struct TxDesc *desc)
 {
 	unsigned int len = tx_skb->len;
 
-	dma_unmap_single(&pdev->dev, le64_to_cpu(desc->addr), len,
-			 DMA_TO_DEVICE);
+	dma_unmap_single(d, le64_to_cpu(desc->addr), len, DMA_TO_DEVICE);
+
 	desc->opts1 = 0x00;
 	desc->opts2 = 0x00;
 	desc->addr = 0x00;
@@ -4092,7 +4091,7 @@ static void rtl8169_tx_clear_range(struct rtl8169_private *tp, u32 start,
 		if (len) {
 			struct sk_buff *skb = tx_skb->skb;
 
-			rtl8169_unmap_tx_skb(tp->pci_dev, tx_skb,
+			rtl8169_unmap_tx_skb(&tp->pci_dev->dev, tx_skb,
 					     tp->TxDescArray + entry);
 			if (skb) {
 				dev_kfree_skb(skb);
@@ -4209,6 +4208,7 @@ static int rtl8169_xmit_frags(struct rtl8169_private *tp, struct sk_buff *skb,
 	struct skb_shared_info *info = skb_shinfo(skb);
 	unsigned int cur_frag, entry;
 	struct TxDesc * uninitialized_var(txd);
+	struct device *d = &tp->pci_dev->dev;
 
 	entry = tp->cur_tx;
 	for (cur_frag = 0; cur_frag < info->nr_frags; cur_frag++) {
@@ -4222,9 +4222,8 @@ static int rtl8169_xmit_frags(struct rtl8169_private *tp, struct sk_buff *skb,
 		txd = tp->TxDescArray + entry;
 		len = frag->size;
 		addr = ((void *) page_address(frag->page)) + frag->page_offset;
-		mapping = dma_map_single(&tp->pci_dev->dev, addr, len,
-					 DMA_TO_DEVICE);
-		if (unlikely(dma_mapping_error(&tp->pci_dev->dev, mapping)))
+		mapping = dma_map_single(d, addr, len, DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(d, mapping)))
 			goto err_out;
 
 		/* anti gcc 2.95.3 bugware (sic) */
@@ -4275,6 +4274,7 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
 	unsigned int entry = tp->cur_tx % NUM_TX_DESC;
 	struct TxDesc *txd = tp->TxDescArray + entry;
 	void __iomem *ioaddr = tp->mmio_addr;
+	struct device *d = &tp->pci_dev->dev;
 	dma_addr_t mapping;
 	u32 status, len;
 	u32 opts1;
@@ -4289,9 +4289,8 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
 		goto err_stop_0;
 
 	len = skb_headlen(skb);
-	mapping = dma_map_single(&tp->pci_dev->dev, skb->data, len,
-				 DMA_TO_DEVICE);
-	if (unlikely(dma_mapping_error(&tp->pci_dev->dev, mapping)))
+	mapping = dma_map_single(d, skb->data, len, DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(d, mapping)))
 		goto err_dma_0;
 
 	tp->tx_skb[entry].len = len;
@@ -4332,7 +4331,7 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
 	return NETDEV_TX_OK;
 
 err_dma_1:
-	rtl8169_unmap_tx_skb(tp->pci_dev, tp->tx_skb + entry, txd);
+	rtl8169_unmap_tx_skb(d, tp->tx_skb + entry, txd);
 err_dma_0:
 	dev_kfree_skb(skb);
 	dev->stats.tx_dropped++;
@@ -4414,8 +4413,8 @@ static void rtl8169_tx_interrupt(struct net_device *dev,
 		dev->stats.tx_bytes += len;
 		dev->stats.tx_packets++;
 
-		rtl8169_unmap_tx_skb(tp->pci_dev, tx_skb, tp->TxDescArray + entry);
-
+		rtl8169_unmap_tx_skb(&tp->pci_dev->dev, tx_skb,
+				     tp->TxDescArray + entry);
 		if (status & LastFrag) {
 			dev_kfree_skb(tx_skb->skb);
 			tx_skb->skb = NULL;
@@ -4466,16 +4465,16 @@ static struct sk_buff *rtl8169_try_rx_copy(void *data,
 					   dma_addr_t addr)
 {
 	struct sk_buff *skb;
+	struct device *d = &tp->pci_dev->dev;
 
 	data = rtl8169_align(data);
-	dma_sync_single_for_cpu(&tp->pci_dev->dev, addr, pkt_size,
-				DMA_FROM_DEVICE);
+	dma_sync_single_for_cpu(d, addr, pkt_size, DMA_FROM_DEVICE);
 	prefetch(data);
 	skb = netdev_alloc_skb_ip_align(tp->dev, pkt_size);
 	if (skb)
 		memcpy(skb->data, data, pkt_size);
-	dma_sync_single_for_device(&tp->pci_dev->dev, addr, pkt_size,
-				   DMA_FROM_DEVICE);
+	dma_sync_single_for_device(d, addr, pkt_size, DMA_FROM_DEVICE);
+
 	return skb;
 }
 
-- 
1.6.5.2


^ permalink raw reply related

* [PATCH 3/8] r8169: replace PCI_DMA_{TO,FROM}DEVICE to DMA_{TO,FROM}_DEVICE
From: Stanislaw Gruszka @ 2010-10-21  8:25 UTC (permalink / raw)
  To: Francois Romieu, netdev; +Cc: Denis Kirjanov, Stanislaw Gruszka
In-Reply-To: <1287649543-6569-1-git-send-email-sgruszka@redhat.com>

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
---
 drivers/net/r8169.c |   14 +++++++-------
 1 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 5a87036..7ad119f 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -3948,7 +3948,7 @@ static void rtl8169_free_rx_databuff(struct rtl8169_private *tp,
 	struct pci_dev *pdev = tp->pci_dev;
 
 	dma_unmap_single(&pdev->dev, le64_to_cpu(desc->addr), rx_buf_sz,
-			 PCI_DMA_FROMDEVICE);
+			 DMA_FROM_DEVICE);
 	kfree(*data_buff);
 	*data_buff = NULL;
 	rtl8169_make_unusable_by_asic(desc);
@@ -3994,7 +3994,7 @@ static struct sk_buff *rtl8169_alloc_rx_data(struct rtl8169_private *tp,
 	}
 
 	mapping = dma_map_single(&tp->pci_dev->dev, rtl8169_align(data), rx_buf_sz,
-				 PCI_DMA_FROMDEVICE);
+				 DMA_FROM_DEVICE);
 	if (unlikely(dma_mapping_error(&tp->pci_dev->dev, mapping)))
 		goto err_out;
 
@@ -4072,7 +4072,7 @@ static void rtl8169_unmap_tx_skb(struct pci_dev *pdev, struct ring_info *tx_skb,
 	unsigned int len = tx_skb->len;
 
 	dma_unmap_single(&pdev->dev, le64_to_cpu(desc->addr), len,
-			 PCI_DMA_TODEVICE);
+			 DMA_TO_DEVICE);
 	desc->opts1 = 0x00;
 	desc->opts2 = 0x00;
 	desc->addr = 0x00;
@@ -4223,7 +4223,7 @@ static int rtl8169_xmit_frags(struct rtl8169_private *tp, struct sk_buff *skb,
 		len = frag->size;
 		addr = ((void *) page_address(frag->page)) + frag->page_offset;
 		mapping = dma_map_single(&tp->pci_dev->dev, addr, len,
-					 PCI_DMA_TODEVICE);
+					 DMA_TO_DEVICE);
 		if (unlikely(dma_mapping_error(&tp->pci_dev->dev, mapping)))
 			goto err_out;
 
@@ -4290,7 +4290,7 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
 
 	len = skb_headlen(skb);
 	mapping = dma_map_single(&tp->pci_dev->dev, skb->data, len,
-				 PCI_DMA_TODEVICE);
+				 DMA_TO_DEVICE);
 	if (unlikely(dma_mapping_error(&tp->pci_dev->dev, mapping)))
 		goto err_dma_0;
 
@@ -4469,13 +4469,13 @@ static struct sk_buff *rtl8169_try_rx_copy(void *data,
 
 	data = rtl8169_align(data);
 	dma_sync_single_for_cpu(&tp->pci_dev->dev, addr, pkt_size,
-				PCI_DMA_FROMDEVICE);
+				DMA_FROM_DEVICE);
 	prefetch(data);
 	skb = netdev_alloc_skb_ip_align(tp->dev, pkt_size);
 	if (skb)
 		memcpy(skb->data, data, pkt_size);
 	dma_sync_single_for_device(&tp->pci_dev->dev, addr, pkt_size,
-				   PCI_DMA_FROMDEVICE);
+				   DMA_FROM_DEVICE);
 	return skb;
 }
 
-- 
1.6.5.2


^ permalink raw reply related

* [PATCH 2/8] r8169: init rx ring cleanup
From: Stanislaw Gruszka @ 2010-10-21  8:25 UTC (permalink / raw)
  To: Francois Romieu, netdev; +Cc: Denis Kirjanov, Stanislaw Gruszka
In-Reply-To: <1287649543-6569-1-git-send-email-sgruszka@redhat.com>

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
---
 drivers/net/r8169.c |   52 +++++++++++++++++++++-----------------------------
 1 files changed, 22 insertions(+), 30 deletions(-)

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 01d96a7..5a87036 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -3974,12 +3974,12 @@ static inline void *rtl8169_align(void *data)
 	return (void *)ALIGN((long)data, 16);
 }
 
-static struct sk_buff *rtl8169_alloc_rx_data(struct pci_dev *pdev,
-					    struct net_device *dev,
-					    struct RxDesc *desc)
+static struct sk_buff *rtl8169_alloc_rx_data(struct rtl8169_private *tp,
+					     struct RxDesc *desc)
 {
 	void *data;
 	dma_addr_t mapping;
+	struct net_device *dev = tp->dev;
 	int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
 
 	data = kmalloc_node(rx_buf_sz, GFP_KERNEL, node);
@@ -3993,9 +3993,9 @@ static struct sk_buff *rtl8169_alloc_rx_data(struct pci_dev *pdev,
 			return NULL;
 	}
 
-	mapping = dma_map_single(&pdev->dev, rtl8169_align(data), rx_buf_sz,
+	mapping = dma_map_single(&tp->pci_dev->dev, rtl8169_align(data), rx_buf_sz,
 				 PCI_DMA_FROMDEVICE);
-	if (unlikely(dma_mapping_error(&pdev->dev, mapping)))
+	if (unlikely(dma_mapping_error(&tp->pci_dev->dev, mapping)))
 		goto err_out;
 
 	rtl8169_map_to_asic(desc, mapping, rx_buf_sz);
@@ -4018,34 +4018,35 @@ static void rtl8169_rx_clear(struct rtl8169_private *tp)
 	}
 }
 
-static u32 rtl8169_rx_fill(struct rtl8169_private *tp, struct net_device *dev,
-			   u32 start, u32 end, gfp_t gfp)
+static inline void rtl8169_mark_as_last_descriptor(struct RxDesc *desc)
 {
-	u32 cur;
+	desc->opts1 |= cpu_to_le32(RingEnd);
+}
 
-	for (cur = start; end - cur != 0; cur++) {
-		void *data;
-		unsigned int i = cur % NUM_RX_DESC;
+static int rtl8169_rx_fill(struct rtl8169_private *tp)
+{
+	unsigned int i;
 
-		WARN_ON((s32)(end - cur) < 0);
+	for (i = 0; i < NUM_RX_DESC; i++) {
+		void *data;
 
 		if (tp->Rx_databuff[i])
 			continue;
 
-		data = rtl8169_alloc_rx_data(tp->pci_dev, dev,
-					     tp->RxDescArray + i);
+		data = rtl8169_alloc_rx_data(tp, tp->RxDescArray + i);
 		if (!data) {
 			rtl8169_make_unusable_by_asic(tp->RxDescArray + i);
-			break;
+			goto err_out;
 		}
 		tp->Rx_databuff[i] = data;
 	}
-	return cur - start;
-}
 
-static inline void rtl8169_mark_as_last_descriptor(struct RxDesc *desc)
-{
-	desc->opts1 |= cpu_to_le32(RingEnd);
+	rtl8169_mark_as_last_descriptor(tp->RxDescArray + NUM_RX_DESC - 1);
+	return 0;
+
+err_out:
+	rtl8169_rx_clear(tp);
+	return -ENOMEM;
 }
 
 static void rtl8169_init_ring_indexes(struct rtl8169_private *tp)
@@ -4062,16 +4063,7 @@ static int rtl8169_init_ring(struct net_device *dev)
 	memset(tp->tx_skb, 0x0, NUM_TX_DESC * sizeof(struct ring_info));
 	memset(tp->Rx_databuff, 0x0, NUM_RX_DESC * sizeof(void *));
 
-	if (rtl8169_rx_fill(tp, dev, 0, NUM_RX_DESC, GFP_KERNEL) != NUM_RX_DESC)
-		goto err_out;
-
-	rtl8169_mark_as_last_descriptor(tp->RxDescArray + NUM_RX_DESC - 1);
-
-	return 0;
-
-err_out:
-	rtl8169_rx_clear(tp);
-	return -ENOMEM;
+	return rtl8169_rx_fill(tp);
 }
 
 static void rtl8169_unmap_tx_skb(struct pci_dev *pdev, struct ring_info *tx_skb,
-- 
1.6.5.2


^ permalink raw reply related

* [PATCH 1/8] r8169: check dma mapping failures
From: Stanislaw Gruszka @ 2010-10-21  8:25 UTC (permalink / raw)
  To: Francois Romieu, netdev; +Cc: Denis Kirjanov, Stanislaw Gruszka
In-Reply-To: <1287649543-6569-1-git-send-email-sgruszka@redhat.com>

Check possible dma mapping errors and do clean up if it happens.

Fix overwrap bug in rtl8169_tx_clear on the way.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
---
 drivers/net/r8169.c |   66 +++++++++++++++++++++++++++++++++++++--------------
 1 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 1760533..01d96a7 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -3992,11 +3992,18 @@ static struct sk_buff *rtl8169_alloc_rx_data(struct pci_dev *pdev,
 		if (!data)
 			return NULL;
 	}
+
 	mapping = dma_map_single(&pdev->dev, rtl8169_align(data), rx_buf_sz,
 				 PCI_DMA_FROMDEVICE);
+	if (unlikely(dma_mapping_error(&pdev->dev, mapping)))
+		goto err_out;
 
 	rtl8169_map_to_asic(desc, mapping, rx_buf_sz);
 	return data;
+
+err_out:
+	kfree(data);
+	return NULL;
 }
 
 static void rtl8169_rx_clear(struct rtl8169_private *tp)
@@ -4080,12 +4087,13 @@ static void rtl8169_unmap_tx_skb(struct pci_dev *pdev, struct ring_info *tx_skb,
 	tx_skb->len = 0;
 }
 
-static void rtl8169_tx_clear(struct rtl8169_private *tp)
+static void rtl8169_tx_clear_range(struct rtl8169_private *tp, u32 start,
+				   unsigned int n)
 {
 	unsigned int i;
 
-	for (i = tp->dirty_tx; i < tp->dirty_tx + NUM_TX_DESC; i++) {
-		unsigned int entry = i % NUM_TX_DESC;
+	for (i = 0; i < n; i++) {
+		unsigned int entry = (start + i) % NUM_TX_DESC;
 		struct ring_info *tx_skb = tp->tx_skb + entry;
 		unsigned int len = tx_skb->len;
 
@@ -4101,6 +4109,11 @@ static void rtl8169_tx_clear(struct rtl8169_private *tp)
 			tp->dev->stats.tx_dropped++;
 		}
 	}
+}
+
+static void rtl8169_tx_clear(struct rtl8169_private *tp)
+{
+	rtl8169_tx_clear_range(tp, tp->dirty_tx, NUM_TX_DESC);
 	tp->cur_tx = tp->dirty_tx = 0;
 }
 
@@ -4219,6 +4232,8 @@ static int rtl8169_xmit_frags(struct rtl8169_private *tp, struct sk_buff *skb,
 		addr = ((void *) page_address(frag->page)) + frag->page_offset;
 		mapping = dma_map_single(&tp->pci_dev->dev, addr, len,
 					 PCI_DMA_TODEVICE);
+		if (unlikely(dma_mapping_error(&tp->pci_dev->dev, mapping)))
+			goto err_out;
 
 		/* anti gcc 2.95.3 bugware (sic) */
 		status = opts1 | len | (RingEnd * !((entry + 1) % NUM_TX_DESC));
@@ -4235,6 +4250,10 @@ static int rtl8169_xmit_frags(struct rtl8169_private *tp, struct sk_buff *skb,
 	}
 
 	return cur_frag;
+
+err_out:
+	rtl8169_tx_clear_range(tp, tp->cur_tx + 1, cur_frag);
+	return -EIO;
 }
 
 static inline u32 rtl8169_tso_csum(struct sk_buff *skb, struct net_device *dev)
@@ -4261,40 +4280,44 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
 				      struct net_device *dev)
 {
 	struct rtl8169_private *tp = netdev_priv(dev);
-	unsigned int frags, entry = tp->cur_tx % NUM_TX_DESC;
+	unsigned int entry = tp->cur_tx % NUM_TX_DESC;
 	struct TxDesc *txd = tp->TxDescArray + entry;
 	void __iomem *ioaddr = tp->mmio_addr;
 	dma_addr_t mapping;
 	u32 status, len;
 	u32 opts1;
+	int frags;
 
 	if (unlikely(TX_BUFFS_AVAIL(tp) < skb_shinfo(skb)->nr_frags)) {
 		netif_err(tp, drv, dev, "BUG! Tx Ring full when queue awake!\n");
-		goto err_stop;
+		goto err_stop_0;
 	}
 
 	if (unlikely(le32_to_cpu(txd->opts1) & DescOwn))
-		goto err_stop;
+		goto err_stop_0;
+
+	len = skb_headlen(skb);
+	mapping = dma_map_single(&tp->pci_dev->dev, skb->data, len,
+				 PCI_DMA_TODEVICE);
+	if (unlikely(dma_mapping_error(&tp->pci_dev->dev, mapping)))
+		goto err_dma_0;
+
+	tp->tx_skb[entry].len = len;
+	txd->addr = cpu_to_le64(mapping);
+	txd->opts2 = cpu_to_le32(rtl8169_tx_vlan_tag(tp, skb));
 
 	opts1 = DescOwn | rtl8169_tso_csum(skb, dev);
 
 	frags = rtl8169_xmit_frags(tp, skb, opts1);
-	if (frags) {
-		len = skb_headlen(skb);
+	if (frags < 0)
+		goto err_dma_1;
+	else if (frags)
 		opts1 |= FirstFrag;
-	} else {
-		len = skb->len;
+	else {
 		opts1 |= FirstFrag | LastFrag;
 		tp->tx_skb[entry].skb = skb;
 	}
 
-	mapping = dma_map_single(&tp->pci_dev->dev, skb->data, len,
-				 PCI_DMA_TODEVICE);
-
-	tp->tx_skb[entry].len = len;
-	txd->addr = cpu_to_le64(mapping);
-	txd->opts2 = cpu_to_le32(rtl8169_tx_vlan_tag(tp, skb));
-
 	wmb();
 
 	/* anti gcc 2.95.3 bugware (sic) */
@@ -4316,7 +4339,14 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
 
 	return NETDEV_TX_OK;
 
-err_stop:
+err_dma_1:
+	rtl8169_unmap_tx_skb(tp->pci_dev, tp->tx_skb + entry, txd);
+err_dma_0:
+	dev_kfree_skb(skb);
+	dev->stats.tx_dropped++;
+	return NETDEV_TX_OK;
+
+err_stop_0:
 	netif_stop_queue(dev);
 	dev->stats.tx_dropped++;
 	return NETDEV_TX_BUSY;
-- 
1.6.5.2


^ permalink raw reply related

* [PATCH 0/8] r8169 patches for net-next v2
From: Stanislaw Gruszka @ 2010-10-21  8:25 UTC (permalink / raw)
  To: Francois Romieu, netdev; +Cc: Denis Kirjanov, Stanislaw Gruszka

v1 -> v2
- update to current code and some minor changes/cleanups
- do not return NETDEV_TX_BUSY from start_xmit when mapping fail (patch 1)
- add missing unlikely (patch 1)
- account tx bytes based on skb->len (patch 5)
- new patches: changing mtu clean up, (re)init phy on resume
- add net_ratelimit (patch 8)

All patches in series was tested on RTL8111/8168B


^ permalink raw reply

* Re: [RFC PATCH 3/9] ipvs network name space aware
From: Simon Horman @ 2010-10-21  8:22 UTC (permalink / raw)
  To: Hans Schillstrom
  Cc: lvs-devel@vger.kernel.org, netdev@vger.kernel.org,
	netfilter-devel@vger.kernel.org, ja@ssi.bg, wensong@linux-vs.org,
	daniel.lezcano@free.fr
In-Reply-To: <201010210951.40914.hans.schillstrom@ericsson.com>

On Thu, Oct 21, 2010 at 09:51:40AM +0200, Hans Schillstrom wrote:
> On Wednesday 20 October 2010 16:03:24 Simon Horman wrote:
> > On Fri, Oct 08, 2010 at 01:16:57PM +0200, Hans Schillstrom wrote:

[ snip ]

> > > @@ -278,35 +271,41 @@ ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
> > >                       unsigned int proto_off, int inverse)
> > >  {
> > >       __be16 _ports[2], *pptr;
> > > +     struct net *net = dev_net(skb->dev);
> > >
> > >       pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
> > >       if (pptr == NULL)
> > >               return NULL;
> > >
> > > +     BUG_ON(!net);
> >
> > Can you explain why BUG_ON is here?
> 
> Yes, I forgot to remove it.
> I had them every where to make sure that net ptr was set,
> - don't call me paranoid ;-)

Thanks, I thought it was something like that.
I'll remove them as part of my rebase.

[ strip ]

^ permalink raw reply

* Re: [GIT PULL net-2.6] vhost-net: access_ok fix
From: David Miller @ 2010-10-21  8:08 UTC (permalink / raw)
  To: mst; +Cc: kvm, virtualization, netdev, linux-kernel
In-Reply-To: <20101019145901.GA16025@redhat.com>

From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 19 Oct 2010 16:59:01 +0200

> David,
> Not sure if it's too late for 2.6.36 - in case it's not, the following tree
> includes a last minute bugfix for vhost-net, found by code inspection.
> It is on top of net-2.6.
> Thanks!
> 
> The following changes since commit b0057c51db66c5f0f38059f242c57d61c4741d89:
> 
>   tg3: restore rx_dropped accounting (2010-10-11 16:06:24 -0700)
> 
> are available in the git repository at:
>   git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git vhost-net

Even though it's too late, I've pulled this.

^ permalink raw reply

* [PATCH net-next] fib: introduce fib_alias_accessed() helper
From: Eric Dumazet @ 2010-10-21  8:03 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Perf tools session at NFWS 2010 pointed out a false sharing on struct
fib_alias that can be avoided pretty easily, if we set FA_S_ACCESSED bit
only if needed (ie : not already set)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
net/ipv4/fib_hash.c      |    3 ++-
net/ipv4/fib_lookup.h    |    7 +++++++
net/ipv4/fib_semantics.c |    2 +-
net/ipv4/fib_trie.c      |    3 ++-
4 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 4f1aafd..43e1c59 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -335,7 +335,8 @@ void fib_table_select_default(struct fib_table *tb,
 			if (!next_fi->fib_nh[0].nh_gw ||
 			    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
 				continue;
-			fa->fa_state |= FA_S_ACCESSED;
+
+			fib_alias_accessed(fa);
 
 			if (fi == NULL) {
 				if (next_fi != res->fi)
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 5072d8e..a29edf2 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -17,6 +17,13 @@ struct fib_alias {
 
 #define FA_S_ACCESSED	0x01
 
+/* Dont write on fa_state unless needed, to keep it shared on all cpus */
+static inline void fib_alias_accessed(struct fib_alias *fa)
+{
+	if (!(fa->fa_state & FA_S_ACCESSED))
+		fa->fa_state |= FA_S_ACCESSED;
+}
+
 /* Exported by fib_semantics.c */
 extern int fib_semantic_match(struct list_head *head,
 			      const struct flowi *flp,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 6734c9c..3e0da3e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -901,7 +901,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
 		if (fa->fa_scope < flp->fl4_scope)
 			continue;
 
-		fa->fa_state |= FA_S_ACCESSED;
+		fib_alias_accessed(fa);
 
 		err = fib_props[fa->fa_type].error;
 		if (err == 0) {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 31494f3..cd5e13a 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1838,7 +1838,8 @@ void fib_table_select_default(struct fib_table *tb,
 		if (!next_fi->fib_nh[0].nh_gw ||
 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
 			continue;
-		fa->fa_state |= FA_S_ACCESSED;
+
+		fib_alias_accessed(fa);
 
 		if (fi == NULL) {
 			if (next_fi != res->fi)







^ permalink raw reply related

* Re: [RFC PATCH 1/9] ipvs network name space aware
From: Eric Dumazet @ 2010-10-21  8:01 UTC (permalink / raw)
  To: Hans Schillstrom
  Cc: paulmck@linux.vnet.ibm.com, Daniel Lezcano,
	lvs-devel@vger.kernel.org, netdev@vger.kernel.org,
	netfilter-devel@vger.kernel.org, horms@verge.net.au, ja@ssi.bg,
	wensong@linux-vs.org
In-Reply-To: <201010210945.55252.hans.schillstrom@ericsson.com>

Le jeudi 21 octobre 2010 à 09:45 +0200, Hans Schillstrom a écrit :
> I do have this (and some debuging)
> __rcu_read_lock()
> => 0xffffffff8108bcf3 <+0>:	push   %rbp
>    0xffffffff8108bcf4 <+1>:	mov    %rsp,%rbp
>    0xffffffff8108bcf7 <+4>:	nopl   0x0(%rax,%rax,1)
>    0xffffffff8108bcfc <+9>:	mov    %gs:0xb540,%rax
>    0xffffffff8108bd05 <+18>:	mov    0x108(%rax),%edx
>    0xffffffff8108bd0b <+24>:	inc    %edx
>    0xffffffff8108bd0d <+26>:	mov    %edx,0x108(%rax)
>    0xffffffff8108bd13 <+32>:	leaveq
>    0xffffffff8108bd14 <+33>:	retq
> 
> which is not that many, actually imprerssing few instructions :-)

nopl   0x0(%rax,%rax,1) is a filler because of extra instrumentation in
your kernel.

Maybe you could find out why your compiler dont use

	incl 0x108(%rax)

instead of

	mov    0x108(%rax),%edx
	inc    %edx
	mov    %edx,0x108(%rax)


So rcu_read_lock() is really _two_ instructions.

I agree with Paul with the "few" qualification... :-)



--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* (unknown)
From: David Miller @ 2010-10-21  7:56 UTC (permalink / raw)
  To: ddutt; +Cc: netdev, rmody, huangj, amathur
In-Reply-To: <F363E7AC84E1B646A0358B281A46F4AEABA0FFCC62@HQ1-EXCH03.corp.brocade.com>


People are very unlikely to read your posting because you
did not provide a subject line.

^ permalink raw reply

* Re: Linux 2.6.36
From: Mihai Donțu @ 2010-10-21  7:52 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, Gary Zambrano, netdev
In-Reply-To: <AANLkTimdii2F3PsG4SxO5Zym7TB=MSGhtN+TpG=HmbcT@mail.gmail.com>

[-- Attachment #1: Type: Text/Plain, Size: 2050 bytes --]

On Thursday 21 October 2010 00:01:27 Linus Torvalds wrote:
> So it's a week later than I wanted (plus all the days that added up
> from me having a few 8-day weeks during this release window), but it's
> out there now.
> 
> The delay means that the merge window that opens now would cover the
> upcoming kernel summit. However, I really hope that everybody sends me
> their patches and pull requests _before_ KS even starts. And if you're
> affected by the kernel summit you probably won't have time during it
> to finalize anything that week anyway, especially for those staying
> for plumbers afterwards, and...
> 
> So I'm going to hope that we could perhaps even do the 2.6.37 -rc1
> release and close the merge window the Sunday before KS opens. Since
> 2.6.36 was longer than usual (at least it felt that way), I wouldn't
> mind having a 2.6.37 that is shorter than usual.
> 
> But holler if this really screws up any plans. Ten days instead of two
> weeks? Let's see if it's even reasonably realistic.
> 
> Anyway, I'm appending the shortlog since -rc8. At least it's
> noticeably shorter than the -rc7 and -rc8 logs were, and most of it
> really is pretty small.
> 
> For the bigger picture of changes since 2.6.35, see for example
> 
>    http://kernelnewbies.org/Linux_2_6_36
> 
> but it may be worth pointing out that we ended up disabling the new
> fanotify system calls because people were still unsure about the
> interfaces. Better let the interface discussion cook a bit longer than
> release with a bad interface that we need to redo.

I get a rather big amount of 'b44 ssb1:0: eth0: powering down PHY' messages in 
dmesg shortly after booting:

# grep -c 'b44 ssb1:0: eth0: powering down PHY' /var/log/messages
124566
# grep -c 'b44 ssb1:0: eth0: late interrupt' /var/log/messages
1141

The same thing happens when resuming from suspend to RAM. This is accompanied 
by kworker/0:3 (?) taking 100% CPU time for 1 min or so. I'm running 2.6.35 
now, so I might be wrong about the name of the kernel thread.

Thanks,

-- 
Mihai Donțu

[-- Attachment #2: syslog-messages.gz --]
[-- Type: application/x-gzip, Size: 299996 bytes --]

[-- Attachment #3: lspci.txt --]
[-- Type: text/plain, Size: 1758 bytes --]

00:00.0 Host bridge: Intel Corporation Mobile 945GM/PM/GMS, 943/940GML and 945GT Express Memory Controller Hub (rev 03)
00:02.0 VGA compatible controller: Intel Corporation Mobile 945GM/GMS, 943/940GML Express Integrated Graphics Controller (rev 03)
00:02.1 Display controller: Intel Corporation Mobile 945GM/GMS/GME, 943/940GML Express Integrated Graphics Controller (rev 03)
00:1b.0 Audio device: Intel Corporation 82801G (ICH7 Family) High Definition Audio Controller (rev 01)
00:1c.0 PCI bridge: Intel Corporation 82801G (ICH7 Family) PCI Express Port 1 (rev 01)
00:1c.1 PCI bridge: Intel Corporation 82801G (ICH7 Family) PCI Express Port 2 (rev 01)
00:1d.0 USB Controller: Intel Corporation 82801G (ICH7 Family) USB UHCI Controller #1 (rev 01)
00:1d.1 USB Controller: Intel Corporation 82801G (ICH7 Family) USB UHCI Controller #2 (rev 01)
00:1d.2 USB Controller: Intel Corporation 82801G (ICH7 Family) USB UHCI Controller #3 (rev 01)
00:1d.3 USB Controller: Intel Corporation 82801G (ICH7 Family) USB UHCI Controller #4 (rev 01)
00:1d.7 USB Controller: Intel Corporation 82801G (ICH7 Family) USB2 EHCI Controller (rev 01)
00:1e.0 PCI bridge: Intel Corporation 82801 Mobile PCI Bridge (rev e1)
00:1f.0 ISA bridge: Intel Corporation 82801GBM (ICH7-M) LPC Interface Bridge (rev 01)
00:1f.2 IDE interface: Intel Corporation 82801GBM/GHM (ICH7 Family) SATA IDE Controller (rev 01)
00:1f.3 SMBus: Intel Corporation 82801G (ICH7 Family) SMBus Controller (rev 01)
02:00.0 Ethernet controller: Broadcom Corporation BCM4401-B0 100Base-TX (rev 02)
02:01.0 CardBus bridge: O2 Micro, Inc. Cardbus bridge (rev 21)
02:01.4 FireWire (IEEE 1394): O2 Micro, Inc. Firewire (IEEE 1394) (rev 02)
0c:00.0 Network controller: Broadcom Corporation BCM4312 802.11a/b/g (rev 01)

^ permalink raw reply

* Re: [RFC PATCH 3/9] ipvs network name space aware
From: Hans Schillstrom @ 2010-10-21  7:51 UTC (permalink / raw)
  To: Simon Horman
  Cc: lvs-devel@vger.kernel.org, netdev@vger.kernel.org,
	netfilter-devel@vger.kernel.org, ja@ssi.bg, wensong@linux-vs.org,
	daniel.lezcano@free.fr
In-Reply-To: <20101020140318.GA17760@verge.net.au>

On Wednesday 20 October 2010 16:03:24 Simon Horman wrote:
> On Fri, Oct 08, 2010 at 01:16:57PM +0200, Hans Schillstrom wrote:
> >
> > This patch just contains ip_vs_conn.c
> > and does the normal
> >  - moving to vars to struct ipvs
> >  - adding per netns init and exit
> >
> > proc_fs required some extra work with adding/chaning private data to get the net ptr.
>
> I am currently working on rebasing this patch against the
> current nf-next-2.6 tree with includes persistence engines
> and I noticed a few things.
>
> > Signed-off-by:Hans Schillstrom <hans.schillstrom@ericsson.com>
> >
> > diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
> > index b71c69a..c47828f 100644
> > --- a/net/netfilter/ipvs/ip_vs_conn.c
> > +++ b/net/netfilter/ipvs/ip_vs_conn.c
> > @@ -47,7 +47,7 @@
> >
> >  /*
> >   * Connection hash size. Default is what was selected at compile time.
> > -*/
> > + */
> >  int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
> >  module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
> >  MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
>
> This fragment is not needed.

OK

>
> > @@ -56,23 +56,12 @@ MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
> >  int ip_vs_conn_tab_size;
> >  int ip_vs_conn_tab_mask;
> >
> > -/*
> > - *  Connection hash table: for input and output packets lookups of IPVS
> > - */
> > -static struct list_head *ip_vs_conn_tab;
> > -
> > -/*  SLAB cache for IPVS connections */
> > -static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
> > -
> > -/*  counter for current IPVS connections */
> > -static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
> > -
> > -/*  counter for no client port connections */
> > -static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
> > -
> >  /* random value for IPVS connection hash */
> >  static unsigned int ip_vs_conn_rnd;
> >
> > +/* cache name cnt */
> > +static atomic_t conn_cache_nr = ATOMIC_INIT(0);
> > +
> >  /*
> >   *  Fine locking granularity for big connection hash table
> >   */
> > @@ -153,7 +142,7 @@ static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
> >   *   Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
> >   *   returns bool success.
> >   */
> > -static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
> > +static inline int ip_vs_conn_hash(struct net *net, struct ip_vs_conn *cp)
> >  {
> >       unsigned hash;
> >       int ret;
> > @@ -168,7 +157,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
> >       spin_lock(&cp->lock);
> >
> >       if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
> > -             list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
> > +             list_add(&cp->c_list, &net->ipvs->conn_tab[hash]);
> >               cp->flags |= IP_VS_CONN_F_HASHED;
> >               atomic_inc(&cp->refcnt);
> >               ret = 1;
> > @@ -221,18 +210,20 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
> >   *   s_addr, s_port: pkt source address (foreign host)
> >   *   d_addr, d_port: pkt dest address (load balancer)
> >   */
> > -static inline struct ip_vs_conn *__ip_vs_conn_in_get
> > -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
> > - const union nf_inet_addr *d_addr, __be16 d_port)
> > +static inline struct ip_vs_conn *
> > +__ip_vs_conn_in_get(struct net *net, int af, int protocol,
> > +                 const union nf_inet_addr *s_addr, __be16 s_port,
> > +                 const union nf_inet_addr *d_addr, __be16 d_port)
> >  {
> >       unsigned hash;
> >       struct ip_vs_conn *cp;
> > +     struct netns_ipvs *ipvs = net->ipvs;
> >
> >       hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
> >
> >       ct_read_lock(hash);
> >
> > -     list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
> > +     list_for_each_entry(cp, &ipvs->conn_tab[hash], c_list) {
> >               if (cp->af == af &&
> >                   ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
> >                   ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
> > @@ -251,16 +242,18 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get
> >       return NULL;
> >  }
> >
> > -struct ip_vs_conn *ip_vs_conn_in_get
> > -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
> > - const union nf_inet_addr *d_addr, __be16 d_port)
> > +struct ip_vs_conn *
> > +ip_vs_conn_in_get(struct net *net, int af, int protocol,
> > +               const union nf_inet_addr *s_addr, __be16 s_port,
> > +               const union nf_inet_addr *d_addr, __be16 d_port)
> >  {
> >       struct ip_vs_conn *cp;
> >
> > -     cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port);
> > -     if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
> > -             cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr,
> > -                                      d_port);
> > +     cp = __ip_vs_conn_in_get(net, af, protocol,
> > +                              s_addr, s_port, d_addr, d_port);
> > +     if (!cp && atomic_read(&net->ipvs->conn_no_cport_cnt))
> > +             cp = __ip_vs_conn_in_get(net, af, protocol,
> > +                                      s_addr, 0, d_addr, d_port);
> >
> >       IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
> >                     ip_vs_proto_name(protocol),
> > @@ -278,35 +271,41 @@ ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
> >                       unsigned int proto_off, int inverse)
> >  {
> >       __be16 _ports[2], *pptr;
> > +     struct net *net = dev_net(skb->dev);
> >
> >       pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
> >       if (pptr == NULL)
> >               return NULL;
> >
> > +     BUG_ON(!net);
>
> Can you explain why BUG_ON is here?

Yes, I forgot to remove it.
I had them every where to make sure that net ptr was set,
- don't call me paranoid ;-)
>
> >       if (likely(!inverse))
> > -             return ip_vs_conn_in_get(af, iph->protocol,
> > +             return ip_vs_conn_in_get(net, af, iph->protocol,
> >                                        &iph->saddr, pptr[0],
> >                                        &iph->daddr, pptr[1]);
> >       else
> > -             return ip_vs_conn_in_get(af, iph->protocol,
> > +             return ip_vs_conn_in_get(net, af, iph->protocol,
> >                                        &iph->daddr, pptr[1],
> >                                        &iph->saddr, pptr[0]);
> >  }
> >  EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
> >
> > -/* Get reference to connection template */
> > -struct ip_vs_conn *ip_vs_ct_in_get
> > -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
> > - const union nf_inet_addr *d_addr, __be16 d_port)
> > +/*
> > + *  Get reference to connection template
> > + */
> > +struct ip_vs_conn *
> > +ip_vs_ct_in_get(struct net *net, int af, int protocol,
> > +             const union nf_inet_addr *s_addr, __be16 s_port,
> > +             const union nf_inet_addr *d_addr, __be16 d_port)
> >  {
> >       unsigned hash;
> >       struct ip_vs_conn *cp;
> > +     struct netns_ipvs *ipvs = net->ipvs;
> >
> >       hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
> >
> >       ct_read_lock(hash);
> >
> > -     list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
> > +     list_for_each_entry(cp, &ipvs->conn_tab[hash], c_list) {
> >               if (cp->af == af &&
> >                   ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
> >                   /* protocol should only be IPPROTO_IP if
> > @@ -341,12 +340,14 @@ struct ip_vs_conn *ip_vs_ct_in_get
> >   *   s_addr, s_port: pkt source address (inside host)
> >   *   d_addr, d_port: pkt dest address (foreign host)
> >   */
> > -struct ip_vs_conn *ip_vs_conn_out_get
> > -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
> > - const union nf_inet_addr *d_addr, __be16 d_port)
> > +struct ip_vs_conn *
> > +ip_vs_conn_out_get(struct net *net, int af, int protocol,
> > +                const union nf_inet_addr *s_addr, __be16 s_port,
> > +                const union nf_inet_addr *d_addr, __be16 d_port)
> >  {
> >       unsigned hash;
> >       struct ip_vs_conn *cp, *ret=NULL;
> > +     struct netns_ipvs *ipvs = net->ipvs;
> >
> >       /*
> >        *      Check for "full" addressed entries
> > @@ -355,7 +356,7 @@ struct ip_vs_conn *ip_vs_conn_out_get
> >
> >       ct_read_lock(hash);
> >
> > -     list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
> > +     list_for_each_entry(cp, &ipvs->conn_tab[hash], c_list) {
> >               if (cp->af == af &&
> >                   ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
> >                   ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
> > @@ -386,17 +387,19 @@ ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
> >                        unsigned int proto_off, int inverse)
> >  {
> >       __be16 _ports[2], *pptr;
> > +     struct net *net = dev_net(skb->dev);
> >
> >       pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
> >       if (pptr == NULL)
> >               return NULL;
> >
> > +     BUG_ON(!net);
> >       if (likely(!inverse))
> > -             return ip_vs_conn_out_get(af, iph->protocol,
> > +             return ip_vs_conn_out_get(net, af, iph->protocol,
> >                                         &iph->saddr, pptr[0],
> >                                         &iph->daddr, pptr[1]);
> >       else
> > -             return ip_vs_conn_out_get(af, iph->protocol,
> > +             return ip_vs_conn_out_get(net, af, iph->protocol,
> >                                         &iph->daddr, pptr[1],
> >                                         &iph->saddr, pptr[0]);
> >  }
> > @@ -408,7 +411,7 @@ EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
> >  void ip_vs_conn_put(struct ip_vs_conn *cp)
> >  {
> >       unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
> > -             0 : cp->timeout;
> > +                        0 : cp->timeout;
> >       mod_timer(&cp->timer, jiffies+t);
> >
> >       __ip_vs_conn_put(cp);
> > @@ -418,19 +421,19 @@ void ip_vs_conn_put(struct ip_vs_conn *cp)
> >  /*
> >   *   Fill a no_client_port connection with a client port number
> >   */
> > -void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
> > +void ip_vs_conn_fill_cport(struct net *net, struct ip_vs_conn *cp, __be16 cport)
> >  {
> >       if (ip_vs_conn_unhash(cp)) {
> >               spin_lock(&cp->lock);
> >               if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
> > -                     atomic_dec(&ip_vs_conn_no_cport_cnt);
> > +                     atomic_dec(&net->ipvs->conn_no_cport_cnt);
> >                       cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
> >                       cp->cport = cport;
> >               }
> >               spin_unlock(&cp->lock);
> >
> >               /* hash on new dport */
> > -             ip_vs_conn_hash(cp);
> > +             ip_vs_conn_hash(net, cp);
> >       }
> >  }
> >
> > @@ -561,12 +564,12 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
> >   * Check if there is a destination for the connection, if so
> >   * bind the connection to the destination.
> >   */
> > -struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
> > +struct ip_vs_dest *ip_vs_try_bind_dest(struct net *net, struct ip_vs_conn *cp)
> >  {
> >       struct ip_vs_dest *dest;
> >
> >       if ((cp) && (!cp->dest)) {
> > -             dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport,
> > +             dest = ip_vs_find_dest(net, cp->af, &cp->daddr, cp->dport,
> >                                      &cp->vaddr, cp->vport,
> >                                      cp->protocol);
> >               ip_vs_bind_dest(cp, dest);
> > @@ -638,7 +641,7 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
> >   *   If available, return 1, otherwise invalidate this connection
> >   *   template and return 0.
> >   */
> > -int ip_vs_check_template(struct ip_vs_conn *ct)
> > +int ip_vs_check_template(struct net *net, struct ip_vs_conn *ct)
> >  {
> >       struct ip_vs_dest *dest = ct->dest;
> >
> > @@ -647,7 +650,7 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
> >        */
> >       if ((dest == NULL) ||
> >           !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
> > -         (sysctl_ip_vs_expire_quiescent_template &&
> > +         (net->ipvs->sysctl_expire_quiescent_template &&
> >            (atomic_read(&dest->weight) == 0))) {
> >               IP_VS_DBG_BUF(9, "check_template: dest not available for "
> >                             "protocol %s s:%s:%d v:%s:%d "
> > @@ -668,7 +671,7 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
> >                               ct->dport = htons(0xffff);
> >                               ct->vport = htons(0xffff);
> >                               ct->cport = 0;
> > -                             ip_vs_conn_hash(ct);
> > +                             ip_vs_conn_hash(net, ct);
> >                       }
> >               }
> >
> > @@ -720,16 +723,17 @@ static void ip_vs_conn_expire(unsigned long data)
> >               if (unlikely(cp->app != NULL))
> >                       ip_vs_unbind_app(cp);
> >               ip_vs_unbind_dest(cp);
> > +             BUG_ON(!cp->net);
> >               if (cp->flags & IP_VS_CONN_F_NO_CPORT)
> > -                     atomic_dec(&ip_vs_conn_no_cport_cnt);
> > -             atomic_dec(&ip_vs_conn_count);
> > +                     atomic_dec(&cp->net->ipvs->conn_no_cport_cnt);
> > +             atomic_dec(&cp->net->ipvs->conn_count);
> >
> > -             kmem_cache_free(ip_vs_conn_cachep, cp);
> > +             kmem_cache_free(cp->net->ipvs->conn_cachep, cp);
> >               return;
> >       }
> >
> >       /* hash it back to the table */
> > -     ip_vs_conn_hash(cp);
> > +     ip_vs_conn_hash(cp->net, cp);
> >
> >    expire_later:
> >       IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
> > @@ -748,18 +752,22 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
> >
> >
> >  /*
> > - *   Create a new connection entry and hash it into the ip_vs_conn_tab
> > + *   Create a new connection entry and hash it into the ip_vs_conn_tab,
> > + *   netns ptr will be stored in ip_vs_con here.
> >   */
> >  struct ip_vs_conn *
> > -ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
> > +ip_vs_conn_new(struct net *net, int af, int proto,
> > +            const union nf_inet_addr *caddr, __be16 cport,
> >              const union nf_inet_addr *vaddr, __be16 vport,
> > -            const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
> > -            struct ip_vs_dest *dest)
> > +            const union nf_inet_addr *daddr, __be16 dport,
> > +            unsigned flags, struct ip_vs_dest *dest)
> >  {
> >       struct ip_vs_conn *cp;
> > -     struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
> > +     struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, proto);
> > +     struct ip_vs_protocol *pp;
> > +     struct netns_ipvs *ipvs = net->ipvs;
> >
> > -     cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
> > +     cp = kmem_cache_zalloc(ipvs->conn_cachep, GFP_ATOMIC);
> >       if (cp == NULL) {
> >               IP_VS_ERR_RL("%s(): no memory\n", __func__);
> >               return NULL;
> > @@ -790,9 +798,9 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
> >       atomic_set(&cp->n_control, 0);
> >       atomic_set(&cp->in_pkts, 0);
> >
> > -     atomic_inc(&ip_vs_conn_count);
> > +     atomic_inc(&ipvs->conn_count);
> >       if (flags & IP_VS_CONN_F_NO_CPORT)
> > -             atomic_inc(&ip_vs_conn_no_cport_cnt);
> > +             atomic_inc(&ipvs->conn_no_cport_cnt);
> >
> >       /* Bind the connection with a destination server */
> >       ip_vs_bind_dest(cp, dest);
> > @@ -808,12 +816,14 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
> >       else
> >  #endif
> >               ip_vs_bind_xmit(cp);
> > -
> > -     if (unlikely(pp && atomic_read(&pp->appcnt)))
> > -             ip_vs_bind_app(cp, pp);
> > -
> > +     cp->net = net;  /* netns ptr  needed in timer */
> > +     if( pd ) {
> > +             pp = pd->pp;
> > +             if (unlikely(pp && atomic_read(&pd->appcnt)))
> > +                     ip_vs_bind_app(net, cp, pp);
> > +     }
> >       /* Hash it in the ip_vs_conn_tab finally */
> > -     ip_vs_conn_hash(cp);
> > +     ip_vs_conn_hash(net, cp);
> >
> >       return cp;
> >  }
> > @@ -824,16 +834,33 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
> >   */
> >  #ifdef CONFIG_PROC_FS
> >
> > +struct ipvs_private {
> > +     struct seq_net_private p;
> > +     void *private;
> > +};
> > +
> > +static inline void ipvs_seq_priv_set(struct seq_file *seq, void *data)
> > +{
> > +     struct ipvs_private *ipriv=(struct ipvs_private *)seq->private;
> > +     ipriv->private = data;
> > +}
> > +static inline void *ipvs_seq_priv_get(struct seq_file *seq)
> > +{
> > +     return ((struct ipvs_private *)seq->private)->private;
> > +}
> > +
> >  static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
> >  {
> >       int idx;
> >       struct ip_vs_conn *cp;
> > +     struct net *net = seq_file_net(seq);
> > +     struct netns_ipvs *ipvs = net->ipvs;
> >
> >       for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
> >               ct_read_lock_bh(idx);
> > -             list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
> > +             list_for_each_entry(cp, &ipvs->conn_tab[idx], c_list) {
> >                       if (pos-- == 0) {
> > -                             seq->private = &ip_vs_conn_tab[idx];
> > +                             ipvs_seq_priv_set(seq, &ipvs->conn_tab[idx]);
> >                               return cp;
> >                       }
> >               }
> > @@ -845,15 +872,17 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
> >
> >  static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
> >  {
> > -     seq->private = NULL;
> > +     ipvs_seq_priv_set(seq, NULL);
> >       return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
> >  }
> > -
> > + /* netns: conn_tab OK */
> >  static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> >  {
> >       struct ip_vs_conn *cp = v;
> > -     struct list_head *e, *l = seq->private;
> > +     struct list_head *e, *l = ipvs_seq_priv_get(seq);
> >       int idx;
> > +     struct net *net = seq_file_net(seq);
> > +     struct netns_ipvs *ipvs = net->ipvs;
> >
> >       ++*pos;
> >       if (v == SEQ_START_TOKEN)
> > @@ -863,27 +892,28 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> >       if ((e = cp->c_list.next) != l)
> >               return list_entry(e, struct ip_vs_conn, c_list);
> >
> > -     idx = l - ip_vs_conn_tab;
> > +     idx = l - ipvs->conn_tab;
> >       ct_read_unlock_bh(idx);
> >
> >       while (++idx < ip_vs_conn_tab_size) {
> >               ct_read_lock_bh(idx);
> > -             list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
> > -                     seq->private = &ip_vs_conn_tab[idx];
> > +             list_for_each_entry(cp, &ipvs->conn_tab[idx], c_list) {
> > +                     ipvs_seq_priv_set(seq, &ipvs->conn_tab[idx]);
> >                       return cp;
> >               }
> >               ct_read_unlock_bh(idx);
> >       }
> > -     seq->private = NULL;
> > +     ipvs_seq_priv_set(seq, NULL);
> >       return NULL;
> >  }
> > -
> > +/* netns: conn_tab OK */
> >  static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
> >  {
> > -     struct list_head *l = seq->private;
> > +     struct list_head *l = ipvs_seq_priv_get(seq);
> > +     struct net *net = seq_file_net(seq);
> >
> >       if (l)
> > -             ct_read_unlock_bh(l - ip_vs_conn_tab);
> > +             ct_read_unlock_bh(l - net->ipvs->conn_tab);
> >  }
> >
> >  static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
> > @@ -928,7 +958,16 @@ static const struct seq_operations ip_vs_conn_seq_ops = {
> >
> >  static int ip_vs_conn_open(struct inode *inode, struct file *file)
> >  {
> > -     return seq_open(file, &ip_vs_conn_seq_ops);
> > +     int ret;
> > +     struct ipvs_private *priv;
> > +
> > +     ret = seq_open_net(inode, file, &ip_vs_conn_seq_ops,
> > +                        sizeof(struct ipvs_private));
> > +     if (!ret) {
> > +             priv = ((struct seq_file *)file->private_data)->private;
> > +             priv->private = NULL;
> > +     }
> > +     return ret;
> >  }
> >
> >  static const struct file_operations ip_vs_conn_fops = {
> > @@ -936,7 +975,8 @@ static const struct file_operations ip_vs_conn_fops = {
> >       .open    = ip_vs_conn_open,
> >       .read    = seq_read,
> >       .llseek  = seq_lseek,
> > -     .release = seq_release,
> > +     .release = seq_release_private,
> > +
> >  };
> >
> >  static const char *ip_vs_origin_name(unsigned flags)
> > @@ -991,7 +1031,17 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = {
> >
> >  static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
> >  {
> > -     return seq_open(file, &ip_vs_conn_sync_seq_ops);
> > +     int ret;
> > +     struct ipvs_private *ipriv;
> > +
> > +     ret = seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops,
> > +                        sizeof(struct ipvs_private));
> > +     if (!ret) {
> > +             ipriv = ((struct seq_file *)file->private_data)->private;
> > +             ipriv->private = NULL;
> > +     }
> > +     return ret;
> > +//   return seq_open(file, &ip_vs_conn_sync_seq_ops);
> >  }
> >
> >  static const struct file_operations ip_vs_conn_sync_fops = {
> > @@ -999,7 +1049,7 @@ static const struct file_operations ip_vs_conn_sync_fops = {
> >       .open    = ip_vs_conn_sync_open,
> >       .read    = seq_read,
> >       .llseek  = seq_lseek,
> > -     .release = seq_release,
> > +     .release = seq_release_private,
> >  };
> >
> >  #endif
> > @@ -1036,11 +1086,14 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
> >       return 1;
> >  }
> >
> > -/* Called from keventd and must protect itself from softirqs */
> > -void ip_vs_random_dropentry(void)
> > +/* Called from keventd and must protect itself from softirqs
> > + * netns: conn_tab OK
> > + */
> > +void ip_vs_random_dropentry(struct net *net)
> >  {
> >       int idx;
> >       struct ip_vs_conn *cp;
> > +     struct netns_ipvs *ipvs = net->ipvs;
> >
> >       /*
> >        * Randomly scan 1/32 of the whole table every second
> > @@ -1053,7 +1106,7 @@ void ip_vs_random_dropentry(void)
> >                */
> >               ct_write_lock_bh(hash);
> >
> > -             list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
> > +             list_for_each_entry(cp, &ipvs->conn_tab[hash], c_list) {
> >                       if (cp->flags & IP_VS_CONN_F_TEMPLATE)
> >                               /* connection template */
> >                               continue;
> > @@ -1091,11 +1144,13 @@ void ip_vs_random_dropentry(void)
> >
> >  /*
> >   *      Flush all the connection entries in the ip_vs_conn_tab
> > + * netns: conn_tab OK
> >   */
> > -static void ip_vs_conn_flush(void)
> > +static void ip_vs_conn_flush(struct net *net)
> >  {
> >       int idx;
> >       struct ip_vs_conn *cp;
> > +     struct netns_ipvs *ipvs = net->ipvs;
> >
> >    flush_again:
> >       for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
> > @@ -1104,7 +1159,7 @@ static void ip_vs_conn_flush(void)
> >                */
> >               ct_write_lock_bh(idx);
> >
> > -             list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
> > +             list_for_each_entry(cp, &ipvs->conn_tab[idx], c_list) {
> >
> >                       IP_VS_DBG(4, "del connection\n");
> >                       ip_vs_conn_expire_now(cp);
> > @@ -1118,16 +1173,17 @@ static void ip_vs_conn_flush(void)
> >
> >       /* the counter may be not NULL, because maybe some conn entries
> >          are run by slow timer handler or unhashed but still referred */
> > -     if (atomic_read(&ip_vs_conn_count) != 0) {
> > +     if (atomic_read(&ipvs->conn_count) != 0) {
> >               schedule();
> >               goto flush_again;
> >       }
> >  }
> >
> >
> > -int __init ip_vs_conn_init(void)
> > +int __net_init __ip_vs_conn_init(struct net *net)
> >  {
> >       int idx;
> > +     struct netns_ipvs *ipvs = net->ipvs;
> >
> >       /* Compute size and mask */
> >       ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
> > @@ -1136,19 +1192,26 @@ int __init ip_vs_conn_init(void)
> >       /*
> >        * Allocate the connection hash table and initialize its list heads
> >        */
> > -     ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size *
> > +     ipvs->conn_tab = vmalloc(ip_vs_conn_tab_size *
> >                                sizeof(struct list_head));
> > -     if (!ip_vs_conn_tab)
> > +     if (!ipvs->conn_tab)
> >               return -ENOMEM;
> >
> >       /* Allocate ip_vs_conn slab cache */
> > -     ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
> > +     /* Todo: find a better way to name the cache */
> > +     snprintf(ipvs->conn_cname, sizeof(ipvs->conn_cname)-1,
> > +                     "ipvs_conn_%d", atomic_read(&conn_cache_nr) );
> > +     atomic_inc(&conn_cache_nr);
> > +
> > +     ipvs->conn_cachep = kmem_cache_create(ipvs->conn_cname,
> >                                             sizeof(struct ip_vs_conn), 0,
> >                                             SLAB_HWCACHE_ALIGN, NULL);
> > -     if (!ip_vs_conn_cachep) {
> > -             vfree(ip_vs_conn_tab);
> > +     if (!ipvs->conn_cachep) {
> > +             vfree(ipvs->conn_tab);
> >               return -ENOMEM;
> >       }
> > +     atomic_set(&ipvs->conn_count, 0);
> > +     atomic_set(&ipvs->conn_no_cport_cnt, 0);
> >
> >       pr_info("Connection hash table configured "
> >               "(size=%d, memory=%ldKbytes)\n",
> > @@ -1158,31 +1221,46 @@ int __init ip_vs_conn_init(void)
> >                 sizeof(struct ip_vs_conn));
> >
> >       for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
> > -             INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
> > +             INIT_LIST_HEAD(&ipvs->conn_tab[idx]);
> >       }
> >
> >       for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
> >               rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
> >       }
> >
> > -     proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops);
> > -     proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
> > -
> > -     /* calculate the random value for connection hash */
> > -     get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
> > +     proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops);
> > +     proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
> >
> >       return 0;
> >  }
> > +/* Cleanup and release all netns related ... */
> > +static void __net_exit __ip_vs_conn_cleanup(struct net *net) {
> >
> > +     /* flush all the connection entries first */
> > +     ip_vs_conn_flush(net);
> > +     /* Release the empty cache */
> > +     kmem_cache_destroy(net->ipvs->conn_cachep);
> > +     proc_net_remove(net, "ip_vs_conn");
> > +     proc_net_remove(net, "ip_vs_conn_sync");
> > +     vfree(net->ipvs->conn_tab);
> > +}
> > +static struct pernet_operations ipvs_conn_ops = {
> > +     .init = __ip_vs_conn_init,
> > +     .exit = __ip_vs_conn_cleanup,
> > +};
> >
> > -void ip_vs_conn_cleanup(void)
> > +int __init ip_vs_conn_init(void)
> >  {
> > -     /* flush all the connection entries first */
> > -     ip_vs_conn_flush();
> > +     int rv;
> >
> > -     /* Release the empty cache */
> > -     kmem_cache_destroy(ip_vs_conn_cachep);
> > -     proc_net_remove(&init_net, "ip_vs_conn");
> > -     proc_net_remove(&init_net, "ip_vs_conn_sync");
> > -     vfree(ip_vs_conn_tab);
> > +     rv = register_pernet_subsys(&ipvs_conn_ops);
> > +
> > +     /* calculate the random value for connection hash */
> > +     get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
> > +     return rv;
> > +}
> > +
> > +void ip_vs_conn_cleanup(void)
> > +{
> > +     unregister_pernet_subsys(&ipvs_conn_ops);
> >  }
> >
> > --
> > Regards
> > Hans Schillstrom <hans.schillstrom@ericsson.com>
> > --
> > To unsubscribe from this list: send the line "unsubscribe lvs-devel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >
>

--
Regards
Hans Schillstrom <hans.schillstrom@ericsson.com>

^ permalink raw reply

* Re: [BUG] problems with "ip xfrm" on 32-bit userspace with 64-bit kernel
From: Florian Westphal @ 2010-10-21  7:50 UTC (permalink / raw)
  To: Chris Friesen; +Cc: netdev, Linux Kernel Mailing List
In-Reply-To: <4CBF78B6.90002@genband.com>

Chris Friesen <chris.friesen@genband.com> wrote:
> We've run into a 32/64 compatibility problem with iproute2.  The "ip
> xfrm monitor acquire" command doesn't work properly due to struct size
> mismatches between kernel and userspace.

Yes.  See archives for 'xfrm: add x86 CONFIG_COMPAT support'
(http://marc.info/?t=127050655600003&r=1&w=2)

for a discussion on why the patch set to fix this was rejected.

^ permalink raw reply

* Re: [RFC PATCH 1/9] ipvs network name space aware
From: Hans Schillstrom @ 2010-10-21  7:45 UTC (permalink / raw)
  To: paulmck@linux.vnet.ibm.com
  Cc: Daniel Lezcano, lvs-devel@vger.kernel.org, netdev@vger.kernel.org,
	netfilter-devel@vger.kernel.org, horms@verge.net.au, ja@ssi.bg,
	wensong@linux-vs.org
In-Reply-To: <20101020160205.GB2386@linux.vnet.ibm.com>

On Wednesday 20 October 2010 18:02:06 Paul E. McKenney wrote:
> On Wed, Oct 20, 2010 at 10:25:19AM +0200, Hans Schillstrom wrote:
> > On Tuesday 19 October 2010 20:44:36 Paul E. McKenney wrote:
> > > On Mon, Oct 18, 2010 at 03:23:48PM +0200, Hans Schillstrom wrote:
> > > > On Monday 18 October 2010 13:37:38 Daniel Lezcano wrote:
> > > > > On 10/18/2010 11:54 AM, Hans Schillstrom wrote:
> > > > > > On Monday 18 October 2010 10:59:25 Daniel Lezcano wrote:
> > > > > >
> > > > > >> On 10/08/2010 01:16 PM, Hans Schillstrom wrote:
> > > > > >>
> > > > > >>> This part contains the include files
> > > > > >>> where include/net/netns/ip_vs.h is new and contains all moved vars.
> > > > > >>>
> > > > > >>> SUMMARY
> > > > > >>>
> > > > > >>>    include/net/ip_vs.h                     |  136 ++++---
> > > > > >>>    include/net/net_namespace.h             |    2 +
> > > > > >>>    include/net/netns/ip_vs.h               |  112 +++++
> > > > > >>>
> > > > > >>> Signed-off-by:Hans Schillstrom<hans.schillstrom@ericsson.com>
> > > > > >>> ---
> > > > > >>>
> > > > > >>>
> > > > > >>>
> > > > > >> [ ... ]
> > > > > >>
> > > > > >>
> > > > > >>>    #ifdef CONFIG_IP_VS_IPV6
> > > > > >>> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> > > > > >>> index bd10a79..b59cdc5 100644
> > > > > >>> --- a/include/net/net_namespace.h
> > > > > >>> +++ b/include/net/net_namespace.h
> > > > > >>> @@ -15,6 +15,7 @@
> > > > > >>>    #include<net/netns/ipv4.h>
> > > > > >>>    #include<net/netns/ipv6.h>
> > > > > >>>    #include<net/netns/dccp.h>
> > > > > >>> +#include<net/netns/ip_vs.h>
> > > > > >>>    #include<net/netns/x_tables.h>
> > > > > >>>    #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
> > > > > >>>    #include<net/netns/conntrack.h>
> > > > > >>> @@ -91,6 +92,7 @@ struct net {
> > > > > >>>    	struct sk_buff_head	wext_nlevents;
> > > > > >>>    #endif
> > > > > >>>    	struct net_generic	*gen;
> > > > > >>> +	struct netns_ipvs       *ipvs;
> > > > > >>>    };
> > > > > >>>
> > > > > >>>
> > > > > >> IMHO, it would be better to use the net_generic infra-structure instead
> > > > > >> of adding a new field in the netns structure.
> > > > > >>
> > > > > >>
> > > > > >>
> > > > > > I realized that to, but the performance penalty is quite high with net_generic :-(
> > > > > > But on the other hand if you are going to backport it, (without recompiling the kernel)
> > > > > > you gonna need it!
> > > > > >
> > > > >
> > > > > Hmm, yes. We don't want to have the init_net_ns performances to be impacted.
> > > > >
> > > > > You use here a pointer which will be dereferenced like the net_generic,
> > > > > I don't think there will be
> > > > > a big difference between using net_generic and using a pointer in the
> > > > > net namespace structure.
> > > > >
> > > > > The difference is the id usage, but this one is based on the idr which
> > > > > is quite fast.
> > > > >
> > > >
> > > > I'm not so sure about that, have a look at net_generic and rcu_read_lock
> > > > and compare
> > > >  ipvs = net->ipvs;
> > > > vs.
> > > >  ipvs = net_generic(net, id)
> > > >
> > > > static inline void *net_generic(struct net *net, int id)
> > > > {
> > > > 	struct net_generic *ng;
> > > > 	void *ptr;
> > > >
> > > > 	rcu_read_lock();
> > > > 	ng = rcu_dereference(net->gen);
> > > > 	BUG_ON(id == 0 || id > ng->len);
> > > > 	ptr = ng->ptr[id - 1];
> > > > 	rcu_read_unlock();
> > > >
> > > > 	return ptr;
> > > > }
> > > > ...
> > > > static inline void rcu_read_lock(void)
> > > > {
> > > >         __rcu_read_lock();
> > > >         __acquire(RCU);
> > > >         rcu_read_acquire();
> > > > }
> > > >
> > > > Another way of doing it is to pass the ipvs ptr instead of the net ptr,
> > > > and add *net to the ipvs struct.
> > > >
> > > > > We should experiment a bit here to compare both solutions.
> > > > Agre
> > > > >
> > > > I single stepped through the rcu_read_lock() on a x86_64
> > > > and it's quite many "stepi" that you need to enter :-(
> > >
> > > Was this by chance with lockdep enabled?  If not, could you please send
> > > your .config?
> > >
> > > 							Thanx, Paul
> >
> > No lockdep, but what I ment is that net_generic is not as fast as a plain ptr->xxx.
> > IPVS has hooks in the netfilter chain, and gets a huge amount of packets .
> >
> > I don't think IPVS is a candidate for net_generic, it should have its own part in "struct net"
> > That was my point.
> > ( No critic to locking or net_generic)
>
> You said that there were a lot of "stepi" commands to get through
> rcu_read_lock() on x86_64.  This is quite surprising, especially if you
> built with CONFIG_RCU_TREE.  Even if you built with CONFIG_PREEMPT_RCU_TREE,
> you should only see something like the following from rcu_read_lock():
>
> 000000b7 <__rcu_read_lock>:
>       b7:	55                   	push   %ebp
>       b8:	64 a1 00 00 00 00    	mov    %fs:0x0,%eax
>       be:	ff 80 80 01 00 00    	incl   0x180(%eax)
>       c4:	89 e5                	mov    %esp,%ebp
>       c6:	5d                   	pop    %ebp
>       c7:	c3                   	ret
>
> Unless you have some sort of debugging options turned on.  Or unless
> six instructions counts for "quite many" stepi commands.  ;-)
>
I do have this (and some debuging)
__rcu_read_lock()
=> 0xffffffff8108bcf3 <+0>:	push   %rbp
   0xffffffff8108bcf4 <+1>:	mov    %rsp,%rbp
   0xffffffff8108bcf7 <+4>:	nopl   0x0(%rax,%rax,1)
   0xffffffff8108bcfc <+9>:	mov    %gs:0xb540,%rax
   0xffffffff8108bd05 <+18>:	mov    0x108(%rax),%edx
   0xffffffff8108bd0b <+24>:	inc    %edx
   0xffffffff8108bd0d <+26>:	mov    %edx,0x108(%rax)
   0xffffffff8108bd13 <+32>:	leaveq
   0xffffffff8108bd14 <+33>:	retq

which is not that many, actually imprerssing few instructions :-)

Thanks
	Hans

> So I am quite curious, independent of whether or not IPVS is a candidate
> for net_generic.  That choice for IPVS is not mine to make, and I will
> trust the relevant developers and maintainers to make the right choice,
> whether that be RCU or something else.  Even I do not claim that RCU
> is the right tool for all jobs!  ;-)
>
> 							Thanx, Paul
> --
> To unsubscribe from this list: send the line "unsubscribe lvs-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

--
Regards
Hans Schillstrom <hans.schillstrom@ericsson.com>

^ permalink raw reply

* Re: [RFC PATCH 5/9] ipvs network name space aware
From: Hans Schillstrom @ 2010-10-21  7:35 UTC (permalink / raw)
  To: Simon Horman
  Cc: lvs-devel@vger.kernel.org, netdev@vger.kernel.org,
	netfilter-devel@vger.kernel.org, ja@ssi.bg, wensong@linux-vs.org,
	daniel.lezcano@free.fr
In-Reply-To: <20101020152112.GA8502@verge.net.au>

On Wednesday 20 October 2010 17:21:45 Simon Horman wrote:
> On Fri, Oct 08, 2010 at 01:17:02PM +0200, Hans Schillstrom wrote:
> > This patch just contains ip_vs_ctl
> >
> > Signed-off-by:Hans Schillstrom <hans.schillstrom@ericsson.com>
> >
> > diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
> > index ca8ec8c..7e99cbc 100644
> > --- a/net/netfilter/ipvs/ip_vs_ctl.c
> > +++ b/net/netfilter/ipvs/ip_vs_ctl.c
>
> [ snip ]
>
> > @@ -3377,62 +3383,131 @@ static void ip_vs_genl_unregister(void)
> >  }
> >
> >  /* End of Generic Netlink interface definitions */
> > +/*
> > + * per netns intit/exit func.
> > + */
> > +int /*__net_init*/ __ip_vs_control_init(struct net *net)
>
> Can you describe why __net_init is commented out?

The coloring in my editor get fucked up  :-)
I just forgott to remove the comment

>
> [ snip ]
>
--
Regards
Hans Schillstrom <hans.schillstrom@ericsson.com>

^ permalink raw reply

* TCP always advertises zero window.
From: Li Yu @ 2010-10-21  7:17 UTC (permalink / raw)
  To: netdev@vger.kernel.org

Hi,

	We found this on RHEL 5.4, the kernel is 2.6.18-164.11.1.el5, and also suspect that recent kernel also has similar problem. 

	First, we turned off both TCP window scaling option and MTU probe feature. On some servers, we found that some servers may always advertised another ends zero received window, below are some captured traffic (by tcpdump -S -nn -vv):

16:24:59.990545 IP (tos 0x0, ttl  64, id 37079, offset 0, flags [DF], proto: TCP (6), length: 52) 10.1.157.3.2904 > 10.1.157.4.2903: ., cksum 0x96df (correct), 3830348746:3830348746(0) ack 1951026211 win 65160 <nop,nop,timestamp 1040455485 1040632013>
16:25:00.054563 IP (tos 0x0, ttl  64, id 47424, offset 0, flags [DF], proto: TCP (6), length: 460) 10.1.157.4.2903 > 10.1.157.3.2904: P 1951026211:1951026619(408) ack 3830348746 win 0 <nop,nop,timestamp 1040632077 1040455485>
16:25:00.054579 IP (tos 0x0, ttl  64, id 37080, offset 0, flags [DF], proto: TCP (6), length: 52) 10.1.157.3.2904 > 10.1.157.4.2903: ., cksum 0x94c7 (correct), 3830348746:3830348746(0) ack 1951026619 win 65160 <nop,nop,timestamp 1040455549 1040632077>
16:25:01.451253 IP (tos 0x0, ttl  64, id 47425, offset 0, flags [DF], proto: TCP (6), length: 4148) 10.1.157.4.2903 > 10.1.157.3.2904: P 1951026619:1951030715(4096) ack 3830348746 win 0 <nop,nop,timestamp 1040633474 1040455549>

	As above example show, 10.1.157.4 always advertise zero window forever. I wrote a small toy kernel module to show TCP internal socket status as below:

tcp-snapshot:sock:
  sk->sk_family=2
  sk->sk_state=1
  sk->sk_reuse=1
  sk->sk_bound_dev_if=0
  atomic_read(&sk->sk_refcnt)=3
  sk->sk_hash=117920776
  sk->sk_shutdown=0
  sk->sk_no_check=0
  sk->sk_userlocks=7
  sk->sk_protocol=6
  sk->sk_type=1
  sk->sk_rcvbuf=131072
  list_empty(&sk->sk_sleep->task_list)=0
  atomic_read(&sk->sk_rmem_alloc)=0
  atomic_read(&sk->sk_wmem_alloc)=0
  atomic_read(&sk->sk_omem_alloc)=0
  sk->sk_receive_queue.qlen=0
  sk->sk_write_queue.qlen=0
  sk->sk_async_wait_queue.qlen=0
  sk->sk_error_queue.qlen=0
  sk->sk_wmem_queued=0
  sk->sk_forward_alloc=8192
  sk->sk_allocation=d0
  sk->sk_sndbuf=131072
  sk->sk_route_caps=1143a9
  sk->sk_gso_type=1
  sk->sk_rcvlowat=1
  sk->sk_flags=300
  sk->sk_lingertime=0
  sk->sk_err=0
  sk->sk_err_soft=0
  sk->sk_ack_backlog=0
  sk->sk_max_ack_backlog=128
  sk->sk_priority=0
  sk->sk_rcvtimeo=9223372036854775807
  sk->sk_sndtimeo=9223372036854775807
  sk->sk_protinfo=0000000000000000
  sk->sk_stamp.tv_sec=18446744073709551615
  sk->sk_stamp.tv_usec=18446744073709551615
  sk->sk_socket=ffff81053ee71080
  sk->sk_user_data=0000000000000000
  sk->sk_sndmsg_page=ffff8103761ab220
  sk->sk_sndmsg_off=475
  sk->sk_send_head=0000000000000000
  sk->sk_write_pending=0
tcp-snapshot:inet_sock:
  inetsk->daddr=39d010a
  inetsk->rcv_saddr=49d010a
  inetsk->dport=580b
  inetsk->num=b57
  inetsk->saddr=49d010a
  inetsk->uc_ttl=4294967295
  inetsk->cmsg_flags=0
  inetsk->opt=0000000000000000
  inetsk->sport=570b
  inetsk->id=5843
  inetsk->tos=0
  inetsk->mc_ttl=64
  inetsk->pmtudisc=1
  inetsk->recverr=0
  inetsk->is_icsk=1
  inetsk->freebind=0
  inetsk->hdrincl=0
  inetsk->mc_loop=1
  inetsk->mc_index=2
  inetsk->mc_addr=0
  inetsk->mc_list=0000000000000000
tcp-snapshot:inet_connection_sk
  icsk->icsk_accept_queue.rskq_defer_accept=0
  icsk->icsk_accept_queue.listen_opt=0000000000000000
  icsk->icsk_timeout=5336784156
  icsk->icsk_rto=218
  icsk->icsk_pmtu_cookie=1500
  icsk->icsk_ca_state=0
  icsk->icsk_retransmits=0
  icsk->icsk_pending=0
  icsk->icsk_backoff=0
  icsk->icsk_syn_retries=0
  icsk->icsk_probes_out=0
  icsk->icsk_ext_hdr_len=0
  icsk->icsk_ack.pending=0
  icsk->icsk_ack.quick=0
  icsk->icsk_ack.pingpong=1
  icsk->icsk_ack.blocked=0
  icsk->icsk_ack.ato=40
  icsk->icsk_ack.timeout=5303454287
  icsk->icsk_ack.lrcvtime=1008486952
  icsk->icsk_ack.last_seg_size=6814
  icsk->icsk_ack.rcv_mss=8688
  icsk->icsk_mtup.enabled=0
  icsk->icsk_mtup.search_high=1500
  icsk->icsk_mtup.search_low=564
  icsk->icsk_mtup.probe_size=0
tcp-snapshot:tcp_sock
  tcpsk->tcp_header_len=32
  tcpsk->pred_flags=0
  tcpsk->rcv_nxt=3830348746
  tcpsk->snd_nxt=1984376345
  tcpsk->snd_una=1984376345
  tcpsk->snd_sml=1984376345
  tcpsk->rcv_tstamp=1041816640
  tcpsk->lsndtime=1041816640
  tcpsk->ucopy.prequeue.qlen=0
  tcpsk->ucopy.task=0000000000000000
  tcpsk->ucopy.iov=0000000000000000
  tcpsk->ucopy.memory=0
  tcpsk->ucopy.len=0
  tcpsk->snd_wl1=3830348746
  tcpsk->snd_wnd=65160
  tcpsk->max_window=65524
  tcpsk->mss_cache=1448
  tcpsk->xmit_size_goal=31856
  tcpsk->window_clamp=65535
  tcpsk->rcv_ssthresh=5792
  tcpsk->frto_highmark=0
  tcpsk->reordering=3
  tcpsk->frto_counter=0
  tcpsk->nonagle=1
  tcpsk->keepalive_probes=0
  tcpsk->srtt=121
  tcpsk->mdev=76
  tcpsk->mdev_max=200
  tcpsk->rttvar=203
  tcpsk->rtt_seq=1984376345
  tcpsk->packets_out=0
  tcpsk->left_out=0
  tcpsk->retrans_out=0
  tcpsk->rx_opt.ts_recent_stamp=1287564284
  tcpsk->rx_opt.ts_recent=1041640111
  tcpsk->rx_opt.rcv_tsval=1041640111
  tcpsk->rx_opt.rcv_tsecr=1041816640
  tcpsk->rx_opt.saw_tstamp=1
  tcpsk->rx_opt.tstamp_ok=1
  tcpsk->rx_opt.dsack=0
  tcpsk->rx_opt.wscale_ok=0
  tcpsk->rx_opt.sack_ok=5
  tcpsk->rx_opt.snd_wscale=0
  tcpsk->rx_opt.rcv_wscale=0
  tcpsk->rx_opt.eff_sacks=0
  tcpsk->rx_opt.num_sacks=0
  tcpsk->rx_opt.user_mss=0
  tcpsk->rx_opt.mss_clamp=1460
  tcpsk->snd_ssthresh=4
  tcpsk->snd_cwnd=4
  tcpsk->snd_cwnd_cnt=4
  tcpsk->snd_cwnd_clamp=65535
  tcpsk->snd_cwnd_used=2
  tcpsk->snd_cwnd_stamp=1041816640
  tcpsk->out_of_order_queue.qlen=0
  tcpsk->rcv_wnd=0
  tcpsk->rcv_wup=3830348746
  tcpsk->write_seq=1984376345
  tcpsk->pushed_seq=1984376345
  tcpsk->copied_seq=3830348746
  tcpsk->duplicate_sack[0].start_seq=3613713418
  tcpsk->duplicate_sack[0].end_seq=3613714866
  tcpsk->selective_acks[i].start_seq=3648234364
  tcpsk->selective_acks[i].end_seq=3648247396
  tcpsk->selective_acks[i].start_seq=3647855528
  tcpsk->selective_acks[i].end_seq=3647856976
  tcpsk->selective_acks[i].start_seq=3640487648
  tcpsk->selective_acks[i].end_seq=3640496336
  tcpsk->selective_acks[i].start_seq=3498843984
  tcpsk->selective_acks[i].end_seq=3498845432
  tcpsk->recv_sack_cache[i].start_seq=1226527628
  tcpsk->recv_sack_cache[i].end_seq=1226549030
  tcpsk->recv_sack_cache[i].start_seq=179088461
  tcpsk->recv_sack_cache[i].end_seq=179091357
  tcpsk->recv_sack_cache[i].start_seq=4042009662
  tcpsk->recv_sack_cache[i].end_seq=4042011110
  tcpsk->recv_sack_cache[i].start_seq=0
  tcpsk->recv_sack_cache[i].end_seq=0
  tcpsk->lost_skb_hint=0000000000000000
  tcpsk->scoreboard_skb_hint=0000000000000000
  tcpsk->retransmit_skb_hint=0000000000000000
  tcpsk->forward_skb_hint=0000000000000000
  tcpsk->fastpath_skb_hint=0000000000000000
  tcpsk->fastpath_cnt_hint=15
  tcpsk->lost_cnt_hint=6
  tcpsk->retransmit_cnt_hint=0
  tcpsk->forward_cnt_hint=9
  tcpsk->advmss=1448
  tcpsk->prior_ssthresh=5
  tcpsk->lost_out=0
  tcpsk->sacked_out=0
  tcpsk->fackets_out=0
  tcpsk->high_seq=1226549030
  tcpsk->retrans_stamp=0
  tcpsk->undo_marker=0
  tcpsk->undo_retrans=1
  tcpsk->urg_seq=0
  tcpsk->urg_data=0
  tcpsk->urg_mode=0
  tcpsk->ecn_flags=0
  tcpsk->snd_up=0
  tcpsk->total_retrans=2110
  tcpsk->bytes_acked=0
  tcpsk->keepalive_time=0
  tcpsk->keepalive_intvl=0
  tcpsk->linger2=0
  tcpsk->last_synq_overflow=0
  tcpsk->rcv_rtt_est.rtt=15
  tcpsk->rcv_rtt_est.seq=3830352454
  tcpsk->rcv_rtt_est.time=1008486951
  tcpsk->rcvq_space.space=468244
  tcpsk->rcvq_space.seq=3830258350
  tcpsk->rcvq_space.time=1008486952
  tcpsk->mtu_probe.probe_seq_start=0
  tcpsk->mtu_probe.probe_seq_end=0

	We noticed that the tcpsk->rcv_wnd indeed is 0, but tcpsk->copied_seq equals with tcpsk->rcv_nxt, and sk->sk_rmem_alloc is 0, both latters mean that there has not any pending data in receive queue.

	After some digging against source code, I found that __tcp_select_window() actually returns zero in such case. In my words, the function should resume window into non-zero value in this time (we have full free space on receive queue), is it right?

	In such case, I think that tcpsk->rcv_ssthresh has an exceptional value, it is too small, which triggers skiping rest processing in __tcp_select_window(), leave zero window forever.

	According to source code, only MTU probing success and receive some non-zero length of L7 payload could grow tcp_sock->rcv_ssthresh. Because of we turn off MTU probing and TCP only received some zero-window probe from another end, so it seem that we have not any chance to update tcp_sock->rcv_ssthresh at all, so a dead loop come here.

	It seem that we lost some processing in skb_data_queue() to keep consistent between free space of receive queue and tcpsk->rcv_ssthresh, is it right? or I missed something or some wrongs in my understanding...


	Thank you~

Yu





^ permalink raw reply

* [PATCHv2] vmxnet3: remove set_flag_le{16,64} helpers
From: Harvey Harrison @ 2010-10-21  6:32 UTC (permalink / raw)
  To: sbhatewara; +Cc: netdev, shemminger

It's easier to just annotate the constants as little endian types and set/clear
the flags directly.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
---i
Sorry, missed a git add and left a line out of the previous patch.

 drivers/net/vmxnet3/upt1_defs.h       |    8 +++---
 drivers/net/vmxnet3/vmxnet3_defs.h    |    6 ++--
 drivers/net/vmxnet3/vmxnet3_drv.c     |   37 ++++++++-------------------------
 drivers/net/vmxnet3/vmxnet3_ethtool.c |   14 +++++-------
 drivers/net/vmxnet3/vmxnet3_int.h     |    4 ---
 5 files changed, 22 insertions(+), 47 deletions(-)

diff --git a/drivers/net/vmxnet3/upt1_defs.h b/drivers/net/vmxnet3/upt1_defs.h
index 37108fb..969c751 100644
--- a/drivers/net/vmxnet3/upt1_defs.h
+++ b/drivers/net/vmxnet3/upt1_defs.h
@@ -88,9 +88,9 @@ struct UPT1_RSSConf {
 
 /* features */
 enum {
-	UPT1_F_RXCSUM		= 0x0001,   /* rx csum verification */
-	UPT1_F_RSS		= 0x0002,
-	UPT1_F_RXVLAN		= 0x0004,   /* VLAN tag stripping */
-	UPT1_F_LRO		= 0x0008,
+	UPT1_F_RXCSUM		= cpu_to_le64(0x0001),   /* rx csum verification */
+	UPT1_F_RSS		= cpu_to_le64(0x0002),
+	UPT1_F_RXVLAN		= cpu_to_le64(0x0004),   /* VLAN tag stripping */
+	UPT1_F_LRO		= cpu_to_le64(0x0008),
 };
 #endif
diff --git a/drivers/net/vmxnet3/vmxnet3_defs.h b/drivers/net/vmxnet3/vmxnet3_defs.h
index ca7727b..4d84912 100644
--- a/drivers/net/vmxnet3/vmxnet3_defs.h
+++ b/drivers/net/vmxnet3/vmxnet3_defs.h
@@ -523,9 +523,9 @@ struct Vmxnet3_RxFilterConf {
 #define VMXNET3_PM_MAX_PATTERN_SIZE   128
 #define VMXNET3_PM_MAX_MASK_SIZE      (VMXNET3_PM_MAX_PATTERN_SIZE / 8)
 
-#define VMXNET3_PM_WAKEUP_MAGIC       0x01  /* wake up on magic pkts */
-#define VMXNET3_PM_WAKEUP_FILTER      0x02  /* wake up on pkts matching
-					     * filters */
+#define VMXNET3_PM_WAKEUP_MAGIC       cpu_to_le16(0x01)  /* wake up on magic pkts */
+#define VMXNET3_PM_WAKEUP_FILTER      cpu_to_le16(0x02)  /* wake up on pkts matching
+							  * filters */
 
 
 struct Vmxnet3_PM_PktFilter {
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 198ce92..c8d1a14 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -1548,23 +1548,6 @@ vmxnet3_free_irqs(struct vmxnet3_adapter *adapter)
 	}
 }
 
-
-inline void set_flag_le16(__le16 *data, u16 flag)
-{
-	*data = cpu_to_le16(le16_to_cpu(*data) | flag);
-}
-
-inline void set_flag_le64(__le64 *data, u64 flag)
-{
-	*data = cpu_to_le64(le64_to_cpu(*data) | flag);
-}
-
-inline void reset_flag_le64(__le64 *data, u64 flag)
-{
-	*data = cpu_to_le64(le64_to_cpu(*data) & ~flag);
-}
-
-
 static void
 vmxnet3_vlan_rx_register(struct net_device *netdev, struct vlan_group *grp)
 {
@@ -1580,8 +1563,7 @@ vmxnet3_vlan_rx_register(struct net_device *netdev, struct vlan_group *grp)
 			adapter->vlan_grp = grp;
 
 			/* update FEATURES to device */
-			set_flag_le64(&devRead->misc.uptFeatures,
-				      UPT1_F_RXVLAN);
+			devRead->misc.uptFeatures |= UPT1_F_RXVLAN;
 			VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 					       VMXNET3_CMD_UPDATE_FEATURE);
 			/*
@@ -1604,7 +1586,7 @@ vmxnet3_vlan_rx_register(struct net_device *netdev, struct vlan_group *grp)
 		struct Vmxnet3_DSDevRead *devRead = &shared->devRead;
 		adapter->vlan_grp = NULL;
 
-		if (le64_to_cpu(devRead->misc.uptFeatures) & UPT1_F_RXVLAN) {
+		if (devRead->misc.uptFeatures & UPT1_F_RXVLAN) {
 			int i;
 
 			for (i = 0; i < VMXNET3_VFT_SIZE; i++) {
@@ -1617,8 +1599,7 @@ vmxnet3_vlan_rx_register(struct net_device *netdev, struct vlan_group *grp)
 					       VMXNET3_CMD_UPDATE_VLAN_FILTERS);
 
 			/* update FEATURES to device */
-			reset_flag_le64(&devRead->misc.uptFeatures,
-					UPT1_F_RXVLAN);
+			devRead->misc.uptFeatures &= ~UPT1_F_RXVLAN;
 			VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 					       VMXNET3_CMD_UPDATE_FEATURE);
 		}
@@ -1779,15 +1760,15 @@ vmxnet3_setup_driver_shared(struct vmxnet3_adapter *adapter)
 
 	/* set up feature flags */
 	if (adapter->rxcsum)
-		set_flag_le64(&devRead->misc.uptFeatures, UPT1_F_RXCSUM);
+		devRead->misc.uptFeatures |= UPT1_F_RXCSUM;
 
 	if (adapter->lro) {
-		set_flag_le64(&devRead->misc.uptFeatures, UPT1_F_LRO);
+		devRead->misc.uptFeatures |= UPT1_F_LRO;
 		devRead->misc.maxNumRxSG = cpu_to_le16(1 + MAX_SKB_FRAGS);
 	}
 	if ((adapter->netdev->features & NETIF_F_HW_VLAN_RX) &&
 	    adapter->vlan_grp) {
-		set_flag_le64(&devRead->misc.uptFeatures, UPT1_F_RXVLAN);
+		devRead->misc.uptFeatures |= UPT1_F_RXVLAN;
 	}
 
 	devRead->misc.mtu = cpu_to_le32(adapter->netdev->mtu);
@@ -2594,7 +2575,7 @@ vmxnet3_suspend(struct device *device)
 		memcpy(pmConf->filters[i].pattern, netdev->dev_addr, ETH_ALEN);
 		pmConf->filters[i].mask[0] = 0x3F; /* LSB ETH_ALEN bits */
 
-		set_flag_le16(&pmConf->wakeUpEvents, VMXNET3_PM_WAKEUP_FILTER);
+		pmConf->wakeUpEvents |= VMXNET3_PM_WAKEUP_FILTER;
 		i++;
 	}
 
@@ -2636,13 +2617,13 @@ vmxnet3_suspend(struct device *device)
 		pmConf->filters[i].mask[5] = 0x03; /* IPv4 TIP */
 		in_dev_put(in_dev);
 
-		set_flag_le16(&pmConf->wakeUpEvents, VMXNET3_PM_WAKEUP_FILTER);
+		pmConf->wakeUpEvents |= VMXNET3_PM_WAKEUP_FILTER;
 		i++;
 	}
 
 skip_arp:
 	if (adapter->wol & WAKE_MAGIC)
-		set_flag_le16(&pmConf->wakeUpEvents, VMXNET3_PM_WAKEUP_MAGIC);
+		pmConf->wakeUpEvents |= VMXNET3_PM_WAKEUP_MAGIC;
 
 	pmConf->numFilters = i;
 
diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c
index 7e4b5a8..b79070b 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethtool.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c
@@ -50,13 +50,11 @@ vmxnet3_set_rx_csum(struct net_device *netdev, u32 val)
 		adapter->rxcsum = val;
 		if (netif_running(netdev)) {
 			if (val)
-				set_flag_le64(
-				&adapter->shared->devRead.misc.uptFeatures,
-				UPT1_F_RXCSUM);
+				adapter->shared->devRead.misc.uptFeatures |=
+				UPT1_F_RXCSUM;
 			else
-				reset_flag_le64(
-				&adapter->shared->devRead.misc.uptFeatures,
-				UPT1_F_RXCSUM);
+				adapter->shared->devRead.misc.uptFeatures &=
+				~UPT1_F_RXCSUM;
 
 			VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 					       VMXNET3_CMD_UPDATE_FEATURE);
@@ -292,10 +290,10 @@ vmxnet3_set_flags(struct net_device *netdev, u32 data)
 		/* update harware LRO capability accordingly */
 		if (lro_requested)
 			adapter->shared->devRead.misc.uptFeatures |=
-						cpu_to_le64(UPT1_F_LRO);
+							UPT1_F_LRO;
 		else
 			adapter->shared->devRead.misc.uptFeatures &=
-						cpu_to_le64(~UPT1_F_LRO);
+							~UPT1_F_LRO;
 		VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 				       VMXNET3_CMD_UPDATE_FEATURE);
 	}
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index 2121c73..46aee6d 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -353,10 +353,6 @@ struct vmxnet3_adapter {
 #define VMXNET3_MAX_ETH_HDR_SIZE    22
 #define VMXNET3_MAX_SKB_BUF_SIZE    (3*1024)
 
-void set_flag_le16(__le16 *data, u16 flag);
-void set_flag_le64(__le64 *data, u64 flag);
-void reset_flag_le64(__le64 *data, u64 flag);
-
 int
 vmxnet3_quiesce_dev(struct vmxnet3_adapter *adapter);
 
-- 
1.7.1


^ permalink raw reply related

* [PATCH] vmxnet3: remove set_flag_le{16,64} helpers
From: Harvey Harrison @ 2010-10-21  6:28 UTC (permalink / raw)
  To: sbhatewara; +Cc: netdev, shemminger

It's easier to just annotate the constants as little endian types and set/clear
the flags directly.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
---
 drivers/net/vmxnet3/upt1_defs.h       |    8 +++---
 drivers/net/vmxnet3/vmxnet3_defs.h    |    6 ++--
 drivers/net/vmxnet3/vmxnet3_drv.c     |   35 +++++++-------------------------
 drivers/net/vmxnet3/vmxnet3_ethtool.c |   14 +++++-------
 drivers/net/vmxnet3/vmxnet3_int.h     |    4 ---
 5 files changed, 21 insertions(+), 46 deletions(-)

diff --git a/drivers/net/vmxnet3/upt1_defs.h b/drivers/net/vmxnet3/upt1_defs.h
index 37108fb..969c751 100644
--- a/drivers/net/vmxnet3/upt1_defs.h
+++ b/drivers/net/vmxnet3/upt1_defs.h
@@ -88,9 +88,9 @@ struct UPT1_RSSConf {
 
 /* features */
 enum {
-	UPT1_F_RXCSUM		= 0x0001,   /* rx csum verification */
-	UPT1_F_RSS		= 0x0002,
-	UPT1_F_RXVLAN		= 0x0004,   /* VLAN tag stripping */
-	UPT1_F_LRO		= 0x0008,
+	UPT1_F_RXCSUM		= cpu_to_le64(0x0001),   /* rx csum verification */
+	UPT1_F_RSS		= cpu_to_le64(0x0002),
+	UPT1_F_RXVLAN		= cpu_to_le64(0x0004),   /* VLAN tag stripping */
+	UPT1_F_LRO		= cpu_to_le64(0x0008),
 };
 #endif
diff --git a/drivers/net/vmxnet3/vmxnet3_defs.h b/drivers/net/vmxnet3/vmxnet3_defs.h
index ca7727b..4d84912 100644
--- a/drivers/net/vmxnet3/vmxnet3_defs.h
+++ b/drivers/net/vmxnet3/vmxnet3_defs.h
@@ -523,9 +523,9 @@ struct Vmxnet3_RxFilterConf {
 #define VMXNET3_PM_MAX_PATTERN_SIZE   128
 #define VMXNET3_PM_MAX_MASK_SIZE      (VMXNET3_PM_MAX_PATTERN_SIZE / 8)
 
-#define VMXNET3_PM_WAKEUP_MAGIC       0x01  /* wake up on magic pkts */
-#define VMXNET3_PM_WAKEUP_FILTER      0x02  /* wake up on pkts matching
-					     * filters */
+#define VMXNET3_PM_WAKEUP_MAGIC       cpu_to_le16(0x01)  /* wake up on magic pkts */
+#define VMXNET3_PM_WAKEUP_FILTER      cpu_to_le16(0x02)  /* wake up on pkts matching
+							  * filters */
 
 
 struct Vmxnet3_PM_PktFilter {
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 198ce92..ce292d4 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -1548,23 +1548,6 @@ vmxnet3_free_irqs(struct vmxnet3_adapter *adapter)
 	}
 }
 
-
-inline void set_flag_le16(__le16 *data, u16 flag)
-{
-	*data = cpu_to_le16(le16_to_cpu(*data) | flag);
-}
-
-inline void set_flag_le64(__le64 *data, u64 flag)
-{
-	*data = cpu_to_le64(le64_to_cpu(*data) | flag);
-}
-
-inline void reset_flag_le64(__le64 *data, u64 flag)
-{
-	*data = cpu_to_le64(le64_to_cpu(*data) & ~flag);
-}
-
-
 static void
 vmxnet3_vlan_rx_register(struct net_device *netdev, struct vlan_group *grp)
 {
@@ -1580,8 +1563,7 @@ vmxnet3_vlan_rx_register(struct net_device *netdev, struct vlan_group *grp)
 			adapter->vlan_grp = grp;
 
 			/* update FEATURES to device */
-			set_flag_le64(&devRead->misc.uptFeatures,
-				      UPT1_F_RXVLAN);
+			devRead->misc.uptFeatures |= UPT1_F_RXVLAN;
 			VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 					       VMXNET3_CMD_UPDATE_FEATURE);
 			/*
@@ -1604,7 +1586,7 @@ vmxnet3_vlan_rx_register(struct net_device *netdev, struct vlan_group *grp)
 		struct Vmxnet3_DSDevRead *devRead = &shared->devRead;
 		adapter->vlan_grp = NULL;
 
-		if (le64_to_cpu(devRead->misc.uptFeatures) & UPT1_F_RXVLAN) {
+		if (devRead->misc.uptFeatures & UPT1_F_RXVLAN) {
 			int i;
 
 			for (i = 0; i < VMXNET3_VFT_SIZE; i++) {
@@ -1617,8 +1599,7 @@ vmxnet3_vlan_rx_register(struct net_device *netdev, struct vlan_group *grp)
 					       VMXNET3_CMD_UPDATE_VLAN_FILTERS);
 
 			/* update FEATURES to device */
-			reset_flag_le64(&devRead->misc.uptFeatures,
-					UPT1_F_RXVLAN);
+			devRead->misc.uptFeatures &= ~UPT1_F_RXVLAN;
 			VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 					       VMXNET3_CMD_UPDATE_FEATURE);
 		}
@@ -1779,15 +1760,15 @@ vmxnet3_setup_driver_shared(struct vmxnet3_adapter *adapter)
 
 	/* set up feature flags */
 	if (adapter->rxcsum)
-		set_flag_le64(&devRead->misc.uptFeatures, UPT1_F_RXCSUM);
+		devRead->misc.uptFeatures |= UPT1_F_RXCSUM;
 
 	if (adapter->lro) {
-		set_flag_le64(&devRead->misc.uptFeatures, UPT1_F_LRO);
+		devRead->misc.uptFeatures |= UPT1_F_LRO;
 		devRead->misc.maxNumRxSG = cpu_to_le16(1 + MAX_SKB_FRAGS);
 	}
 	if ((adapter->netdev->features & NETIF_F_HW_VLAN_RX) &&
 	    adapter->vlan_grp) {
-		set_flag_le64(&devRead->misc.uptFeatures, UPT1_F_RXVLAN);
+		devRead->misc.uptFeatures |= UPT1_F_RXVLAN;
 	}
 
 	devRead->misc.mtu = cpu_to_le32(adapter->netdev->mtu);
@@ -2594,7 +2575,7 @@ vmxnet3_suspend(struct device *device)
 		memcpy(pmConf->filters[i].pattern, netdev->dev_addr, ETH_ALEN);
 		pmConf->filters[i].mask[0] = 0x3F; /* LSB ETH_ALEN bits */
 
-		set_flag_le16(&pmConf->wakeUpEvents, VMXNET3_PM_WAKEUP_FILTER);
+		pmConf->wakeUpEvents |= VMXNET3_PM_WAKEUP_FILTER;
 		i++;
 	}
 
@@ -2642,7 +2623,7 @@ vmxnet3_suspend(struct device *device)
 
 skip_arp:
 	if (adapter->wol & WAKE_MAGIC)
-		set_flag_le16(&pmConf->wakeUpEvents, VMXNET3_PM_WAKEUP_MAGIC);
+		pmConf->wakeUpEvents |= VMXNET3_PM_WAKEUP_MAGIC;
 
 	pmConf->numFilters = i;
 
diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c
index 7e4b5a8..b79070b 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethtool.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c
@@ -50,13 +50,11 @@ vmxnet3_set_rx_csum(struct net_device *netdev, u32 val)
 		adapter->rxcsum = val;
 		if (netif_running(netdev)) {
 			if (val)
-				set_flag_le64(
-				&adapter->shared->devRead.misc.uptFeatures,
-				UPT1_F_RXCSUM);
+				adapter->shared->devRead.misc.uptFeatures |=
+				UPT1_F_RXCSUM;
 			else
-				reset_flag_le64(
-				&adapter->shared->devRead.misc.uptFeatures,
-				UPT1_F_RXCSUM);
+				adapter->shared->devRead.misc.uptFeatures &=
+				~UPT1_F_RXCSUM;
 
 			VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 					       VMXNET3_CMD_UPDATE_FEATURE);
@@ -292,10 +290,10 @@ vmxnet3_set_flags(struct net_device *netdev, u32 data)
 		/* update harware LRO capability accordingly */
 		if (lro_requested)
 			adapter->shared->devRead.misc.uptFeatures |=
-						cpu_to_le64(UPT1_F_LRO);
+							UPT1_F_LRO;
 		else
 			adapter->shared->devRead.misc.uptFeatures &=
-						cpu_to_le64(~UPT1_F_LRO);
+							~UPT1_F_LRO;
 		VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
 				       VMXNET3_CMD_UPDATE_FEATURE);
 	}
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index 2121c73..46aee6d 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -353,10 +353,6 @@ struct vmxnet3_adapter {
 #define VMXNET3_MAX_ETH_HDR_SIZE    22
 #define VMXNET3_MAX_SKB_BUF_SIZE    (3*1024)
 
-void set_flag_le16(__le16 *data, u16 flag);
-void set_flag_le64(__le64 *data, u64 flag);
-void reset_flag_le64(__le64 *data, u64 flag);
-
 int
 vmxnet3_quiesce_dev(struct vmxnet3_adapter *adapter);
 
-- 
1.7.1


^ permalink raw reply related

* Re: Question w.r.t debugfs / netdevice pass-through IOCTL
From: Stephen Hemminger @ 2010-10-21  4:19 UTC (permalink / raw)
  To: Debashis Dutt; +Cc: netdev@vger.kernel.org
In-Reply-To: <F363E7AC84E1B646A0358B281A46F4AEABA0FFCC68@HQ1-EXCH03.corp.brocade.com>

On Wed, 20 Oct 2010 20:26:50 -0700
Debashis Dutt <ddutt@Brocade.COM> wrote:

> Hi, 
> 
> For the Brocade 10G Ethernet driver (bna) we want to implement a set of operations which is not supported by current tools like ethtool. 
> 
> Examples of such operations would be 
>        a) Queries related to CEE, if the link is CEE.
>        b) Get traces from firmware.

> 
> I was wondering what would be right approach to take here:
>                 a) use debugfs (like the Chelsio cxgb4 driver)
Works as long as they are really debug operations. The debugfs isn't always
available, and support should be a config option for your driver.

>                 b) use SIOCDEVPRIVATE for the pass through IOCTL defined in
>                     struct net_device_ops{}

The problem with ioctl is it doesn't work for 32 bit user space
compatiablity. The ioctl compat layer does not have enough context
to translate SIOCDEVPRIVATE

>                     As per comments in the header file, b) should not be used
>                     since this IOCTL is supposed to be deprecated.
>                 c) use procfs / sysfs (these may not scale, in our opinion)

Although less common, there were drivers putting things in /proc/net/xxx/ethX



-- 

^ permalink raw reply

* [RFC PATCH] net: consolidate 8021q tagging
From: John Fastabend @ 2010-10-21  3:40 UTC (permalink / raw)
  To: netdev; +Cc: john.r.fastabend, jesse, davem

This is an example to illustrate my comment to Jesse Gross
patch where he adds vlan tagging for the non offload case
to dev_hard_start_xmit. It compiles but otherwise I've not
tested it.

If we tag vlan packets in dev_hard_start_xmit we no longer
need to actually tag them here just set the vlan_tci field
in the skb and let the stack get them at the bottom.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---

 net/8021q/vlan_dev.c |  105 +++-----------------------------------------------
 1 files changed, 7 insertions(+), 98 deletions(-)

diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 14e3d1f..78b1618 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -326,24 +326,12 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
 	 */
 	if (veth->h_vlan_proto != htons(ETH_P_8021Q) ||
 	    vlan_dev_info(dev)->flags & VLAN_FLAG_REORDER_HDR) {
-		unsigned int orig_headroom = skb_headroom(skb);
 		u16 vlan_tci;
-
-		vlan_dev_info(dev)->cnt_encap_on_xmit++;
-
 		vlan_tci = vlan_dev_info(dev)->vlan_id;
 		vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb);
-		skb = __vlan_put_tag(skb, vlan_tci);
-		if (!skb) {
-			txq->tx_dropped++;
-			return NETDEV_TX_OK;
-		}
-
-		if (orig_headroom < VLAN_HLEN)
-			vlan_dev_info(dev)->cnt_inc_headroom_on_tx++;
+		skb = __vlan_hwaccel_put_tag(skb, vlan_tci);
 	}
 
-
 	skb_set_dev(skb, vlan_dev_info(dev)->real_dev);
 	len = skb->len;
 	ret = dev_queue_xmit(skb);
@@ -357,32 +345,6 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
 	return ret;
 }
 
-static netdev_tx_t vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb,
-						    struct net_device *dev)
-{
-	int i = skb_get_queue_mapping(skb);
-	struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
-	u16 vlan_tci;
-	unsigned int len;
-	int ret;
-
-	vlan_tci = vlan_dev_info(dev)->vlan_id;
-	vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb);
-	skb = __vlan_hwaccel_put_tag(skb, vlan_tci);
-
-	skb->dev = vlan_dev_info(dev)->real_dev;
-	len = skb->len;
-	ret = dev_queue_xmit(skb);
-
-	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
-		txq->tx_packets++;
-		txq->tx_bytes += len;
-	} else
-		txq->tx_dropped++;
-
-	return ret;
-}
-
 static u16 vlan_dev_select_queue(struct net_device *dev, struct sk_buff *skb)
 {
 	struct net_device *rdev = vlan_dev_info(dev)->real_dev;
@@ -719,8 +681,7 @@ static const struct header_ops vlan_header_ops = {
 	.parse	 = eth_header_parse,
 };
 
-static const struct net_device_ops vlan_netdev_ops, vlan_netdev_accel_ops,
-		    vlan_netdev_ops_sq, vlan_netdev_accel_ops_sq;
+static const struct net_device_ops vlan_netdev_ops, vlan_netdev_ops_sq;
 
 static int vlan_dev_init(struct net_device *dev)
 {
@@ -755,19 +716,16 @@ static int vlan_dev_init(struct net_device *dev)
 	if (real_dev->features & NETIF_F_HW_VLAN_TX) {
 		dev->header_ops      = real_dev->header_ops;
 		dev->hard_header_len = real_dev->hard_header_len;
-		if (real_dev->netdev_ops->ndo_select_queue)
-			dev->netdev_ops = &vlan_netdev_accel_ops_sq;
-		else
-			dev->netdev_ops = &vlan_netdev_accel_ops;
 	} else {
 		dev->header_ops      = &vlan_header_ops;
 		dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN;
-		if (real_dev->netdev_ops->ndo_select_queue)
-			dev->netdev_ops = &vlan_netdev_ops_sq;
-		else
-			dev->netdev_ops = &vlan_netdev_ops;
 	}
 
+	if (real_dev->netdev_ops->ndo_select_queue)
+		dev->netdev_ops = &vlan_netdev_ops_sq;
+	else
+		dev->netdev_ops = &vlan_netdev_ops;
+
 	if (is_vlan_dev(real_dev))
 		subclass = 1;
 
@@ -908,30 +866,6 @@ static const struct net_device_ops vlan_netdev_ops = {
 #endif
 };
 
-static const struct net_device_ops vlan_netdev_accel_ops = {
-	.ndo_change_mtu		= vlan_dev_change_mtu,
-	.ndo_init		= vlan_dev_init,
-	.ndo_uninit		= vlan_dev_uninit,
-	.ndo_open		= vlan_dev_open,
-	.ndo_stop		= vlan_dev_stop,
-	.ndo_start_xmit =  vlan_dev_hwaccel_hard_start_xmit,
-	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_set_mac_address	= vlan_dev_set_mac_address,
-	.ndo_set_rx_mode	= vlan_dev_set_rx_mode,
-	.ndo_set_multicast_list	= vlan_dev_set_rx_mode,
-	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
-	.ndo_do_ioctl		= vlan_dev_ioctl,
-	.ndo_neigh_setup	= vlan_dev_neigh_setup,
-	.ndo_get_stats64	= vlan_dev_get_stats64,
-#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
-	.ndo_fcoe_ddp_setup	= vlan_dev_fcoe_ddp_setup,
-	.ndo_fcoe_ddp_done	= vlan_dev_fcoe_ddp_done,
-	.ndo_fcoe_enable	= vlan_dev_fcoe_enable,
-	.ndo_fcoe_disable	= vlan_dev_fcoe_disable,
-	.ndo_fcoe_get_wwn	= vlan_dev_fcoe_get_wwn,
-#endif
-};
-
 static const struct net_device_ops vlan_netdev_ops_sq = {
 	.ndo_select_queue	= vlan_dev_select_queue,
 	.ndo_change_mtu		= vlan_dev_change_mtu,
@@ -957,31 +891,6 @@ static const struct net_device_ops vlan_netdev_ops_sq = {
 #endif
 };
 
-static const struct net_device_ops vlan_netdev_accel_ops_sq = {
-	.ndo_select_queue	= vlan_dev_select_queue,
-	.ndo_change_mtu		= vlan_dev_change_mtu,
-	.ndo_init		= vlan_dev_init,
-	.ndo_uninit		= vlan_dev_uninit,
-	.ndo_open		= vlan_dev_open,
-	.ndo_stop		= vlan_dev_stop,
-	.ndo_start_xmit =  vlan_dev_hwaccel_hard_start_xmit,
-	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_set_mac_address	= vlan_dev_set_mac_address,
-	.ndo_set_rx_mode	= vlan_dev_set_rx_mode,
-	.ndo_set_multicast_list	= vlan_dev_set_rx_mode,
-	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
-	.ndo_do_ioctl		= vlan_dev_ioctl,
-	.ndo_neigh_setup	= vlan_dev_neigh_setup,
-	.ndo_get_stats64	= vlan_dev_get_stats64,
-#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
-	.ndo_fcoe_ddp_setup	= vlan_dev_fcoe_ddp_setup,
-	.ndo_fcoe_ddp_done	= vlan_dev_fcoe_ddp_done,
-	.ndo_fcoe_enable	= vlan_dev_fcoe_enable,
-	.ndo_fcoe_disable	= vlan_dev_fcoe_disable,
-	.ndo_fcoe_get_wwn	= vlan_dev_fcoe_get_wwn,
-#endif
-};
-
 void vlan_setup(struct net_device *dev)
 {
 	ether_setup(dev);


^ permalink raw reply related

* Re: [PATCH v2 04/14] vlan: Enable software emulation for vlan accleration.
From: John Fastabend @ 2010-10-21  3:32 UTC (permalink / raw)
  To: Jesse Gross; +Cc: David Miller, netdev@vger.kernel.org
In-Reply-To: <1287618974-4714-5-git-send-email-jesse@nicira.com>

On 10/20/2010 4:56 PM, Jesse Gross wrote:
> Currently users of hardware vlan accleration need to know whether
> the device supports it before generating packets.  However, vlan
> acceleration will soon be available in a more flexible manner so
> knowing ahead of time becomes much more difficult.  This adds
> a software fallback path for vlan packets on devices without the
> necessary offloading support, similar to other types of hardware
> accleration.
> 
> Signed-off-by: Jesse Gross <jesse@nicira.com>
> ---
>  include/linux/netdevice.h |   14 +++++++++++---
>  net/core/dev.c            |   36 +++++++++++++++++++++++++++++++++---
>  2 files changed, 44 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 880d565..2861565 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -2248,9 +2248,17 @@ static inline int skb_gso_ok(struct sk_buff *skb, int features)
>  
>  static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
>  {
> -	return skb_is_gso(skb) &&
> -	       (!skb_gso_ok(skb, dev->features) ||
> -		unlikely(skb->ip_summed != CHECKSUM_PARTIAL));
> +	if (skb_is_gso(skb)) {
> +		int features = dev->features;
> +
> +		if (skb->protocol == htons(ETH_P_8021Q) || skb->vlan_tci)
> +			features &= dev->vlan_features;
> +
> +		return (!skb_gso_ok(skb, features) ||
> +			unlikely(skb->ip_summed != CHECKSUM_PARTIAL));
> +	}
> +
> +	return 0;
>  }
>  
>  static inline void netif_set_gso_max_size(struct net_device *dev,
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 4c3ac53..1bfd96b 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -1694,7 +1694,12 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol)
>  
>  static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
>  {
> -	if (can_checksum_protocol(dev->features, skb->protocol))
> +	int features = dev->features;
> +
> +	if (vlan_tx_tag_present(skb))
> +		features &= dev->vlan_features;
> +
> +	if (can_checksum_protocol(features, skb->protocol))
>  		return true;
>  
>  	if (skb->protocol == htons(ETH_P_8021Q)) {
> @@ -1793,6 +1798,16 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
>  	__be16 type = skb->protocol;
>  	int err;
>  
> +	if (type == htons(ETH_P_8021Q)) {
> +		struct vlan_ethhdr *veh;
> +
> +		if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
> +			return ERR_PTR(-EINVAL);
> +
> +		veh = (struct vlan_ethhdr *)skb->data;
> +		type = veh->h_vlan_encapsulated_proto;
> +	}
> +
>  	skb_reset_mac_header(skb);
>  	skb->mac_len = skb->network_header - skb->mac_header;
>  	__skb_pull(skb, skb->mac_len);
> @@ -1964,9 +1979,14 @@ static inline void skb_orphan_try(struct sk_buff *skb)
>  static inline int skb_needs_linearize(struct sk_buff *skb,
>  				      struct net_device *dev)
>  {
> +	int features = dev->features;
> +
> +	if (skb->protocol == htons(ETH_P_8021Q) || vlan_tx_tag_present(skb))
> +		features &= dev->vlan_features;
> +
>  	return skb_is_nonlinear(skb) &&
> -	       ((skb_has_frag_list(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
> -	        (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
> +	       ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) ||
> +		(skb_shinfo(skb)->nr_frags && (!(features & NETIF_F_SG) ||
>  					      illegal_highdma(dev, skb))));
>  }
>  
> @@ -1989,6 +2009,15 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
>  
>  		skb_orphan_try(skb);
>  
> +		if (vlan_tx_tag_present(skb) &&
> +		    !(dev->features & NETIF_F_HW_VLAN_TX)) {
> +			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
> +			if (unlikely(!skb))
> +				goto out;
> +
> +			skb->vlan_tci = 0;
> +		}
> +

Nice set of patches! If we tag frames in dev_hard_start_xmit() can we consolidate
the offload enabled and non-offloaded net_device_ops in 8021q. And then not tag in
vlan_dev_hard_start_xmit? I'll post an example thinking out loud here.

Thanks,
John.


>  		if (netif_needs_gso(dev, skb)) {
>  			if (unlikely(dev_gso_segment(skb)))
>  				goto out_kfree_skb;
> @@ -2050,6 +2079,7 @@ out_kfree_gso_skb:
>  		skb->destructor = DEV_GSO_CB(skb)->destructor;
>  out_kfree_skb:
>  	kfree_skb(skb);
> +out:
>  	return rc;
>  }
>  


^ permalink raw reply

* Re: [PATCH v2 07/14] ethtool: Add support for vlan accleration.
From: John Fastabend @ 2010-10-21  3:27 UTC (permalink / raw)
  To: Jesse Gross; +Cc: David Miller, netdev@vger.kernel.org
In-Reply-To: <1287618974-4714-8-git-send-email-jesse@nicira.com>

On 10/20/2010 4:56 PM, Jesse Gross wrote:
> Now that vlan acceleration is handled consistently regardless of usage,
> it is possible to enable and disable it at will.  This adds support for
> Ethtool operations that change the offloading status for debugging
> purposes, similar to other forms of hardware acceleration.
> 

Jesse,

Not sure if this is enough to get dynamic toggling like this
dev->hard_header_len is set depending on offloads at init time in
vlan_dev_init(). By changing this LL_RESERVED_SPACE won't work
correctly and we end up having to call pskb_expand_head(). I think
this might end up hurting performance.

That said I think I can probably get this working by fixing up the
header_ops in vlan_dev.c.  And while I'm at it add a vlan_header_cache
and vlan_header_cache_update routines. I'll try to get something out
tomorrow in the meantime nothing too bad is happening.

Thanks,
John.

> Signed-off-by: Jesse Gross <jesse@nicira.com>
> ---
>  include/linux/ethtool.h |    2 ++
>  net/core/ethtool.c      |    3 ++-
>  2 files changed, 4 insertions(+), 1 deletions(-)
> 
> diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
> index 8a3338c..6628a50 100644
> --- a/include/linux/ethtool.h
> +++ b/include/linux/ethtool.h
> @@ -309,6 +309,8 @@ struct ethtool_perm_addr {
>   * flag differs from the read-only value.
>   */
>  enum ethtool_flags {
> +	ETH_FLAG_TXVLAN		= (1 << 7),	/* TX VLAN offload enabled */
> +	ETH_FLAG_RXVLAN		= (1 << 8),	/* RX VLAN offload enabled */
>  	ETH_FLAG_LRO		= (1 << 15),	/* LRO is enabled */
>  	ETH_FLAG_NTUPLE		= (1 << 27),	/* N-tuple filters enabled */
>  	ETH_FLAG_RXHASH		= (1 << 28),
> diff --git a/net/core/ethtool.c b/net/core/ethtool.c
> index 685c700..956a9f4 100644
> --- a/net/core/ethtool.c
> +++ b/net/core/ethtool.c
> @@ -132,7 +132,8 @@ EXPORT_SYMBOL(ethtool_op_set_ufo);
>   * NETIF_F_xxx values in include/linux/netdevice.h
>   */
>  static const u32 flags_dup_features =
> -	(ETH_FLAG_LRO | ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH);
> +	(ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE |
> +	 ETH_FLAG_RXHASH);
>  
>  u32 ethtool_op_get_flags(struct net_device *dev)
>  {

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox