Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
@ 2004-12-05 18:25 Manfred Spraul
  0 siblings, 0 replies; 24+ messages in thread
From: Manfred Spraul @ 2004-12-05 18:25 UTC (permalink / raw)
  To: Lennert Buytenhek, Netdev, Martin Josefsson

[-- Attachment #1: Type: text/plain, Size: 1604 bytes --]

Lennert wrote:

> A dirty way, yes ;-)  Open up e1000_osdep.h and do:
>
> -#define E1000_READ_REG(a, reg) ( \
> -    readl((a)->hw_addr + \
> -        (((a)->mac_type >= e1000_82543) ? E1000_##reg : E1000_82542_##reg)))
> +#define E1000_READ_REG(a, reg) ({ \
> +    unsigned long s, e, d, v; \
> +\
> +    (a)->mmio_reads++; \
> +    rdtsc(s, d); \
> +    v = readl((a)->hw_addr + \
> +        (((a)->mac_type >= e1000_82543) ? E1000_##reg : E1000_82542_##reg)); \
> +    rdtsc(e, d); \
> +    e -= s; \
> +    printk(KERN_INFO "e1000: MMIO read took %ld clocks\n", e); \
> +    printk(KERN_INFO "e1000: in process %d(%s)\n", current->pid, current->comm); \
> +    dump_stack(); \
> +    v; \
> +})

Too dirty: rdtsc is not serializing, thus my Opteron happily reorders 
the read and the rdtsc and reports 9 cycles.
Attached is a longer patch that I usually use for microbenchmarks. I get 
around 506 cycles with it for an Opteron 2 GHz to the nForce 250 Gb nic 
(i.e. integrated nic in the chipset, just one HT hop):

Results - zero - shift 0
 40: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 :0 0 63 0 0 0 0 0 0 0 0 0 0 0 0 0
1e0: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 :0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
Overflows: 0.
Sum: 100
 >>>>>>>>>>> benchmark overhead: 82 cycles
** reading register e08920b4
Results - readl - shift 0
240: 0 0 b 0 0 0 0 0 0 0 0 0 32 0 1 1 :0 0 0 0 0 0 a 0 0 0 0 0 0 0 0 0
260: 1a 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 :0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
300: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 :0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
Overflows: 0.
Sum: 100
 >>>>>>>>>> total: 0x248, i.e. net 506 cycles.

--
    Manfred

[-- Attachment #2: patch-perftest-forcedeth --]
[-- Type: text/plain, Size: 2910 bytes --]

--- 2.6/drivers/net/forcedeth.c	2004-12-05 16:21:28.000000000 +0100
+++ build-2.6/drivers/net/forcedeth.c	2004-12-05 19:18:24.000000000 +0100
@@ -1500,6 +1500,131 @@
 	enable_irq(dev->irq);
 }
 
+int p_shift = 0;
+
+#define STAT_TABLELEN		16384
+static unsigned long totals[STAT_TABLELEN];
+static unsigned int overflows;
+
+static unsigned long long stime;
+static void start_measure(void)
+{
+	 __asm__ __volatile__ (
+		".align 64\n\t"
+	 	"pushal\n\t"
+		"cpuid\n\t"
+		"popal\n\t"
+		"rdtsc\n\t"
+		"movl %%eax,(%0)\n\t"
+		"movl %%edx,4(%0)\n\t"
+		: /* no output */
+		: "c"(&stime)
+		: "eax", "edx", "memory" );
+}
+
+static void end_measure(void)
+{
+static unsigned long long etime;
+	__asm__ __volatile__ (
+		"pushal\n\t"
+		"cpuid\n\t"
+		"popal\n\t"
+		"rdtsc\n\t"
+		"movl %%eax,(%0)\n\t"
+		"movl %%edx,4(%0)\n\t"
+		: /* no output */
+		: "c"(&etime)
+		: "eax", "edx", "memory" );
+	{
+		unsigned long time = (unsigned long)(etime-stime);
+		time >>= p_shift;
+		if(time < STAT_TABLELEN) {
+			totals[time]++;
+		} else {
+			overflows++;
+		}
+	}
+}
+
+static void clean_buf(void)
+{
+	memset(totals,0,sizeof(totals));
+	overflows = 0;
+}
+
+static void print_line(unsigned long* array)
+{
+	int i;
+	for(i=0;i<32;i++) {
+		if((i%32)==16)
+			printk(":");
+		printk("%lx ",array[i]); 
+	}
+}
+
+static void print_buf(char* caption)
+{
+	int i, other = 0;
+	printk("Results - %s - shift %d",
+		caption, p_shift);
+
+	for(i=0;i<STAT_TABLELEN;i+=32) {
+		int j;
+		int local = 0;
+		for(j=0;j<32;j++)
+			local += totals[i+j];
+
+		if(local) {
+			printk("\n%3x: ",i);
+			print_line(&totals[i]);
+			other += local;
+		}
+	}
+	printk("\nOverflows: %d.\n",
+		overflows);
+	printk("Sum: %d\n",other+overflows);
+}
+
+static void return_immediately(void *dummy)
+{
+}
+
+static void bench_readl(u8 __iomem *base)
+{ 
+	int i;
+
+	/* empty test measurement: */
+	printk("******** kernel cpu benchmark started **********\n");
+	clean_buf();
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(200);
+	for(i=0;i<100;i++) {
+		start_measure();
+		return_immediately(NULL);
+		return_immediately(NULL);
+		return_immediately(NULL);
+		return_immediately(NULL);
+		end_measure();
+	}
+	print_buf("zero");
+	clean_buf();
+
+	printk("** reading register %p\n", base);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(200);
+	for(i=0;i<100;i++) {
+		start_measure();
+		return_immediately(NULL);
+		return_immediately(NULL);
+		readl(base);
+		return_immediately(NULL);
+		return_immediately(NULL);
+		end_measure();
+	}
+	print_buf("readl");
+	clean_buf();
+}
+
 static int nv_open(struct net_device *dev)
 {
 	struct fe_priv *np = get_nvpriv(dev);
@@ -1635,6 +1760,8 @@
 		mod_timer(&np->oom_kick, jiffies + OOM_REFILL);
 	spin_unlock_irq(&np->lock);
 
+	bench_readl(base + NvRegMulticastAddrB);
+	bench_readl(base + NvRegIrqStatus);
 	return 0;
 out_drain:
 	drain_ring(dev);

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
@ 2004-12-06 19:10 Robert Olsson
  2004-12-06 22:29 ` Martin Josefsson
  0 siblings, 1 reply; 24+ messages in thread
From: Robert Olsson @ 2004-12-06 19:10 UTC (permalink / raw)
  To: Lennert Buytenhek
  Cc: jamal, Martin Josefsson, Scott Feldman, Robert Olsson, P, mellia,
	Jorge Manuel Finochietto, Giulio Galante, netdev



Lennert Buytenhek writes:
 > On Mon, Dec 06, 2004 at 08:11:02AM -0500, jamal wrote:
 > 
 > > Hopefully someone will beat me to testing to see if our forwarding
 > > capacity now goes up with this new recipe.


Yes a breakthrough as we now can send small packets at GIGE wire speed this
will make development and testing much easier... A first router test with 
our setup below. Opteron 1.6 GHz SMP kernel. using 1 CPU. 82546 EB + 
82456 GB and PCI-X 100 Mhz & 133 MHz. 

pktgen performance is measured on router box. Remember Scotts patch uses 
4096 TX buffers and w. pktgen we use clone_skb. So with real skb's we probably 
see lower performance due to this. This may explain results below so routing
performance doesn't follow pktgen performance as seen.

T-PUT is routing performance. Also pktgen pure TX performance is given
this on the router.


Input rate for routing test is 2*765 kpps for all three runs. Input 
Packets input to eth0 is routed to eth1 and eth2 to eth3.


Vanilla. T-PUT 657 kpps. pktgen TX perf 818 kpps
-------------------------------------------------
Iface   MTU Met  RX-OK RX-ERR RX-DRP RX-OVR  TX-OK TX-ERR TX-DRP TX-OVR Flags
eth0   1500   0 4312682 8253078 8253078 5687318      5      0      0      0 BRU
eth1   1500   0      1      0      0      0 4312199      0      0      0 BRU
eth2   1500   0 4311018 8386504 8386504 5688982      5      0      0      0 BRU
eth3   1500   0      1      0      0      0 4310791      0      0      0 BRU

           CPU0       
  0:     116665    IO-APIC-edge  timer
  1:        208    IO-APIC-edge  i8042
  8:          0    IO-APIC-edge  rtc
  9:          0   IO-APIC-level  acpi
 14:      21943    IO-APIC-edge  ide0
 26:         66   IO-APIC-level  eth0
 27:      58638   IO-APIC-level  eth1
 28:         68   IO-APIC-level  eth2
 29:      58497   IO-APIC-level  eth3
NMI:          0 
LOC:     116605 
ERR:          0
MIS:          0

e1000-TX-prefetch+scott tx patch. T-PUT 540 kpps. pktgen TX perf 1.48 Mpps
--------------------------------------------------------------------------

Iface   MTU Met  RX-OK RX-ERR RX-DRP RX-OVR  TX-OK TX-ERR TX-DRP TX-OVR Flags
eth0   1500   0 3533795 8618637 8618637 6466205      5      0      0      0 BRU
eth1   1500   0      3      0      0      0 3533803      0      0      0 BRU
eth2   1500   0 3535804 8697149 8697149 6464196      5      0      0      0 BRU
eth3   1500   0      1      0      0      0 3535321      0      0      0 BRU

           CPU0       
  0:    1372774    IO-APIC-edge  timer
  1:        663    IO-APIC-edge  i8042
  8:          0    IO-APIC-edge  rtc
  9:          0   IO-APIC-level  acpi
 14:      22631    IO-APIC-edge  ide0
 26:        686   IO-APIC-level  eth0
 27:        693   IO-APIC-level  eth1
 28:        687   IO-APIC-level  eth2
 29:        682   IO-APIC-level  eth3
NMI:          0 
LOC:    1372804 
ERR:          0
MIS:          0


e1000-TX-prefetch. T-PUT 657 kpps. pktgen TX perf 1.15 Mpps
-----------------------------------------------------------
Kernel Interface table
Iface   MTU Met  RX-OK RX-ERR RX-DRP RX-OVR  TX-OK TX-ERR TX-DRP TX-OVR Flags
eth0   1500   0 4311848 8288270 8288270 5688152      5      0      0      0 BRU
eth1   1500   0      4      0      0      0 4311388      0      0      0 BRU
eth2   1500   0 4309082 8400892 8400892 5690918      5      0      0      0 BRU
eth3   1500   0      1      0      0      0 4308271      0      0      0 BRU
lo    16436   0      0      0      0      0      0      0      0      0 LRU
           CPU0       
  0:     224310    IO-APIC-edge  timer
  1:        250    IO-APIC-edge  i8042
  8:          0    IO-APIC-edge  rtc
  9:          0   IO-APIC-level  acpi
 14:      22055    IO-APIC-edge  ide0
 26:        122   IO-APIC-level  eth0
 27:      58001   IO-APIC-level  eth1
 28:        123   IO-APIC-level  eth2
 29:      57681   IO-APIC-level  eth3
NMI:          0 
LOC:     224251 
ERR:          0
MIS:          0


						--ro

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-06 19:10 Robert Olsson
@ 2004-12-06 22:29 ` Martin Josefsson
  2004-12-07  3:20   ` jamal
  0 siblings, 1 reply; 24+ messages in thread
From: Martin Josefsson @ 2004-12-06 22:29 UTC (permalink / raw)
  To: Robert Olsson
  Cc: Lennert Buytenhek, jamal, Scott Feldman, P, mellia,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Mon, 6 Dec 2004, Robert Olsson wrote:

> pktgen performance is measured on router box. Remember Scotts patch uses
> 4096 TX buffers and w. pktgen we use clone_skb. So with real skb's we probably
> see lower performance due to this. This may explain results below so routing
> performance doesn't follow pktgen performance as seen.

I've performed some tests with and without clone_skb with various versions
of the driver.

> Vanilla. T-PUT 657 kpps. pktgen TX perf 818 kpps

> e1000-TX-prefetch+scott tx patch. T-PUT 540 kpps. pktgen TX perf 1.48 Mpps

> e1000-TX-prefetch. T-PUT 657 kpps. pktgen TX perf 1.15 Mpps

This matches the data I see in my tests here with and without clone_skb.

I've included a lot of pps numbers below, they might need some
description.

I tested generating packets with four diffrent drivers with and without
clone_skb.

vanilla is the vanilla driver in 2.6.10-rc3

copy is using the patch found at the bottom of this mail, just a small
test to see if there's any gain or loss using "static" buffers to dma
from. Prefetch doesn't help at all here, just makes things worse, even for
clone_skb. Tried with delayed TDT updating as well, didn't help.

vanilla + prefetch is just the vanilla driver + prefetching.

feldman tx is using scotts tx-path rewrite patch.
I didn't bother listing feldman tx + prefetch as the results were even
lower for the non clone_skb case.
The only thing I can think of that can cause this is cache trashing, or
overhead in slab when we have a lot of skb's in the wild.

I don't have oprofile on my testmachine at the moment and it's time to go
to bed now, maybe tomorrow...

Does anyone have any suggestions of what to test next?


vanilla and clone
60      854886
64      772341
68      759531
72      758872
76      758926
80      761136
84      742109
88      742070
92      741616
96      744083
100     727430
104     725242
108     724153
112     725841
116     707331
120     706000
124     704923
128     662547

vanilla and noclone
60      748552
64      702464
68      649066
72      671992
76      680251
80      627711
84      625468
88      640115
92      679365
96      650544
100     666423
104     652057
108     665821
112     679443
116     652507
120     661279
124     648627
128     635780

copy and clone
60      897165
64      872767
68      750694
72      750427
76      749583
80      748242
84      732760
88      731129
92      732603
96      732631
100     717123
104     717678
108     716839
112     719258
116     703824
120     706047
124     701885
128     695575

copy and noclone
60      882227
64      649614
68      691327
72      700706
76      700795
80      696594
84      686016
88      691689
92      696136
96      691348
100     684596
104     687800
108     689218
112     671483
116     675867
120     679089
124     672385
128     650148

vanilla + prefetch and clone
60      1300075
64      1079069
68      1082091
72      1068791
76      1067630
80      1026222
84      1053055
88      1024442
92      1032112
96      1014844
100     991346
104     976483
108     947019
112     919193
116     892863
120     868054
124     844679
128     822347

vanilla + prefetch and noclone
60      738538
64      800927
68      719832
72      725353
76      822738
80      743134
84      813520
88      721522
92      797838
96      724031
100     812198
104     717811
108     713072
112     789771
116     696027
120     682168
124     749020
128     703233

feldman tx and clone
60      1029997
64      916706
68      898601
72      895378
76      896171
80      898594
84      861434
88      861446
92      861444
96      863669
100     837624
104     836225
108     835528
112     835527
116     817102
120     817101
124     817100
128     757683

feldman tx and noclone
60      626646
64      628148
68      628935
72      625084
76      623527
80      623510
84      624286
88      625086
92      623907
96      630199
100     613933
104     618025
108     620326
112     607884
116     606124
120     538434
124     531699
128     532719



diff -X /home/gandalf/dontdiff.ny -urNp drivers/net/e1000-vanilla/e1000_main.c drivers/net/e1000/e1000_main.c
--- drivers/net/e1000-vanilla/e1000_main.c	2004-12-05 18:27:50.000000000 +0100
+++ drivers/net/e1000/e1000_main.c	2004-12-06 22:21:10.000000000 +0100
@@ -132,6 +132,7 @@ static void e1000_irq_disable(struct e10
 static void e1000_irq_enable(struct e1000_adapter *adapter);
 static irqreturn_t e1000_intr(int irq, void *data, struct pt_regs *regs);
 static boolean_t e1000_clean_tx_irq(struct e1000_adapter *adapter);
+static boolean_t e1000_alloc_tx_buffers(struct e1000_adapter *adapter);
 #ifdef CONFIG_E1000_NAPI
 static int e1000_clean(struct net_device *netdev, int *budget);
 static boolean_t e1000_clean_rx_irq(struct e1000_adapter *adapter,
@@ -264,6 +265,7 @@ e1000_up(struct e1000_adapter *adapter)
 	e1000_restore_vlan(adapter);

 	e1000_configure_tx(adapter);
+	e1000_alloc_tx_buffers(adapter);
 	e1000_setup_rctl(adapter);
 	e1000_configure_rx(adapter);
 	e1000_alloc_rx_buffers(adapter);
@@ -1048,10 +1052,21 @@ e1000_configure_rx(struct e1000_adapter
 void
 e1000_free_tx_resources(struct e1000_adapter *adapter)
 {
+	struct e1000_desc_ring *tx_ring = &adapter->tx_ring;
+	struct e1000_buffer *buffer_info;
 	struct pci_dev *pdev = adapter->pdev;
+	unsigned int i;

 	e1000_clean_tx_ring(adapter);

+	for(i = 0; i < tx_ring->count; i++) {
+		buffer_info = &tx_ring->buffer_info[i];
+		if(buffer_info->skb) {
+			kfree(buffer_info->skb);
+			buffer_info->skb = NULL;
+		}
+	}
+
 	vfree(adapter->tx_ring.buffer_info);
 	adapter->tx_ring.buffer_info = NULL;

@@ -1079,16 +1094,12 @@ e1000_clean_tx_ring(struct e1000_adapter

 	for(i = 0; i < tx_ring->count; i++) {
 		buffer_info = &tx_ring->buffer_info[i];
-		if(buffer_info->skb) {
-
+		if(buffer_info->dma) {
 			pci_unmap_page(pdev,
 				       buffer_info->dma,
 				       buffer_info->length,
 				       PCI_DMA_TODEVICE);
-
-			dev_kfree_skb(buffer_info->skb);
-
-			buffer_info->skb = NULL;
+			buffer_info->dma = 0;
 		}
 	}

@@ -1579,8 +1590,6 @@ e1000_tx_map(struct e1000_adapter *adapt
 	struct e1000_buffer *buffer_info;
 	unsigned int len = skb->len;
 	unsigned int offset = 0, size, count = 0, i;
-	unsigned int f;
-	len -= skb->data_len;

 	i = tx_ring->next_to_use;

@@ -1600,10 +1609,12 @@ e1000_tx_map(struct e1000_adapter *adapt
 		   size > 4))
 			size -= 4;

+		skb_copy_bits(skb, offset, buffer_info->skb, size);
+
 		buffer_info->length = size;
 		buffer_info->dma =
 			pci_map_single(adapter->pdev,
-				skb->data + offset,
+				buffer_info->skb,
 				size,
 				PCI_DMA_TODEVICE);
 		buffer_info->time_stamp = jiffies;
@@ -1614,50 +1625,11 @@ e1000_tx_map(struct e1000_adapter *adapt
 		if(unlikely(++i == tx_ring->count)) i = 0;
 	}

-	for(f = 0; f < nr_frags; f++) {
-		struct skb_frag_struct *frag;
-
-		frag = &skb_shinfo(skb)->frags[f];
-		len = frag->size;
-		offset = frag->page_offset;
-
-		while(len) {
-			buffer_info = &tx_ring->buffer_info[i];
-			size = min(len, max_per_txd);
-#ifdef NETIF_F_TSO
-			/* Workaround for premature desc write-backs
-			 * in TSO mode.  Append 4-byte sentinel desc */
-			if(unlikely(mss && f == (nr_frags-1) && size == len && size > 8))
-				size -= 4;
-#endif
-			/* Workaround for potential 82544 hang in PCI-X.
-			 * Avoid terminating buffers within evenly-aligned
-			 * dwords. */
-			if(unlikely(adapter->pcix_82544 &&
-			   !((unsigned long)(frag->page+offset+size-1) & 4) &&
-			   size > 4))
-				size -= 4;
-
-			buffer_info->length = size;
-			buffer_info->dma =
-				pci_map_page(adapter->pdev,
-					frag->page,
-					offset,
-					size,
-					PCI_DMA_TODEVICE);
-			buffer_info->time_stamp = jiffies;
-
-			len -= size;
-			offset += size;
-			count++;
-			if(unlikely(++i == tx_ring->count)) i = 0;
-		}
-	}
-
 	i = (i == 0) ? tx_ring->count - 1 : i - 1;
-	tx_ring->buffer_info[i].skb = skb;
 	tx_ring->buffer_info[first].next_to_watch = i;

+	dev_kfree_skb_any(skb);
+
 	return count;
 }

@@ -2213,11 +2185,6 @@ e1000_clean_tx_irq(struct e1000_adapter
 				buffer_info->dma = 0;
 			}

-			if(buffer_info->skb) {
-				dev_kfree_skb_any(buffer_info->skb);
-				buffer_info->skb = NULL;
-			}
-
 			tx_desc->buffer_addr = 0;
 			tx_desc->lower.data = 0;
 			tx_desc->upper.data = 0;
@@ -2243,6 +2210,28 @@ e1000_clean_tx_irq(struct e1000_adapter
 	return cleaned;
 }

+
+static boolean_t
+e1000_alloc_tx_buffers(struct e1000_adapter *adapter)
+{
+        struct e1000_desc_ring *tx_ring = &adapter->tx_ring;
+        struct e1000_buffer *buffer_info;
+        unsigned int i;
+
+	for (i = 0; i < tx_ring->count; i++) {
+		buffer_info = &tx_ring->buffer_info[i];
+		if (!buffer_info->skb) {
+			buffer_info->skb = kmalloc(2048, GFP_ATOMIC);
+			if (unlikely(!buffer_info->skb)) {
+				printk("eek!\n");
+				return FALSE;
+			}
+		}
+	}
+
+	return TRUE;
+}
+
 /**
  * e1000_clean_rx_irq - Send received data up the network stack
  * @adapter: board private structure

/Martin

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-06 22:29 ` Martin Josefsson
@ 2004-12-07  3:20   ` jamal
  0 siblings, 0 replies; 24+ messages in thread
From: jamal @ 2004-12-07  3:20 UTC (permalink / raw)
  To: Martin Josefsson
  Cc: Robert Olsson, Lennert Buytenhek, Scott Feldman, P, mellia,
	Jorge Manuel Finochietto, Giulio Galante, netdev


Can someone post the patches and a small README?
As luck would have it my ext3 just decided to fail me on my first
-rc3 boot. Dammit. 

cheers,
jamal

On Mon, 2004-12-06 at 17:29, Martin Josefsson wrote:
> On Mon, 6 Dec 2004, Robert Olsson wrote:
> 
> > pktgen performance is measured on router box. Remember Scotts patch uses
> > 4096 TX buffers and w. pktgen we use clone_skb. So with real skb's we probably
> > see lower performance due to this. This may explain results below so routing
> > performance doesn't follow pktgen performance as seen.
> 
> I've performed some tests with and without clone_skb with various versions
> of the driver.
> 
> > Vanilla. T-PUT 657 kpps. pktgen TX perf 818 kpps
> 
> > e1000-TX-prefetch+scott tx patch. T-PUT 540 kpps. pktgen TX perf 1.48 Mpps
> 
> > e1000-TX-prefetch. T-PUT 657 kpps. pktgen TX perf 1.15 Mpps
> 
> This matches the data I see in my tests here with and without clone_skb.
> 
> I've included a lot of pps numbers below, they might need some
> description.
> 
> I tested generating packets with four diffrent drivers with and without
> clone_skb.
> 
> vanilla is the vanilla driver in 2.6.10-rc3
> 
> copy is using the patch found at the bottom of this mail, just a small
> test to see if there's any gain or loss using "static" buffers to dma
> from. Prefetch doesn't help at all here, just makes things worse, even for
> clone_skb. Tried with delayed TDT updating as well, didn't help.
> 
> vanilla + prefetch is just the vanilla driver + prefetching.
> 
> feldman tx is using scotts tx-path rewrite patch.
> I didn't bother listing feldman tx + prefetch as the results were even
> lower for the non clone_skb case.
> The only thing I can think of that can cause this is cache trashing, or
> overhead in slab when we have a lot of skb's in the wild.
> 
> I don't have oprofile on my testmachine at the moment and it's time to go
> to bed now, maybe tomorrow...
> 
> Does anyone have any suggestions of what to test next?
> 
> 
> vanilla and clone
> 60      854886
> 64      772341
> 68      759531
> 72      758872
> 76      758926
> 80      761136
> 84      742109
> 88      742070
> 92      741616
> 96      744083
> 100     727430
> 104     725242
> 108     724153
> 112     725841
> 116     707331
> 120     706000
> 124     704923
> 128     662547
> 
> vanilla and noclone
> 60      748552
> 64      702464
> 68      649066
> 72      671992
> 76      680251
> 80      627711
> 84      625468
> 88      640115
> 92      679365
> 96      650544
> 100     666423
> 104     652057
> 108     665821
> 112     679443
> 116     652507
> 120     661279
> 124     648627
> 128     635780
> 
> copy and clone
> 60      897165
> 64      872767
> 68      750694
> 72      750427
> 76      749583
> 80      748242
> 84      732760
> 88      731129
> 92      732603
> 96      732631
> 100     717123
> 104     717678
> 108     716839
> 112     719258
> 116     703824
> 120     706047
> 124     701885
> 128     695575
> 
> copy and noclone
> 60      882227
> 64      649614
> 68      691327
> 72      700706
> 76      700795
> 80      696594
> 84      686016
> 88      691689
> 92      696136
> 96      691348
> 100     684596
> 104     687800
> 108     689218
> 112     671483
> 116     675867
> 120     679089
> 124     672385
> 128     650148
> 
> vanilla + prefetch and clone
> 60      1300075
> 64      1079069
> 68      1082091
> 72      1068791
> 76      1067630
> 80      1026222
> 84      1053055
> 88      1024442
> 92      1032112
> 96      1014844
> 100     991346
> 104     976483
> 108     947019
> 112     919193
> 116     892863
> 120     868054
> 124     844679
> 128     822347
> 
> vanilla + prefetch and noclone
> 60      738538
> 64      800927
> 68      719832
> 72      725353
> 76      822738
> 80      743134
> 84      813520
> 88      721522
> 92      797838
> 96      724031
> 100     812198
> 104     717811
> 108     713072
> 112     789771
> 116     696027
> 120     682168
> 124     749020
> 128     703233
> 
> feldman tx and clone
> 60      1029997
> 64      916706
> 68      898601
> 72      895378
> 76      896171
> 80      898594
> 84      861434
> 88      861446
> 92      861444
> 96      863669
> 100     837624
> 104     836225
> 108     835528
> 112     835527
> 116     817102
> 120     817101
> 124     817100
> 128     757683
> 
> feldman tx and noclone
> 60      626646
> 64      628148
> 68      628935
> 72      625084
> 76      623527
> 80      623510
> 84      624286
> 88      625086
> 92      623907
> 96      630199
> 100     613933
> 104     618025
> 108     620326
> 112     607884
> 116     606124
> 120     538434
> 124     531699
> 128     532719
> 
> 
> 
> diff -X /home/gandalf/dontdiff.ny -urNp drivers/net/e1000-vanilla/e1000_main.c drivers/net/e1000/e1000_main.c
> --- drivers/net/e1000-vanilla/e1000_main.c	2004-12-05 18:27:50.000000000 +0100
> +++ drivers/net/e1000/e1000_main.c	2004-12-06 22:21:10.000000000 +0100
> @@ -132,6 +132,7 @@ static void e1000_irq_disable(struct e10
>  static void e1000_irq_enable(struct e1000_adapter *adapter);
>  static irqreturn_t e1000_intr(int irq, void *data, struct pt_regs *regs);
>  static boolean_t e1000_clean_tx_irq(struct e1000_adapter *adapter);
> +static boolean_t e1000_alloc_tx_buffers(struct e1000_adapter *adapter);
>  #ifdef CONFIG_E1000_NAPI
>  static int e1000_clean(struct net_device *netdev, int *budget);
>  static boolean_t e1000_clean_rx_irq(struct e1000_adapter *adapter,
> @@ -264,6 +265,7 @@ e1000_up(struct e1000_adapter *adapter)
>  	e1000_restore_vlan(adapter);
> 
>  	e1000_configure_tx(adapter);
> +	e1000_alloc_tx_buffers(adapter);
>  	e1000_setup_rctl(adapter);
>  	e1000_configure_rx(adapter);
>  	e1000_alloc_rx_buffers(adapter);
> @@ -1048,10 +1052,21 @@ e1000_configure_rx(struct e1000_adapter
>  void
>  e1000_free_tx_resources(struct e1000_adapter *adapter)
>  {
> +	struct e1000_desc_ring *tx_ring = &adapter->tx_ring;
> +	struct e1000_buffer *buffer_info;
>  	struct pci_dev *pdev = adapter->pdev;
> +	unsigned int i;
> 
>  	e1000_clean_tx_ring(adapter);
> 
> +	for(i = 0; i < tx_ring->count; i++) {
> +		buffer_info = &tx_ring->buffer_info[i];
> +		if(buffer_info->skb) {
> +			kfree(buffer_info->skb);
> +			buffer_info->skb = NULL;
> +		}
> +	}
> +
>  	vfree(adapter->tx_ring.buffer_info);
>  	adapter->tx_ring.buffer_info = NULL;
> 
> @@ -1079,16 +1094,12 @@ e1000_clean_tx_ring(struct e1000_adapter
> 
>  	for(i = 0; i < tx_ring->count; i++) {
>  		buffer_info = &tx_ring->buffer_info[i];
> -		if(buffer_info->skb) {
> -
> +		if(buffer_info->dma) {
>  			pci_unmap_page(pdev,
>  				       buffer_info->dma,
>  				       buffer_info->length,
>  				       PCI_DMA_TODEVICE);
> -
> -			dev_kfree_skb(buffer_info->skb);
> -
> -			buffer_info->skb = NULL;
> +			buffer_info->dma = 0;
>  		}
>  	}
> 
> @@ -1579,8 +1590,6 @@ e1000_tx_map(struct e1000_adapter *adapt
>  	struct e1000_buffer *buffer_info;
>  	unsigned int len = skb->len;
>  	unsigned int offset = 0, size, count = 0, i;
> -	unsigned int f;
> -	len -= skb->data_len;
> 
>  	i = tx_ring->next_to_use;
> 
> @@ -1600,10 +1609,12 @@ e1000_tx_map(struct e1000_adapter *adapt
>  		   size > 4))
>  			size -= 4;
> 
> +		skb_copy_bits(skb, offset, buffer_info->skb, size);
> +
>  		buffer_info->length = size;
>  		buffer_info->dma =
>  			pci_map_single(adapter->pdev,
> -				skb->data + offset,
> +				buffer_info->skb,
>  				size,
>  				PCI_DMA_TODEVICE);
>  		buffer_info->time_stamp = jiffies;
> @@ -1614,50 +1625,11 @@ e1000_tx_map(struct e1000_adapter *adapt
>  		if(unlikely(++i == tx_ring->count)) i = 0;
>  	}
> 
> -	for(f = 0; f < nr_frags; f++) {
> -		struct skb_frag_struct *frag;
> -
> -		frag = &skb_shinfo(skb)->frags[f];
> -		len = frag->size;
> -		offset = frag->page_offset;
> -
> -		while(len) {
> -			buffer_info = &tx_ring->buffer_info[i];
> -			size = min(len, max_per_txd);
> -#ifdef NETIF_F_TSO
> -			/* Workaround for premature desc write-backs
> -			 * in TSO mode.  Append 4-byte sentinel desc */
> -			if(unlikely(mss && f == (nr_frags-1) && size == len && size > 8))
> -				size -= 4;
> -#endif
> -			/* Workaround for potential 82544 hang in PCI-X.
> -			 * Avoid terminating buffers within evenly-aligned
> -			 * dwords. */
> -			if(unlikely(adapter->pcix_82544 &&
> -			   !((unsigned long)(frag->page+offset+size-1) & 4) &&
> -			   size > 4))
> -				size -= 4;
> -
> -			buffer_info->length = size;
> -			buffer_info->dma =
> -				pci_map_page(adapter->pdev,
> -					frag->page,
> -					offset,
> -					size,
> -					PCI_DMA_TODEVICE);
> -			buffer_info->time_stamp = jiffies;
> -
> -			len -= size;
> -			offset += size;
> -			count++;
> -			if(unlikely(++i == tx_ring->count)) i = 0;
> -		}
> -	}
> -
>  	i = (i == 0) ? tx_ring->count - 1 : i - 1;
> -	tx_ring->buffer_info[i].skb = skb;
>  	tx_ring->buffer_info[first].next_to_watch = i;
> 
> +	dev_kfree_skb_any(skb);
> +
>  	return count;
>  }
> 
> @@ -2213,11 +2185,6 @@ e1000_clean_tx_irq(struct e1000_adapter
>  				buffer_info->dma = 0;
>  			}
> 
> -			if(buffer_info->skb) {
> -				dev_kfree_skb_any(buffer_info->skb);
> -				buffer_info->skb = NULL;
> -			}
> -
>  			tx_desc->buffer_addr = 0;
>  			tx_desc->lower.data = 0;
>  			tx_desc->upper.data = 0;
> @@ -2243,6 +2210,28 @@ e1000_clean_tx_irq(struct e1000_adapter
>  	return cleaned;
>  }
> 
> +
> +static boolean_t
> +e1000_alloc_tx_buffers(struct e1000_adapter *adapter)
> +{
> +        struct e1000_desc_ring *tx_ring = &adapter->tx_ring;
> +        struct e1000_buffer *buffer_info;
> +        unsigned int i;
> +
> +	for (i = 0; i < tx_ring->count; i++) {
> +		buffer_info = &tx_ring->buffer_info[i];
> +		if (!buffer_info->skb) {
> +			buffer_info->skb = kmalloc(2048, GFP_ATOMIC);
> +			if (unlikely(!buffer_info->skb)) {
> +				printk("eek!\n");
> +				return FALSE;
> +			}
> +		}
> +	}
> +
> +	return TRUE;
> +}
> +
>  /**
>   * e1000_clean_rx_irq - Send received data up the network stack
>   * @adapter: board private structure
> 
> /Martin
> 
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [E1000-devel] Transmission limit
@ 2004-11-26 20:01 jamal
  2004-11-29 13:09 ` Robert Olsson
  0 siblings, 1 reply; 24+ messages in thread
From: jamal @ 2004-11-26 20:01 UTC (permalink / raw)
  To: P
  Cc: mellia, Robert Olsson, e1000-devel, Jorge Manuel Finochietto,
	Giulio Galante, netdev

On Fri, 2004-11-26 at 11:57, P@draigBrady.com wrote:

> > skb are de/allocated using standard kernel memory management. Still,
> > without touching the packet, we can receive 100% of them.
> 
> I was doing some playing in this area this week.
> I changed the alloc per packet to a "realloc" per packet.
> I.E. the e1000 driver owns the packets. I noticed a
> very nice speedup from this. In summary a userspace
> app was able to receive 2x250Kpps without this patch,
> and 2x490Kpps with it. The patch is here:
> http://www.pixelbeat.org/tmp/linux-2.4.20-pb.diff

A very angry gorilla on that url ;->

> Note 99% of that patch is just upgrading from
> e1000 V4.4.12-k1 to V5.2.52 (which doesn't affect
> the performance).
> 
> Wow I just read you're excellent paper, and noticed
> you used this approach also :-)
> 

Have to read the paper - When Robert was last visiting here; we did some
tests and packet recycling is not very valuable as far as SMP is
concerned (given that packets can be alloced on one CPU and freed on
another). There a clear win on single CPU machines.

> >> Small packet performance is dependent on low latency. Higher bus speed
> >> gives shorter latency but also on higher speed buses there use to be  
> >> bridges that adds latency.
> > 
> > That's true. We suspect that the limit is due to bus latency. But still,
> > we are surprised, since the bus allows to receive 100%, but to transmit
> > up to ~50%. Moreover the raw aggerate bandwidth of the buffer is _far_
> > larger (133MHz*64bit ~ 8gbit/s
> 
> Well there definitely could be an asymmetry wrt bus latency.
> Saying that though, in my tests with much the same hardware
> as you, I could only get 800Kpps into the driver.

Yep, thats about the number i was seeing as well in both pieces of
hardware i used in the tests in my SUCON presentation.

>  I'll
> check this again when I have time. Note also that as I understand
> it the PCI control bus is running at a much lower rate,
> and that is used to arbitrate the bus for each packet.
> I.E. the 8Gb/s number above is not the bottleneck.
> 
> An lspci -vvv for your ethernet devices would be useful
> Also to view the burst size: setpci -d 8086:1010 e6.b
> (where 8086:1010 is the ethernet device PCI id).
> 

Can you talk a little about this PCI control bus? I have heard you
mention it before ... I am trying to visualize where it fits in PCI
system.

cheers,
jamal

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [E1000-devel] Transmission limit
  2004-11-26 20:01 [E1000-devel] Transmission limit jamal
@ 2004-11-29 13:09 ` Robert Olsson
  2004-11-30 13:31   ` jamal
  0 siblings, 1 reply; 24+ messages in thread
From: Robert Olsson @ 2004-11-29 13:09 UTC (permalink / raw)
  To: hadi
  Cc: P, mellia, Robert Olsson, e1000-devel, Jorge Manuel Finochietto,
	Giulio Galante, netdev

jamal writes:

 > Have to read the paper - When Robert was last visiting here; we did some
 > tests and packet recycling is not very valuable as far as SMP is
 > concerned (given that packets can be alloced on one CPU and freed on
 > another). There a clear win on single CPU machines.

 Correct yes at you lab about 2 1/2 years ago. I see those experiments in a 
 different light today as we never got any packet budget contribution
 from SMP with shared mem arch whatsoever. Spent a week w. Alexey in the lab 
 to understand whats going on. Two flows with total affinity (for each CPU)
 even removed all locks and part of the IP stack. We were still confused...

 When Opteron/NUMA gave good contribution in those setups. We start thinking
 it must be latency and memory controllers that makes the difference. As w. 
 each CPU has it's own memory and memory controller in Opteron case.

 So from that aspect we expecting the impossible from recycling patch
 maybe it will do better on boxes w. local memory.

 But I think we should give it up in current form skb recycling. If extend 
 it to deal cache bouncing etc. We end up having something like slab in 
 every driver. slab has improved is not so dominant in profiles now.

 Also from what I understand new HW and MSI can help in the case where
 pass objects between CPU. Did I dream or did someone tell me that S2IO 
 could have several TX ring that could via MSI be routed to proper cpu?

 slab packet-objects have been discussed. It would do some contribution
 but is the complexity worth it?

 Also I think it could possible to do more lightweight variant of skb
 recycling in case we need to recycle PCI-mapping etc.

					 --ro

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [E1000-devel] Transmission limit
  2004-11-29 13:09 ` Robert Olsson
@ 2004-11-30 13:31   ` jamal
  2004-11-30 13:46     ` Lennert Buytenhek
  0 siblings, 1 reply; 24+ messages in thread
From: jamal @ 2004-11-30 13:31 UTC (permalink / raw)
  To: Robert Olsson
  Cc: P, mellia, e1000-devel, Jorge Manuel Finochietto, Giulio Galante,
	netdev

On Mon, 2004-11-29 at 08:09, Robert Olsson wrote:
> jamal writes:
> 
>  > Have to read the paper - When Robert was last visiting here; we did some
>  > tests and packet recycling is not very valuable as far as SMP is
>  > concerned (given that packets can be alloced on one CPU and freed on
>  > another). There a clear win on single CPU machines.
> 
> 
>  Correct yes at you lab about 2 1/2 years ago. 

How time flies when you are having fun ;->

> I see those experiments in a 
>  different light today as we never got any packet budget contribution
>  from SMP with shared mem arch whatsoever. Spent a week w. Alexey in the lab 
>  to understand whats going on. Two flows with total affinity (for each CPU)
>  even removed all locks and part of the IP stack. We were still confused...
> 
>  When Opteron/NUMA gave good contribution in those setups. We start thinking
>  it must be latency and memory controllers that makes the difference. As w. 
>  each CPU has it's own memory and memory controller in Opteron case.
> 
>  So from that aspect we expecting the impossible from recycling patch
>  maybe it will do better on boxes w. local memory.
> 

Interesting thought. Not using a lot of my brain cells to compute i
would say that it would get worse. But i suppose the real reason this 
gets nasty on x86 style SMP is because cache misses are more expensive
there, maybe?

>  But I think we should give it up in current form skb recycling. If extend 
>  it to deal cache bouncing etc. We end up having something like slab in 
>  every driver. slab has improved is not so dominant in profiles now.
> 

nod.

>  Also from what I understand new HW and MSI can help in the case where
>  pass objects between CPU. Did I dream or did someone tell me that S2IO 
>  could have several TX ring that could via MSI be routed to proper cpu?

I am wondering if the per CPU tx/rx irqs are valuable at all. They sound
like more hell to maintain.
 
>  slab packet-objects have been discussed. It would do some contribution
>  but is the complexity worth it?

May not be worth it.

>  
>  Also I think it could possible to do more lightweight variant of skb
>  recycling in case we need to recycle PCI-mapping etc.
>

I think its valuable to have it for people with UP; its not worth the
complexity for SMP IMO.

cheers,
jamal

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [E1000-devel] Transmission limit
  2004-11-30 13:31   ` jamal
@ 2004-11-30 13:46     ` Lennert Buytenhek
  2004-11-30 14:25       ` jamal
  0 siblings, 1 reply; 24+ messages in thread
From: Lennert Buytenhek @ 2004-11-30 13:46 UTC (permalink / raw)
  To: jamal
  Cc: Robert Olsson, P, mellia, e1000-devel, Jorge Manuel Finochietto,
	Giulio Galante, netdev

On Tue, Nov 30, 2004 at 08:31:41AM -0500, jamal wrote:

> >  Also from what I understand new HW and MSI can help in the case where
> >  pass objects between CPU. Did I dream or did someone tell me that S2IO 
> >  could have several TX ring that could via MSI be routed to proper cpu?
> 
> I am wondering if the per CPU tx/rx irqs are valuable at all. They sound
> like more hell to maintain.

On the TX path you'd have qdiscs to deal with as well, no?


--L

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [E1000-devel] Transmission limit
  2004-11-30 13:46     ` Lennert Buytenhek
@ 2004-11-30 14:25       ` jamal
  2004-12-01  0:11         ` Lennert Buytenhek
  0 siblings, 1 reply; 24+ messages in thread
From: jamal @ 2004-11-30 14:25 UTC (permalink / raw)
  To: Lennert Buytenhek
  Cc: Robert Olsson, P, mellia, e1000-devel, Jorge Manuel Finochietto,
	Giulio Galante, netdev

On Tue, 2004-11-30 at 08:46, Lennert Buytenhek wrote:
> On Tue, Nov 30, 2004 at 08:31:41AM -0500, jamal wrote:
> 
> > >  Also from what I understand new HW and MSI can help in the case where
> > >  pass objects between CPU. Did I dream or did someone tell me that S2IO 
> > >  could have several TX ring that could via MSI be routed to proper cpu?
> > 
> > I am wondering if the per CPU tx/rx irqs are valuable at all. They sound
> > like more hell to maintain.
> 
> On the TX path you'd have qdiscs to deal with as well, no?

I think management of it would be non-trivial in SMP. Youd have to start
playing stupid loadbalancing tricks which would reduce the value of
existence of tx irqs to begin with. 

cheers,
jamal

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [E1000-devel] Transmission limit
  2004-11-30 14:25       ` jamal
@ 2004-12-01  0:11         ` Lennert Buytenhek
  2004-12-01  1:09           ` Scott Feldman
  0 siblings, 1 reply; 24+ messages in thread
From: Lennert Buytenhek @ 2004-12-01  0:11 UTC (permalink / raw)
  To: jamal
  Cc: Robert Olsson, P, mellia, e1000-devel, Jorge Manuel Finochietto,
	Giulio Galante, netdev

On Tue, Nov 30, 2004 at 09:25:54AM -0500, jamal wrote:

> > > >  Also from what I understand new HW and MSI can help in the case where
> > > >  pass objects between CPU. Did I dream or did someone tell me that S2IO 
> > > >  could have several TX ring that could via MSI be routed to proper cpu?
> > > 
> > > I am wondering if the per CPU tx/rx irqs are valuable at all. They sound
> > > like more hell to maintain.
> > 
> > On the TX path you'd have qdiscs to deal with as well, no?
> 
> I think management of it would be non-trivial in SMP. Youd have to start
> playing stupid loadbalancing tricks which would reduce the value of
> existence of tx irqs to begin with. 

You mean the management of qdiscs would be non-trivial?

Probably the idea of these kinds of tricks is to skip the qdisc step
altogether.


--L

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [E1000-devel] Transmission limit
  2004-12-01  0:11         ` Lennert Buytenhek
@ 2004-12-01  1:09           ` Scott Feldman
  2004-12-01 18:29             ` Lennert Buytenhek
  0 siblings, 1 reply; 24+ messages in thread
From: Scott Feldman @ 2004-12-01  1:09 UTC (permalink / raw)
  To: Lennert Buytenhek
  Cc: jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

Hey, turns out, I know some e1000 tricks that might help get the kpps
numbers up.  

My problem is I only have a P4 desktop system with a 82544 nic running
at PCI 32/33Mhz, so I can't play with the big boys.  But, attached is a
rework of the Tx path to eliminate 1) Tx interrupts, and 2) Tx
descriptor write-backs.  For me, I see a nice jump in kpps, but I'd like
others to try with their setups.  We should be able to get to wire speed
with 60-byte packets.

I'm using pktgen in linux-2.6.9, count = 1000000.

System: Intel 865 (HT 2.6Ghz)
Nic: 82544 PCI 32-bit/33Mhz
Driver: linux-2.6.9 e1000 (5.3.19-k2-NAPI), no Interrupt Delays
                                                                                BEFORE

256 descs
  pkt_size = 60:   253432pps 129Mb/sec errors: 0
  pkt_size = 1500: 56356pps  678Mb/sec errors: 499791
4096 descs
  pkt_size = 60:   254222pps 130Mb/sec errors: 0
  pkt_size = 1500: 52693pps  634Mb/sec errors: 497556
                                                                                
AFTER

Modified driver to turn off Tx interrupts and descriptor write-backs.
Uses a timer to schedule Tx cleanup.  The timer runs at 1ms.  This would
work poorly where HZ=100.  Needed to bump Tx descriptors up to 4096
because 1ms is a lot of time with 60-byte packets at 1GbE.  Every time
the timer expires, there is only one PIO read to get HW head pointer. 
This wouldn't work at lower media speeds like 10Mbps or 100Mbps because
the ring isn't large enough (or we would need a higher resolution
timer).  This also get Tx cleanup out of the NAPI path.

4096 descs
  pkt_size = 60:   541618pps 277Mb/sec errors: 914
  pkt_size = 1500: 76198pps  916Mb/sec errors: 12419
                                                                               
This doubles the kpps numbers for 60-byte packets.  I'd like to see what
happens on higher bus bandwidth systems.  Anyone?

-scott

diff -Naurp linux-2.6.9/drivers/net/e1000/e1000.h linux-2.6.9/drivers/net/e1000.mod/e1000.h
--- linux-2.6.9/drivers/net/e1000/e1000.h	2004-10-18 14:53:06.000000000 -0700
+++ linux-2.6.9/drivers/net/e1000.mod/e1000.h	2004-11-30 14:41:07.045391488 -0800
@@ -103,7 +103,7 @@ struct e1000_adapter;
 #define E1000_MAX_INTR 10
 
 /* TX/RX descriptor defines */
-#define E1000_DEFAULT_TXD                  256
+#define E1000_DEFAULT_TXD                 4096
 #define E1000_MAX_TXD                      256
 #define E1000_MIN_TXD                       80
 #define E1000_MAX_82544_TXD               4096
@@ -189,6 +189,7 @@ struct e1000_desc_ring {
 /* board specific private data structure */
 
 struct e1000_adapter {
+	struct timer_list tx_cleanup_timer;
 	struct timer_list tx_fifo_stall_timer;
 	struct timer_list watchdog_timer;
 	struct timer_list phy_info_timer;
@@ -224,6 +225,7 @@ struct e1000_adapter {
 	uint32_t tx_fifo_size;
 	atomic_t tx_fifo_stall;
 	boolean_t pcix_82544;
+	boolean_t tx_cleanup_scheduled;
 
 	/* RX */
 	struct e1000_desc_ring rx_ring;
diff -Naurp linux-2.6.9/drivers/net/e1000/e1000_hw.h linux-2.6.9/drivers/net/e1000.mod/e1000_hw.h
--- linux-2.6.9/drivers/net/e1000/e1000_hw.h	2004-10-18 14:55:06.000000000 -0700
+++ linux-2.6.9/drivers/net/e1000.mod/e1000_hw.h	2004-11-30 13:48:07.983682328 -0800
@@ -417,14 +417,12 @@ int32_t e1000_set_d3_lplu_state(struct e
 /* This defines the bits that are set in the Interrupt Mask
  * Set/Read Register.  Each bit is documented below:
  *   o RXT0   = Receiver Timer Interrupt (ring 0)
- *   o TXDW   = Transmit Descriptor Written Back
  *   o RXDMT0 = Receive Descriptor Minimum Threshold hit (ring 0)
  *   o RXSEQ  = Receive Sequence Error
  *   o LSC    = Link Status Change
  */
 #define IMS_ENABLE_MASK ( \
     E1000_IMS_RXT0   |    \
-    E1000_IMS_TXDW   |    \
     E1000_IMS_RXDMT0 |    \
     E1000_IMS_RXSEQ  |    \
     E1000_IMS_LSC)
diff -Naurp linux-2.6.9/drivers/net/e1000/e1000_main.c linux-2.6.9/drivers/net/e1000.mod/e1000_main.c
--- linux-2.6.9/drivers/net/e1000/e1000_main.c	2004-10-18 14:53:50.000000000 -0700
+++ linux-2.6.9/drivers/net/e1000.mod/e1000_main.c	2004-11-30 16:15:13.777957656 -0800
@@ -131,7 +131,7 @@ static int e1000_set_mac(struct net_devi
 static void e1000_irq_disable(struct e1000_adapter *adapter);
 static void e1000_irq_enable(struct e1000_adapter *adapter);
 static irqreturn_t e1000_intr(int irq, void *data, struct pt_regs *regs);
-static boolean_t e1000_clean_tx_irq(struct e1000_adapter *adapter);
+static void e1000_clean_tx(unsigned long data);
 #ifdef CONFIG_E1000_NAPI
 static int e1000_clean(struct net_device *netdev, int *budget);
 static boolean_t e1000_clean_rx_irq(struct e1000_adapter *adapter,
@@ -286,6 +286,7 @@ e1000_down(struct e1000_adapter *adapter
 
 	e1000_irq_disable(adapter);
 	free_irq(adapter->pdev->irq, netdev);
+	del_timer_sync(&adapter->tx_cleanup_timer);
 	del_timer_sync(&adapter->tx_fifo_stall_timer);
 	del_timer_sync(&adapter->watchdog_timer);
 	del_timer_sync(&adapter->phy_info_timer);
@@ -533,6 +534,10 @@ e1000_probe(struct pci_dev *pdev,
 
 	e1000_get_bus_info(&adapter->hw);
 
+	init_timer(&adapter->tx_cleanup_timer);
+	adapter->tx_cleanup_timer.function = &e1000_clean_tx;
+	adapter->tx_cleanup_timer.data = (unsigned long) adapter;
+
 	init_timer(&adapter->tx_fifo_stall_timer);
 	adapter->tx_fifo_stall_timer.function = &e1000_82547_tx_fifo_stall;
 	adapter->tx_fifo_stall_timer.data = (unsigned long) adapter;
@@ -893,14 +898,9 @@ e1000_configure_tx(struct e1000_adapter 
 	e1000_config_collision_dist(&adapter->hw);
 
 	/* Setup Transmit Descriptor Settings for eop descriptor */
-	adapter->txd_cmd = E1000_TXD_CMD_IDE | E1000_TXD_CMD_EOP |
+	adapter->txd_cmd = E1000_TXD_CMD_EOP |
 		E1000_TXD_CMD_IFCS;
 
-	if(adapter->hw.mac_type < e1000_82543)
-		adapter->txd_cmd |= E1000_TXD_CMD_RPS;
-	else
-		adapter->txd_cmd |= E1000_TXD_CMD_RS;
-
 	/* Cache if we're 82544 running in PCI-X because we'll
 	 * need this to apply a workaround later in the send path. */
 	if(adapter->hw.mac_type == e1000_82544 &&
@@ -1820,6 +1820,11 @@ e1000_xmit_frame(struct sk_buff *skb, st
  		return NETDEV_TX_LOCKED; 
  	} 
 
+	if(!adapter->tx_cleanup_scheduled) {
+		adapter->tx_cleanup_scheduled = TRUE;
+		mod_timer(&adapter->tx_cleanup_timer, jiffies + 1);
+	}
+
 	/* need: count + 2 desc gap to keep tail from touching
 	 * head, otherwise try next time */
 	if(E1000_DESC_UNUSED(&adapter->tx_ring) < count + 2) {
@@ -1856,6 +1861,7 @@ e1000_xmit_frame(struct sk_buff *skb, st
 	netdev->trans_start = jiffies;
 
 	spin_unlock_irqrestore(&adapter->tx_lock, flags);
+
 	return NETDEV_TX_OK;
 }
 
@@ -2151,8 +2157,7 @@ e1000_intr(int irq, void *data, struct p
 	}
 #else
 	for(i = 0; i < E1000_MAX_INTR; i++)
-		if(unlikely(!e1000_clean_rx_irq(adapter) &
-		   !e1000_clean_tx_irq(adapter)))
+		if(unlikely(!e1000_clean_rx_irq(adapter)))
 			break;
 #endif
 
@@ -2170,18 +2175,15 @@ e1000_clean(struct net_device *netdev, i
 {
 	struct e1000_adapter *adapter = netdev->priv;
 	int work_to_do = min(*budget, netdev->quota);
-	int tx_cleaned;
 	int work_done = 0;
 	
-	tx_cleaned = e1000_clean_tx_irq(adapter);
 	e1000_clean_rx_irq(adapter, &work_done, work_to_do);
 
 	*budget -= work_done;
 	netdev->quota -= work_done;
 	
-	/* if no Rx and Tx cleanup work was done, exit the polling mode */
-	if(!tx_cleaned || (work_done < work_to_do) || 
-				!netif_running(netdev)) {
+	/* if no Rx cleanup work was done, exit the polling mode */
+	if((work_done < work_to_do) || !netif_running(netdev)) {
 		netif_rx_complete(netdev);
 		e1000_irq_enable(adapter);
 		return 0;
@@ -2192,66 +2194,74 @@ e1000_clean(struct net_device *netdev, i
 
 #endif
 /**
- * e1000_clean_tx_irq - Reclaim resources after transmit completes
- * @adapter: board private structure
+ * e1000_clean_tx - Reclaim resources after transmit completes
+ * @data: timer callback data (board private structure)
  **/
 
-static boolean_t
-e1000_clean_tx_irq(struct e1000_adapter *adapter)
+static void
+e1000_clean_tx(unsigned long data)
 {
+	struct e1000_adapter *adapter = (struct e1000_adapter *)data;
 	struct e1000_desc_ring *tx_ring = &adapter->tx_ring;
 	struct net_device *netdev = adapter->netdev;
 	struct pci_dev *pdev = adapter->pdev;
-	struct e1000_tx_desc *tx_desc, *eop_desc;
 	struct e1000_buffer *buffer_info;
-	unsigned int i, eop;
-	boolean_t cleaned = FALSE;
+	unsigned int i, next;
+	int size = 0, count = 0;
+	uint32_t tx_head;
 
-	i = tx_ring->next_to_clean;
-	eop = tx_ring->buffer_info[i].next_to_watch;
-	eop_desc = E1000_TX_DESC(*tx_ring, eop);
+	spin_lock(&adapter->tx_lock);
 
-	while(eop_desc->upper.data & cpu_to_le32(E1000_TXD_STAT_DD)) {
-		for(cleaned = FALSE; !cleaned; ) {
-			tx_desc = E1000_TX_DESC(*tx_ring, i);
-			buffer_info = &tx_ring->buffer_info[i];
+	tx_head = E1000_READ_REG(&adapter->hw, TDH);
 
-			if(likely(buffer_info->dma)) {
-				pci_unmap_page(pdev,
-					       buffer_info->dma,
-					       buffer_info->length,
-					       PCI_DMA_TODEVICE);
-				buffer_info->dma = 0;
-			}
+	i = next = tx_ring->next_to_clean;
 
-			if(buffer_info->skb) {
-				dev_kfree_skb_any(buffer_info->skb);
-				buffer_info->skb = NULL;
-			}
+	while(i != tx_head) {
+		size++;
+		if(i == tx_ring->buffer_info[next].next_to_watch) {
+			count += size;
+			size = 0;
+			if(unlikely(++i == tx_ring->count))
+				i = 0;
+			next = i;
+		} else {
+			if(unlikely(++i == tx_ring->count))
+				i = 0;
+		}
+	}
 
-			tx_desc->buffer_addr = 0;
-			tx_desc->lower.data = 0;
-			tx_desc->upper.data = 0;
+	i = tx_ring->next_to_clean;
+	while(count--) {
+		buffer_info = &tx_ring->buffer_info[i];
 
-			cleaned = (i == eop);
-			if(unlikely(++i == tx_ring->count)) i = 0;
+		if(likely(buffer_info->dma)) {
+			pci_unmap_page(pdev,
+				       buffer_info->dma,
+				       buffer_info->length,
+				       PCI_DMA_TODEVICE);
+			buffer_info->dma = 0;
 		}
-		
-		eop = tx_ring->buffer_info[i].next_to_watch;
-		eop_desc = E1000_TX_DESC(*tx_ring, eop);
+
+		if(buffer_info->skb) {
+			dev_kfree_skb_any(buffer_info->skb);
+			buffer_info->skb = NULL;
+		}
+
+		if(unlikely(++i == tx_ring->count))
+			i = 0;
 	}
 
 	tx_ring->next_to_clean = i;
 
-	spin_lock(&adapter->tx_lock);
+	if(E1000_DESC_UNUSED(tx_ring) != tx_ring->count)
+		mod_timer(&adapter->tx_cleanup_timer, jiffies + 1);
+	else
+		adapter->tx_cleanup_scheduled = FALSE;
 
-	if(unlikely(cleaned && netif_queue_stopped(netdev) &&
-		    netif_carrier_ok(netdev)))
+	if(unlikely(netif_queue_stopped(netdev) && netif_carrier_ok(netdev)))
 		netif_wake_queue(netdev);
 
 	spin_unlock(&adapter->tx_lock);
-
-	return cleaned;
 }
 
 /**

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [E1000-devel] Transmission limit
  2004-12-01  1:09           ` Scott Feldman
@ 2004-12-01 18:29             ` Lennert Buytenhek
  2004-12-01 21:35               ` Lennert Buytenhek
  0 siblings, 1 reply; 24+ messages in thread
From: Lennert Buytenhek @ 2004-12-01 18:29 UTC (permalink / raw)
  To: Scott Feldman
  Cc: jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Tue, Nov 30, 2004 at 05:09:59PM -0800, Scott Feldman wrote:

> This doubles the kpps numbers for 60-byte packets.  I'd like to see what
> happens on higher bus bandwidth systems.  Anyone?

Dual Xeon 2.4GHz, a 82540EM and a 82541GI both on 32/66 on separate
PCI buses.

BEFORE performance is approx the same for both, ~620kpps.
AFTER performance is ~730kpps, also approx the same for both.

(Note: only sending with one NIC at a time.)

Once or twice it went into a state where it started spitting out these
kinds of messages and never recovered:

	Dec  1 19:13:18 phi kernel: NETDEV WATCHDOG: eth1: transmit timed out
	[...]
	Dec  1 19:13:31 phi kernel: NETDEV WATCHDOG: eth1: transmit timed out
	[...]
	Dec  1 19:13:43 phi kernel: NETDEV WATCHDOG: eth1: transmit timed out

But overall, looks good.  Strange thing that Robert's numbers didn't
improve.  Doing some more measurements right now.

--L

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [E1000-devel] Transmission limit
  2004-12-01 18:29             ` Lennert Buytenhek
@ 2004-12-01 21:35               ` Lennert Buytenhek
  2004-12-02  6:13                 ` Scott Feldman
  0 siblings, 1 reply; 24+ messages in thread
From: Lennert Buytenhek @ 2004-12-01 21:35 UTC (permalink / raw)
  To: Scott Feldman
  Cc: jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

[-- Attachment #1: Type: text/plain, Size: 1209 bytes --]

On Wed, Dec 01, 2004 at 07:29:43PM +0100, Lennert Buytenhek wrote:

> > This doubles the kpps numbers for 60-byte packets.  I'd like to see what
> > happens on higher bus bandwidth systems.  Anyone?
> 
> Dual Xeon 2.4GHz, a 82540EM and a 82541GI both on 32/66 on separate
> PCI buses.
> 
> BEFORE performance is approx the same for both, ~620kpps.
> AFTER performance is ~730kpps, also approx the same for both.

Pretty graph attached.  From ~220B packets or so it does wire speed, but
there's still an odd drop in performance around 256B packets (which is
also there without your patch.)  From 350B packets or so, performance is
identical with or without your patch (wire speed.)

So.  Do you have any other good plans perhaps? :)


> Once or twice it went into a state where it started spitting out these
> kinds of messages and never recovered:
> 
> 	Dec  1 19:13:18 phi kernel: NETDEV WATCHDOG: eth1: transmit timed out
> 	[...]
> 	Dec  1 19:13:31 phi kernel: NETDEV WATCHDOG: eth1: transmit timed out
> 	[...]
> 	Dec  1 19:13:43 phi kernel: NETDEV WATCHDOG: eth1: transmit timed out

Didn't see this happen anymore.  (ifconfig down and then up recovered it
both times I saw it happen.)


thanks,
Lennert

[-- Attachment #2: feldman.png --]
[-- Type: image/png, Size: 7959 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [E1000-devel] Transmission limit
  2004-12-01 21:35               ` Lennert Buytenhek
@ 2004-12-02  6:13                 ` Scott Feldman
  2004-12-05 14:50                   ` 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit) Lennert Buytenhek
  0 siblings, 1 reply; 24+ messages in thread
From: Scott Feldman @ 2004-12-02  6:13 UTC (permalink / raw)
  To: Lennert Buytenhek
  Cc: jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Wed, 2004-12-01 at 13:35, Lennert Buytenhek wrote: 
> Pretty graph attached.  From ~220B packets or so it does wire speed, but
> there's still an odd drop in performance around 256B packets (which is
> also there without your patch.)  From 350B packets or so, performance is
> identical with or without your patch (wire speed.)
Seems this is helping PCI nics but not PCI-X.  I was using PCI 32/33. 
Can't explain the dip around 256B.

> So.  Do you have any other good plans perhaps? :)

Idea#1

Is the write of TDT causing interference with DMA transactions?

In addition to my patch, what happens if you bump the Tx tail every n
packets, where n is like 16 or 32 or 64?  

if((i % 16) == 0)
	E1000_REG_WRITE(&adapter->hw, TDT, i);

This might piss the NETDEV timer off if the send count isn't a multiple
of n, so you might want to disable netdev->tx_timeout.

Idea#2

The Ultimate: queue up 4096 packets and then write TDT once to send all
4096 in one shot.  Well, maybe a few less that 4096 so we don't wrap the
ring.  How about pkt_size = 4000?

Take my patch and change the timer call in e1000_xmit_frame from 

	jiffies + 1

to

	jiffies + HZ

This will schedule the cleanup of the skbs 1 second after the first
queue, so we shouldn't be doing any cleanup while the 4000 packets are
DMA'ed.

Oh, and change the tail write to

if((i % 4000) == 0)
	E1000_REG_WRITE(&adapter->hw, TDT, i);

Of course you'll need to close/open the driver after each run.

Idea#3

http://www.mail-archive.com/freebsd-net@freebsd.org/msg10826.html

Set TXDMAC to 0 in e1000_configure_tx.

> > Once or twice it went into a state where it started spitting out these
> > kinds of messages and never recovered:
> > 
> > 	Dec  1 19:13:18 phi kernel: NETDEV WATCHDOG: eth1: transmit timed out
> > 	[...]
> > 	Dec  1 19:13:31 phi kernel: NETDEV WATCHDOG: eth1: transmit timed out
> > 	[...]
> > 	Dec  1 19:13:43 phi kernel: NETDEV WATCHDOG: eth1: transmit timed out
> 
> Didn't see this happen anymore.  (ifconfig down and then up recovered it
> both times I saw it happen.)

Well, it's probably not a HW bug that's causing the reset; it's probably
some bug with my patch.

-scott

^ permalink raw reply	[flat|nested] 24+ messages in thread

* 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-02  6:13                 ` Scott Feldman
@ 2004-12-05 14:50                   ` Lennert Buytenhek
  2004-12-05 15:03                     ` Martin Josefsson
  0 siblings, 1 reply; 24+ messages in thread
From: Lennert Buytenhek @ 2004-12-05 14:50 UTC (permalink / raw)
  To: Scott Feldman
  Cc: jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Wed, Dec 01, 2004 at 10:13:33PM -0800, Scott Feldman wrote:

> Idea#3
> 
> http://www.mail-archive.com/freebsd-net@freebsd.org/msg10826.html
> 
> Set TXDMAC to 0 in e1000_configure_tx.

Enabling 'DMA packet prefetching' gives me an impressive boost in performance.
Combined with your TX clean rework, I now get 1.03Mpps TX performance at 60B
packets.  Transmitting from both of the 82546 ports at the same time gives me
close to 2 Mpps.

The freebsd post hints that (some) e1000 hardware might be buggy w.r.t. this
prefetching though.

I'll play some more with the other ideas you suggested as well.

60      1036488
61      1037413
62      1036429
63      990239
64      993218
65      993233
66      993201
67      993234
68      993219
69      993208
70      992225
71      980560


--L


diff -ur e1000.orig/e1000_main.c e1000/e1000_main.c
--- e1000.orig/e1000_main.c	2004-12-04 11:43:12.000000000 +0100
+++ e1000/e1000_main.c	2004-12-05 15:40:49.284946897 +0100
@@ -879,6 +894,8 @@
 
 	E1000_WRITE_REG(&adapter->hw, TCTL, tctl);
 
+	E1000_WRITE_REG(&adapter->hw, TXDMAC, 0);
+
 	e1000_config_collision_dist(&adapter->hw);
 
 	/* Setup Transmit Descriptor Settings for eop descriptor */

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-05 14:50                   ` 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit) Lennert Buytenhek
@ 2004-12-05 15:03                     ` Martin Josefsson
  2004-12-05 15:15                       ` Lennert Buytenhek
                                         ` (2 more replies)
  0 siblings, 3 replies; 24+ messages in thread
From: Martin Josefsson @ 2004-12-05 15:03 UTC (permalink / raw)
  To: Lennert Buytenhek
  Cc: Scott Feldman, jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Sun, 5 Dec 2004, Lennert Buytenhek wrote:

> Enabling 'DMA packet prefetching' gives me an impressive boost in performance.
> Combined with your TX clean rework, I now get 1.03Mpps TX performance at 60B
> packets.  Transmitting from both of the 82546 ports at the same time gives me
> close to 2 Mpps.
>
> The freebsd post hints that (some) e1000 hardware might be buggy w.r.t. this
> prefetching though.
>
> I'll play some more with the other ideas you suggested as well.
>
> 60      1036488

I was just playing with prefetching when you sent your mail :)

I get that number with Scotts patch but without prefetching.
If I mode the TDT update to the tc cleaning I get a few extra kpps but not
much.

BUT if I use the above + prefetching I get this:

60      1483890
64      1418568
68      1356992
72      1300523
76      1248568
80      1142989
84      1140909
88      1114951
92      1076546
96      960732
100     949801
104     972876
108     945314
112     918380
116     891393
120     865923
124     843288
128     696465

Which is pretty nice :)

This is on one port of a 82546GB

The hardware is a dual Athlon MP 2000+ in an Asus A7M266-D motherboard and
the nic is located in a 64/66 slot.

I won't post any patch until I've tested some more and cleaned up a few
things.

BTW, I also get some transmit timouts with Scotts patch sometimes, not
often but it does happen.

/Martin

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-05 15:03                     ` Martin Josefsson
@ 2004-12-05 15:15                       ` Lennert Buytenhek
  2004-12-05 15:19                         ` Martin Josefsson
  2004-12-05 15:42                       ` Martin Josefsson
  2004-12-05 21:12                       ` Scott Feldman
  2 siblings, 1 reply; 24+ messages in thread
From: Lennert Buytenhek @ 2004-12-05 15:15 UTC (permalink / raw)
  To: Martin Josefsson
  Cc: Scott Feldman, jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Sun, Dec 05, 2004 at 04:03:36PM +0100, Martin Josefsson wrote:

> BUT if I use the above + prefetching I get this:
> 
> 60      1483890
> [snip]
> 
> Which is pretty nice :)

Not just that, it's also wire speed GigE.  Damn.  Now we all have to go
and upgrade to 10GbE cards, and I don't think my girlfriend would give me
one of those for christmas.


> This is on one port of a 82546GB
> 
> The hardware is a dual Athlon MP 2000+ in an Asus A7M266-D motherboard and
> the nic is located in a 64/66 slot.

Hmmm.  Funny you get this number even on 64/66.  How many PCI bridges
between the CPUs and the NIC?  Any idea how many cycles an MMIO read on
your hardware is?


cheers,
Lennert

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-05 15:15                       ` Lennert Buytenhek
@ 2004-12-05 15:19                         ` Martin Josefsson
  2004-12-05 15:30                           ` Martin Josefsson
  0 siblings, 1 reply; 24+ messages in thread
From: Martin Josefsson @ 2004-12-05 15:19 UTC (permalink / raw)
  To: Lennert Buytenhek
  Cc: Scott Feldman, jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Sun, 5 Dec 2004, Lennert Buytenhek wrote:

> > 60      1483890
> > [snip]
> >
> > Which is pretty nice :)
>
> Not just that, it's also wire speed GigE.  Damn.  Now we all have to go
> and upgrade to 10GbE cards, and I don't think my girlfriend would give me
> one of those for christmas.

Yes it is, and it's lovely to see.
You have to nerdify her so she sees the need for geeky hardware enough to
give you what you need :)

> > This is on one port of a 82546GB
> >
> > The hardware is a dual Athlon MP 2000+ in an Asus A7M266-D motherboard and
> > the nic is located in a 64/66 slot.
>
> Hmmm.  Funny you get this number even on 64/66.  How many PCI bridges
> between the CPUs and the NIC?  Any idea how many cycles an MMIO read on
> your hardware is?

I verified that I get the same results on a small whimpy 82540EM that runs
at 32/66 as well. Just about to see what I get at 32/33 with that card.

/Martin

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-05 15:19                         ` Martin Josefsson
@ 2004-12-05 15:30                           ` Martin Josefsson
  2004-12-05 17:00                             ` Lennert Buytenhek
  0 siblings, 1 reply; 24+ messages in thread
From: Martin Josefsson @ 2004-12-05 15:30 UTC (permalink / raw)
  To: Lennert Buytenhek
  Cc: Scott Feldman, jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Sun, 5 Dec 2004, Martin Josefsson wrote:

> > > The hardware is a dual Athlon MP 2000+ in an Asus A7M266-D motherboard and
> > > the nic is located in a 64/66 slot.
> >
> > Hmmm.  Funny you get this number even on 64/66.  How many PCI bridges
> > between the CPUs and the NIC?  Any idea how many cycles an MMIO read on
> > your hardware is?
>
> I verified that I get the same results on a small whimpy 82540EM that runs
> at 32/66 as well. Just about to see what I get at 32/33 with that card.

Just tested the 82540EM at 32/33 and it's a big diffrence.

60      350229
64      247037
68      219643
72      218205
76      216786
80      215386
84      214003
88      212638
92      211291
96      210004
100     208647
104     182461
108     181468
112     180453
116     179482
120     185472
124     188336
128     153743

Sorry, forgot to answer your other questions, I'm a bit excited at the
moment :)

The 64/66 bus on this motherboard is directly connected to the
northbridge. Here's the lspci output with the 82546GB nic attached
to the 64/66 bus and 82540EM nic connected to the 32/33 bus that hangs
off the southbridge:

00:00.0 Host bridge: Advanced Micro Devices [AMD] AMD-760 MP [IGD4-2P] System Controller (rev 11)
00:01.0 PCI bridge: Advanced Micro Devices [AMD] AMD-760 MP [IGD4-2P] AGP Bridge
00:07.0 ISA bridge: Advanced Micro Devices [AMD] AMD-768 [Opus] ISA (rev 05)
00:07.1 IDE interface: Advanced Micro Devices [AMD] AMD-768 [Opus] IDE (rev 04)
00:07.3 Bridge: Advanced Micro Devices [AMD] AMD-768 [Opus] ACPI (rev 03)
00:08.0 Ethernet controller: Intel Corp. 82546GB Gigabit Ethernet Controller (rev 03)
00:08.1 Ethernet controller: Intel Corp. 82546GB Gigabit Ethernet Controller (rev 03)
00:10.0 PCI bridge: Advanced Micro Devices [AMD] AMD-768 [Opus] PCI (rev 05)
01:05.0 VGA compatible controller: Silicon Integrated Systems [SiS] 86C326 5598/6326 (rev 0b)
02:05.0 Ethernet controller: Intel Corp. 82557/8/9 [Ethernet Pro 100] (rev 0c)
02:06.0 SCSI storage controller: Adaptec AIC-7892A U160/m (rev 02)
02:08.0 Ethernet controller: Intel Corp. 82540EM Gigabit Ethernet Controller (rev 02)

And lspci -t

-[00]-+-00.0
      +-01.0-[01]----05.0
      +-07.0
      +-07.1
      +-07.3
      +-08.0
      +-08.1
      \-10.0-[02]--+-05.0
                   +-06.0
                   \-08.0

I have no idea how expensive an MMIO read is on this machine, do you have
an relatively easy way to find out?

/Martin

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-05 15:30                           ` Martin Josefsson
@ 2004-12-05 17:00                             ` Lennert Buytenhek
  2004-12-05 17:11                               ` Martin Josefsson
  0 siblings, 1 reply; 24+ messages in thread
From: Lennert Buytenhek @ 2004-12-05 17:00 UTC (permalink / raw)
  To: Martin Josefsson
  Cc: Scott Feldman, jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Sun, Dec 05, 2004 at 04:30:47PM +0100, Martin Josefsson wrote:

> > I verified that I get the same results on a small whimpy 82540EM
> > that runs at 32/66 as well. Just about to see what I get at 32/33
> > with that card.
> 
> Just tested the 82540EM at 32/33 and it's a big diffrence.
> 
> 60      350229
> 64      247037
> 68      219643
> 72      218205
> 76      216786
> 80      215386
> 84      214003
> 88      212638
> 92      211291
> 96      210004
> 100     208647
> 104     182461
> 108     181468
> 112     180453
> 116     179482
> 120     185472
> 124     188336
> 128     153743

With or without prefetching? My 82540 in 32/33 mode gets on baseline
2.6.9:

60      431967
61      431311
62      431927
63      427827
64      427482

And with Scott's notxints patch:

60      514496
61      514493
62      514754
63      504629
64      504123


> Sorry, forgot to answer your other questions, I'm a bit excited at the
> moment :)

Makes sense :)


> The 64/66 bus on this motherboard is directly connected to the
> northbridge.

Your lspci output seems to suggest there is another PCI bridge in
between (00:10.0)

Basically on my box, it's CPU - MCH - P64H2 - e1000, where MCH is the
'Memory Controller Hub' and P64H2 the PCI-X bridge chip.


> I have no idea how expensive an MMIO read is on this machine, do you have
> an relatively easy way to find out?

A dirty way, yes ;-)  Open up e1000_osdep.h and do:

-#define E1000_READ_REG(a, reg) ( \
-    readl((a)->hw_addr + \
-        (((a)->mac_type >= e1000_82543) ? E1000_##reg : E1000_82542_##reg)))
+#define E1000_READ_REG(a, reg) ({ \
+    unsigned long s, e, d, v; \
+\
+    (a)->mmio_reads++; \
+    rdtsc(s, d); \
+    v = readl((a)->hw_addr + \
+        (((a)->mac_type >= e1000_82543) ? E1000_##reg : E1000_82542_##reg)); \
+    rdtsc(e, d); \
+    e -= s; \
+    printk(KERN_INFO "e1000: MMIO read took %ld clocks\n", e); \
+    printk(KERN_INFO "e1000: in process %d(%s)\n", current->pid, current->comm); \
+    dump_stack(); \
+    v; \
+})

You might want to disable the stack dump of course.


--L

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-05 17:00                             ` Lennert Buytenhek
@ 2004-12-05 17:11                               ` Martin Josefsson
  2004-12-05 17:38                                 ` Martin Josefsson
  0 siblings, 1 reply; 24+ messages in thread
From: Martin Josefsson @ 2004-12-05 17:11 UTC (permalink / raw)
  To: Lennert Buytenhek
  Cc: Scott Feldman, jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Sun, 5 Dec 2004, Lennert Buytenhek wrote:

> > Just tested the 82540EM at 32/33 and it's a big diffrence.
> >
> > 60      350229
> > 64      247037
> > 68      219643

[snip]

> With or without prefetching? My 82540 in 32/33 mode gets on baseline
> 2.6.9:

With, will test without. I've always suspected that the 32bit bus on this
motherboard is a bit slow.

> Your lspci output seems to suggest there is another PCI bridge in
> between (00:10.0)

Yes it sits between the 32bit and the 64bit bus.

> Basically on my box, it's CPU - MCH - P64H2 - e1000, where MCH is the
> 'Memory Controller Hub' and P64H2 the PCI-X bridge chip.

I don't have PCI-X (unless 64/66 counts as PCI-x which I highly doubt)

> > I have no idea how expensive an MMIO read is on this machine, do you have
> > an relatively easy way to find out?
>
> A dirty way, yes ;-)  Open up e1000_osdep.h and do:
>
> -#define E1000_READ_REG(a, reg) ( \
> -    readl((a)->hw_addr + \
> -        (((a)->mac_type >= e1000_82543) ? E1000_##reg : E1000_82542_##reg)))
> +#define E1000_READ_REG(a, reg) ({ \
> +    unsigned long s, e, d, v; \
> +\
> +    (a)->mmio_reads++; \
> +    rdtsc(s, d); \
> +    v = readl((a)->hw_addr + \
> +        (((a)->mac_type >= e1000_82543) ? E1000_##reg : E1000_82542_##reg)); \
> +    rdtsc(e, d); \
> +    e -= s; \
> +    printk(KERN_INFO "e1000: MMIO read took %ld clocks\n", e); \
> +    printk(KERN_INFO "e1000: in process %d(%s)\n", current->pid, current->comm); \
> +    dump_stack(); \
> +    v; \
> +})
>
> You might want to disable the stack dump of course.

Will test this in a while.

/Martin

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-05 17:11                               ` Martin Josefsson
@ 2004-12-05 17:38                                 ` Martin Josefsson
  2004-12-05 18:14                                   ` Lennert Buytenhek
  0 siblings, 1 reply; 24+ messages in thread
From: Martin Josefsson @ 2004-12-05 17:38 UTC (permalink / raw)
  To: Lennert Buytenhek
  Cc: Scott Feldman, jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Sun, 5 Dec 2004, Martin Josefsson wrote:

> > -#define E1000_READ_REG(a, reg) ( \
> > -    readl((a)->hw_addr + \
> > -        (((a)->mac_type >= e1000_82543) ? E1000_##reg : E1000_82542_##reg)))
> > +#define E1000_READ_REG(a, reg) ({ \
> > +    unsigned long s, e, d, v; \
> > +\
> > +    (a)->mmio_reads++; \
> > +    rdtsc(s, d); \
> > +    v = readl((a)->hw_addr + \
> > +        (((a)->mac_type >= e1000_82543) ? E1000_##reg : E1000_82542_##reg)); \
> > +    rdtsc(e, d); \
> > +    e -= s; \
> > +    printk(KERN_INFO "e1000: MMIO read took %ld clocks\n", e); \
> > +    printk(KERN_INFO "e1000: in process %d(%s)\n", current->pid, current->comm); \
> > +    dump_stack(); \
> > +    v; \
> > +})
> >
> > You might want to disable the stack dump of course.
>
> Will test this in a while.

It gives pretty varied results.
This is during a pktgen run.

The machine is an Athlon MP 2000+ which operated at 1667 MHz

e1000: MMIO read took 481 clocks
e1000: MMIO read took 369 clocks
e1000: MMIO read took 481 clocks
e1000: MMIO read took 11 clocks
e1000: MMIO read took 477 clocks
e1000: MMIO read took 316 clocks
e1000: MMIO read took 481 clocks
e1000: MMIO read took 316 clocks
e1000: MMIO read took 480 clocks
e1000: MMIO read took 332 clocks
e1000: MMIO read took 480 clocks
e1000: MMIO read took 372 clocks
e1000: MMIO read took 480 clocks
e1000: MMIO read took 11 clocks
e1000: MMIO read took 481 clocks
e1000: MMIO read took 388 clocks
e1000: MMIO read took 480 clocks
e1000: MMIO read took 11 clocks
e1000: MMIO read took 485 clocks
e1000: MMIO read took 317 clocks
e1000: MMIO read took 481 clocks
e1000: MMIO read took 337 clocks
e1000: MMIO read took 480 clocks
e1000: MMIO read took 316 clocks
e1000: MMIO read took 480 clocks
e1000: MMIO read took 409 clocks
e1000: MMIO read took 480 clocks
e1000: MMIO read took 334 clocks
e1000: MMIO read took 481 clocks
e1000: MMIO read took 316 clocks
e1000: MMIO read took 480 clocks
e1000: MMIO read took 11 clocks
e1000: MMIO read took 505 clocks
e1000: MMIO read took 359 clocks
e1000: MMIO read took 484 clocks
e1000: MMIO read took 337 clocks
e1000: MMIO read took 464 clocks
e1000: MMIO read took 504 clocks

/Martin

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-05 17:38                                 ` Martin Josefsson
@ 2004-12-05 18:14                                   ` Lennert Buytenhek
  0 siblings, 0 replies; 24+ messages in thread
From: Lennert Buytenhek @ 2004-12-05 18:14 UTC (permalink / raw)
  To: Martin Josefsson
  Cc: Scott Feldman, jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Sun, Dec 05, 2004 at 06:38:05PM +0100, Martin Josefsson wrote:

> e1000: MMIO read took 481 clocks
> e1000: MMIO read took 369 clocks
> e1000: MMIO read took 481 clocks
> e1000: MMIO read took 11 clocks
> e1000: MMIO read took 477 clocks
> e1000: MMIO read took 316 clocks

Interesting.  On a 1667MHz CPU, this is around ~0.28us per MMIO read
in the worst case.  On my hardware (dual Xeon 2.4GHz), the best case
I've ever seen was ~0.83us.

This alone can make a hell of a difference, esp. for 60B packets.


--L

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-05 15:03                     ` Martin Josefsson
  2004-12-05 15:15                       ` Lennert Buytenhek
@ 2004-12-05 15:42                       ` Martin Josefsson
  2004-12-05 16:48                         ` Martin Josefsson
                                           ` (2 more replies)
  2004-12-05 21:12                       ` Scott Feldman
  2 siblings, 3 replies; 24+ messages in thread
From: Martin Josefsson @ 2004-12-05 15:42 UTC (permalink / raw)
  To: Lennert Buytenhek
  Cc: Scott Feldman, jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Sun, 5 Dec 2004, Martin Josefsson wrote:

[snip]
> BUT if I use the above + prefetching I get this:
>
> 60      1483890
[snip]
> This is on one port of a 82546GB
>
> The hardware is a dual Athlon MP 2000+ in an Asus A7M266-D motherboard and
> the nic is located in a 64/66 slot.
>
> I won't post any patch until I've tested some more and cleaned up a few
> things.
>
> BTW, I also get some transmit timouts with Scotts patch sometimes, not
> often but it does happen.

Here's the patch, not much more tested (it still gives some transmit
timeouts since it's scotts patch + prefetching and delayed TDT updating).
And it's not cleaned up, but hey, that's development :)

The delayed TDT updating was a test and currently it delays the first tx'd
packet after a timerrun 1ms.

Would be interesting to see what other people get with this thing.
Lennert?

diff -X /home/gandalf/dontdiff.ny -urNp linux-2.6.10-rc3.orig/drivers/net/e1000/e1000.h linux-2.6.10-rc3.labbrouter/drivers/net/e1000/e1000.h
--- linux-2.6.10-rc3.orig/drivers/net/e1000/e1000.h	2004-12-04 18:16:53.000000000 +0100
+++ linux-2.6.10-rc3.labbrouter/drivers/net/e1000/e1000.h	2004-12-05 15:12:25.000000000 +0100
@@ -101,7 +101,7 @@ struct e1000_adapter;
 #define E1000_MAX_INTR 10

 /* TX/RX descriptor defines */
-#define E1000_DEFAULT_TXD                  256
+#define E1000_DEFAULT_TXD                 4096
 #define E1000_MAX_TXD                      256
 #define E1000_MIN_TXD                       80
 #define E1000_MAX_82544_TXD               4096
@@ -187,6 +187,7 @@ struct e1000_desc_ring {
 /* board specific private data structure */

 struct e1000_adapter {
+	struct timer_list tx_cleanup_timer;
 	struct timer_list tx_fifo_stall_timer;
 	struct timer_list watchdog_timer;
 	struct timer_list phy_info_timer;
@@ -222,6 +223,7 @@ struct e1000_adapter {
 	uint32_t tx_fifo_size;
 	atomic_t tx_fifo_stall;
 	boolean_t pcix_82544;
+	boolean_t tx_cleanup_scheduled;

 	/* RX */
 	struct e1000_desc_ring rx_ring;
diff -X /home/gandalf/dontdiff.ny -urNp linux-2.6.10-rc3.orig/drivers/net/e1000/e1000_hw.h linux-2.6.10-rc3.labbrouter/drivers/net/e1000/e1000_hw.h
--- linux-2.6.10-rc3.orig/drivers/net/e1000/e1000_hw.h	2004-12-04 18:16:53.000000000 +0100
+++ linux-2.6.10-rc3.labbrouter/drivers/net/e1000/e1000_hw.h	2004-12-05 15:37:50.000000000 +0100
@@ -417,14 +417,12 @@ int32_t e1000_set_d3_lplu_state(struct e
 /* This defines the bits that are set in the Interrupt Mask
  * Set/Read Register.  Each bit is documented below:
  *   o RXT0   = Receiver Timer Interrupt (ring 0)
- *   o TXDW   = Transmit Descriptor Written Back
  *   o RXDMT0 = Receive Descriptor Minimum Threshold hit (ring 0)
  *   o RXSEQ  = Receive Sequence Error
  *   o LSC    = Link Status Change
  */
 #define IMS_ENABLE_MASK ( \
     E1000_IMS_RXT0   |    \
-    E1000_IMS_TXDW   |    \
     E1000_IMS_RXDMT0 |    \
     E1000_IMS_RXSEQ  |    \
     E1000_IMS_LSC)
diff -X /home/gandalf/dontdiff.ny -urNp linux-2.6.10-rc3.orig/drivers/net/e1000/e1000_main.c linux-2.6.10-rc3.labbrouter/drivers/net/e1000/e1000_main.c
--- linux-2.6.10-rc3.orig/drivers/net/e1000/e1000_main.c	2004-12-05 14:59:19.000000000 +0100
+++ linux-2.6.10-rc3.labbrouter/drivers/net/e1000/e1000_main.c	2004-12-05 15:40:11.000000000 +0100
@@ -131,7 +131,7 @@ static int e1000_set_mac(struct net_devi
 static void e1000_irq_disable(struct e1000_adapter *adapter);
 static void e1000_irq_enable(struct e1000_adapter *adapter);
 static irqreturn_t e1000_intr(int irq, void *data, struct pt_regs *regs);
-static boolean_t e1000_clean_tx_irq(struct e1000_adapter *adapter);
+static void e1000_clean_tx(unsigned long data);
 #ifdef CONFIG_E1000_NAPI
 static int e1000_clean(struct net_device *netdev, int *budget);
 static boolean_t e1000_clean_rx_irq(struct e1000_adapter *adapter,
@@ -286,6 +286,7 @@ e1000_down(struct e1000_adapter *adapter

 	e1000_irq_disable(adapter);
 	free_irq(adapter->pdev->irq, netdev);
+	del_timer_sync(&adapter->tx_cleanup_timer);
 	del_timer_sync(&adapter->tx_fifo_stall_timer);
 	del_timer_sync(&adapter->watchdog_timer);
 	del_timer_sync(&adapter->phy_info_timer);
@@ -522,6 +523,10 @@ e1000_probe(struct pci_dev *pdev,

 	e1000_get_bus_info(&adapter->hw);

+	init_timer(&adapter->tx_cleanup_timer);
+	adapter->tx_cleanup_timer.function = &e1000_clean_tx;
+	adapter->tx_cleanup_timer.data = (unsigned long) adapter;
+
 	init_timer(&adapter->tx_fifo_stall_timer);
 	adapter->tx_fifo_stall_timer.function = &e1000_82547_tx_fifo_stall;
 	adapter->tx_fifo_stall_timer.data = (unsigned long) adapter;
@@ -882,19 +887,16 @@ e1000_configure_tx(struct e1000_adapter
 	e1000_config_collision_dist(&adapter->hw);

 	/* Setup Transmit Descriptor Settings for eop descriptor */
-	adapter->txd_cmd = E1000_TXD_CMD_IDE | E1000_TXD_CMD_EOP |
+	adapter->txd_cmd = E1000_TXD_CMD_EOP |
 		E1000_TXD_CMD_IFCS;

-	if(adapter->hw.mac_type < e1000_82543)
-		adapter->txd_cmd |= E1000_TXD_CMD_RPS;
-	else
-		adapter->txd_cmd |= E1000_TXD_CMD_RS;
-
 	/* Cache if we're 82544 running in PCI-X because we'll
 	 * need this to apply a workaround later in the send path. */
 	if(adapter->hw.mac_type == e1000_82544 &&
 	   adapter->hw.bus_type == e1000_bus_type_pcix)
 		adapter->pcix_82544 = 1;
+
+	E1000_WRITE_REG(&adapter->hw, TXDMAC, 0);
 }

 /**
@@ -1707,7 +1709,7 @@ e1000_tx_queue(struct e1000_adapter *ada
 	wmb();

 	tx_ring->next_to_use = i;
-	E1000_WRITE_REG(&adapter->hw, TDT, i);
+	/* E1000_WRITE_REG(&adapter->hw, TDT, i); */
 }

 /**
@@ -1809,6 +1811,11 @@ e1000_xmit_frame(struct sk_buff *skb, st
  		return NETDEV_TX_LOCKED;
  	}

+	if(!adapter->tx_cleanup_scheduled) {
+		adapter->tx_cleanup_scheduled = TRUE;
+		mod_timer(&adapter->tx_cleanup_timer, jiffies + 1);
+	}
+
 	/* need: count + 2 desc gap to keep tail from touching
 	 * head, otherwise try next time */
 	if(E1000_DESC_UNUSED(&adapter->tx_ring) < count + 2) {
@@ -1845,6 +1852,7 @@ e1000_xmit_frame(struct sk_buff *skb, st
 	netdev->trans_start = jiffies;

 	spin_unlock_irqrestore(&adapter->tx_lock, flags);
+
 	return NETDEV_TX_OK;
 }

@@ -2140,8 +2148,7 @@ e1000_intr(int irq, void *data, struct p
 	}
 #else
 	for(i = 0; i < E1000_MAX_INTR; i++)
-		if(unlikely(!e1000_clean_rx_irq(adapter) &
-		   !e1000_clean_tx_irq(adapter)))
+		if(unlikely(!e1000_clean_rx_irq(adapter)))
 			break;
 #endif

@@ -2159,18 +2166,15 @@ e1000_clean(struct net_device *netdev, i
 {
 	struct e1000_adapter *adapter = netdev->priv;
 	int work_to_do = min(*budget, netdev->quota);
-	int tx_cleaned;
 	int work_done = 0;

-	tx_cleaned = e1000_clean_tx_irq(adapter);
 	e1000_clean_rx_irq(adapter, &work_done, work_to_do);

 	*budget -= work_done;
 	netdev->quota -= work_done;

-	/* if no Rx and Tx cleanup work was done, exit the polling mode */
-	if(!tx_cleaned || (work_done < work_to_do) ||
-				!netif_running(netdev)) {
+	/* if no Rx cleanup work was done, exit the polling mode */
+	if((work_done < work_to_do) || !netif_running(netdev)) {
 		netif_rx_complete(netdev);
 		e1000_irq_enable(adapter);
 		return 0;
@@ -2181,66 +2185,76 @@ e1000_clean(struct net_device *netdev, i

 #endif
 /**
- * e1000_clean_tx_irq - Reclaim resources after transmit completes
- * @adapter: board private structure
+ * e1000_clean_tx - Reclaim resources after transmit completes
+ * @data: timer callback data (board private structure)
  **/

-static boolean_t
-e1000_clean_tx_irq(struct e1000_adapter *adapter)
+static void
+e1000_clean_tx(unsigned long data)
 {
+	struct e1000_adapter *adapter = (struct e1000_adapter *)data;
 	struct e1000_desc_ring *tx_ring = &adapter->tx_ring;
 	struct net_device *netdev = adapter->netdev;
 	struct pci_dev *pdev = adapter->pdev;
-	struct e1000_tx_desc *tx_desc, *eop_desc;
 	struct e1000_buffer *buffer_info;
-	unsigned int i, eop;
-	boolean_t cleaned = FALSE;
+	unsigned int i, next;
+	int size = 0, count = 0;
+	uint32_t tx_head;

-	i = tx_ring->next_to_clean;
-	eop = tx_ring->buffer_info[i].next_to_watch;
-	eop_desc = E1000_TX_DESC(*tx_ring, eop);
+	spin_lock(&adapter->tx_lock);

-	while(eop_desc->upper.data & cpu_to_le32(E1000_TXD_STAT_DD)) {
-		for(cleaned = FALSE; !cleaned; ) {
-			tx_desc = E1000_TX_DESC(*tx_ring, i);
-			buffer_info = &tx_ring->buffer_info[i];
+	E1000_WRITE_REG(&adapter->hw, TDT, tx_ring->next_to_use);

-			if(likely(buffer_info->dma)) {
-				pci_unmap_page(pdev,
-					       buffer_info->dma,
-					       buffer_info->length,
-					       PCI_DMA_TODEVICE);
-				buffer_info->dma = 0;
-			}
+	tx_head = E1000_READ_REG(&adapter->hw, TDH);

-			if(buffer_info->skb) {
-				dev_kfree_skb_any(buffer_info->skb);
-				buffer_info->skb = NULL;
-			}
+	i = next = tx_ring->next_to_clean;

-			tx_desc->buffer_addr = 0;
-			tx_desc->lower.data = 0;
-			tx_desc->upper.data = 0;
+	while(i != tx_head) {
+		size++;
+		if(i == tx_ring->buffer_info[next].next_to_watch) {
+			count += size;
+			size = 0;
+			if(unlikely(++i == tx_ring->count))
+				i = 0;
+			next = i;
+		} else {
+			if(unlikely(++i == tx_ring->count))
+				i = 0;
+		}
+	}

-			cleaned = (i == eop);
-			if(unlikely(++i == tx_ring->count)) i = 0;
+	i = tx_ring->next_to_clean;
+	while(count--) {
+		buffer_info = &tx_ring->buffer_info[i];
+
+		if(likely(buffer_info->dma)) {
+			pci_unmap_page(pdev,
+				       buffer_info->dma,
+				       buffer_info->length,
+				       PCI_DMA_TODEVICE);
+			buffer_info->dma = 0;
 		}
-
-		eop = tx_ring->buffer_info[i].next_to_watch;
-		eop_desc = E1000_TX_DESC(*tx_ring, eop);
+
+		if(buffer_info->skb) {
+			dev_kfree_skb_any(buffer_info->skb);
+			buffer_info->skb = NULL;
+		}
+
+		if(unlikely(++i == tx_ring->count))
+			i = 0;
 	}

 	tx_ring->next_to_clean = i;

-	spin_lock(&adapter->tx_lock);
+	if(E1000_DESC_UNUSED(tx_ring) != tx_ring->count)
+		mod_timer(&adapter->tx_cleanup_timer, jiffies + 1);
+	else
+		adapter->tx_cleanup_scheduled = FALSE;

-	if(unlikely(cleaned && netif_queue_stopped(netdev) &&
-		    netif_carrier_ok(netdev)))
+	if(unlikely(netif_queue_stopped(netdev) && netif_carrier_ok(netdev)))
 		netif_wake_queue(netdev);

 	spin_unlock(&adapter->tx_lock);
-
-	return cleaned;
 }

 /**

/Martin

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-05 15:42                       ` Martin Josefsson
@ 2004-12-05 16:48                         ` Martin Josefsson
  2004-12-05 17:01                           ` Martin Josefsson
  2004-12-05 17:58                           ` Lennert Buytenhek
  2004-12-05 17:44                         ` Lennert Buytenhek
  2004-12-08 23:36                         ` Ray Lehtiniemi
  2 siblings, 2 replies; 24+ messages in thread
From: Martin Josefsson @ 2004-12-05 16:48 UTC (permalink / raw)
  To: Lennert Buytenhek
  Cc: Scott Feldman, jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Sun, 5 Dec 2004, Martin Josefsson wrote:

> The delayed TDT updating was a test and currently it delays the first tx'd
> packet after a timerrun 1ms.

I removed the delayed TDT updating and gave it a go again (this is scott +
prefetching):

60      1486193
64      1267639
68      1259682
72      1243997
76      1243989
80      1153608
84      1123813
88      1115047
92      1076636
96      1040792
100     1007252
104     975806
108     946263
112     918456
116     892227
120     867477
124     844052
128     821858

It gives a little diffrent results, 60byte is ok but then it falls a lot
down to 64byte and the curve seems a bit flatter.

This should be the same driver that Lennert got 1.03Mpps with.
I get 1.03Mpps without prefetching.

I tried using both ports on the 82546GB nic.

        delay        nodelay
1CPU    1.95 Mpps    1.76 Mpps
2CPU    1.60 Mpps    1.44 Mpps

All tests performed on an SMP kernel, the above mention of 1CPU vs 2CPU
just means how the two nics were bound to the cpus. And there's no
tx-interrupts at all due to scotts patch.

/Martin

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-05 16:48                         ` Martin Josefsson
@ 2004-12-05 17:01                           ` Martin Josefsson
  2004-12-05 17:58                           ` Lennert Buytenhek
  1 sibling, 0 replies; 24+ messages in thread
From: Martin Josefsson @ 2004-12-05 17:01 UTC (permalink / raw)
  To: Lennert Buytenhek
  Cc: Scott Feldman, jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Sun, 5 Dec 2004, Martin Josefsson wrote:

> I removed the delayed TDT updating and gave it a go again (this is scott +
> prefetching):
>
> 60      1486193
> 64      1267639
> 68      1259682

Yet another mail, I hope you are using a NAPI-enabled MUA :)

This time I tried vanilla + prefetch and it gave pretty nice performance
as well:

60      1308047
64      1076044
68      1079377
72      1058993
76      1055708
80      1025659
84      1024692
88      1024236
92      1024510
96      1012853
100     1007925
104     976500
108     947061
112     919169
116     892804
120     868084
124     844609
128     822381

Large gap between 60 and 64byte, maybe the prefetching only prefetches
32bytes at a time?

As a reference: here's a completely vanilla e1000 driver:

60      860931
64      772949
68      754738
72      754200
76      756093
80      756398
84      742111
88      738120
92      740426
96      739720
100     722322
104     729287
108     719312
112     723171
116     705551
120     704843
124     704622
128     665863

/Martin

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-05 16:48                         ` Martin Josefsson
  2004-12-05 17:01                           ` Martin Josefsson
@ 2004-12-05 17:58                           ` Lennert Buytenhek
  1 sibling, 0 replies; 24+ messages in thread
From: Lennert Buytenhek @ 2004-12-05 17:58 UTC (permalink / raw)
  To: Martin Josefsson
  Cc: Scott Feldman, jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Sun, Dec 05, 2004 at 05:48:34PM +0100, Martin Josefsson wrote:

> I tried using both ports on the 82546GB nic.
> 
>         delay        nodelay
> 1CPU    1.95 Mpps    1.76 Mpps
> 2CPU    1.60 Mpps    1.44 Mpps

I get:

	delay		nodelay
1CPU	1837356		1837330
2CPU	2035060		1947424

So in your case using 2 CPUs degrades performance, in my case it
increases it.  And TDT delaying/coalescing only improves performance
when using 2 CPUs, and even then only slightly (and only for <= 62B
packets.)


--L

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-05 15:42                       ` Martin Josefsson
  2004-12-05 16:48                         ` Martin Josefsson
@ 2004-12-05 17:44                         ` Lennert Buytenhek
  2004-12-05 17:51                           ` Lennert Buytenhek
  2004-12-08 23:36                         ` Ray Lehtiniemi
  2 siblings, 1 reply; 24+ messages in thread
From: Lennert Buytenhek @ 2004-12-05 17:44 UTC (permalink / raw)
  To: Martin Josefsson
  Cc: Scott Feldman, jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Sun, Dec 05, 2004 at 04:42:34PM +0100, Martin Josefsson wrote:

> The delayed TDT updating was a test and currently it delays the first tx'd
> packet after a timerrun 1ms.
> 
> Would be interesting to see what other people get with this thing.
> Lennert?

I took Scott's notxints patch, added the prefetch bits and moved the
TDT updating to e1000_clean_tx as you did.

Slightly better than before, but not much:

60      1070157
61      1066610
62      1062088
63      991447
64      991546
65      991537
66      991449
67      990857
68      989882
69      991347

Regular TDT updating:

60      1037469
61      1038425
62      1037393
63      993143
64      992156
65      993137
66      992203
67      992165
68      992185
69      988249


--L

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-05 17:44                         ` Lennert Buytenhek
@ 2004-12-05 17:51                           ` Lennert Buytenhek
  2004-12-05 17:54                             ` Martin Josefsson
  0 siblings, 1 reply; 24+ messages in thread
From: Lennert Buytenhek @ 2004-12-05 17:51 UTC (permalink / raw)
  To: Martin Josefsson
  Cc: Scott Feldman, jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Sun, Dec 05, 2004 at 06:44:01PM +0100, Lennert Buytenhek wrote:
> On Sun, Dec 05, 2004 at 04:42:34PM +0100, Martin Josefsson wrote:
> 
> > The delayed TDT updating was a test and currently it delays the first tx'd
> > packet after a timerrun 1ms.
> > 
> > Would be interesting to see what other people get with this thing.
> > Lennert?
> 
> I took Scott's notxints patch, added the prefetch bits and moved the
> TDT updating to e1000_clean_tx as you did.
> 
> Slightly better than before, but not much:

I've tested all packet sizes now, and delayed TDT updating once per jiffy
(instead of once per packet) indeed gives about 25kpps more on 60,61,62
byte packets, and is hardly worth it for bigger packets.


--L

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-05 17:51                           ` Lennert Buytenhek
@ 2004-12-05 17:54                             ` Martin Josefsson
  2004-12-06 11:32                               ` 1.03Mpps on e1000 (was: " jamal
  0 siblings, 1 reply; 24+ messages in thread
From: Martin Josefsson @ 2004-12-05 17:54 UTC (permalink / raw)
  To: Lennert Buytenhek
  Cc: Scott Feldman, jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Sun, 5 Dec 2004, Lennert Buytenhek wrote:

> I've tested all packet sizes now, and delayed TDT updating once per jiffy
> (instead of once per packet) indeed gives about 25kpps more on 60,61,62
> byte packets, and is hardly worth it for bigger packets.

Maybe we can't see any real gains here now, I wonder if it has any effect
if you have lots of nics on the same bus. I mean, in theory it saves a
whole lot of traffic on the bus.

/Martin

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: Transmission limit)
  2004-12-05 17:54                             ` Martin Josefsson
@ 2004-12-06 11:32                               ` jamal
  2004-12-06 12:11                                 ` Lennert Buytenhek
  0 siblings, 1 reply; 24+ messages in thread
From: jamal @ 2004-12-06 11:32 UTC (permalink / raw)
  To: Martin Josefsson
  Cc: Lennert Buytenhek, Scott Feldman, Robert Olsson, P, mellia,
	e1000-devel, Jorge Manuel Finochietto, Giulio Galante, netdev

On Sun, 2004-12-05 at 12:54, Martin Josefsson wrote:
> On Sun, 5 Dec 2004, Lennert Buytenhek wrote:
> 
> > I've tested all packet sizes now, and delayed TDT updating once per jiffy
> > (instead of once per packet) indeed gives about 25kpps more on 60,61,62
> > byte packets, and is hardly worth it for bigger packets.
> 
> Maybe we can't see any real gains here now, I wonder if it has any effect
> if you have lots of nics on the same bus. I mean, in theory it saves a
> whole lot of traffic on the bus.
> 

This sounds like really exciting stuff happening here over the weekend.
Scott, you had to leave Intel before giving us this tip? ;-> 

Someone correct me if i am wrong - but does it appear as if all these
changes are only useful on PCI but not PCI-X?

cheers,
jamal



-------------------------------------------------------
SF email is sponsored by - The IT Product Guide
Read honest & candid reviews on hundreds of IT Products from real users.
Discover which products truly live up to the hype. Start reading now. 
http://productguide.itmanagersjournal.com/

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: Transmission limit)
  2004-12-06 11:32                               ` 1.03Mpps on e1000 (was: " jamal
@ 2004-12-06 12:11                                 ` Lennert Buytenhek
  2004-12-06 12:20                                   ` jamal
  0 siblings, 1 reply; 24+ messages in thread
From: Lennert Buytenhek @ 2004-12-06 12:11 UTC (permalink / raw)
  To: jamal
  Cc: Martin Josefsson, Scott Feldman, Robert Olsson, P, mellia,
	e1000-devel, Jorge Manuel Finochietto, Giulio Galante, netdev

On Mon, Dec 06, 2004 at 06:32:37AM -0500, jamal wrote:

> Someone correct me if i am wrong - but does it appear as if all these
> changes are only useful on PCI but not PCI-X?

They are useful on PCI-X as well as regular PCI.  On my 64/100 NIC I
get ~620kpps on 2.6.9, ~1Mpps with 2.6.9 plus tx rework plus TXDMAC=0.

Martin gets the ~1Mpps number with just the tx rework, and even more
with TXDMAC=0 added in as well.


--L


-------------------------------------------------------
SF email is sponsored by - The IT Product Guide
Read honest & candid reviews on hundreds of IT Products from real users.
Discover which products truly live up to the hype. Start reading now. 
http://productguide.itmanagersjournal.com/

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: Transmission limit)
  2004-12-06 12:11                                 ` Lennert Buytenhek
@ 2004-12-06 12:20                                   ` jamal
  2004-12-06 12:23                                     ` Lennert Buytenhek
  0 siblings, 1 reply; 24+ messages in thread
From: jamal @ 2004-12-06 12:20 UTC (permalink / raw)
  To: Lennert Buytenhek
  Cc: Martin Josefsson, Scott Feldman, Robert Olsson, P, mellia,
	e1000-devel, Jorge Manuel Finochietto, Giulio Galante, netdev

On Mon, 2004-12-06 at 07:11, Lennert Buytenhek wrote:
> On Mon, Dec 06, 2004 at 06:32:37AM -0500, jamal wrote:
> 
> > Someone correct me if i am wrong - but does it appear as if all these
> > changes are only useful on PCI but not PCI-X?
> 
> They are useful on PCI-X as well as regular PCI.  On my 64/100 NIC I
> get ~620kpps on 2.6.9, ~1Mpps with 2.6.9 plus tx rework plus TXDMAC=0.
> 
> Martin gets the ~1Mpps number with just the tx rework, and even more
> with TXDMAC=0 added in as well.

Right, but so far when i scan the results all i see is PCI not PCI-X.
Which of your (or Martins) boards has PCI-X?

cheers,
jamal



-------------------------------------------------------
SF email is sponsored by - The IT Product Guide
Read honest & candid reviews on hundreds of IT Products from real users.
Discover which products truly live up to the hype. Start reading now. 
http://productguide.itmanagersjournal.com/

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: Transmission limit)
  2004-12-06 12:20                                   ` jamal
@ 2004-12-06 12:23                                     ` Lennert Buytenhek
  2004-12-06 12:30                                       ` Martin Josefsson
  0 siblings, 1 reply; 24+ messages in thread
From: Lennert Buytenhek @ 2004-12-06 12:23 UTC (permalink / raw)
  To: jamal
  Cc: Martin Josefsson, Scott Feldman, Robert Olsson, P, mellia,
	e1000-devel, Jorge Manuel Finochietto, Giulio Galante, netdev

On Mon, Dec 06, 2004 at 07:20:43AM -0500, jamal wrote:

> > > Someone correct me if i am wrong - but does it appear as if all these
> > > changes are only useful on PCI but not PCI-X?
> > 
> > They are useful on PCI-X as well as regular PCI.  On my 64/100 NIC I
> > get ~620kpps on 2.6.9, ~1Mpps with 2.6.9 plus tx rework plus TXDMAC=0.
> > 
> > Martin gets the ~1Mpps number with just the tx rework, and even more
> > with TXDMAC=0 added in as well.
> 
> Right, but so far when i scan the results all i see is PCI not PCI-X.
> Which of your (or Martins) boards has PCI-X?

I've tested 32/33 PCI, 32/66 PCI, and 64/100 PCI-X.  I _think_ Martin
was running at 64/133 PCI-X.


--L


-------------------------------------------------------
SF email is sponsored by - The IT Product Guide
Read honest & candid reviews on hundreds of IT Products from real users.
Discover which products truly live up to the hype. Start reading now. 
http://productguide.itmanagersjournal.com/

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: Transmission limit)
  2004-12-06 12:23                                     ` Lennert Buytenhek
@ 2004-12-06 12:30                                       ` Martin Josefsson
  2004-12-06 13:11                                         ` jamal
  0 siblings, 1 reply; 24+ messages in thread
From: Martin Josefsson @ 2004-12-06 12:30 UTC (permalink / raw)
  To: Lennert Buytenhek
  Cc: jamal, Scott Feldman, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Mon, 6 Dec 2004, Lennert Buytenhek wrote:

> > Right, but so far when i scan the results all i see is PCI not PCI-X.
> > Which of your (or Martins) boards has PCI-X?
>
> I've tested 32/33 PCI, 32/66 PCI, and 64/100 PCI-X.  I _think_ Martin
> was running at 64/133 PCI-X.

I don't have any motherboards with PCI-X so no :)
I'm running the 82546GB (dualport) at 64/66 and the 82540EM (desktop
adapter) at 32/66, both are able to send at wirespeed.

/Martin


-------------------------------------------------------
SF email is sponsored by - The IT Product Guide
Read honest & candid reviews on hundreds of IT Products from real users.
Discover which products truly live up to the hype. Start reading now. 
http://productguide.itmanagersjournal.com/

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: Transmission limit)
  2004-12-06 12:30                                       ` Martin Josefsson
@ 2004-12-06 13:11                                         ` jamal
       [not found]                                           ` <20041206132907.GA13411@xi.wantstofly.org>
  0 siblings, 1 reply; 24+ messages in thread
From: jamal @ 2004-12-06 13:11 UTC (permalink / raw)
  To: Martin Josefsson
  Cc: Lennert Buytenhek, Scott Feldman, Robert Olsson, P, mellia,
	e1000-devel, Jorge Manuel Finochietto, Giulio Galante, netdev

Hopefully someone will beat me to testing to see if our forwarding
capacity now goes up with this new recipe.

cheers,
jamal

On Mon, 2004-12-06 at 07:30, Martin Josefsson wrote:
> On Mon, 6 Dec 2004, Lennert Buytenhek wrote:
> 
> > > Right, but so far when i scan the results all i see is PCI not PCI-X.
> > > Which of your (or Martins) boards has PCI-X?
> >
> > I've tested 32/33 PCI, 32/66 PCI, and 64/100 PCI-X.  I _think_ Martin
> > was running at 64/133 PCI-X.
> 
> I don't have any motherboards with PCI-X so no :)
> I'm running the 82546GB (dualport) at 64/66 and the 82540EM (desktop
> adapter) at 32/66, both are able to send at wirespeed.
> 
> /Martin
> 
> 



-------------------------------------------------------
SF email is sponsored by - The IT Product Guide
Read honest & candid reviews on hundreds of IT Products from real users.
Discover which products truly live up to the hype. Start reading now. 
http://productguide.itmanagersjournal.com/

^ permalink raw reply	[flat|nested] 24+ messages in thread

[parent not found: <20041206132907.GA13411@xi.wantstofly.org>]

[parent not found: <16820.37049.396306.295878@robur.slu.se>]

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
       [not found]                                             ` <16820.37049.396306.295878@robur.slu.se>
@ 2004-12-06 17:32                                               ` P
  0 siblings, 0 replies; 24+ messages in thread
From: P @ 2004-12-06 17:32 UTC (permalink / raw)
  To: Robert Olsson
  Cc: Lennert Buytenhek, jamal, Martin Josefsson, Scott Feldman, mellia,
	Jorge Manuel Finochietto, Giulio Galante, netdev

Robert Olsson wrote:
> Lennert Buytenhek writes:
>  > On Mon, Dec 06, 2004 at 08:11:02AM -0500, jamal wrote:
>  > 
>  > > Hopefully someone will beat me to testing to see if our forwarding
>  > > capacity now goes up with this new recipe.
> 
> 
> A breakthrough we now can send small packets at wire speed it will make 
> development and testing much easier...

It surely will!!

Just to recap, 2 people have been able to tx @ wire speed.
The origonal poster was able to receive at wire speed,
but could only TX at about 50% wire speed.

It would be really cool if we could combine this
to bridge @ wire speed.

-- 
Pádraig Brady - http://www.pixelbeat.org
--

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-05 15:42                       ` Martin Josefsson
  2004-12-05 16:48                         ` Martin Josefsson
  2004-12-05 17:44                         ` Lennert Buytenhek
@ 2004-12-08 23:36                         ` Ray Lehtiniemi
  2 siblings, 0 replies; 24+ messages in thread
From: Ray Lehtiniemi @ 2004-12-08 23:36 UTC (permalink / raw)
  To: Martin Josefsson
  Cc: Lennert Buytenhek, Scott Feldman, jamal, Robert Olsson, P, mellia,
	e1000-devel, Jorge Manuel Finochietto, Giulio Galante, netdev


hello martin


On Sun, Dec 05, 2004 at 04:42:34PM +0100, Martin Josefsson wrote:
> 
> Here's the patch, not much more tested (it still gives some transmit
> timeouts since it's scotts patch + prefetching and delayed TDT updating).
> And it's not cleaned up, but hey, that's development :)
> 
> The delayed TDT updating was a test and currently it delays the first tx'd
> packet after a timerrun 1ms.
> 
> Would be interesting to see what other people get with this thing.
> Lennert?

well, i'm brand new to gig ethernet, but i have access to some nice
hardware right now, so i decided to give your patch a try.

this is the average tx pps of 10 pktgen runs for each packet size:
	
60	1187589.1
64	 601805.4
68	1115029.3
72	 593096.4
76	1097761.1
80	 587125.4
84	1098045.2
88	 588159.1
92	1072124.8
96	 582510.3
100	1008056.8
104	 577898.0
108	 946974.0
112	 573719.2
116	 892871.0
120	 573072.5
124	 844608.3
128	 563685.7


any idea why the packet rates are cut in half for every other line?

pktgen is running with eth0 bound to CPU0 on this box:

  NexGate NSA 2040G
  Dual Xeon 3.06 GHz, HT enabled
  1 GB PC3200 DDR SDRAM
  Dual 82544EI
  - on PCI-X 64 bit 133 MHz bus
  - behind P64H2 bridge
  - on hub channel D of E7501 chipset



thanks

-- 
----------------------------------------------------------------------
     Ray L   <rayl@mail.com>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-05 15:03                     ` Martin Josefsson
  2004-12-05 15:15                       ` Lennert Buytenhek
  2004-12-05 15:42                       ` Martin Josefsson
@ 2004-12-05 21:12                       ` Scott Feldman
  2004-12-05 21:25                         ` Lennert Buytenhek
  2 siblings, 1 reply; 24+ messages in thread
From: Scott Feldman @ 2004-12-05 21:12 UTC (permalink / raw)
  To: Martin Josefsson
  Cc: Lennert Buytenhek, jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Sun, 2004-12-05 at 07:03, Martin Josefsson wrote:
> BUT if I use the above + prefetching I get this:
> 
> 60      1483890

Ok, proof that we can get to 1.4Mpps!  

That's the good news.

The bad news is prefetching is potentially buggy as pointed out in the
freebsd note.  Buggy as in the controller may hang.  Sorry, I don't have
details on what conditions are necessary to cause a hang.

Would Martin or Lennert run these test for a longer duration so we can
get some data, maybe adding in Rx.  It could be that removing the Tx
interrupts and descriptor write-backs, prefetching may be ok.  I don't
know.  Intel?

Also, wouldn't it be great if someone wrote a document capturing all of
the accumulated knowledge for future generations?

-scott

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
  2004-12-05 21:12                       ` Scott Feldman
@ 2004-12-05 21:25                         ` Lennert Buytenhek
  0 siblings, 0 replies; 24+ messages in thread
From: Lennert Buytenhek @ 2004-12-05 21:25 UTC (permalink / raw)
  To: Scott Feldman
  Cc: Martin Josefsson, jamal, Robert Olsson, P, mellia, e1000-devel,
	Jorge Manuel Finochietto, Giulio Galante, netdev

On Sun, Dec 05, 2004 at 01:12:22PM -0800, Scott Feldman wrote:

> Would Martin or Lennert run these test for a longer duration so we can
> get some data, maybe adding in Rx.  It could be that removing the Tx
> interrupts and descriptor write-backs, prefetching may be ok.  I don't
> know.  Intel?

What your patch does is (correct me if I'm wrong):
- Masking TXDW, effectively preventing it from delivering TXdone ints.
- Not setting E1000_TXD_CMD_IDE in the TXD command field, which causes
  the chip to 'ignore the TIDV' register, which is the 'TX Interrupt
  Delay Value'.  What exactly does this?
- Not setting the "Report Packet Sent"/"Report Status" bits in the TXD
  command field.  Is this the equivalent of the TXdone interrupt?

Just exactly which bit avoids the descriptor writeback?

I'm also a bit worried that only freeing packets 1ms later will mess up
socket accounting and such.  Any ideas on that?


> Also, wouldn't it be great if someone wrote a document capturing all of
> the accumulated knowledge for future generations?

I'll volunteer for that.


--L

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2004-12-08 23:36 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-12-05 18:25 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit) Manfred Spraul
  -- strict thread matches above, loose matches on Subject: below --
2004-12-06 19:10 Robert Olsson
2004-12-06 22:29 ` Martin Josefsson
2004-12-07  3:20   ` jamal
2004-11-26 20:01 [E1000-devel] Transmission limit jamal
2004-11-29 13:09 ` Robert Olsson
2004-11-30 13:31   ` jamal
2004-11-30 13:46     ` Lennert Buytenhek
2004-11-30 14:25       ` jamal
2004-12-01  0:11         ` Lennert Buytenhek
2004-12-01  1:09           ` Scott Feldman
2004-12-01 18:29             ` Lennert Buytenhek
2004-12-01 21:35               ` Lennert Buytenhek
2004-12-02  6:13                 ` Scott Feldman
2004-12-05 14:50                   ` 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit) Lennert Buytenhek
2004-12-05 15:03                     ` Martin Josefsson
2004-12-05 15:15                       ` Lennert Buytenhek
2004-12-05 15:19                         ` Martin Josefsson
2004-12-05 15:30                           ` Martin Josefsson
2004-12-05 17:00                             ` Lennert Buytenhek
2004-12-05 17:11                               ` Martin Josefsson
2004-12-05 17:38                                 ` Martin Josefsson
2004-12-05 18:14                                   ` Lennert Buytenhek
2004-12-05 15:42                       ` Martin Josefsson
2004-12-05 16:48                         ` Martin Josefsson
2004-12-05 17:01                           ` Martin Josefsson
2004-12-05 17:58                           ` Lennert Buytenhek
2004-12-05 17:44                         ` Lennert Buytenhek
2004-12-05 17:51                           ` Lennert Buytenhek
2004-12-05 17:54                             ` Martin Josefsson
2004-12-06 11:32                               ` 1.03Mpps on e1000 (was: " jamal
2004-12-06 12:11                                 ` Lennert Buytenhek
2004-12-06 12:20                                   ` jamal
2004-12-06 12:23                                     ` Lennert Buytenhek
2004-12-06 12:30                                       ` Martin Josefsson
2004-12-06 13:11                                         ` jamal
     [not found]                                           ` <20041206132907.GA13411@xi.wantstofly.org>
     [not found]                                             ` <16820.37049.396306.295878@robur.slu.se>
2004-12-06 17:32                                               ` 1.03Mpps on e1000 (was: Re: [E1000-devel] " P
2004-12-08 23:36                         ` Ray Lehtiniemi
2004-12-05 21:12                       ` Scott Feldman
2004-12-05 21:25                         ` Lennert Buytenhek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).