netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
@ 2004-12-05 18:25 Manfred Spraul
  0 siblings, 0 replies; 24+ messages in thread
From: Manfred Spraul @ 2004-12-05 18:25 UTC (permalink / raw)
  To: Lennert Buytenhek, Netdev, Martin Josefsson

[-- Attachment #1: Type: text/plain, Size: 1604 bytes --]

Lennert wrote:

> A dirty way, yes ;-)  Open up e1000_osdep.h and do:
>
> -#define E1000_READ_REG(a, reg) ( \
> -    readl((a)->hw_addr + \
> -        (((a)->mac_type >= e1000_82543) ? E1000_##reg : E1000_82542_##reg)))
> +#define E1000_READ_REG(a, reg) ({ \
> +    unsigned long s, e, d, v; \
> +\
> +    (a)->mmio_reads++; \
> +    rdtsc(s, d); \
> +    v = readl((a)->hw_addr + \
> +        (((a)->mac_type >= e1000_82543) ? E1000_##reg : E1000_82542_##reg)); \
> +    rdtsc(e, d); \
> +    e -= s; \
> +    printk(KERN_INFO "e1000: MMIO read took %ld clocks\n", e); \
> +    printk(KERN_INFO "e1000: in process %d(%s)\n", current->pid, current->comm); \
> +    dump_stack(); \
> +    v; \
> +})

Too dirty: rdtsc is not serializing, thus my Opteron happily reorders 
the read and the rdtsc and reports 9 cycles.
Attached is a longer patch that I usually use for microbenchmarks. I get 
around 506 cycles with it for an Opteron 2 GHz to the nForce 250 Gb nic 
(i.e. integrated nic in the chipset, just one HT hop):

Results - zero - shift 0
 40: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 :0 0 63 0 0 0 0 0 0 0 0 0 0 0 0 0
1e0: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 :0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
Overflows: 0.
Sum: 100
 >>>>>>>>>>> benchmark overhead: 82 cycles
** reading register e08920b4
Results - readl - shift 0
240: 0 0 b 0 0 0 0 0 0 0 0 0 32 0 1 1 :0 0 0 0 0 0 a 0 0 0 0 0 0 0 0 0
260: 1a 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 :0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
300: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 :0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
Overflows: 0.
Sum: 100
 >>>>>>>>>> total: 0x248, i.e. net 506 cycles.

--
    Manfred

[-- Attachment #2: patch-perftest-forcedeth --]
[-- Type: text/plain, Size: 2910 bytes --]

--- 2.6/drivers/net/forcedeth.c	2004-12-05 16:21:28.000000000 +0100
+++ build-2.6/drivers/net/forcedeth.c	2004-12-05 19:18:24.000000000 +0100
@@ -1500,6 +1500,131 @@
 	enable_irq(dev->irq);
 }
 
+int p_shift = 0;
+
+#define STAT_TABLELEN		16384
+static unsigned long totals[STAT_TABLELEN];
+static unsigned int overflows;
+
+static unsigned long long stime;
+static void start_measure(void)
+{
+	 __asm__ __volatile__ (
+		".align 64\n\t"
+	 	"pushal\n\t"
+		"cpuid\n\t"
+		"popal\n\t"
+		"rdtsc\n\t"
+		"movl %%eax,(%0)\n\t"
+		"movl %%edx,4(%0)\n\t"
+		: /* no output */
+		: "c"(&stime)
+		: "eax", "edx", "memory" );
+}
+
+static void end_measure(void)
+{
+static unsigned long long etime;
+	__asm__ __volatile__ (
+		"pushal\n\t"
+		"cpuid\n\t"
+		"popal\n\t"
+		"rdtsc\n\t"
+		"movl %%eax,(%0)\n\t"
+		"movl %%edx,4(%0)\n\t"
+		: /* no output */
+		: "c"(&etime)
+		: "eax", "edx", "memory" );
+	{
+		unsigned long time = (unsigned long)(etime-stime);
+		time >>= p_shift;
+		if(time < STAT_TABLELEN) {
+			totals[time]++;
+		} else {
+			overflows++;
+		}
+	}
+}
+
+static void clean_buf(void)
+{
+	memset(totals,0,sizeof(totals));
+	overflows = 0;
+}
+
+static void print_line(unsigned long* array)
+{
+	int i;
+	for(i=0;i<32;i++) {
+		if((i%32)==16)
+			printk(":");
+		printk("%lx ",array[i]); 
+	}
+}
+
+static void print_buf(char* caption)
+{
+	int i, other = 0;
+	printk("Results - %s - shift %d",
+		caption, p_shift);
+
+	for(i=0;i<STAT_TABLELEN;i+=32) {
+		int j;
+		int local = 0;
+		for(j=0;j<32;j++)
+			local += totals[i+j];
+
+		if(local) {
+			printk("\n%3x: ",i);
+			print_line(&totals[i]);
+			other += local;
+		}
+	}
+	printk("\nOverflows: %d.\n",
+		overflows);
+	printk("Sum: %d\n",other+overflows);
+}
+
+static void return_immediately(void *dummy)
+{
+}
+
+static void bench_readl(u8 __iomem *base)
+{ 
+	int i;
+
+	/* empty test measurement: */
+	printk("******** kernel cpu benchmark started **********\n");
+	clean_buf();
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(200);
+	for(i=0;i<100;i++) {
+		start_measure();
+		return_immediately(NULL);
+		return_immediately(NULL);
+		return_immediately(NULL);
+		return_immediately(NULL);
+		end_measure();
+	}
+	print_buf("zero");
+	clean_buf();
+
+	printk("** reading register %p\n", base);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(200);
+	for(i=0;i<100;i++) {
+		start_measure();
+		return_immediately(NULL);
+		return_immediately(NULL);
+		readl(base);
+		return_immediately(NULL);
+		return_immediately(NULL);
+		end_measure();
+	}
+	print_buf("readl");
+	clean_buf();
+}
+
 static int nv_open(struct net_device *dev)
 {
 	struct fe_priv *np = get_nvpriv(dev);
@@ -1635,6 +1760,8 @@
 		mod_timer(&np->oom_kick, jiffies + OOM_REFILL);
 	spin_unlock_irq(&np->lock);
 
+	bench_readl(base + NvRegMulticastAddrB);
+	bench_readl(base + NvRegIrqStatus);
 	return 0;
 out_drain:
 	drain_ring(dev);

^ permalink raw reply	[flat|nested] 24+ messages in thread
* Re: 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit)
@ 2004-12-06 19:10 Robert Olsson
  2004-12-06 22:29 ` Martin Josefsson
  0 siblings, 1 reply; 24+ messages in thread
From: Robert Olsson @ 2004-12-06 19:10 UTC (permalink / raw)
  To: Lennert Buytenhek
  Cc: jamal, Martin Josefsson, Scott Feldman, Robert Olsson, P, mellia,
	Jorge Manuel Finochietto, Giulio Galante, netdev



Lennert Buytenhek writes:
 > On Mon, Dec 06, 2004 at 08:11:02AM -0500, jamal wrote:
 > 
 > > Hopefully someone will beat me to testing to see if our forwarding
 > > capacity now goes up with this new recipe.


Yes a breakthrough as we now can send small packets at GIGE wire speed this
will make development and testing much easier... A first router test with 
our setup below. Opteron 1.6 GHz SMP kernel. using 1 CPU. 82546 EB + 
82456 GB and PCI-X 100 Mhz & 133 MHz. 

pktgen performance is measured on router box. Remember Scotts patch uses 
4096 TX buffers and w. pktgen we use clone_skb. So with real skb's we probably 
see lower performance due to this. This may explain results below so routing
performance doesn't follow pktgen performance as seen.

T-PUT is routing performance. Also pktgen pure TX performance is given
this on the router.


Input rate for routing test is 2*765 kpps for all three runs. Input 
Packets input to eth0 is routed to eth1 and eth2 to eth3.


Vanilla. T-PUT 657 kpps. pktgen TX perf 818 kpps
-------------------------------------------------
Iface   MTU Met  RX-OK RX-ERR RX-DRP RX-OVR  TX-OK TX-ERR TX-DRP TX-OVR Flags
eth0   1500   0 4312682 8253078 8253078 5687318      5      0      0      0 BRU
eth1   1500   0      1      0      0      0 4312199      0      0      0 BRU
eth2   1500   0 4311018 8386504 8386504 5688982      5      0      0      0 BRU
eth3   1500   0      1      0      0      0 4310791      0      0      0 BRU

           CPU0       
  0:     116665    IO-APIC-edge  timer
  1:        208    IO-APIC-edge  i8042
  8:          0    IO-APIC-edge  rtc
  9:          0   IO-APIC-level  acpi
 14:      21943    IO-APIC-edge  ide0
 26:         66   IO-APIC-level  eth0
 27:      58638   IO-APIC-level  eth1
 28:         68   IO-APIC-level  eth2
 29:      58497   IO-APIC-level  eth3
NMI:          0 
LOC:     116605 
ERR:          0
MIS:          0

e1000-TX-prefetch+scott tx patch. T-PUT 540 kpps. pktgen TX perf 1.48 Mpps
--------------------------------------------------------------------------

Iface   MTU Met  RX-OK RX-ERR RX-DRP RX-OVR  TX-OK TX-ERR TX-DRP TX-OVR Flags
eth0   1500   0 3533795 8618637 8618637 6466205      5      0      0      0 BRU
eth1   1500   0      3      0      0      0 3533803      0      0      0 BRU
eth2   1500   0 3535804 8697149 8697149 6464196      5      0      0      0 BRU
eth3   1500   0      1      0      0      0 3535321      0      0      0 BRU

           CPU0       
  0:    1372774    IO-APIC-edge  timer
  1:        663    IO-APIC-edge  i8042
  8:          0    IO-APIC-edge  rtc
  9:          0   IO-APIC-level  acpi
 14:      22631    IO-APIC-edge  ide0
 26:        686   IO-APIC-level  eth0
 27:        693   IO-APIC-level  eth1
 28:        687   IO-APIC-level  eth2
 29:        682   IO-APIC-level  eth3
NMI:          0 
LOC:    1372804 
ERR:          0
MIS:          0


e1000-TX-prefetch. T-PUT 657 kpps. pktgen TX perf 1.15 Mpps
-----------------------------------------------------------
Kernel Interface table
Iface   MTU Met  RX-OK RX-ERR RX-DRP RX-OVR  TX-OK TX-ERR TX-DRP TX-OVR Flags
eth0   1500   0 4311848 8288270 8288270 5688152      5      0      0      0 BRU
eth1   1500   0      4      0      0      0 4311388      0      0      0 BRU
eth2   1500   0 4309082 8400892 8400892 5690918      5      0      0      0 BRU
eth3   1500   0      1      0      0      0 4308271      0      0      0 BRU
lo    16436   0      0      0      0      0      0      0      0      0 LRU
           CPU0       
  0:     224310    IO-APIC-edge  timer
  1:        250    IO-APIC-edge  i8042
  8:          0    IO-APIC-edge  rtc
  9:          0   IO-APIC-level  acpi
 14:      22055    IO-APIC-edge  ide0
 26:        122   IO-APIC-level  eth0
 27:      58001   IO-APIC-level  eth1
 28:        123   IO-APIC-level  eth2
 29:      57681   IO-APIC-level  eth3
NMI:          0 
LOC:     224251 
ERR:          0
MIS:          0


						--ro

^ permalink raw reply	[flat|nested] 24+ messages in thread
* Re: [E1000-devel] Transmission limit
@ 2004-11-26 20:01 jamal
  2004-11-29 13:09 ` Robert Olsson
  0 siblings, 1 reply; 24+ messages in thread
From: jamal @ 2004-11-26 20:01 UTC (permalink / raw)
  To: P
  Cc: mellia, Robert Olsson, e1000-devel, Jorge Manuel Finochietto,
	Giulio Galante, netdev

On Fri, 2004-11-26 at 11:57, P@draigBrady.com wrote:

> > skb are de/allocated using standard kernel memory management. Still,
> > without touching the packet, we can receive 100% of them.
> 
> I was doing some playing in this area this week.
> I changed the alloc per packet to a "realloc" per packet.
> I.E. the e1000 driver owns the packets. I noticed a
> very nice speedup from this. In summary a userspace
> app was able to receive 2x250Kpps without this patch,
> and 2x490Kpps with it. The patch is here:
> http://www.pixelbeat.org/tmp/linux-2.4.20-pb.diff

A very angry gorilla on that url ;->

> Note 99% of that patch is just upgrading from
> e1000 V4.4.12-k1 to V5.2.52 (which doesn't affect
> the performance).
> 
> Wow I just read you're excellent paper, and noticed
> you used this approach also :-)
> 

Have to read the paper - When Robert was last visiting here; we did some
tests and packet recycling is not very valuable as far as SMP is
concerned (given that packets can be alloced on one CPU and freed on
another). There a clear win on single CPU machines.

> >> Small packet performance is dependent on low latency. Higher bus speed
> >> gives shorter latency but also on higher speed buses there use to be  
> >> bridges that adds latency.
> > 
> > That's true. We suspect that the limit is due to bus latency. But still,
> > we are surprised, since the bus allows to receive 100%, but to transmit
> > up to ~50%. Moreover the raw aggerate bandwidth of the buffer is _far_
> > larger (133MHz*64bit ~ 8gbit/s
> 
> Well there definitely could be an asymmetry wrt bus latency.
> Saying that though, in my tests with much the same hardware
> as you, I could only get 800Kpps into the driver.

Yep, thats about the number i was seeing as well in both pieces of
hardware i used in the tests in my SUCON presentation.

>  I'll
> check this again when I have time. Note also that as I understand
> it the PCI control bus is running at a much lower rate,
> and that is used to arbitrate the bus for each packet.
> I.E. the 8Gb/s number above is not the bottleneck.
> 
> An lspci -vvv for your ethernet devices would be useful
> Also to view the burst size: setpci -d 8086:1010 e6.b
> (where 8086:1010 is the ethernet device PCI id).
> 

Can you talk a little about this PCI control bus? I have heard you
mention it before ... I am trying to visualize where it fits in PCI
system.

cheers,
jamal

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2004-12-08 23:36 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-12-05 18:25 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit) Manfred Spraul
  -- strict thread matches above, loose matches on Subject: below --
2004-12-06 19:10 Robert Olsson
2004-12-06 22:29 ` Martin Josefsson
2004-12-07  3:20   ` jamal
2004-11-26 20:01 [E1000-devel] Transmission limit jamal
2004-11-29 13:09 ` Robert Olsson
2004-11-30 13:31   ` jamal
2004-11-30 13:46     ` Lennert Buytenhek
2004-11-30 14:25       ` jamal
2004-12-01  0:11         ` Lennert Buytenhek
2004-12-01  1:09           ` Scott Feldman
2004-12-01 18:29             ` Lennert Buytenhek
2004-12-01 21:35               ` Lennert Buytenhek
2004-12-02  6:13                 ` Scott Feldman
2004-12-05 14:50                   ` 1.03Mpps on e1000 (was: Re: [E1000-devel] Transmission limit) Lennert Buytenhek
2004-12-05 15:03                     ` Martin Josefsson
2004-12-05 15:15                       ` Lennert Buytenhek
2004-12-05 15:19                         ` Martin Josefsson
2004-12-05 15:30                           ` Martin Josefsson
2004-12-05 17:00                             ` Lennert Buytenhek
2004-12-05 17:11                               ` Martin Josefsson
2004-12-05 17:38                                 ` Martin Josefsson
2004-12-05 18:14                                   ` Lennert Buytenhek
2004-12-05 15:42                       ` Martin Josefsson
2004-12-05 16:48                         ` Martin Josefsson
2004-12-05 17:01                           ` Martin Josefsson
2004-12-05 17:58                           ` Lennert Buytenhek
2004-12-05 17:44                         ` Lennert Buytenhek
2004-12-05 17:51                           ` Lennert Buytenhek
2004-12-05 17:54                             ` Martin Josefsson
2004-12-06 11:32                               ` 1.03Mpps on e1000 (was: " jamal
2004-12-06 12:11                                 ` Lennert Buytenhek
2004-12-06 12:20                                   ` jamal
2004-12-06 12:23                                     ` Lennert Buytenhek
2004-12-06 12:30                                       ` Martin Josefsson
2004-12-06 13:11                                         ` jamal
     [not found]                                           ` <20041206132907.GA13411@xi.wantstofly.org>
     [not found]                                             ` <16820.37049.396306.295878@robur.slu.se>
2004-12-06 17:32                                               ` 1.03Mpps on e1000 (was: Re: [E1000-devel] " P
2004-12-08 23:36                         ` Ray Lehtiniemi
2004-12-05 21:12                       ` Scott Feldman
2004-12-05 21:25                         ` Lennert Buytenhek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).