Embedded Linux development

Embedded Linux development
 help / color / mirror / Atom feed

* [PATCH 3/7] printk: introduce printk_unfiltered as an alias to printk
From: Marc Andre Tanner @ 2009-09-01 22:31 UTC (permalink / raw)
  To: linux-embedded; +Cc: mat
In-Reply-To: <1251844269-12394-1-git-send-email-mat@brain-dump.org>

The standard printk function will be wrapped by a macro.
However this doesn't work in all situations (for example
when the return value of printk is of interest). We
therefore provide a new function which is just an alias
to printk and therefore bypasses the macro.

Signed-off-by: Marc Andre Tanner <mat@brain-dump.org>
---
 include/linux/kernel.h |    5 +++++
 kernel/printk.c        |   24 ++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 0 deletions(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d6320a3..c2b3047 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -239,6 +239,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
 asmlinkage int printk(const char * fmt, ...)
 	__attribute__ ((format (printf, 1, 2))) __cold;
+asmlinkage int printk_unfiltered(const char *fmt, ...)
+	__attribute__ ((format (printf, 1, 2))) __cold;
 
 extern struct ratelimit_state printk_ratelimit_state;
 extern int printk_ratelimit(void);
@@ -265,6 +267,9 @@ static inline int vprintk(const char *s, va_list args) { return 0; }
 static inline int printk(const char *s, ...)
 	__attribute__ ((format (printf, 1, 2)));
 static inline int __cold printk(const char *s, ...) { return 0; }
+static inline int printk_unfiltered(const char *s, ...)
+	__attribute__ ((format (printf, 1, 2)));
+static inline int __cold printk_unfiltered(const char *s, ...) { return 0; }
 static inline int printk_ratelimit(void) { return 0; }
 static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \
 					  unsigned int interval_msec)	\
diff --git a/kernel/printk.c b/kernel/printk.c
index 5455d41..20379b5 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1310,6 +1310,11 @@ EXPORT_SYMBOL(printk_timed_ratelimit);
  *
  * This is printk().  It can be called from any context.  We want it to work.
  *
+ * Note that depending on the kernel configuration printk might be wrapped by
+ * a macro. In cases where it's important that the implementation is a function
+ * (for example when the return value of printk is of interest) printk_unfiltered
+ * which bypasses the macro should be used instead.
+ *
  * We try to grab the console_sem.  If we succeed, it's easy - we log the output and
  * call the console drivers.  If we fail to get the semaphore we place the output
  * into the log buffer and return.  The current holder of the console_sem will
@@ -1326,6 +1331,14 @@ EXPORT_SYMBOL(printk_timed_ratelimit);
  * See the vsnprintf() documentation for format string extensions over C99.
  */
 
+/*
+ * We need to #undef the printk macro from <linux/kernel.h> because
+ * it would otherwise conflict with the function implementation.
+ */
+#ifdef printk
+# undef printk
+#endif
+
 asmlinkage int printk(const char *fmt, ...)
 {
 	va_list args;
@@ -1338,4 +1351,15 @@ asmlinkage int printk(const char *fmt, ...)
 	return r;
 }
 EXPORT_SYMBOL(printk);
+ 
+/*
+ * Because printk might be wrapped by a macro which doesn't work in all
+ * circumstances (for example when the return value of printk is of
+ * interest) we make the functionality also available as a normal
+ * function.
+ */
+
+asmlinkage int printk_unfiltered(const char *fmt, ...)
+	__attribute__((alias("printk")));
+EXPORT_SYMBOL(printk_unfiltered);
 #endif
-- 
1.6.3.3

^ permalink raw reply related

* [PATCH 2/7] printk: move printk to the end of the file
From: Marc Andre Tanner @ 2009-09-01 22:31 UTC (permalink / raw)
  To: linux-embedded; +Cc: mat
In-Reply-To: <1251844269-12394-1-git-send-email-mat@brain-dump.org>

A later patch will #undef printk because the macro would otherwise
conflict with the function definition. Moving the printk function
to the end of the file makes sure that the macro is expanded within
the rest of the file.

Signed-off-by: Marc Andre Tanner <mat@brain-dump.org>
---
 kernel/printk.c |   72 ++++++++++++++++++++++++++++--------------------------
 1 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/kernel/printk.c b/kernel/printk.c
index b4d97b5..5455d41 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -551,40 +551,6 @@ static int have_callable_console(void)
 	return 0;
 }
 
-/**
- * printk - print a kernel message
- * @fmt: format string
- *
- * This is printk().  It can be called from any context.  We want it to work.
- *
- * We try to grab the console_sem.  If we succeed, it's easy - we log the output and
- * call the console drivers.  If we fail to get the semaphore we place the output
- * into the log buffer and return.  The current holder of the console_sem will
- * notice the new output in release_console_sem() and will send it to the
- * consoles before releasing the semaphore.
- *
- * One effect of this deferred printing is that code which calls printk() and
- * then changes console_loglevel may break. This is because console_loglevel
- * is inspected when the actual printing occurs.
- *
- * See also:
- * printf(3)
- *
- * See the vsnprintf() documentation for format string extensions over C99.
- */
-
-asmlinkage int printk(const char *fmt, ...)
-{
-	va_list args;
-	int r;
-
-	va_start(args, fmt);
-	r = vprintk(fmt, args);
-	va_end(args);
-
-	return r;
-}
-
 /* cpu currently holding logbuf_lock */
 static volatile unsigned int printk_cpu = UINT_MAX;
 
@@ -770,7 +736,6 @@ out_restore_irqs:
 	preempt_enable();
 	return printed_len;
 }
-EXPORT_SYMBOL(printk);
 EXPORT_SYMBOL(vprintk);
 
 #else
@@ -1337,3 +1302,40 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 }
 EXPORT_SYMBOL(printk_timed_ratelimit);
 #endif
+
+#ifdef CONFIG_PRINTK
+/**
+ * printk - print a kernel message
+ * @fmt: format string
+ *
+ * This is printk().  It can be called from any context.  We want it to work.
+ *
+ * We try to grab the console_sem.  If we succeed, it's easy - we log the output and
+ * call the console drivers.  If we fail to get the semaphore we place the output
+ * into the log buffer and return.  The current holder of the console_sem will
+ * notice the new output in release_console_sem() and will send it to the
+ * consoles before releasing the semaphore.
+ *
+ * One effect of this deferred printing is that code which calls printk() and
+ * then changes console_loglevel may break. This is because console_loglevel
+ * is inspected when the actual printing occurs.
+ *
+ * See also:
+ * printf(3)
+ *
+ * See the vsnprintf() documentation for format string extensions over C99.
+ */
+
+asmlinkage int printk(const char *fmt, ...)
+{
+	va_list args;
+	int r;
+
+	va_start(args, fmt);
+	r = vprintk(fmt, args);
+	va_end(args);
+
+	return r;
+}
+EXPORT_SYMBOL(printk);
+#endif
-- 
1.6.3.3

^ permalink raw reply related

* [PATCH 1/7] printk: introduce CONFIG_PRINTK_VERBOSITY
From: Marc Andre Tanner @ 2009-09-01 22:31 UTC (permalink / raw)
  To: linux-embedded; +Cc: mat
In-Reply-To: <1251844269-12394-1-git-send-email-mat@brain-dump.org>

Introduce a config option which allows to selectively compile out
printk messages based on a specified verbosity level.

Signed-off-by: Marc Andre Tanner <mat@brain-dump.org>
---
 init/Kconfig |   28 ++++++++++++++++++++++++++++
 1 files changed, 28 insertions(+), 0 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index 3f7e609..3618168 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -833,6 +833,34 @@ config PRINTK
 	  very difficult to diagnose system problems, saying N here is
 	  strongly discouraged.
 
+config PRINTK_VERBOSITY
+	int "Printk compile time verbosity"
+	depends on EMBEDDED && PRINTK
+	range 0 7
+	default 0
+	help
+
+	  Select the maximum printk verbosity level to be compiled into
+	  the kernel.
+
+ 	  Messages above the specified verbosity level are removed from
+ 	  the kernel at compile time. This reduces the kernel image size
+ 	  at the cost of a calmer kernel.
+
+ 	  Possible verbosity levels are:
+
+	   0  Disable this feature and compile all messages in.
+
+ 	   1  KERN_ALERT        /* action must be taken immediately  */
+ 	   2  KERN_CRIT         /* critical conditions               */
+ 	   3  KERN_ERR          /* error conditions                  */
+ 	   4  KERN_WARNING      /* warning conditions                */
+ 	   5  KERN_NOTICE       /* normal but significant condition  */
+ 	   6  KERN_INFO         /* informational                     */
+ 	   7  KERN_DEBUG        /* debug-level messages              */
+
+	  If unsure, just move on and leave this option alone.
+
 config BUG
 	bool "BUG() support" if EMBEDDED
 	default y
-- 
1.6.3.3

^ permalink raw reply related

* [RFC|PATCH] Compile time printk verbosity
From: Marc Andre Tanner @ 2009-09-01 22:31 UTC (permalink / raw)
  To: linux-embedded; +Cc: mat

This series adds a configuration option to selectively compile out
printk message strings based on a verbosity level.

This works by wrapping printk with a macro which evaluates to a 
constant if condition which the compiler will be able to optimize 
out.

However because printk might be wrapped by a macro it no longer has
a return value. This means that constructs like the following ones
don't work: 

   ((void)(SOME_RANDOM_DEBUG_FLAG && printk(...));

   some_random_variable = printk(...);

Therefore printk_unfiltered is introduced which is just an alias
to the standard printk function but not wrapped by a macro.

Patches 4-6 make existing kernel code aware of this fact.

The series was compile tested with make allyesconfig for x86 and 
arm (with a cross compiler) but I might have missed something.

All kinds of comments are welcome.

Marc Andre Tanner (7):
      printk: introduce CONFIG_PRINTK_VERBOSITY
      printk: move printk to the end of the file
      printk: introduce printk_unfiltered as an alias to printk
      drivers: replace printk with printk_unfiltered
      drivers: make macro independent of printk's return value
      video/stk-webcam: change use of STK_ERROR
      printk: provide a filtering macro for printk

 drivers/char/mem.c                 |    2 +-
 drivers/md/md.c                    |    2 +-
 drivers/md/raid5.c                 |    2 +-
 drivers/media/video/stk-webcam.c   |   16 +++---
 drivers/net/e100.c                 |    2 +-
 drivers/net/ixgb/ixgb.h            |    2 +-
 drivers/net/ixgbe/ixgbe.h          |    2 +-
 drivers/scsi/aic7xxx/aic79xx_osm.h |    2 +-
 drivers/scsi/aic7xxx/aic7xxx_osm.h |    2 +-
 include/linux/kernel.h             |   29 +++++++++++
 include/net/sctp/sctp.h            |    2 +-
 init/Kconfig                       |   28 ++++++++++
 kernel/lockdep.c                   |    4 +-
 kernel/printk.c                    |   96 +++++++++++++++++++++++-------------
 14 files changed, 137 insertions(+), 54 deletions(-)

^ permalink raw reply

* Re: 100Mbit ethernet performance on embedded devices
From: Simon Holm Thøgersen @ 2009-08-29  7:05 UTC (permalink / raw)
  To: Johannes Stezenbach; +Cc: Jamie Lokier, linux-embedded, netdev
In-Reply-To: <20090828144138.GB7375@sig21.net>

fre, 28 08 2009 kl. 16:41 +0200, skrev Johannes Stezenbach:
> On Thu, Aug 20, 2009 at 02:56:49PM +0200, Johannes Stezenbach wrote:
> > On Wed, Aug 19, 2009 at 04:35:34PM +0100, Jamie Lokier wrote:
> > > Johannes Stezenbach wrote:
> > > > 
> > > >   TCP RX ~70Mbit/sec  (iperf -s on SoC, iperf -c on destop PC)
> > > >   TCP TX ~56Mbit/sec  (iperf -s on destop PC, iperf -c o SoC)
> > > > 
> > > > The CPU load during the iperf test is around
> > > > 1% user, 44% system, 4% irq, 48% softirq, with 7500 irqs/sec.
> > > > 
> > > > The kernel used in these measurements does not have iptables
> > > > support, I think packet filtering will slow it down noticably,
> > > > but I didn't actually try.  The ethernet driver uses NAPI,
> > > > but it doesn't seem to be a win judging from the irq/sec number.
> > > 
> > > You should see far fewer interrupts if NAPI was working properly.
> > > Rather than NAPI not being a win, it looks like it's not active at
> > > all.
> > > 
> > > 7500/sec is close to the packet rate, for sending TCP with
> > > full-size ethernet packages over a 100Mbit ethernet link.
> > 
> > From debug output I can see that NAPI works in principle, however
> > the timing seems to be such that ->poll() almost always completes
> > before the next packet is received.  I followed the NAPI_HOWTO.txt
> > which came with the 2.6.20 kernel.  The delay between irq ->
> > netif_rx_schedule() -> NET_RX_SOFTIRQ ->  ->poll()  doesn't seem
> > to be long enough.  But of course my understanding of NAPI is
> > very limited, probably I missed something...
> 
> It would've been nice to get a comment on this.  Yeah I know,
> old kernel, non-mainline driver...

Tried porting the driver to mainline? That way you will get more than
two years of improvements to the networking stack including NAPI.

There was a rework of NAPI [1] around 2.6.24, you'd probably like to see
commit bea3348eef27e6044b6161fd04c3152215f96411. You could also ask the
linux driver project to help you make the driver suitable for mainline
inclusion.

[1] http://lwn.net/Articles/244640/


Simon Holm Th√∏gersen

^ permalink raw reply

* Re: 100Mbit ethernet performance on embedded devices
From: Mark Brown @ 2009-08-28 17:35 UTC (permalink / raw)
  To: Johannes Stezenbach; +Cc: Jamie Lokier, linux-embedded, netdev
In-Reply-To: <20090828144138.GB7375@sig21.net>

On Fri, Aug 28, 2009 at 04:41:38PM +0200, Johannes Stezenbach wrote:
> On Thu, Aug 20, 2009 at 02:56:49PM +0200, Johannes Stezenbach wrote:

> > which came with the 2.6.20 kernel.  The delay between irq ->
> > netif_rx_schedule() -> NET_RX_SOFTIRQ ->  ->poll()  doesn't seem
> > to be long enough.  But of course my understanding of NAPI is
> > very limited, probably I missed something...

> It would've been nice to get a comment on this.  Yeah I know,
> old kernel, non-mainline driver...

> On this platform NAPI seems to be a win when receiving small packets,
> but not for a single max-bandwidth TCP stream.  The folks at
> stlinux.com seem to be using a dedicated hw timer to delay
> the NAPI poll() calls:
> http://www.stlinux.com/drupal/kernel/network/stmmac-optimizations

> This of course adds some latency to the packet processing,
> however in the single TCP stream case this wouldn't matter.

Does your actual system have any appreciable CPU loading?  If so that
will normally have the same effect as inserting a delay in the RX path.
Some of the numbers will often look worse with NAPI when the system is
lightly loaded (though not normally throughput).

^ permalink raw reply

* Re: 100Mbit ethernet performance on embedded devices
From: Johannes Stezenbach @ 2009-08-28 14:41 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: linux-embedded, netdev
In-Reply-To: <20090820125649.GA29029@sig21.net>

On Thu, Aug 20, 2009 at 02:56:49PM +0200, Johannes Stezenbach wrote:
> On Wed, Aug 19, 2009 at 04:35:34PM +0100, Jamie Lokier wrote:
> > Johannes Stezenbach wrote:
> > > 
> > >   TCP RX ~70Mbit/sec  (iperf -s on SoC, iperf -c on destop PC)
> > >   TCP TX ~56Mbit/sec  (iperf -s on destop PC, iperf -c o SoC)
> > > 
> > > The CPU load during the iperf test is around
> > > 1% user, 44% system, 4% irq, 48% softirq, with 7500 irqs/sec.
> > > 
> > > The kernel used in these measurements does not have iptables
> > > support, I think packet filtering will slow it down noticably,
> > > but I didn't actually try.  The ethernet driver uses NAPI,
> > > but it doesn't seem to be a win judging from the irq/sec number.
> > 
> > You should see far fewer interrupts if NAPI was working properly.
> > Rather than NAPI not being a win, it looks like it's not active at
> > all.
> > 
> > 7500/sec is close to the packet rate, for sending TCP with
> > full-size ethernet packages over a 100Mbit ethernet link.
> 
> From debug output I can see that NAPI works in principle, however
> the timing seems to be such that ->poll() almost always completes
> before the next packet is received.  I followed the NAPI_HOWTO.txt
> which came with the 2.6.20 kernel.  The delay between irq ->
> netif_rx_schedule() -> NET_RX_SOFTIRQ ->  ->poll()  doesn't seem
> to be long enough.  But of course my understanding of NAPI is
> very limited, probably I missed something...

It would've been nice to get a comment on this.  Yeah I know,
old kernel, non-mainline driver...

On this platform NAPI seems to be a win when receiving small packets,
but not for a single max-bandwidth TCP stream.  The folks at
stlinux.com seem to be using a dedicated hw timer to delay
the NAPI poll() calls:
http://www.stlinux.com/drupal/kernel/network/stmmac-optimizations

This of course adds some latency to the packet processing,
however in the single TCP stream case this wouldn't matter.


Thanks,
Johannes

^ permalink raw reply

* Re: 100Mbit ethernet performance on embedded devices
From: Johannes Stezenbach @ 2009-08-28 14:26 UTC (permalink / raw)
  To: H M Thalib; +Cc: linux-embedded, netdev
In-Reply-To: <4A96A871.3090002@gmail.com>

On Thu, Aug 27, 2009 at 09:08:25PM +0530, H M Thalib wrote:
> Johannes Stezenbach wrote:
> >
> >a while ago I was working on a SoC with 200MHz ARM926EJ-S CPU
> >and integrated 100Mbit ethernet core, connected on internal
> >(fast) memory bus, with DMA.  With iperf I measured:
> 
> Did you used Iperf it is not the correct tool to find the
> performance of ethernet. use tools like Smartbits or IXIA they are
> special hardware to measure the performance . They will give you
> better results

iperf is close to what the targeted application of this system
does -- receive a stream via TCP and process it.

Busybox wget e.g. is not good for benchmarking, it has a too small
receive buffer and adds a lot of syscall overhead.

> >  TCP RX ~70Mbit/sec  (iperf -s on SoC, iperf -c on destop PC)
> >  TCP TX ~56Mbit/sec  (iperf -s on destop PC, iperf -c o SoC)
> 
> Did you stopped unwanted process in both PC as well as processor,
> make sure PC has a bottle neck. Does it gives a through put of at
> least 95MBps. Is you system connected directly with crossover
> cables.

They are usually cpnnected via a 100Mbit switch, direct connection
yields no measurable improvement, and the PC can RX/TX ~95Mbit/sec
at close to 0% CPI load.

> >The CPU load during the iperf test is around
> >1% user, 44% system, 4% irq, 48% softirq, with 7500 irqs/sec.
> 
> Did you used vmast -- it is not the correct way to measure the cpu load
> or do you use top -- it takes lots of you system resource .. this
> can affect ehternet performance

I used a small tool similar to busybox nmeter (except that
it prints numbers instead of a bar). When this tool alone
runs the system is 100% idle.

> >I tried hard, but I couldn't find any performance figures for
> >comparison.  (All performance figures I found refer to 1Gbit
> >or 10Gbit server type systems.)
> 
> surely you will not find the perf data for small low end processor
> because they are not made fro that. and also this data is not some
> thing sharable .they are the benchmark about their product.

I wouldn't trust manufacturer benchmakrs anyway.  But I was
hoping to get some numbers from people working on similar
networked embedded hsystems.  E.g. it is hard to believe
that wireless routers running OpenWRT have trouble handling
54Mbit on the *wired* interface with a few iptables rules
enabled.

Thanks,
Johannes

^ permalink raw reply

* Re: [PATCH] OMAP3:PM: fix lockdep warning caused by omap3_pm_init
From: Kevin Hilman @ 2009-08-28 12:03 UTC (permalink / raw)
  To: tom.leiming; +Cc: peterz, linux-omap, arm-kernel, linux-embedded, linux
In-Reply-To: <1250947226-2371-1-git-send-email-tom.leiming@gmail.com>

tom.leiming@gmail.com writes:

> From: Ming Lei <tom.leiming@gmail.com>
>
> This patch uses kmalloc(size,GFP_ATOMIC) instead of kmalloc(size,GFP_KERNEL)
> to allocate memory for instance of struct power_state in pwrdms_setup(),
> since it may be called by pwrdm_for_each() with irq disabled.
>
> It is a easy fix for the following lockdep warning caused by
> kmalloc(size,GFP_KERNEL) in pwrdms_setup():
>
> Power Management for TI OMAP3.
> ------------[ cut here ]------------
> WARNING: at kernel/lockdep.c:2282 lockdep_trace_alloc+0xe8/0xfc()
> Modules linked in:
> [<c0032ccc>] (unwind_backtrace+0x0/0xec) from [<c0056934>] (warn_slowpath_common+0x48/0x60)
> [<c0056934>] (warn_slowpath_common+0x48/0x60) from [<c007da10>] (lockdep_trace_alloc+0xe8/0xfc)
> [<c007da10>] (lockdep_trace_alloc+0xe8/0xfc) from [<c00cd9bc>] (kmem_cache_alloc+0x28/0x178)
> [<c00cd9bc>] (kmem_cache_alloc+0x28/0x178) from [<c000f184>] (pwrdms_setup+0x30/0xf8)
> [<c000f184>] (pwrdms_setup+0x30/0xf8) from [<c00381c4>] (pwrdm_for_each+0x64/0x84)
> [<c00381c4>] (pwrdm_for_each+0x64/0x84) from [<c000ef60>] (omap3_pm_init+0x3f4/0x5ac)
> [<c000ef60>] (omap3_pm_init+0x3f4/0x5ac) from [<c002c2c0>] (do_one_initcall+0x30/0x1d4)
> [<c002c2c0>] (do_one_initcall+0x30/0x1d4) from [<c00088d8>] (kernel_init+0xa4/0x118)
> [<c00088d8>] (kernel_init+0xa4/0x118) from [<c002ddf8>] (kernel_thread_exit+0x0/0x8)
> ---[ end trace 1e06f8d97dc5a19b ]---
>
> ---
> This patch is against linux-2.6.31-rc7.
>
> Signed-off-by: Ming Lei <tom.leiming@gmail.com>

Looks good, I'll queue in my fixes for 2.6.32 queue.

Kevin

> ---
>  arch/arm/mach-omap2/pm34xx.c |    2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/arch/arm/mach-omap2/pm34xx.c b/arch/arm/mach-omap2/pm34xx.c
> index 488d595..d67b781 100644
> --- a/arch/arm/mach-omap2/pm34xx.c
> +++ b/arch/arm/mach-omap2/pm34xx.c
> @@ -665,7 +665,7 @@ static int __init pwrdms_setup(struct powerdomain *pwrdm)
>  	if (!pwrdm->pwrsts)
>  		return 0;
>  
> -	pwrst = kmalloc(sizeof(struct power_state), GFP_KERNEL);
> +	pwrst = kmalloc(sizeof(struct power_state), GFP_ATOMIC);
>  	if (!pwrst)
>  		return -ENOMEM;
>  	pwrst->pwrdm = pwrdm;
> -- 
> 1.6.0.GIT

^ permalink raw reply

* Re: 100Mbit ethernet performance on embedded devices
From: H M Thalib @ 2009-08-27 15:38 UTC (permalink / raw)
  To: Johannes Stezenbach; +Cc: linux-embedded, netdev
In-Reply-To: <20090819145057.GA25400@sig21.net>

Hi,

Johannes Stezenbach wrote:
> Hi,
> 
> a while ago I was working on a SoC with 200MHz ARM926EJ-S CPU
> and integrated 100Mbit ethernet core, connected on internal
> (fast) memory bus, with DMA.  With iperf I measured:
> 

Did you used Iperf it is not the correct tool to find the performance of 
ethernet. use tools like Smartbits or IXIA they are special hardware to 
measure the performance . They will give you better results

>   TCP RX ~70Mbit/sec  (iperf -s on SoC, iperf -c on destop PC)
>   TCP TX ~56Mbit/sec  (iperf -s on destop PC, iperf -c o SoC)

Did you stopped unwanted process in both PC as well as processor, make 
sure PC has a bottle neck. Does it gives a through put of at least 
95MBps. Is you system connected directly with crossover cables.

> The CPU load during the iperf test is around
> 1% user, 44% system, 4% irq, 48% softirq, with 7500 irqs/sec.

Did you used vmast -- it is not the correct way to measure the cpu load
or do you use top -- it takes lots of you system resource .. this can 
affect ehternet performance

> The kernel used in these measurements does not have iptables
> support, I think packet filtering will slow it down noticably,
> but I didn't actually try.

Thats good. iptable will dramatically affect the performance. remove all 
the iptables related modules if it is loaded before performing test

   The ethernet driver uses NAPI,
> but it doesn't seem to be a win judging from the irq/sec number.
> The kernel was an ancient 2.6.20.
> 
not bad. worth upgrading.

> I tried hard, but I couldn't find any performance figures for
> comparison.  (All performance figures I found refer to 1Gbit
> or 10Gbit server type systems.)

surely you will not find the perf data for small low end processor 
because they are not made fro that. and also this data is not some thing 
sharable .they are the benchmark about their product.

Industry is interested in high performance processor for network 
product. beside ethernet they do have lot offloading engines.

> What I'm interested in are some numbers for similar hardware,
> to find out if my hardware and/or ethernet driver can be improved,
> or if the CPU will always be the limiting factor.

probably should be possible optimizing hardware+software but you have to 
   pay for that.

> I'd also be interested to know if hardware checksumming
> support would improve throughput noticably in such a system,
> or if it is only useful for 1Gbit and above.

In my experience for your cpu 80% max of ehternet speed should be ok .. 
don't expect more.

> 
> Did anyone actually manage to get close to 100Mbit/sec
> with similar CPU resources?
> 
> 
> TIA,
> Johannes
> --
> To unsubscribe from this list: send the line "unsubscribe linux-embedded" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

-- 
Thanks & Regards,
H M Thalib.

^ permalink raw reply

* [PATCH] OMAP3:PM: fix lockdep warning caused by omap3_pm_init
From: tom.leiming @ 2009-08-22 13:20 UTC (permalink / raw)
  To: khilman, peterz; +Cc: linux-omap, arm-kernel, linux-embedded, linux, Ming Lei

From: Ming Lei <tom.leiming@gmail.com>

This patch uses kmalloc(size,GFP_ATOMIC) instead of kmalloc(size,GFP_KERNEL)
to allocate memory for instance of struct power_state in pwrdms_setup(),
since it may be called by pwrdm_for_each() with irq disabled.

It is a easy fix for the following lockdep warning caused by
kmalloc(size,GFP_KERNEL) in pwrdms_setup():

Power Management for TI OMAP3.
------------[ cut here ]------------
WARNING: at kernel/lockdep.c:2282 lockdep_trace_alloc+0xe8/0xfc()
Modules linked in:
[<c0032ccc>] (unwind_backtrace+0x0/0xec) from [<c0056934>] (warn_slowpath_common+0x48/0x60)
[<c0056934>] (warn_slowpath_common+0x48/0x60) from [<c007da10>] (lockdep_trace_alloc+0xe8/0xfc)
[<c007da10>] (lockdep_trace_alloc+0xe8/0xfc) from [<c00cd9bc>] (kmem_cache_alloc+0x28/0x178)
[<c00cd9bc>] (kmem_cache_alloc+0x28/0x178) from [<c000f184>] (pwrdms_setup+0x30/0xf8)
[<c000f184>] (pwrdms_setup+0x30/0xf8) from [<c00381c4>] (pwrdm_for_each+0x64/0x84)
[<c00381c4>] (pwrdm_for_each+0x64/0x84) from [<c000ef60>] (omap3_pm_init+0x3f4/0x5ac)
[<c000ef60>] (omap3_pm_init+0x3f4/0x5ac) from [<c002c2c0>] (do_one_initcall+0x30/0x1d4)
[<c002c2c0>] (do_one_initcall+0x30/0x1d4) from [<c00088d8>] (kernel_init+0xa4/0x118)
[<c00088d8>] (kernel_init+0xa4/0x118) from [<c002ddf8>] (kernel_thread_exit+0x0/0x8)
---[ end trace 1e06f8d97dc5a19b ]---

---
This patch is against linux-2.6.31-rc7.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 arch/arm/mach-omap2/pm34xx.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/arm/mach-omap2/pm34xx.c b/arch/arm/mach-omap2/pm34xx.c
index 488d595..d67b781 100644
--- a/arch/arm/mach-omap2/pm34xx.c
+++ b/arch/arm/mach-omap2/pm34xx.c
@@ -665,7 +665,7 @@ static int __init pwrdms_setup(struct powerdomain *pwrdm)
 	if (!pwrdm->pwrsts)
 		return 0;
 
-	pwrst = kmalloc(sizeof(struct power_state), GFP_KERNEL);
+	pwrst = kmalloc(sizeof(struct power_state), GFP_ATOMIC);
 	if (!pwrst)
 		return -ENOMEM;
 	pwrst->pwrdm = pwrdm;
-- 
1.6.0.GIT


^ permalink raw reply related

* [PATCH] mlock_everything
From: Samo Pogacnik @ 2009-08-20 23:50 UTC (permalink / raw)
  To: linux-embedded

Hi,

This patch tries to provide the functionality for locking of all used
memory of all living processes/threads into RAM. It creates an
additional proc file named "/proc/sys/vm/mlock_everything". Every write
to this file calls the do_mlock_everything() function, which scans all
processes and threads and their VMAs. For each task (process or thread),
locking is performed almost like mlockall(MCL_CURRENT | MCL_FUTURE)
would have been called by each process but without checking the limits.
Reads from this proc file return two numbers. The first number tells how
many writes (do_mlock_everything() requests) have been already performed
since the system startup. The second number shows how many VM_LOCKED
flags had been set to different VMAs during the last locking request.

So, why all this? Maybe this is not the best idea but enabling the
do_mlock_everything() after the initialization phase of the system
startup prevents system trashing in the OOM situation before the
oom_killer does its job on an embedded system (typically without swap,
running a fixed set of processes with maybe some RT requirements, ...).

Is there another way to prevent unloading of any existing process'es
memory segment backed up by a file on a filesystem (i.e. from a dynamic
loadable shared object), when the system approaches the OOM situation?

I am not sure about the correctness of the do_mlock_everything()
implementation in the patch. Are there any strange implications of such
functionality? Would it be sensible to have the possibility for a global
lock flag, which would automatically set lock flags to all future
processes as they would have been created?

Any comments and suggestions would be very much appreciated, thank you. 

The patch is below.

regards, Samo

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index e76d3b2..91b21be 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -205,6 +205,7 @@ enum
 	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
 	VM_VDSO_ENABLED=34,	/* map VDSO into new processes? */
 	VM_MIN_SLAB=35,		 /* Percent pages ignored by zone reclaim */
+	VM_MLOCK_EVERYTHING=36,	/* Lock all memory currently mapped by tasks
*/
 };

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e5bfcc7..cfe8de2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -77,6 +77,8 @@ extern int suid_dumpable;
 extern char core_pattern[];
 extern int pid_max;
 extern int min_free_kbytes;
+extern int sysctl_mlock_everything[2];
+extern int mlock_everything_sysctl_handler(ctl_table *, int, struct
file *, void __user *, size_t *, loff_t *);
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
@@ -1109,6 +1111,15 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 	{
+		.ctl_name	= VM_MLOCK_EVERYTHING,
+		.procname	= "mlock_everything",
+		.data		= &sysctl_mlock_everything,
+		.maxlen		= sizeof(sysctl_mlock_everything),
+		.mode		= 0644,
+		.proc_handler	= mlock_everything_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+	},
+	{
 		.ctl_name	= VM_PERCPU_PAGELIST_FRACTION,
 		.procname	= "percpu_pagelist_fraction",
 		.data		= &percpu_pagelist_fraction,
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index b38423c..019d788 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -135,6 +135,7 @@ static const struct trans_ctl_table trans_vm_table[]
= {
 	{ VM_PANIC_ON_OOM,		"panic_on_oom" },
 	{ VM_VDSO_ENABLED,		"vdso_enabled" },
 	{ VM_MIN_SLAB,			"min_slab_ratio" },
+	{ VM_MLOCK_EVERYTHING,		"mlock_everything" },

 	{}
 };
diff --git a/mm/mlock.c b/mm/mlock.c
index cbe9e05..2baace7 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -678,3 +678,56 @@ void free_locked_buffer(void *buffer, size_t size)

 	kfree(buffer);
 }
+
+/*
+ * This function scans all mapped vma-s of all existing tasks and sets
their
+ * default (future mapped areas) and already mapped areas flags to
VM_LOCKED.
+ */ 
+int do_mlock_everything(void)
+{
+	struct task_struct *g, *p;
+	struct vm_area_struct * vma;
+	int count = 0;
+
+	do_each_thread(g, p)
+		if (p->pid && p->mm) {
+			down_write(&p->mm->mmap_sem);
+			read_lock(&tasklist_lock);
+			task_lock(p);
+			p->mm->def_flags = VM_LOCKED;
+
+			for (vma = p->mm->mmap; vma ; vma = vma->vm_next) {
+				if (vma->vm_flags & VM_LOCKED)
+					continue;
+				vma->vm_flags |= VM_LOCKED;
+				count++;
+			}
+
+			task_unlock(p);
+			read_unlock(&tasklist_lock);
+			up_write(&p->mm->mmap_sem);
+		}
+	while_each_thread(g, p);
+	return count;
+}
+
+/*
+ * Syctl handler for do_mlock_everything().
+ */
+int sysctl_mlock_everything[2] = {
+	0, /* subsequent request for mlock everything */
+	0  /* number of VMAs locked in the last mlock everything request */
+};
+static int mlock_everything_count = 0;
+
+int mlock_everything_sysctl_handler(ctl_table *table, int write, 
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec(table, write, file, buffer, length, ppos);
+	if (write) {
+		sysctl_mlock_everything[0] = ++mlock_everything_count;
+		sysctl_mlock_everything[1] = do_mlock_everything();
+	}
+	return 0;
+}
+

^ permalink raw reply related

* Re: 100Mbit ethernet performance on embedded devices
From: Johannes Stezenbach @ 2009-08-20 12:56 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: linux-embedded, netdev
In-Reply-To: <20090819153534.GC30013@shareable.org>

On Wed, Aug 19, 2009 at 04:35:34PM +0100, Jamie Lokier wrote:
> Johannes Stezenbach wrote:
> > 
> >   TCP RX ~70Mbit/sec  (iperf -s on SoC, iperf -c on destop PC)
> >   TCP TX ~56Mbit/sec  (iperf -s on destop PC, iperf -c o SoC)
> > 
> > The CPU load during the iperf test is around
> > 1% user, 44% system, 4% irq, 48% softirq, with 7500 irqs/sec.
> > 
> > The kernel used in these measurements does not have iptables
> > support, I think packet filtering will slow it down noticably,
> > but I didn't actually try.  The ethernet driver uses NAPI,
> > but it doesn't seem to be a win judging from the irq/sec number.
> 
> You should see far fewer interrupts if NAPI was working properly.
> Rather than NAPI not being a win, it looks like it's not active at
> all.
> 
> 7500/sec is close to the packet rate, for sending TCP with
> full-size ethernet packages over a 100Mbit ethernet link.

From debug output I can see that NAPI works in principle, however
the timing seems to be such that ->poll() almost always completes
before the next packet is received.  I followed the NAPI_HOWTO.txt
which came with the 2.6.20 kernel.  The delay between irq ->
netif_rx_schedule() -> NET_RX_SOFTIRQ ->  ->poll()  doesn't seem
to be long enough.  But of course my understanding of NAPI is
very limited, probably I missed something...

> > What I'm interested in are some numbers for similar hardware,
> > to find out if my hardware and/or ethernet driver can be improved,
> > or if the CPU will always be the limiting factor.
> 
> I have a SoC with a 166MHz ARMv4 (ARM7TDMI I think, but I'm not sure),
> and an external RTL8139 100Mbit ethernet chip over the SoC's PCI bus.
> 
> It gets a little over 80Mbit/s actual data throughput in both
> directions, running a simple FTP client.

I found one interesting page which defines network driver performance
in terms of "CPU MHz per Mbit".
http://www.stlinux.com/drupal/node/439

I can't really tell from their table how big a win HW csum is, but
what they call "interrupt mitigation optimisations" (IOW: working NAPI)
seems important.  (compare the values for STx7105)

If some has an embedded platform with 100Mbit ethernet where they can switch
HW checksum via ethtool and benchmark both under equal conditions, that
would be very interesting.

Thanks
Johannes

^ permalink raw reply

* Re: New fast(?)-boot results on ARM
From: Sascha Hauer @ 2009-08-20  8:57 UTC (permalink / raw)
  To: Dirk Behme
  Cc: Robert Schwebel, linux-kernel, linux-embedded, Arjan van de Ven,
	Tim Bird, kernel
In-Reply-To: <4A8C263D.9000009@googlemail.com>

On Wed, Aug 19, 2009 at 06:20:13PM +0200, Dirk Behme wrote:
>>
>> Yes, correct. The copying itself is between 'copy' and 'done' so it
>> takes about 0.4s.
>>
>>> What's the size of the uncompressed kernel copied here?
>>
>> The image is about 2.8MB, but I copied the whole partition of 3MB
>> because with raw images you can't detect the image size.
>
> With 3MB copied in ~0.4s you get ~8MB/s. This really depends on your HW, 
> but I would think with standard NOR flashes you should be able to do at 
> least two (three?) times better. Have you already checked the memory (NOR 
> flash) timings configured in your SoC?

It's NAND flash, so there's not much timing to optimize. What's
interesting about this is that the kernel NAND driver is much slower
than the one in U-Boot. Looking at it it turned out that the kernel
driver uses interrupts to wait for the controller to get ready.
Switching this to polling nearly doubles the NAND performance. UBI
mounts much faster now and this cuts off another few seconds from the
boot process :)

Sascha


-- 
Pengutronix e.K.                           |                             |
Industrial Linux Solutions                 | http://www.pengutronix.de/  |
Peiner Str. 6-8, 31137 Hildesheim, Germany | Phone: +49-5121-206917-0    |
Amtsgericht Hildesheim, HRA 2686           | Fax:   +49-5121-206917-5555 |

^ permalink raw reply

* Re: New fast(?)-boot results on ARM
From: Dirk Behme @ 2009-08-19 16:20 UTC (permalink / raw)
  To: Sascha Hauer
  Cc: Robert Schwebel, linux-kernel, linux-embedded, Arjan van de Ven,
	Tim Bird, kernel
In-Reply-To: <20090819072100.GA23444@pengutronix.de>

Sascha Hauer wrote:
> On Tue, Aug 18, 2009 at 05:31:42PM +0200, Dirk Behme wrote:
>> Sascha Hauer wrote:
>>> On Fri, Aug 14, 2009 at 07:02:28PM +0200, Robert Schwebel wrote:
>>>> Hi,
>>>>
>>>> On Thu, Aug 13, 2009 at 05:33:26PM +0200, Robert Schwebel wrote:
>>>>> On Thu, Aug 13, 2009 at 08:28:26AM -0700, Arjan van de Ven wrote:
>>>>>>> That's bad :-) So there is no room for improvement any more in our
>>>>>>> ARM boot sequences ...
>>>>>> on x86 we're doing pretty well ;-)
>>>>> On i.MX27 (400 MHz ARM926EJ-S) we currently need 7 s, measured from
>>>>> power-on through the kernel up to "starting init". This is with
>>>>>
>>>>> - no delay in u-boot-v2
>>>>> - rootfs on NAND (UBIFS)
>>>>> - quiet
>>>>> - precalculated loops-per-jiffy
>>>>> - zImage kernel instead of uImage
>>>> Here's a little video of our demo system booting:
>>>> http://www.youtube.com/watch?v=xDbUnNsj0cI
>>>>
>>>> As you can see there, it needs about 15 s from the release of the reset button
>>>> up to the moment where the application shows it's Qt 4.5.2 based GUI (which is
>>>> when we fade over from the initial framebuffer to the final one, in order to
>>>> hide the qt application startup noise).
>>>>
>>>> And below is the boot log (after turning "quiet" off again). The numbers are
>>>> the timestamp and the delta to the last timestamp, measured on the controlling
>>>> PC by looking at the serial console output. The ptx_ts script starts when the
>>>> regexp was found, so the numbers start basically in the moment when u-boot-v2
>>>> has initialized the system up to the point where we can see something.
>>>>
>>>> Result:
>>>>
>>>> - 2.4 s up from u-boot to the end of "Uncompressing Linux"
>>>> - 300 ms until ubifs initialization starts
>>>> - 3.7 s for ubifs, until "mounted root"
>>>>
>>>> So we basically have 7 s for the kernel. The rest is userspace, which hasn't
>>>> seen much optimization yet, other than trying to start the GUI application as
>>>> early as possible, while doing all other init stuff in parallel. Adding "quiet"
>>>> brings us another 300 ms.
>>>>
>>>> That's factor 70 away from the 110 ms boot time Tim has talked about some days
>>>> ago (and he measured on an ARM cpu which had almost half the speed of this
>>>> one), and I'm wondering what we can do to improve the boot time.
>>>>
>>>> Robert
>>>>
>>>> rsc@thebe:~$ microcom | ptx_ts "U-Boot 2.0.0-rc9"
>>>> [ 13.522625] <  0.043189>
>>>> [ 13.546627] <  0.024002> OSELAS(R)-phyCORE-trunk (PTXdist-1.99.svn/2009-08-06T08:37:25+0200)
>>>> [ 13.558613] <  0.011986>
>>>> [ 13.690643] <  0.132030>        _            ____ ___  ____  _____
>>>> [ 13.690731] <  0.000088>  _ __ | |__  _   _ / ___/ _ \|  _ \| ____|
>>>> [ 13.698595] <  0.007864> | '_ \| '_ \| | | | |  | | | | |_) |  _|
>>>> [ 13.698654] <  0.000059> | |_) | | | | |_| | |__| |_| |  _ <| |___
>>>> [ 13.702581] <  0.003927> | .__/|_| |_|\__, |\____\___/|_| \_\_____|
>>>> [ 13.706573] <  0.003992> |_|          |___/
>>>> [ 13.706622] <  0.000049>
>>>> [ 13.725043] <  0.018421>
>>>> [ 14.742608] <  1.017565>
>>> I made some changes suggested in this thread:
>>>
>>> - enable MMU in the bootloader
>>> - use assembler optimized memcpy/memset in the bootloader
>>> - start an uncompressed image
>>> - disable IP autoconfiguration in the Kernel
>>> - use lpj= command line parameter
>>> - use static device nodes instead of udev
>>> - skip some init scripts
>>> - made the kernel smaller (I do not have both configs handy, so I do not
>>>   know what exactly I changed)
>>>
>>> Already looks much better:
>>>
>>> [  0.000005] <  0.000005> U-Boot 2.0.0-rc10-00241-g3f10fe9-dirty (Aug 18 2009 - 13:29:25)
>>> [  0.000026] <  0.000021>
>>> [  0.000041] <  0.000015> Board: Phytec phyCORE-i.MX27
>>> [  0.000054] <  0.000013> cfi_probe: cfi_flash base: 0xc0000000 size: 0x02000000
>>> [  0.000067] <  0.000013> NAND device: Manufacturer ID: 0x20, Chip ID: 0x36 (ST Micro NAND 64MiB 1,8V 8-bit)
>>> [  0.000080] <  0.000013> imxfb@imxfb0: i.MX Framebuffer driver
>>> [  0.000092] <  0.000012> dma_alloc: 0xa6f56e40 0x10000000
>>> [  0.000105] <  0.000013> dma_alloc: 0xa6f57088 0x10000000
>>> [  0.000118] <  0.000013> dev_protect: currently broken
>>> [  0.000129] <  0.000011> Using environment in NOR Flash
>>> [  0.000141] <  0.000012> initialising PLLs
>>> [  0.128972] <  0.128831> Malloc space: 0xa6f00000 -> 0xa7f00000 (size 16 MB)
>>> [  0.128995] <  0.000023> Stack space : 0xa6ef8000 -> 0xa6f00000 (size 32 kB)
>>> [  0.129008] <  0.000013> running /env/bin/init...
>>> [  0.224963] <  0.095955>
>>> [  0.224984] <  0.000021> Hit any key to stop autoboot:  0
>>> [  0.224999] <  0.000015> copy
>>> [  0.592964] <  0.367965> done
>>> [  0.652010] <  0.059046> Linux version 2.6.31-rc4-00004-g05786f8-dirty (sha@octopus) (gcc version 4.3.2 (OSELAS.Toolchain-1.99.3) ) #206 PREEMPT Tue Aug 18 14:08:51 CEST 2009
>> So, this are ~0.6 s in boot loader and kernel copy until kernel starts, 
>> correct?
> 
> Yes, correct. The copying itself is between 'copy' and 'done' so it
> takes about 0.4s.
> 
>> What's the size of the uncompressed kernel copied here?
> 
> The image is about 2.8MB, but I copied the whole partition of 3MB
> because with raw images you can't detect the image size.

With 3MB copied in ~0.4s you get ~8MB/s. This really depends on your 
HW, but I would think with standard NOR flashes you should be able to 
do at least two (three?) times better. Have you already checked the 
memory (NOR flash) timings configured in your SoC?

See the second topic of

http://elinux.org/Boot_Time#Boot_time_check_list

too ;)

Best regards

Dirk

^ permalink raw reply

* Re: 100Mbit ethernet performance on embedded devices
From: Jamie Lokier @ 2009-08-19 15:35 UTC (permalink / raw)
  To: Johannes Stezenbach; +Cc: linux-embedded, netdev
In-Reply-To: <20090819145057.GA25400@sig21.net>

Johannes Stezenbach wrote:
> a while ago I was working on a SoC with 200MHz ARM926EJ-S CPU
> and integrated 100Mbit ethernet core, connected on internal
> (fast) memory bus, with DMA.  With iperf I measured:
> 
>   TCP RX ~70Mbit/sec  (iperf -s on SoC, iperf -c on destop PC)
>   TCP TX ~56Mbit/sec  (iperf -s on destop PC, iperf -c o SoC)
> 
> The CPU load during the iperf test is around
> 1% user, 44% system, 4% irq, 48% softirq, with 7500 irqs/sec.
> 
> The kernel used in these measurements does not have iptables
> support, I think packet filtering will slow it down noticably,
> but I didn't actually try.  The ethernet driver uses NAPI,
> but it doesn't seem to be a win judging from the irq/sec number.

You should see far fewer interrupts if NAPI was working properly.
Rather than NAPI not being a win, it looks like it's not active at
all.

7500/sec is close to the packet rate, for sending TCP with
full-size ethernet packages over a 100Mbit ethernet link.

> What I'm interested in are some numbers for similar hardware,
> to find out if my hardware and/or ethernet driver can be improved,
> or if the CPU will always be the limiting factor.

I have a SoC with a 166MHz ARMv4 (ARM7TDMI I think, but I'm not sure),
and an external RTL8139 100Mbit ethernet chip over the SoC's PCI bus.

It gets a little over 80Mbit/s actual data throughput in both
directions, running a simple FTP client.

> I'd also be interested to know if hardware checksumming
> support would improve throughput noticably in such a system,
> or if it is only useful for 1Gbit and above.
> 
> Did anyone actually manage to get close to 100Mbit/sec
> with similar CPU resources?

Remember, the TCP throughput cannot reach 100Mbit/sec due to the
overhead of packet framing.  But it should be much closer to 100 than 70.

-- Jamie

^ permalink raw reply

* Re: 100Mbit ethernet performance on embedded devices
From: Ben Hutchings @ 2009-08-19 15:05 UTC (permalink / raw)
  To: Johannes Stezenbach; +Cc: linux-embedded, netdev
In-Reply-To: <20090819145057.GA25400@sig21.net>

On Wed, 2009-08-19 at 16:50 +0200, Johannes Stezenbach wrote:
> Hi,
> 
> a while ago I was working on a SoC with 200MHz ARM926EJ-S CPU
> and integrated 100Mbit ethernet core, connected on internal
> (fast) memory bus, with DMA.  With iperf I measured:
> 
>   TCP RX ~70Mbit/sec  (iperf -s on SoC, iperf -c on destop PC)
>   TCP TX ~56Mbit/sec  (iperf -s on destop PC, iperf -c o SoC)
> 
> The CPU load during the iperf test is around
> 1% user, 44% system, 4% irq, 48% softirq, with 7500 irqs/sec.
> 
> The kernel used in these measurements does not have iptables
> support, I think packet filtering will slow it down noticably,
> but I didn't actually try.  The ethernet driver uses NAPI,
> but it doesn't seem to be a win judging from the irq/sec number.
> The kernel was an ancient 2.6.20.

Which driver is this?  Is it possible that it does not use NAPI
correctly?

> I tried hard, but I couldn't find any performance figures for
> comparison.  (All performance figures I found refer to 1Gbit
> or 10Gbit server type systems.)
> 
> What I'm interested in are some numbers for similar hardware,
> to find out if my hardware and/or ethernet driver can be improved,
> or if the CPU will always be the limiting factor.
> I'd also be interested to know if hardware checksumming
> support would improve throughput noticably in such a system,
> or if it is only useful for 1Gbit and above.

I have no recent experience with this sort of system, but checksum
offload and scatter/gather DMA support should significantly reduce both
CPU and memory bus load.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* 100Mbit ethernet performance on embedded devices
From: Johannes Stezenbach @ 2009-08-19 14:50 UTC (permalink / raw)
  To: linux-embedded; +Cc: netdev

Hi,

a while ago I was working on a SoC with 200MHz ARM926EJ-S CPU
and integrated 100Mbit ethernet core, connected on internal
(fast) memory bus, with DMA.  With iperf I measured:

  TCP RX ~70Mbit/sec  (iperf -s on SoC, iperf -c on destop PC)
  TCP TX ~56Mbit/sec  (iperf -s on destop PC, iperf -c o SoC)

The CPU load during the iperf test is around
1% user, 44% system, 4% irq, 48% softirq, with 7500 irqs/sec.

The kernel used in these measurements does not have iptables
support, I think packet filtering will slow it down noticably,
but I didn't actually try.  The ethernet driver uses NAPI,
but it doesn't seem to be a win judging from the irq/sec number.
The kernel was an ancient 2.6.20.

I tried hard, but I couldn't find any performance figures for
comparison.  (All performance figures I found refer to 1Gbit
or 10Gbit server type systems.)

What I'm interested in are some numbers for similar hardware,
to find out if my hardware and/or ethernet driver can be improved,
or if the CPU will always be the limiting factor.
I'd also be interested to know if hardware checksumming
support would improve throughput noticably in such a system,
or if it is only useful for 1Gbit and above.

Did anyone actually manage to get close to 100Mbit/sec
with similar CPU resources?

TIA,
Johannes

^ permalink raw reply

* Re: new ipdelay= option for faster netboot
From: Jamie Lokier @ 2009-08-19 11:57 UTC (permalink / raw)
  To: Tim Bird
  Cc: David Miller, r.schwebel, vda.linux, linux-kernel, linux-embedded,
	arjan, kernel, netdev
In-Reply-To: <4A8A06A0.2070402@am.sony.com>

Tim Bird wrote:
> David Miller wrote:
> > From: Tim Bird <tim.bird@am.sony.com>
> > Date: Mon, 17 Aug 2009 18:24:26 -0700
> >
> >> David Miller wrote:
> >>> I have card/switch combinations that take up to 10 seconds to
> >>> negotiate a proper link.
> >> What types of delays are these timeouts supposed to
> >> cover?
> >
> > The problem is that if you don't first give at least some time for the
> > link to come up, the remaining time it takes the link to come up will
> > end up chewing into the actual bootp/dhcp protocol timeouts.  And
> > that's what we're trying to avoid.
> 
> What link?  I'm not that familiar with networking.
> 
> Assuming I'm using ethernet, what link needs to come up?

When you plug an ethernet cable in, you may have noticed it takes a
short time before the signal light comes on.  That's negotiation time.
Some are slower than others, but none of them do it instantly.

> Is this something to do with power propagation to the
> physical wire?

Not really.

> Is there some MAC layer negotiation between the card and the switch?
> Is it the time for the switch to do speed detection?

Yes and yes.

> And, can any of this be more accurately determined
> or guessed-at with knowledge of the onboard hardware?
> Or is it dependent on external conditions?

It can be accurately determined with most cards (all modern ones)
because you get a notification when it's done, or you can poll the card.

That's why on the desktop it's able to detect when you plug in an
ethernet cable and start DHCP as soon as link negotiation is complete.

So the right thing to do, as David Miller suggested too, isn't a fixed
timeout.  It should wait for link state UP and then start DHCP
immediately.

-- Jamie

^ permalink raw reply

* Re: New fast(?)-boot results on ARM
From: Sascha Hauer @ 2009-08-19  7:21 UTC (permalink / raw)
  To: Dirk Behme
  Cc: Robert Schwebel, linux-kernel, linux-embedded, Arjan van de Ven,
	Tim Bird, kernel
In-Reply-To: <4A8AC95E.2040907@googlemail.com>

On Tue, Aug 18, 2009 at 05:31:42PM +0200, Dirk Behme wrote:
> Sascha Hauer wrote:
>> On Fri, Aug 14, 2009 at 07:02:28PM +0200, Robert Schwebel wrote:
>>> Hi,
>>>
>>> On Thu, Aug 13, 2009 at 05:33:26PM +0200, Robert Schwebel wrote:
>>>> On Thu, Aug 13, 2009 at 08:28:26AM -0700, Arjan van de Ven wrote:
>>>>>> That's bad :-) So there is no room for improvement any more in our
>>>>>> ARM boot sequences ...
>>>>> on x86 we're doing pretty well ;-)
>>>> On i.MX27 (400 MHz ARM926EJ-S) we currently need 7 s, measured from
>>>> power-on through the kernel up to "starting init". This is with
>>>>
>>>> - no delay in u-boot-v2
>>>> - rootfs on NAND (UBIFS)
>>>> - quiet
>>>> - precalculated loops-per-jiffy
>>>> - zImage kernel instead of uImage
>>> Here's a little video of our demo system booting:
>>> http://www.youtube.com/watch?v=xDbUnNsj0cI
>>>
>>> As you can see there, it needs about 15 s from the release of the reset button
>>> up to the moment where the application shows it's Qt 4.5.2 based GUI (which is
>>> when we fade over from the initial framebuffer to the final one, in order to
>>> hide the qt application startup noise).
>>>
>>> And below is the boot log (after turning "quiet" off again). The numbers are
>>> the timestamp and the delta to the last timestamp, measured on the controlling
>>> PC by looking at the serial console output. The ptx_ts script starts when the
>>> regexp was found, so the numbers start basically in the moment when u-boot-v2
>>> has initialized the system up to the point where we can see something.
>>>
>>> Result:
>>>
>>> - 2.4 s up from u-boot to the end of "Uncompressing Linux"
>>> - 300 ms until ubifs initialization starts
>>> - 3.7 s for ubifs, until "mounted root"
>>>
>>> So we basically have 7 s for the kernel. The rest is userspace, which hasn't
>>> seen much optimization yet, other than trying to start the GUI application as
>>> early as possible, while doing all other init stuff in parallel. Adding "quiet"
>>> brings us another 300 ms.
>>>
>>> That's factor 70 away from the 110 ms boot time Tim has talked about some days
>>> ago (and he measured on an ARM cpu which had almost half the speed of this
>>> one), and I'm wondering what we can do to improve the boot time.
>>>
>>> Robert
>>>
>>> rsc@thebe:~$ microcom | ptx_ts "U-Boot 2.0.0-rc9"
>>> [ 13.522625] <  0.043189>
>>> [ 13.546627] <  0.024002> OSELAS(R)-phyCORE-trunk (PTXdist-1.99.svn/2009-08-06T08:37:25+0200)
>>> [ 13.558613] <  0.011986>
>>> [ 13.690643] <  0.132030>        _            ____ ___  ____  _____
>>> [ 13.690731] <  0.000088>  _ __ | |__  _   _ / ___/ _ \|  _ \| ____|
>>> [ 13.698595] <  0.007864> | '_ \| '_ \| | | | |  | | | | |_) |  _|
>>> [ 13.698654] <  0.000059> | |_) | | | | |_| | |__| |_| |  _ <| |___
>>> [ 13.702581] <  0.003927> | .__/|_| |_|\__, |\____\___/|_| \_\_____|
>>> [ 13.706573] <  0.003992> |_|          |___/
>>> [ 13.706622] <  0.000049>
>>> [ 13.725043] <  0.018421>
>>> [ 14.742608] <  1.017565>
>>
>> I made some changes suggested in this thread:
>>
>> - enable MMU in the bootloader
>> - use assembler optimized memcpy/memset in the bootloader
>> - start an uncompressed image
>> - disable IP autoconfiguration in the Kernel
>> - use lpj= command line parameter
>> - use static device nodes instead of udev
>> - skip some init scripts
>> - made the kernel smaller (I do not have both configs handy, so I do not
>>   know what exactly I changed)
>>
>> Already looks much better:
>>
>> [  0.000005] <  0.000005> U-Boot 2.0.0-rc10-00241-g3f10fe9-dirty (Aug 18 2009 - 13:29:25)
>> [  0.000026] <  0.000021>
>> [  0.000041] <  0.000015> Board: Phytec phyCORE-i.MX27
>> [  0.000054] <  0.000013> cfi_probe: cfi_flash base: 0xc0000000 size: 0x02000000
>> [  0.000067] <  0.000013> NAND device: Manufacturer ID: 0x20, Chip ID: 0x36 (ST Micro NAND 64MiB 1,8V 8-bit)
>> [  0.000080] <  0.000013> imxfb@imxfb0: i.MX Framebuffer driver
>> [  0.000092] <  0.000012> dma_alloc: 0xa6f56e40 0x10000000
>> [  0.000105] <  0.000013> dma_alloc: 0xa6f57088 0x10000000
>> [  0.000118] <  0.000013> dev_protect: currently broken
>> [  0.000129] <  0.000011> Using environment in NOR Flash
>> [  0.000141] <  0.000012> initialising PLLs
>> [  0.128972] <  0.128831> Malloc space: 0xa6f00000 -> 0xa7f00000 (size 16 MB)
>> [  0.128995] <  0.000023> Stack space : 0xa6ef8000 -> 0xa6f00000 (size 32 kB)
>> [  0.129008] <  0.000013> running /env/bin/init...
>> [  0.224963] <  0.095955>
>> [  0.224984] <  0.000021> Hit any key to stop autoboot:  0
>> [  0.224999] <  0.000015> copy
>> [  0.592964] <  0.367965> done
>> [  0.652010] <  0.059046> Linux version 2.6.31-rc4-00004-g05786f8-dirty (sha@octopus) (gcc version 4.3.2 (OSELAS.Toolchain-1.99.3) ) #206 PREEMPT Tue Aug 18 14:08:51 CEST 2009
>
> So, this are ~0.6 s in boot loader and kernel copy until kernel starts, 
> correct?

Yes, correct. The copying itself is between 'copy' and 'done' so it
takes about 0.4s.

>
> What's the size of the uncompressed kernel copied here?

The image is about 2.8MB, but I copied the whole partition of 3MB
because with raw images you can't detect the image size.

>
> Btw.: I tried to summarize some hints given in this thread in
>
> http://elinux.org/Boot_Time#Boot_time_check_list

Nice work!

Regards
  Sascha

-- 
Pengutronix e.K.                           |                             |
Industrial Linux Solutions                 | http://www.pengutronix.de/  |
Peiner Str. 6-8, 31137 Hildesheim, Germany | Phone: +49-5121-206917-0    |
Amtsgericht Hildesheim, HRA 2686           | Fax:   +49-5121-206917-5555 |

^ permalink raw reply

* Re: architecture-independent I/o accessors
From: Arnd Bergmann @ 2009-08-18 21:37 UTC (permalink / raw)
  To: Wolfgang Denk; +Cc: linux-embedded
In-Reply-To: <20090818210701.84125833DBD2@gemini.denx.de>

On Tuesday 18 August 2009 21:07:01 Wolfgang Denk wrote:
> Dear Arnd,
> 
> Josh Boyer suggested you might provide some insight...
> 
> I'm currently looking for a solution how to provide architecture
> independent I/O accessor functions to U-Boot. In the past, lots of
> code used direct pointer accesses, relying on the idea that "volatile"
> would be sufficient to convince the compiler and the hardware to do
> what was expected; some architectures (like ARM and others) used
> readl() / writel(), while others (like PPC) used in_8, in_le16,
> in_be16, in_le32, in_be32, in_le64, in_be64 etc.
> 
> As we like to borrow code from Linux, I'm trying to find out what the
> big plan for Linux is.
> 
> My understanding is that in Linux the ioreadX() / iowriteX() /
> ioreadXbe() / iowriteXbe() functions are supposed to provide
> architecture independent I/O accessors, and that the plain ioreadX()
> / iowriteX() functions (without the "be") are always guaranteed to be
> little-endian on all architectures, while the "be" functions are,
> well, big-endian.  Is this understanding correct?

yes. Also, these functions are defined so that you can use them
both for memory mapped I/O *and* for programmed I/O (aka inl/outl).

> If yes, does that mean that in the future we will see more Linux code
> using ioreadX[be]() / iowriteX[be]()? So far I did not find much
> hints that support this aproach - only memory-barriers.txt has only a
> short sentence about these functions, with basicly no explanation.

The most common ones are readl/writel, simply because they are better
known. For devices that only have memory mapped I/O, they are
by definition equivalent to ioread32/iowrite32.

The SATA drivers and others use ioread32/iowrite32 because that
lets the driver ignore the difference between PIO and MMIO.

> What I liked from the in_[le]X() / out_[le]X() accessors on PPC was
> that they allowed for type checking - the compiler would raise a
> warning when you used in_[le]16() to read from a 32 bit wide register.
> However, ioreadX[be]() / iowriteX[be]() use a void * "iomem cookie",
> so no type checking can be done.

Hmm, interesting. I was never aware of that difference. We should
probably change that in the kernel, to add type checking to all
of them.

Another difference on powerpc is that in_le32/out_le32 do not
can not be used on PCI devices but only SoC, because legacy iSeries
and pSeries need some additional magic for PCI accesses.

> Basicly I have two questions:
> 
> 1) Can you make a statement which direction Linux is heading to?
>    Will more (new) code use ioreadX() / iowriteX()?

New subsystems will often use ioreadX/iowriteX by default, but
I expect existing code to keep using readl/writel and new drivers
will also keep using it.

> 2) What would be your recommendation what we should do in U-Boot?
>    Provide for all architectures in_8, in_le16, in_be16, in_le32,
>    in_be32, in_le64, in_be64 etc. similar to what we have for the
>    Power architecture, well knowing that Linux will not follow that
>    route, or use ioreadX[be]() / iowriteX[be]() which does not provide
>    type checking, and which eventually does not find wider use in
>    Linux either? Or even something else - like ioreadX[be]() /
>    iowriteX[be]() with type checking added?

I think ioread32/iowrite32 and friends with type checking would
be the easiest. It would be nice to try adding type checking to
the kernel, just to see what breaks ;-)

	Arnd <><

^ permalink raw reply

* architecture-independent I/o accessors
From: Wolfgang Denk @ 2009-08-18 21:07 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: linux-embedded

Dear Arnd,

Josh Boyer suggested you might provide some insight...

I'm currently looking for a solution how to provide architecture
independent I/O accessor functions to U-Boot. In the past, lots of
code used direct pointer accesses, relying on the idea that "volatile"
would be sufficient to convince the compiler and the hardware to do
what was expected; some architectures (like ARM and others) used
readl() / writel(), while others (like PPC) used in_8, in_le16,
in_be16, in_le32, in_be32, in_le64, in_be64 etc.

As we like to borrow code from Linux, I'm trying to find out what the
big plan for Linux is.

My understanding is that in Linux the ioreadX() / iowriteX() /
ioreadXbe() / iowriteXbe() functions are supposed to provide
architecture independent I/O accessors, and that the plain ioreadX()
/ iowriteX() functions (without the "be") are always guaranteed to be
little-endian on all architectures, while the "be" functions are,
well, big-endian.  Is this understanding correct?

If yes, does that mean that in the future we will see more Linux code
using ioreadX[be]() / iowriteX[be]()? So far I did not find much
hints that support this aproach - only memory-barriers.txt has only a
short sentence about these functions, with basicly no explanation.

What I liked from the in_[le]X() / out_[le]X() accessors on PPC was
that they allowed for type checking - the compiler would raise a
warning when you used in_[le]16() to read from a 32 bit wide register.
However, ioreadX[be]() / iowriteX[be]() use a void * "iomem cookie",
so no type checking can be done.

Basicly I have two questions:

1) Can you make a statement which direction Linux is heading to?
   Will more (new) code use ioreadX() / iowriteX()?

2) What would be your recommendation what we should do in U-Boot?
   Provide for all architectures in_8, in_le16, in_be16, in_le32,
   in_be32, in_le64, in_be64 etc. similar to what we have for the
   Power architecture, well knowing that Linux will not follow that
   route, or use ioreadX[be]() / iowriteX[be]() which does not provide
   type checking, and which eventually does not find wider use in
   Linux either? Or even something else - like ioreadX[be]() /
   iowriteX[be]() with type checking added?

Thanks in advance.

Best regards,

Wolfgang Denk

-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany
Phone: (+49)-8142-66989-10 Fax: (+49)-8142-66989-80 Email: wd@denx.de
The ideal situation is to have massive computing power right at home.
Something that dims the streetlights and shrinks the picture  on  the
neighbours' TVs when you boot it up.

^ permalink raw reply

* Re: New fast(?)-boot results on ARM
From: Tim Bird @ 2009-08-18 18:23 UTC (permalink / raw)
  To: Dirk Behme
  Cc: Sascha Hauer, Robert Schwebel, linux-kernel, linux-embedded,
	Arjan van de Ven, kernel
In-Reply-To: <4A8AC95E.2040907@googlemail.com>

Dirk Behme wrote
> Btw.: I tried to summarize some hints given in this thread in
> 
> http://elinux.org/Boot_Time#Boot_time_check_list
> 
> Please feel free to add and correct stuff!

That's a great summary of the points raised in the discussion.
It's good to organize the information and save it in an
easy-to-read format.

Thanks very much for doing that!
 -- Tim

=============================
Tim Bird
Architecture Group Chair, CE Linux Forum
Senior Staff Engineer, Sony Corporation of America
=============================

^ permalink raw reply

* Re: New fast(?)-boot results on ARM
From: Marco Stornelli @ 2009-08-18 16:34 UTC (permalink / raw)
  To: Dirk Behme
  Cc: Sascha Hauer, Robert Schwebel, linux-kernel, linux-embedded,
	Arjan van de Ven, Tim Bird, kernel
In-Reply-To: <4A8AC95E.2040907@googlemail.com>

Dirk Behme wrote:
> Sascha Hauer wrote:
>> On Fri, Aug 14, 2009 at 07:02:28PM +0200, Robert Schwebel wrote:
>>> Hi,
>>>
>>> On Thu, Aug 13, 2009 at 05:33:26PM +0200, Robert Schwebel wrote:
>>>> On Thu, Aug 13, 2009 at 08:28:26AM -0700, Arjan van de Ven wrote:
>>>>>> That's bad :-) So there is no room for improvement any more in our
>>>>>> ARM boot sequences ...
>>>>> on x86 we're doing pretty well ;-)
>>>> On i.MX27 (400 MHz ARM926EJ-S) we currently need 7 s, measured from
>>>> power-on through the kernel up to "starting init". This is with
>>>>
>>>> - no delay in u-boot-v2
>>>> - rootfs on NAND (UBIFS)
>>>> - quiet
>>>> - precalculated loops-per-jiffy
>>>> - zImage kernel instead of uImage
>>> Here's a little video of our demo system booting:
>>> http://www.youtube.com/watch?v=xDbUnNsj0cI
>>>
>>> As you can see there, it needs about 15 s from the release of the
>>> reset button
>>> up to the moment where the application shows it's Qt 4.5.2 based GUI
>>> (which is
>>> when we fade over from the initial framebuffer to the final one, in
>>> order to
>>> hide the qt application startup noise).
>>>
>>> And below is the boot log (after turning "quiet" off again). The
>>> numbers are
>>> the timestamp and the delta to the last timestamp, measured on the
>>> controlling
>>> PC by looking at the serial console output. The ptx_ts script starts
>>> when the
>>> regexp was found, so the numbers start basically in the moment when
>>> u-boot-v2
>>> has initialized the system up to the point where we can see something.
>>>
>>> Result:
>>>
>>> - 2.4 s up from u-boot to the end of "Uncompressing Linux"
>>> - 300 ms until ubifs initialization starts
>>> - 3.7 s for ubifs, until "mounted root"
>>>
>>> So we basically have 7 s for the kernel. The rest is userspace, which
>>> hasn't
>>> seen much optimization yet, other than trying to start the GUI
>>> application as
>>> early as possible, while doing all other init stuff in parallel.
>>> Adding "quiet"
>>> brings us another 300 ms.
>>>
>>> That's factor 70 away from the 110 ms boot time Tim has talked about
>>> some days
>>> ago (and he measured on an ARM cpu which had almost half the speed of
>>> this
>>> one), and I'm wondering what we can do to improve the boot time.
>>>
>>> Robert
>>>
>>> rsc@thebe:~$ microcom | ptx_ts "U-Boot 2.0.0-rc9"
>>> [ 13.522625] <  0.043189>
>>> [ 13.546627] <  0.024002> OSELAS(R)-phyCORE-trunk
>>> (PTXdist-1.99.svn/2009-08-06T08:37:25+0200)
>>> [ 13.558613] <  0.011986>
>>> [ 13.690643] <  0.132030>        _            ____ ___  ____  _____
>>> [ 13.690731] <  0.000088>  _ __ | |__  _   _ / ___/ _ \|  _ \| ____|
>>> [ 13.698595] <  0.007864> | '_ \| '_ \| | | | |  | | | | |_) |  _|
>>> [ 13.698654] <  0.000059> | |_) | | | | |_| | |__| |_| |  _ <| |___
>>> [ 13.702581] <  0.003927> | .__/|_| |_|\__, |\____\___/|_| \_\_____|
>>> [ 13.706573] <  0.003992> |_|          |___/
>>> [ 13.706622] <  0.000049>
>>> [ 13.725043] <  0.018421>
>>> [ 14.742608] <  1.017565>
>>
>> I made some changes suggested in this thread:
>>
>> - enable MMU in the bootloader
>> - use assembler optimized memcpy/memset in the bootloader
>> - start an uncompressed image
>> - disable IP autoconfiguration in the Kernel
>> - use lpj= command line parameter
>> - use static device nodes instead of udev
>> - skip some init scripts
>> - made the kernel smaller (I do not have both configs handy, so I do not
>>   know what exactly I changed)
>>
>> Already looks much better:
>>
>> [  0.000005] <  0.000005> U-Boot 2.0.0-rc10-00241-g3f10fe9-dirty (Aug
>> 18 2009 - 13:29:25)
>> [  0.000026] <  0.000021>
>> [  0.000041] <  0.000015> Board: Phytec phyCORE-i.MX27
>> [  0.000054] <  0.000013> cfi_probe: cfi_flash base: 0xc0000000 size:
>> 0x02000000
>> [  0.000067] <  0.000013> NAND device: Manufacturer ID: 0x20, Chip ID:
>> 0x36 (ST Micro NAND 64MiB 1,8V 8-bit)
>> [  0.000080] <  0.000013> imxfb@imxfb0: i.MX Framebuffer driver
>> [  0.000092] <  0.000012> dma_alloc: 0xa6f56e40 0x10000000
>> [  0.000105] <  0.000013> dma_alloc: 0xa6f57088 0x10000000
>> [  0.000118] <  0.000013> dev_protect: currently broken
>> [  0.000129] <  0.000011> Using environment in NOR Flash
>> [  0.000141] <  0.000012> initialising PLLs
>> [  0.128972] <  0.128831> Malloc space: 0xa6f00000 -> 0xa7f00000 (size
>> 16 MB)
>> [  0.128995] <  0.000023> Stack space : 0xa6ef8000 -> 0xa6f00000 (size
>> 32 kB)
>> [  0.129008] <  0.000013> running /env/bin/init...
>> [  0.224963] <  0.095955>
>> [  0.224984] <  0.000021> Hit any key to stop autoboot:  0
>> [  0.224999] <  0.000015> copy
>> [  0.592964] <  0.367965> done
>> [  0.652010] <  0.059046> Linux version
>> 2.6.31-rc4-00004-g05786f8-dirty (sha@octopus) (gcc version 4.3.2
>> (OSELAS.Toolchain-1.99.3) ) #206 PREEMPT Tue Aug 18 14:08:51 CEST 2009
> 
> So, this are ~0.6 s in boot loader and kernel copy until kernel starts,
> correct?
> 
> What's the size of the uncompressed kernel copied here?
> 
> Best regards
> 
> Dirk
> 
> Btw.: I tried to summarize some hints given in this thread in
> 
> http://elinux.org/Boot_Time#Boot_time_check_list
> 
> Please feel free to add and correct stuff!
> 

It's a good documentation, good work. From 14s to 5s I think it's a very
 good result. In reference to the previous response of Robert, I think
that it's a good thing to use a vanilla kernel and avoid strange and
specific or not mature solutions, but it needs to use the "right" tool
for the "right" platform. SquashFS is in mainline, mdev is part of
busybox and it's used in several projects. You cannot think to have a
normal desktop, imho some tools and some solutions must be very
specific, it's the embedded world. However your problems are very common
in the production environment.

Marco

^ permalink raw reply

* Re: New fast(?)-boot results on ARM
From: Dirk Behme @ 2009-08-18 15:31 UTC (permalink / raw)
  To: Sascha Hauer
  Cc: Robert Schwebel, linux-kernel, linux-embedded, Arjan van de Ven,
	Tim Bird, kernel
In-Reply-To: <20090818140605.GB9943@pengutronix.de>

Sascha Hauer wrote:
> On Fri, Aug 14, 2009 at 07:02:28PM +0200, Robert Schwebel wrote:
>> Hi,
>>
>> On Thu, Aug 13, 2009 at 05:33:26PM +0200, Robert Schwebel wrote:
>>> On Thu, Aug 13, 2009 at 08:28:26AM -0700, Arjan van de Ven wrote:
>>>>> That's bad :-) So there is no room for improvement any more in our
>>>>> ARM boot sequences ...
>>>> on x86 we're doing pretty well ;-)
>>> On i.MX27 (400 MHz ARM926EJ-S) we currently need 7 s, measured from
>>> power-on through the kernel up to "starting init". This is with
>>>
>>> - no delay in u-boot-v2
>>> - rootfs on NAND (UBIFS)
>>> - quiet
>>> - precalculated loops-per-jiffy
>>> - zImage kernel instead of uImage
>> Here's a little video of our demo system booting:
>> http://www.youtube.com/watch?v=xDbUnNsj0cI
>>
>> As you can see there, it needs about 15 s from the release of the reset button
>> up to the moment where the application shows it's Qt 4.5.2 based GUI (which is
>> when we fade over from the initial framebuffer to the final one, in order to
>> hide the qt application startup noise).
>>
>> And below is the boot log (after turning "quiet" off again). The numbers are
>> the timestamp and the delta to the last timestamp, measured on the controlling
>> PC by looking at the serial console output. The ptx_ts script starts when the
>> regexp was found, so the numbers start basically in the moment when u-boot-v2
>> has initialized the system up to the point where we can see something.
>>
>> Result:
>>
>> - 2.4 s up from u-boot to the end of "Uncompressing Linux"
>> - 300 ms until ubifs initialization starts
>> - 3.7 s for ubifs, until "mounted root"
>>
>> So we basically have 7 s for the kernel. The rest is userspace, which hasn't
>> seen much optimization yet, other than trying to start the GUI application as
>> early as possible, while doing all other init stuff in parallel. Adding "quiet"
>> brings us another 300 ms.
>>
>> That's factor 70 away from the 110 ms boot time Tim has talked about some days
>> ago (and he measured on an ARM cpu which had almost half the speed of this
>> one), and I'm wondering what we can do to improve the boot time.
>>
>> Robert
>>
>> rsc@thebe:~$ microcom | ptx_ts "U-Boot 2.0.0-rc9"
>> [ 13.522625] <  0.043189>
>> [ 13.546627] <  0.024002> OSELAS(R)-phyCORE-trunk (PTXdist-1.99.svn/2009-08-06T08:37:25+0200)
>> [ 13.558613] <  0.011986>
>> [ 13.690643] <  0.132030>        _            ____ ___  ____  _____
>> [ 13.690731] <  0.000088>  _ __ | |__  _   _ / ___/ _ \|  _ \| ____|
>> [ 13.698595] <  0.007864> | '_ \| '_ \| | | | |  | | | | |_) |  _|
>> [ 13.698654] <  0.000059> | |_) | | | | |_| | |__| |_| |  _ <| |___
>> [ 13.702581] <  0.003927> | .__/|_| |_|\__, |\____\___/|_| \_\_____|
>> [ 13.706573] <  0.003992> |_|          |___/
>> [ 13.706622] <  0.000049>
>> [ 13.725043] <  0.018421>
>> [ 14.742608] <  1.017565>
> 
> I made some changes suggested in this thread:
> 
> - enable MMU in the bootloader
> - use assembler optimized memcpy/memset in the bootloader
> - start an uncompressed image
> - disable IP autoconfiguration in the Kernel
> - use lpj= command line parameter
> - use static device nodes instead of udev
> - skip some init scripts
> - made the kernel smaller (I do not have both configs handy, so I do not
>   know what exactly I changed)
> 
> Already looks much better:
> 
> [  0.000005] <  0.000005> U-Boot 2.0.0-rc10-00241-g3f10fe9-dirty (Aug 18 2009 - 13:29:25)
> [  0.000026] <  0.000021>
> [  0.000041] <  0.000015> Board: Phytec phyCORE-i.MX27
> [  0.000054] <  0.000013> cfi_probe: cfi_flash base: 0xc0000000 size: 0x02000000
> [  0.000067] <  0.000013> NAND device: Manufacturer ID: 0x20, Chip ID: 0x36 (ST Micro NAND 64MiB 1,8V 8-bit)
> [  0.000080] <  0.000013> imxfb@imxfb0: i.MX Framebuffer driver
> [  0.000092] <  0.000012> dma_alloc: 0xa6f56e40 0x10000000
> [  0.000105] <  0.000013> dma_alloc: 0xa6f57088 0x10000000
> [  0.000118] <  0.000013> dev_protect: currently broken
> [  0.000129] <  0.000011> Using environment in NOR Flash
> [  0.000141] <  0.000012> initialising PLLs
> [  0.128972] <  0.128831> Malloc space: 0xa6f00000 -> 0xa7f00000 (size 16 MB)
> [  0.128995] <  0.000023> Stack space : 0xa6ef8000 -> 0xa6f00000 (size 32 kB)
> [  0.129008] <  0.000013> running /env/bin/init...
> [  0.224963] <  0.095955>
> [  0.224984] <  0.000021> Hit any key to stop autoboot:  0
> [  0.224999] <  0.000015> copy
> [  0.592964] <  0.367965> done
> [  0.652010] <  0.059046> Linux version 2.6.31-rc4-00004-g05786f8-dirty (sha@octopus) (gcc version 4.3.2 (OSELAS.Toolchain-1.99.3) ) #206 PREEMPT Tue Aug 18 14:08:51 CEST 2009

So, this are ~0.6 s in boot loader and kernel copy until kernel 
starts, correct?

What's the size of the uncompressed kernel copied here?

Best regards

Dirk

Btw.: I tried to summarize some hints given in this thread in

http://elinux.org/Boot_Time#Boot_time_check_list

Please feel free to add and correct stuff!

> [  0.652030] <  0.000020> CPU: ARM926EJ-S [41069264] revision 4 (ARMv5TEJ), cr=00053177
> [  0.652044] <  0.000014> CPU: VIVT data cache, VIVT instruction cache
> [  0.652057] <  0.000013> Machine: phyCORE-i.MX27
> [  0.652069] <  0.000012> Memory policy: ECC disabled, Data cache writeback
> [  0.652082] <  0.000013> Built 1 zonelists in Zone order, mobility grouping on.  Total pages: 32512
> [  0.706012] <  0.053930> Kernel command line: console=ttymxc0,115200 earlyprintk lpj=995328 mt9v022.sensor_type=color ip=192.168.23.197:192.168.23.2:192.168.23.2:255.255.0.0::: ubi.mtd=7 root=ubi0:root rootfstype=ubifs mtdparts="physmap-flash.0:256k(uboot)ro,128k(ubootenv),3M(kernel),-(root);mxc_nand:256k(uboot)ro,128k(ubootenv),3M(kernel),-(root)"
> [  0.706034] <  0.000022> console [earlyser0] enabled
> [  0.706049] <  0.000015> Unknown boot option `mt9v022.sensor_type=color': ignoring
> [  0.706062] <  0.000013> PID hash table entries: 512 (order: 9, 2048 bytes)
> [  0.706075] <  0.000013> Dentry cache hash table entries: 16384 (order: 4, 65536 bytes)
> [  0.706087] <  0.000012> Inode-cache hash table entries: 8192 (order: 3, 32768 bytes)
> [  0.755997] <  0.049910> Memory: 128MB = 128MB total
> [  0.756016] <  0.000019> Memory: 127004KB available (2404K code, 205K data, 80K init, 0K highmem)
> [  0.756030] <  0.000014> NR_IRQS:272
> [  0.756042] <  0.000012> MXC GPIO hardware
> [  0.756055] <  0.000013> MXC IRQ initialized
> [  0.756067] <  0.000012> Console: colour dummy device 80x30
> [  0.756079] <  0.000012> Calibrating delay loop (skipped) preset value.. 199.06 BogoMIPS (lpj=995328)
> [  0.756092] <  0.000013> Mount-cache hash table entries: 512
> [  0.756104] <  0.000012> CPU: Testing write buffer coherency: ok
> [  0.771968] <  0.015864> NET: Registered protocol family 16
> [  0.803967] <  0.031999> bio: create slab <bio-0> at 0
> [  0.869007] <  0.065040> NET: Registered protocol family 2
> [  0.869025] <  0.000018> IP route cache hash table entries: 1024 (order: 0, 4096 bytes)
> [  0.869040] <  0.000015> TCP established hash table entries: 4096 (order: 3, 32768 bytes)
> [  0.869053] <  0.000013> TCP bind hash table entries: 4096 (order: 2, 16384 bytes)
> [  0.869066] <  0.000013> TCP: Hash tables configured (established 4096 bind 4096)
> [  0.869078] <  0.000012> TCP reno registered
> [  0.869090] <  0.000012> NET: Registered protocol family 1
> [  0.869103] <  0.000013> msgmni has been set to 248
> [  0.869115] <  0.000012> io scheduler noop registered (default)
> [  0.869127] <  0.000012> i.MX Framebuffer driver
> [  0.884970] <  0.015843> Console: switching to colour frame buffer device 30x40
> [  0.974022] <  0.089052> Serial: IMX driver
> [  0.974127] <  0.000105> Platform driver 'imx-uart' needs updating - please use dev_pm_ops
> [  0.974217] <  0.000090> imx-uart.0: ttymxc0 at MMIO 0x1000a000 (irq = 20) is a IMX
> [  0.974306] <  0.000089> console handover: boot [earlyser0] -> real [ttymxc0]
> [  0.974392] <  0.000086> imx-uart.1: ttymxc1 at MMIO 0x1000b000 (irq = 19) is a IMX
> [  0.974481] <  0.000089> imx-uart.2: ttymxc2 at MMIO 0x1000c000 (irq = 18) is a IMX
> [  0.974569] <  0.000088> FEC Ethernet Driver
> [  0.974651] <  0.000082> Platform driver 'fec' needs updating - please use dev_pm_ops
> [  0.974737] <  0.000086> fec: PHY @ 0x0, ID 0x00221613 -- KS8721BL
> [  1.019018] <  0.044281> physmap platform flash device: 02000000 at c0000000
> [  1.019118] <  0.000100> physmap-flash.0: Found 1 x16 devices at 0x0 in 16-bit bank
> [  1.019207] <  0.000089>  Intel/Sharp Extended Query Table at 0x010A
> [  1.019293] <  0.000086>  Intel/Sharp Extended Query Table at 0x010A
> [  1.019377] <  0.000084>  Intel/Sharp Extended Query Table at 0x010A
> [  1.019460] <  0.000083>  Intel/Sharp Extended Query Table at 0x010A
> [  1.019544] <  0.000084>  Intel/Sharp Extended Query Table at 0x010A
> [  1.019627] <  0.000083> Using buffer write method
> [  1.019714] <  0.000087> Using auto-unlock on power-up/resume
> [  1.019797] <  0.000083> cfi_cmdset_0001: Erase suspend on write enabled
> [  1.019881] <  0.000084> 4 cmdlinepart partitions found on MTD device physmap-flash.0
> [  1.082018] <  0.062137> Creating 4 MTD partitions on "physmap-flash.0":
> [  1.082112] <  0.000094> 0x000000000000-0x000000040000 : "uboot"
> [  1.082199] <  0.000087> 0x000000040000-0x000000060000 : "ubootenv"
> [  1.082287] <  0.000088> 0x000000060000-0x000000360000 : "kernel"
> [  1.082371] <  0.000084> 0x000000360000-0x000002000000 : "root"
> [  1.082453] <  0.000082> NAND device: Manufacturer ID: 0x20, Chip ID: 0x36 (ST Micro NAND 64MiB 1,8V 8-bit)
> [  1.082543] <  0.000090> RedBoot partition parsing not available
> [  1.082627] <  0.000084> 4 cmdlinepart partitions found on MTD device mxc_nand
> [  1.082715] <  0.000088> Creating 4 MTD partitions on "mxc_nand":
> [  1.082798] <  0.000083> 0x000000000000-0x000000040000 : "uboot"
> [  1.082882] <  0.000084> 0x000000040000-0x000000060000 : "ubootenv"
> [  1.097976] <  0.015094> 0x000000060000-0x000000360000 : "kernel"
> [  1.113978] <  0.016002> 0x000000360000-0x000004000000 : "root"
> [  1.425012] <  0.311034> UBI: attaching mtd7 to ubi0
> [  1.425043] <  0.000031> UBI: physical eraseblock size:   16384 bytes (16 KiB)
> [  1.425057] <  0.000014> UBI: logical eraseblock size:    15360 bytes
> [  1.425071] <  0.000014> UBI: smallest flash I/O unit:    512
> [  1.425083] <  0.000012> UBI: VID header offset:          512 (aligned 512)
> [  1.425096] <  0.000013> UBI: data offset:                1024
> [  3.008058] <  1.582962> UBI: attached mtd7 to ubi0
> [  3.008090] <  0.000032> UBI: MTD device name:            "root"
> [  3.008105] <  0.000015> UBI: MTD device size:            60 MiB
> [  3.008119] <  0.000014> UBI: number of good PEBs:        3880
> [  3.008132] <  0.000013> UBI: number of bad PEBs:         0
> [  3.008145] <  0.000013> UBI: max. allowed volumes:       89
> [  3.008159] <  0.000014> UBI: wear-leveling threshold:    4096
> [  3.008172] <  0.000013> UBI: number of internal volumes: 1
> [  3.008185] <  0.000013> UBI: number of user volumes:     1
> [  3.008199] <  0.000014> UBI: available PEBs:             0
> [  3.008212] <  0.000013> UBI: total number of reserved PEBs: 3880
> [  3.008226] <  0.000014> UBI: number of PEBs reserved for bad PEB handling: 38
> [  3.051029] <  0.042803> UBI: max/mean erase counter: 2/0
> [  3.051052] <  0.000023> UBI: image sequence number: 0
> [  3.051066] <  0.000014> UBI: background thread "ubi_bgt0d" started, PID 218
> [  3.051081] <  0.000015> i2c /dev entries driver
> [  3.051094] <  0.000013> rtc-pcf8563 1-0051: chip found, driver version 0.4.3
> [  3.051108] <  0.000014> rtc-pcf8563 1-0051: rtc core: registered rtc-pcf8563 as rtc0
> [  3.051122] <  0.000014> Driver for 1-wire Dallas network protocol.
> [  3.148042] <  0.096920> i.MX SDHC driver
> [  3.148067] <  0.000025> mxc-mmc: probe of mxc-mmc.1 failed with error -16
> [  3.148082] <  0.000015> TCP cubic registered
> [  3.148095] <  0.000013> NET: Registered protocol family 17
> [  3.148107] <  0.000012> RPC: Registered udp transport module.
> [  3.148119] <  0.000012> RPC: Registered tcp transport module.
> [  3.148132] <  0.000013> rtc-pcf8563 1-0051: low voltage detected, date/time is not reliable.
> [  3.148145] <  0.000013> rtc-pcf8563 1-0051: retrieved date/time is not valid.
> [  3.148157] <  0.000012> rtc-pcf8563 1-0051: hctosys: invalid date/time
> [  3.148170] <  0.000013> UBIFS: recovery needed
> [  3.211043] <  0.062873> UBIFS: recovery completed
> [  3.211064] <  0.000021> UBIFS: mounted UBI device 0, volume 1, name "root"
> [  3.211080] <  0.000016> UBIFS: file system size:   58490880 bytes (57120 KiB, 55 MiB, 3808 LEBs)
> [  3.211093] <  0.000013> UBIFS: journal size:       7741440 bytes (7560 KiB, 7 MiB, 504 LEBs)
> [  3.211105] <  0.000012> UBIFS: media format:       w4/r0 (latest is w4/r0)
> [  3.211118] <  0.000013> UBIFS: default compressor: lzo
> [  3.211130] <  0.000012> UBIFS: reserved for root:  0 bytes (0 KiB)
> [  3.211143] <  0.000013> VFS: Mounted root (ubifs filesystem) on device 0:12.
> [  3.211155] <  0.000012> Freeing init memory: 80K
> init started: BusyBox v1.13.4 (2009-08-06 08:30:14 CEST)
> [  3.514007] <  0.159993> mounting filesystems...done.
> [  3.546005] <  0.031998> running rc.d services...
> [  3.626007] <  0.080002> syslogd starting
> [  3.786013] <  0.160006> Starting telnetd...
> [  3.962014] <  0.176001> starting network interfaces...
> [  4.818032] <  0.856018> eth0: config: auto-negotiation on, 100FDX, 100HDX, 10FDX, 10HDX.
> [  5.058038] <  0.240006> ip: cannot find device "can0"
> [  5.250040] <  0.192002> ip: SIOCGIFFLAGS: No such device
> [  5.298033] <  0.047993>
> [  5.336039] <  0.038006> OSELAS(R)-phyCORE-trunk (PTXdist-1.99.svn/2009-08-06T08:37:25+0200)
> [  5.368028] <  0.031989>
> [  5.840066] <  0.472038>        _            ____ ___  ____  _____
> [  5.840090] <  0.000024>  _ __ | |__  _   _ / ___/ _ \|  _ \| ____|
> [  5.840104] <  0.000014> | '_ \| '_ \| | | | |  | | | | |_) |  _|
> [  5.840116] <  0.000012> | |_) | | | | |_| | |__| |_| |  _ <| |___
> [  5.840129] <  0.000013> | .__/|_| |_|\__, |\____\___/|_| \_\_____|
> [  5.840141] <  0.000012> |_|          |___/
> [  5.840154] <  0.000013>
> 
> Sascha
> 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox