* Re: [PATCH 2/2] ARM: QQ2440 networking support
From: Domenico Andreoli @ 2011-02-03 10:34 UTC (permalink / raw)
To: Jamie Iles; +Cc: netdev, Kukjin Kim, Russell King, Ben Dooks, linux-arm-kernel
In-Reply-To: <20110203094123.GB3141@pulham.picochip.com>
On Thu, Feb 03, 2011 at 09:41:23AM +0000, Jamie Iles wrote:
> Hi Domenico,
Hi Jamie,
> This should probably also go to the netdev mailing list:
> netdev@vger.kernel.org. A couple of other minor comments inline.
CCed them
> On Wed, Feb 02, 2011 at 10:06:37PM +0000, Domenico Andreoli wrote:
> > From: Domenico Andreoli <cavokz@gmail.com>
> >
> > Add networking support for QQ2440.
> >
> > Signed-off-by: Domenico Andreoli <cavokz@gmail.com>
> >
> > ---
> > arch/arm/mach-s3c2440/include/mach/qq2440.h | 22 ++++++++++++
> > arch/arm/mach-s3c2440/mach-qq2440.c | 16 ++++++++
> > drivers/net/Kconfig | 4 +-
> > drivers/net/cs89x0.c | 33 ++++++++++++------
> > 4 files changed, 61 insertions(+), 14 deletions(-)
> >
> > Index: arm-2.6.git/arch/arm/mach-s3c2440/include/mach/qq2440.h
> > ===================================================================
> > --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> > +++ arm-2.6.git/arch/arm/mach-s3c2440/include/mach/qq2440.h 2011-02-02 18:32:38.000000000 +0000
> > @@ -0,0 +1,22 @@
> > +/*
> > + * arch/arm/mach-s3c2440/include/mach/qq2440.h
> > + *
> > + * Copyright (c) 2011 Domenico Andreoli <cavokz@gmail.com>
> > + *
> > + * QQ2440 - platform definitions
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > +*/
> > +
> > +#ifndef __ASM_MACH_QQ2440_H
> > +#define __ASM_MACH_QQ2440_H
> > +
> > +#define QQ2440_CS8900_IRQ IRQ_EINT9
> > +
> > +#define QQ2440_CS8900_VIRT_BASE S3C_ADDR(0x00500000)
> > +#define QQ2440_CS8900_PA (S3C2410_CS3 + 0x1000000)
> > +#define QQ2440_CS8900_SZ SZ_1M
> > +
> > +#endif /* __ASM_MACH_QQ2440_H */
> > Index: arm-2.6.git/drivers/net/cs89x0.c
> > ===================================================================
> > --- arm-2.6.git.orig/drivers/net/cs89x0.c 2011-02-02 18:28:01.000000000 +0000
> > +++ arm-2.6.git/drivers/net/cs89x0.c 2011-02-02 18:32:38.000000000 +0000
> > @@ -95,6 +95,9 @@
> > Dmitry Pervushin : dpervushin@ru.mvista.com
> > : PNX010X platform support
> >
> > + Domenico Andreoli : cavokz@gmail.com
> > + : QQ2440 platform support
> > +
> > */
> >
> > /* Always include 'config.h' first in case the user wants to turn on
> > @@ -117,7 +120,7 @@
> > * Set this to zero to remove all the debug statements via
> > * dead code elimination
> > */
> > -#define DEBUGGING 1
> > +#define DEBUGGING 0
>
> This is probably best split out as a separate patch.
I will
> > /*
> > Sources:
> > @@ -173,6 +176,10 @@
> > #if defined(CONFIG_MACH_IXDP2351)
> > static unsigned int netcard_portlist[] __used __initdata = {IXDP2351_VIRT_CS8900_BASE, 0};
> > static unsigned int cs8900_irq_map[] = {IRQ_IXDP2351_CS8900, 0, 0, 0};
> > +#elif defined(CONFIG_MACH_QQ2440)
> > +#include <mach/qq2440.h>
> > +static unsigned int netcard_portlist[] __used __initdata = {QQ2440_CS8900_VIRT_BASE + 0x300, 0};
> > +static unsigned int cs8900_irq_map[] = {QQ2440_CS8900_IRQ, 0, 0, 0};
> > #elif defined(CONFIG_ARCH_IXDP2X01)
> > static unsigned int netcard_portlist[] __used __initdata = {IXDP2X01_CS8900_VIRT_BASE, 0};
> > static unsigned int cs8900_irq_map[] = {IRQ_IXDP2X01_CS8900, 0, 0, 0};
> > @@ -521,6 +528,10 @@
> > #endif
> > lp->force = g_cs89x0_media__force;
> > #endif
> > +
> > +#if defined(CONFIG_MACH_QQ2440)
> > + lp->force |= FORCE_RJ45 | FORCE_FULL;
> > +#endif
> > }
> >
> > /* Grab the region so we can find another board if autoIRQ fails. */
> > @@ -608,7 +619,7 @@
> > dev->dev_addr[i*2+1] = Addr >> 8;
> > }
> >
> > - /* Load the Adapter Configuration.
> > + /* Load the Adapter Configuration.
> > Note: Barring any more specific information from some
> > other source (ie EEPROM+Schematics), we would not know
> > how to operate a 10Base2 interface on the AUI port.
> > @@ -655,7 +666,7 @@
> > if ((readreg(dev, PP_SelfST) & EEPROM_PRESENT) == 0)
> > printk(KERN_WARNING "cs89x0: No EEPROM, relying on command line....\n");
> > else if (get_eeprom_data(dev, START_EEPROM_DATA,CHKSUM_LEN,eeprom_buff) < 0) {
> > - printk(KERN_WARNING "\ncs89x0: EEPROM read failed, relying on command line.\n");
> > + printk(KERN_WARNING "cs89x0: EEPROM read failed, relying on command line.\n");
> > } else if (get_eeprom_cksum(START_EEPROM_DATA,CHKSUM_LEN,eeprom_buff) < 0) {
> > /* Check if the chip was able to read its own configuration starting
> > at 0 in the EEPROM*/
> > @@ -709,7 +720,7 @@
> > /* FIXME: we don't set the Ethernet address on the command line. Use
> > ifconfig IFACE hw ether AABBCCDDEEFF */
> >
> > - printk(KERN_INFO "cs89x0 media %s%s%s",
> > + printk(KERN_INFO "cs89x0: media %s%s%s",
> > (lp->adapter_cnf & A_CNF_10B_T)?"RJ-45,":"",
> > (lp->adapter_cnf & A_CNF_AUI)?"AUI,":"",
> > (lp->adapter_cnf & A_CNF_10B_2)?"BNC,":"");
>
> Splitting these cleanups into a cleanup patch would be handly so you can
> see the real changes easier.
generally speaking, this driver requires some care. it's not only
about formatting.
you can see the ifdefs I had to add for QQ2440. you can note also that
it's not possible to compile it for multiple platforms because there
would be multiple definitions of some symbols.
for instance, there is a design bug on the QQ2440. EEPROM is not present
but the CS8900 is not correctly wired and EEPROM is incorrectly supposed
to be present. one way to solve this is with some more ifdefs...
in practice I'm volunteering to switch it to some more modern model like
platform driver but i still need to understand if it has any sense. it
does not seem a popular driver and I doubt there will be so many new
boards requiring it that the work could be justified (but I would do
it anyway).
what do netdev people think?
> > @@ -943,7 +954,7 @@
> > static void __init reset_chip(struct net_device *dev)
> > {
> > #if !defined(CONFIG_MACH_MX31ADS)
> > -#if !defined(CONFIG_MACH_IXDP2351) && !defined(CONFIG_ARCH_IXDP2X01)
> > +#if !defined(CS89x0_NONISA_IRQ)
> > struct net_local *lp = netdev_priv(dev);
> > int ioaddr = dev->base_addr;
> > #endif
> > @@ -954,18 +965,18 @@
> > /* wait 30 ms */
> > msleep(30);
> >
> > -#if !defined(CONFIG_MACH_IXDP2351) && !defined(CONFIG_ARCH_IXDP2X01)
> > +#if !defined(CS89x0_NONISA_IRQ)
> > if (lp->chip_type != CS8900) {
> > /* Hardware problem requires PNP registers to be reconfigured after a reset */
> > writeword(ioaddr, ADD_PORT, PP_CS8920_ISAINT);
> > - outb(dev->irq, ioaddr + DATA_PORT);
> > - outb(0, ioaddr + DATA_PORT + 1);
> > + writeword(ioaddr, DATA_PORT, dev->irq);
> > + writeword(ioaddr, DATA_PORT + 1, 0);
> >
> > writeword(ioaddr, ADD_PORT, PP_CS8920_ISAMemB);
> > - outb((dev->mem_start >> 16) & 0xff, ioaddr + DATA_PORT);
> > - outb((dev->mem_start >> 8) & 0xff, ioaddr + DATA_PORT + 1);
> > + writeword(ioaddr, DATA_PORT, (dev->mem_start >> 16) & 0xff);
> > + writeword(ioaddr, DATA_PORT + 1, (dev->mem_start >> 8) & 0xff);
> > }
> > -#endif /* IXDP2x01 */
> > +#endif
> >
> > /* Wait until the chip is reset */
> > reset_start_time = jiffies;
> > Index: arm-2.6.git/drivers/net/Kconfig
> > ===================================================================
> > --- arm-2.6.git.orig/drivers/net/Kconfig 2011-02-02 18:28:01.000000000 +0000
> > +++ arm-2.6.git/drivers/net/Kconfig 2011-02-02 18:32:38.000000000 +0000
> > @@ -1498,7 +1498,7 @@
> > config CS89x0
> > tristate "CS89x0 support"
> > depends on NET_ETHERNET && (ISA || EISA || MACH_IXDP2351 \
> > - || ARCH_IXDP2X01 || MACH_MX31ADS)
> > + || ARCH_IXDP2X01 || MACH_MX31ADS || MACH_QQ2440)
> > ---help---
> > Support for CS89x0 chipset based Ethernet cards. If you have a
> > network (Ethernet) card of this type, say Y and read the
> > @@ -1512,7 +1512,7 @@
> > config CS89x0_NONISA_IRQ
> > def_bool y
> > depends on CS89x0 != n
> > - depends on MACH_IXDP2351 || ARCH_IXDP2X01 || MACH_MX31ADS
> > + depends on MACH_IXDP2351 || ARCH_IXDP2X01 || MACH_MX31ADS || MACH_QQ2440
> >
> > config TC35815
> > tristate "TOSHIBA TC35815 Ethernet support"
> > Index: arm-2.6.git/arch/arm/mach-s3c2440/mach-qq2440.c
> > ===================================================================
> > --- arm-2.6.git.orig/arch/arm/mach-s3c2440/mach-qq2440.c 2011-02-02 18:29:48.000000000 +0000
> > +++ arm-2.6.git/arch/arm/mach-s3c2440/mach-qq2440.c 2011-02-02 18:32:38.000000000 +0000
> > @@ -36,6 +36,7 @@
> > #include <mach/leds-gpio.h>
> > #include <mach/regs-mem.h>
> > #include <mach/irqs.h>
> > +#include <mach/qq2440.h>
> > #include <plat/nand.h>
> > #include <plat/iic.h>
> > #include <plat/mci.h>
> > @@ -54,7 +55,12 @@
> > #include <sound/s3c24xx_uda134x.h>
> >
> > static struct map_desc qq2440_iodesc[] __initdata = {
> > - /* nothing to declare, move along */
> > + {
> > + .virtual = QQ2440_CS8900_VIRT_BASE,
> > + .pfn = __phys_to_pfn(QQ2440_CS8900_PA),
> > + .length = QQ2440_CS8900_SZ,
> > + .type = MT_DEVICE
> > + }
> > };
> >
> > #define UCON S3C2410_UCON_DEFAULT
> > @@ -325,10 +331,18 @@
> > s3c24xx_init_uarts(qq2440_uartcfgs, ARRAY_SIZE(qq2440_uartcfgs));
> > }
> >
> > +#define QQ2440_CS8900_BANKCON (S3C2410_BANKCON_Tacp6 | S3C2410_BANKCON_Tcah4 | S3C2410_BANKCON_Tcoh1 | \
> > + S3C2410_BANKCON_Tacc14 | S3C2410_BANKCON_Tcos4)
> > +
> > static void __init qq2440_init(void)
> > {
> > int i;
> >
> > + /* Ethernet */
> > + __raw_writel(__raw_readl(S3C2410_BWSCON) | S3C2410_BWSCON_WS3 | S3C2410_BWSCON_ST3, S3C2410_BWSCON);
> > + __raw_writel(QQ2440_CS8900_BANKCON, S3C2410_BANKCON3);
> > + set_irq_type(QQ2440_CS8900_IRQ, IRQ_TYPE_EDGE_RISING);
> > +
> > /* Make sure the D+ pullup pin is output */
> > WARN_ON(gpio_request(S3C2410_GPG(12), "udc pup"));
> > gpio_direction_output(S3C2410_GPG(12), 0);
>
> This arch stuff should really be a separate patch too.
thank you for the review
cheers,
Domenico
^ permalink raw reply
* Re: epoll broken [was: mmotm 2011-01-25-15-47 uploaded]
From: Jiri Slaby @ 2011-02-03 9:03 UTC (permalink / raw)
To: Eric Dumazet; +Cc: linux-kernel, akpm, mm-commits, ML netdev, davidel
In-Reply-To: <1296719601.3438.0.camel@edumazet-laptop>
On 02/03/2011 08:53 AM, Eric Dumazet wrote:
>> {0, {u32=0, u64=0}} .............. {0, {u32=0, u64=0}}, ?}
>> 0x7fb816996660, 8192, 0) = 379151968
>> 17836 --- SIGSEGV (Segmentation fault) @ 0 (0) ---
>> 17836 +++ killed by SIGSEGV +++
>>
>> The parameter, the same as the retval, seems to be bogus.
>>
>> Is it known (fixed in newer kernels)?
>>
>> thanks,
>
> Yes, its known, and a fix is there : https://lkml.org/lkml/2011/1/26/121
Thanks, it works indeed.
--
js
^ permalink raw reply
* Re: non-symmetric Unix dgram sockets and poll
From: Rémi Denis-Courmont @ 2011-02-03 8:45 UTC (permalink / raw)
To: netdev
In-Reply-To: <20110202173640.2af412f0@chocolatine.cbg.collabora.co.uk>
Le mercredi 2 février 2011 19:36:40 Alban Crequy, vous avez écrit :
> Hi,
>
> I have 3 Unix dgram sockets (sockA, sockB, sockC):
> - sockA is connected to sockB.
> - sockB is connected to sockC.
> - sockC is not connected.
>
> SockA cannot send any message to sockB because
> net/unix/af_unix.c::unix_may_send() prevents it. Is there any reason for
> that restriction?
Yes, absolutely. When you connect() a socket, you expect to only *receive*
packets from the specified peer.
--
Rémi Denis-Courmont
http://www.remlab.net/
http://fi.linkedin.com/in/remidenis
^ permalink raw reply
* Re: epoll broken [was: mmotm 2011-01-25-15-47 uploaded]
From: Eric Dumazet @ 2011-02-03 7:53 UTC (permalink / raw)
To: Jiri Slaby; +Cc: linux-kernel, akpm, mm-commits, ML netdev, davidel
In-Reply-To: <4D4A5BBF.20806@gmail.com>
Le jeudi 03 février 2011 à 08:39 +0100, Jiri Slaby a écrit :
> On 01/26/2011 12:48 AM, akpm@linux-foundation.org wrote:
> > The mm-of-the-moment snapshot 2011-01-25-15-47 has been uploaded to
>
> Hi, the network daemons are broken here. cupsd and httpd children
> segfault too often without servicing requests. It's a regression against
> mmotm 2011-01-06-15-41.
>
> It's epoll after it dies:
> 17836 epoll_create(8192) = 3
> ...
> 17836 accept(7, {sa_family=AF_FILE, NULL}, [2]) = 11
> 17836 getsockname(11, {sa_family=AF_FILE,
> path="/var/run/cups/cups.sock"}, [26]) = 0
> 17836 setsockopt(11, SOL_TCP, TCP_NODELAY, [1], 4) = -1 EOPNOTSUPP
> (Operation not supported)
> 17836 fcntl(11, F_GETFD) = 0
> 17836 fcntl(11, F_SETFD, FD_CLOEXEC) = 0
> 17836 epoll_ctl(3, EPOLL_CTL_ADD, 11, {EPOLLIN, {u32=379708832,
> u64=140428630418848}}) = 0
> 17836 epoll_wait(3, {{EPOLLIN, {u32=379708832, u64=140428630418848}}},
> 8192, 1000) = 1
> 17836 recvfrom(11, "P", 1, MSG_PEEK, NULL, NULL) = 1
> 17836 poll([{fd=11, events=POLLIN}], 1, 10000) = 1 ([{fd=11,
> revents=POLLIN}])
> 17836 recvfrom(11, "POST / HTTP/1.1\r\nContent-Length:"..., 2048, 0,
> NULL, NULL) = 771
> 17836 sendto(11, "HTTP/1.1 100 Continue\r\n\r\n", 25, 0, NULL, 0) = 25
> 17836 epoll_wait(3, {{EPOLLIN, {u32=379708832, u64=140428630418848}},
> {0, {u32=0, u64=0}} .............. {0, {u32=0, u64=0}}, ?}
> 0x7fb816996660, 8192, 0) = 379151968
> 17836 --- SIGSEGV (Segmentation fault) @ 0 (0) ---
> 17836 +++ killed by SIGSEGV +++
>
> The parameter, the same as the retval, seems to be bogus.
>
> Is it known (fixed in newer kernels)?
>
> thanks,
Yes, its known, and a fix is there : https://lkml.org/lkml/2011/1/26/121
^ permalink raw reply
* epoll broken [was: mmotm 2011-01-25-15-47 uploaded]
From: Jiri Slaby @ 2011-02-03 7:39 UTC (permalink / raw)
To: linux-kernel; +Cc: akpm, mm-commits, ML netdev, davidel
In-Reply-To: <201101260021.p0Q0LxsS016458@imap1.linux-foundation.org>
On 01/26/2011 12:48 AM, akpm@linux-foundation.org wrote:
> The mm-of-the-moment snapshot 2011-01-25-15-47 has been uploaded to
Hi, the network daemons are broken here. cupsd and httpd children
segfault too often without servicing requests. It's a regression against
mmotm 2011-01-06-15-41.
It's epoll after it dies:
17836 epoll_create(8192) = 3
...
17836 accept(7, {sa_family=AF_FILE, NULL}, [2]) = 11
17836 getsockname(11, {sa_family=AF_FILE,
path="/var/run/cups/cups.sock"}, [26]) = 0
17836 setsockopt(11, SOL_TCP, TCP_NODELAY, [1], 4) = -1 EOPNOTSUPP
(Operation not supported)
17836 fcntl(11, F_GETFD) = 0
17836 fcntl(11, F_SETFD, FD_CLOEXEC) = 0
17836 epoll_ctl(3, EPOLL_CTL_ADD, 11, {EPOLLIN, {u32=379708832,
u64=140428630418848}}) = 0
17836 epoll_wait(3, {{EPOLLIN, {u32=379708832, u64=140428630418848}}},
8192, 1000) = 1
17836 recvfrom(11, "P", 1, MSG_PEEK, NULL, NULL) = 1
17836 poll([{fd=11, events=POLLIN}], 1, 10000) = 1 ([{fd=11,
revents=POLLIN}])
17836 recvfrom(11, "POST / HTTP/1.1\r\nContent-Length:"..., 2048, 0,
NULL, NULL) = 771
17836 sendto(11, "HTTP/1.1 100 Continue\r\n\r\n", 25, 0, NULL, 0) = 25
17836 epoll_wait(3, {{EPOLLIN, {u32=379708832, u64=140428630418848}},
{0, {u32=0, u64=0}} .............. {0, {u32=0, u64=0}}, ?}
0x7fb816996660, 8192, 0) = 379151968
17836 --- SIGSEGV (Segmentation fault) @ 0 (0) ---
17836 +++ killed by SIGSEGV +++
The parameter, the same as the retval, seems to be bogus.
Is it known (fixed in newer kernels)?
thanks,
--
js
^ permalink raw reply
* [PATCH] sch_choke: Need linux/vmalloc.h
From: David Miller @ 2011-02-03 7:08 UTC (permalink / raw)
To: netdev; +Cc: shemminger, eric.dumazet
Signed-off-by: David S. Miller <davem@davemloft.net>
---
This fixes the build on sparc64.
net/sched/sch_choke.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index a1cec18..ee1e209 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -15,6 +15,7 @@
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/reciprocal_div.h>
+#include <linux/vmalloc.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
#include <net/red.h>
--
1.7.4
^ permalink raw reply related
* Re: linux-next: Tree for February 1 (ip_vs)
From: Hans Schillstrom @ 2011-02-03 7:04 UTC (permalink / raw)
To: Simon Horman
Cc: Randy Dunlap, Stephen Rothwell, netdev,
linux-next@vger.kernel.org, LKML
In-Reply-To: <20110202223159.GB2248@verge.net.au>
On Wed, 2011-02-02 at 23:31 +0100, Simon Horman wrote:
> On Tue, Feb 01, 2011 at 09:16:20AM -0800, Randy Dunlap wrote:
> > On Tue, 1 Feb 2011 15:34:03 +1100 Stephen Rothwell wrote:
> >
> > > Hi all,
> > >
> > > Changes since 20110131:
> >
> >
> > When CONFIG_IP_VS_PROTO_TCP is not set:
> >
> > net/netfilter/ipvs/ip_vs_proto_sctp.c:1104: error: 'struct netns_ipvs' has no member named 'tcp_app_lock'
> > net/netfilter/ipvs/ip_vs_proto_sctp.c:1104: error: 'struct netns_ipvs' has no member named 'tcp_app_lock'
>
> Thanks, the following patch should resolve this problem.
> Hans, could you verify this change?
>
Ooops,
> The change is available at
> git://git.kernel.org/pub/scm/linux/kernel/git/horms/lvs-test-2.6.git master
>
>
> From: Simon Horman <horms@verge.net.au>
>
> IPVS: Use correct lock in SCTP module
>
> Use sctp_app_lock instead of tcp_app_lock in the SCTP protocol module.
>
> This appears to be a typo introduced by the netns changes.
>
> Cc: Hans Schillstrom <hans@schillstrom.com>
> Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
> ---
> net/netfilter/ipvs/ip_vs_proto_sctp.c | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
> index fb2d04a..b027ccc 100644
> --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
> +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
> @@ -1101,7 +1101,7 @@ static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
> struct netns_ipvs *ipvs = net_ipvs(net);
>
> ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
> - spin_lock_init(&ipvs->tcp_app_lock);
> + spin_lock_init(&ipvs->sctp_app_lock);
> pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
> sizeof(sctp_timeouts));
> }
Thanks
Hans
^ permalink raw reply
* Re: Network performance with small packets
From: Michael S. Tsirkin @ 2011-02-03 6:16 UTC (permalink / raw)
To: Shirley Ma
Cc: Krishna Kumar2, David Miller, kvm, mashirle, netdev, netdev-owner,
Sridhar Samudrala, Steve Dobbelstein
In-Reply-To: <1296713354.25430.143.camel@localhost.localdomain>
On Wed, Feb 02, 2011 at 10:09:14PM -0800, Shirley Ma wrote:
> On Thu, 2011-02-03 at 07:59 +0200, Michael S. Tsirkin wrote:
> > > Let's look at the sequence here:
> > >
> > > guest start_xmit()
> > > xmit_skb()
> > > if ring is full,
> > > enable_cb()
> > >
> > > guest skb_xmit_done()
> > > disable_cb,
> > > printk free_old_xmit_skbs <-- it was between more than 1/2
> > to
> > > full ring size
> > > printk vq->num_free
> > >
> > > vhost handle_tx()
> > > if (guest interrupt is enabled)
> > > signal guest to free xmit buffers
> > >
> > > So between guest queue full/stopped queue/enable call back to guest
> > > receives the callback from host to free_old_xmit_skbs, there were
> > about
> > > 1/2 to full ring size descriptors available. I thought there were
> > only a
> > > few. (I disabled your vhost patch for this test.)
> >
> >
> > The expected number is vq->num - max skb frags - 2.
>
> It was various (up to the ring size 256). This is using indirection
> buffers, it returned how many freed descriptors, not number of buffers.
>
> Why do you think it is vq->num - max skb frags - 2 here?
>
> Shirley
well queue is stopped which happens when
if (capacity < 2+MAX_SKB_FRAGS) {
netif_stop_queue(dev);
if (unlikely(!virtqueue_enable_cb(vi->svq))) {
/* More just got used, free them then recheck.
* */
capacity += free_old_xmit_skbs(vi);
if (capacity >= 2+MAX_SKB_FRAGS) {
netif_start_queue(dev);
virtqueue_disable_cb(vi->svq);
}
}
}
This should be the most common case.
I guess the case with += free_old_xmit_skbs is what can get us more.
But it should be rare. Can you count how common it is?
--
MST
^ permalink raw reply
* Re: Network performance with small packets
From: Michael S. Tsirkin @ 2011-02-03 6:13 UTC (permalink / raw)
To: Shirley Ma
Cc: Krishna Kumar2, David Miller, kvm, mashirle, netdev, netdev-owner,
Sridhar Samudrala, Steve Dobbelstein
In-Reply-To: <1296709556.25430.140.camel@localhost.localdomain>
On Wed, Feb 02, 2011 at 09:05:56PM -0800, Shirley Ma wrote:
> On Wed, 2011-02-02 at 23:20 +0200, Michael S. Tsirkin wrote:
> > > I think I need to define the test matrix to collect data for TX xmit
> > > from guest to host here for different tests.
> > >
> > > Data to be collected:
> > > ---------------------
> > > 1. kvm_stat for VM, I/O exits
> > > 2. cpu utilization for both guest and host
> > > 3. cat /proc/interrupts on guest
> > > 4. packets rate from vhost handle_tx per loop
> > > 5. guest netif queue stop rate
> > > 6. how many packets are waiting for free between vhost signaling and
> > > guest callback
> > > 7. performance results
> > >
> > > Test
> > > ----
> > > 1. TCP_STREAM single stream test for 1K to 4K message size
> > > 2. TCP_RR (64 instance test): 128 - 1K request/response size
> > >
> > > Different hacks
> > > ---------------
> > > 1. Base line data ( with the patch to fix capacity check first,
> > > free_old_xmit_skbs returns number of skbs)
> > >
> > > 2. Drop packet data (will put some debugging in generic networking
> > code)
>
> Since I found that the netif queue stop/wake up is so expensive, I
> created a dropping packets patch on guest side so I don't need to debug
> generic networking code.
>
> guest start_xmit()
> capacity = free_old_xmit_skb() + virtqueue_get_num_freed()
> if (capacity == 0)
> drop this packet;
> return;
>
> In the patch, both guest TX interrupts and callback have been omitted.
> Host vhost_signal in handle_tx can totally be removed as well. (A new
> virtio_ring API is needed for exporting total of num_free descriptors
> here -- virtioqueue_get_num_freed)
>
> Initial TCP_STREAM performance results I got for guest to local host
> 4.2Gb/s for 1K message size, (vs. 2.5Gb/s)
> 6.2Gb/s for 2K message size, and (vs. 3.8Gb/s)
> 9.8Gb/s for 4K message size. (vs.5.xGb/s)
What is the average packet size, # bytes per ack, and the # of interrupts
per packet? It could be that just slowing down trahsmission
makes GSO work better.
> Since large message size (64K) doesn't hit (capacity == 0) case, so the
> performance only has a little better. (from 13.xGb/s to 14.x Gb/s)
>
> kvm_stat output shows significant exits reduction for both VM and I/O,
> no guest TX interrupts.
>
> With dropping packets, TCP retrans has been increased here, so I can see
> performance numbers are various.
>
> This might be not a good solution, but it gave us some ideas on
> expensive netif queue stop/wake up between guest and host notification.
>
> I couldn't find a better solution on how to reduce netif queue stop/wake
> up rate for small message size. But I think once we can address this,
> the guest TX performance will burst for small message size.
>
> I also compared this with return TX_BUSY approach when (capacity == 0),
> it is not as good as dropping packets.
>
> > > 3. Delay guest netif queue wake up until certain descriptors (1/2
> > ring
> > > size, 1/4 ring size...) are available once the queue has stopped.
> > >
> > > 4. Accumulate more packets per vhost signal in handle_tx?
> > >
> > > 5. 3 & 4 combinations
> > >
> > > 6. Accumulate more packets per guest kick() (TCP_RR) by adding a
> > timer?
> > >
> > > 7. Accumulate more packets per vhost handle_tx() by adding some
> > delay?
> > >
> > > > Haven't noticed that part, how does your patch make it
> > > handle more packets?
> > >
> > > Added a delay in handle_tx().
> > >
> > > What else?
> > >
> > > It would take sometimes to do this.
> > >
> > > Shirley
> >
> >
> > Need to think about this.
> >
> >
^ permalink raw reply
* Re: Network performance with small packets
From: Shirley Ma @ 2011-02-03 6:09 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Krishna Kumar2, David Miller, kvm, mashirle, netdev, netdev-owner,
Sridhar Samudrala, Steve Dobbelstein
In-Reply-To: <20110203055926.GB22230@redhat.com>
On Thu, 2011-02-03 at 07:59 +0200, Michael S. Tsirkin wrote:
> > Let's look at the sequence here:
> >
> > guest start_xmit()
> > xmit_skb()
> > if ring is full,
> > enable_cb()
> >
> > guest skb_xmit_done()
> > disable_cb,
> > printk free_old_xmit_skbs <-- it was between more than 1/2
> to
> > full ring size
> > printk vq->num_free
> >
> > vhost handle_tx()
> > if (guest interrupt is enabled)
> > signal guest to free xmit buffers
> >
> > So between guest queue full/stopped queue/enable call back to guest
> > receives the callback from host to free_old_xmit_skbs, there were
> about
> > 1/2 to full ring size descriptors available. I thought there were
> only a
> > few. (I disabled your vhost patch for this test.)
>
>
> The expected number is vq->num - max skb frags - 2.
It was various (up to the ring size 256). This is using indirection
buffers, it returned how many freed descriptors, not number of buffers.
Why do you think it is vq->num - max skb frags - 2 here?
Shirley
^ permalink raw reply
* Re: Network performance with small packets
From: Michael S. Tsirkin @ 2011-02-03 5:59 UTC (permalink / raw)
To: Shirley Ma
Cc: Krishna Kumar2, David Miller, kvm, mashirle, netdev, netdev-owner,
Sridhar Samudrala, Steve Dobbelstein
In-Reply-To: <1296682893.25430.112.camel@localhost.localdomain>
On Wed, Feb 02, 2011 at 01:41:33PM -0800, Shirley Ma wrote:
> On Wed, 2011-02-02 at 23:20 +0200, Michael S. Tsirkin wrote:
> > > On Wed, 2011-02-02 at 22:17 +0200, Michael S. Tsirkin wrote:
> > > > Well, this is also the only case where the queue is stopped, no?
> > > Yes. I got some debugging data, I saw that sometimes there were so
> > many
> > > packets were waiting for free in guest between vhost_signal & guest
> > xmit
> > > callback.
> >
> > What does this mean?
>
> Let's look at the sequence here:
>
> guest start_xmit()
> xmit_skb()
> if ring is full,
> enable_cb()
>
> guest skb_xmit_done()
> disable_cb,
> printk free_old_xmit_skbs <-- it was between more than 1/2 to
> full ring size
> printk vq->num_free
>
> vhost handle_tx()
> if (guest interrupt is enabled)
> signal guest to free xmit buffers
>
> So between guest queue full/stopped queue/enable call back to guest
> receives the callback from host to free_old_xmit_skbs, there were about
> 1/2 to full ring size descriptors available. I thought there were only a
> few. (I disabled your vhost patch for this test.)
The expected number is vq->num - max skb frags - 2.
>
> > > Looks like the time spent too long from vhost_signal to guest
> > > xmit callback?
> >
> >
> >
> > > > > I tried to accumulate multiple guest to host notifications for
> > TX
> > > > xmits,
> > > > > it did help multiple streams TCP_RR results;
> > > > I don't see a point to delay used idx update, do you?
> > >
> > > It might cause per vhost handle_tx processed more packets.
> >
> > I don't understand. It's a couple of writes - what is the issue?
>
> Oh, handle_tx could process more packets per loop for multiple streams
> TCP_RR case. I need to print out the data rate per loop to confirm this.
>
> Shirley
^ permalink raw reply
* Re: linux-next: build failure after merge of the final tree (net tree related)
From: Stephen Rothwell @ 2011-02-03 5:14 UTC (permalink / raw)
To: David Miller; +Cc: netdev, linux-next, linux-kernel
In-Reply-To: <20110202.204606.226770967.davem@davemloft.net>
[-- Attachment #1: Type: text/plain, Size: 456 bytes --]
Hi Dave,
On Wed, 02 Feb 2011 20:46:06 -0800 (PST) David Miller <davem@davemloft.net> wrote:
>
> This problem has existed forever, you just have only been testing
> the build with fib_hash enabled instead of fib_trie.
All I have been doing (in this case) is a sparc32 defconfig ...
> I'll fix it up, thanks for the report.
no worries
--
Cheers,
Stephen Rothwell sfr@canb.auug.org.au
http://www.canb.auug.org.au/~sfr/
[-- Attachment #2: Type: application/pgp-signature, Size: 490 bytes --]
^ permalink raw reply
* Re: Network performance with small packets
From: Shirley Ma @ 2011-02-03 5:05 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Krishna Kumar2, David Miller, kvm, mashirle, netdev, netdev-owner,
Sridhar Samudrala, Steve Dobbelstein
In-Reply-To: <20110202212047.GD15150@redhat.com>
On Wed, 2011-02-02 at 23:20 +0200, Michael S. Tsirkin wrote:
> > I think I need to define the test matrix to collect data for TX xmit
> > from guest to host here for different tests.
> >
> > Data to be collected:
> > ---------------------
> > 1. kvm_stat for VM, I/O exits
> > 2. cpu utilization for both guest and host
> > 3. cat /proc/interrupts on guest
> > 4. packets rate from vhost handle_tx per loop
> > 5. guest netif queue stop rate
> > 6. how many packets are waiting for free between vhost signaling and
> > guest callback
> > 7. performance results
> >
> > Test
> > ----
> > 1. TCP_STREAM single stream test for 1K to 4K message size
> > 2. TCP_RR (64 instance test): 128 - 1K request/response size
> >
> > Different hacks
> > ---------------
> > 1. Base line data ( with the patch to fix capacity check first,
> > free_old_xmit_skbs returns number of skbs)
> >
> > 2. Drop packet data (will put some debugging in generic networking
> code)
Since I found that the netif queue stop/wake up is so expensive, I
created a dropping packets patch on guest side so I don't need to debug
generic networking code.
guest start_xmit()
capacity = free_old_xmit_skb() + virtqueue_get_num_freed()
if (capacity == 0)
drop this packet;
return;
In the patch, both guest TX interrupts and callback have been omitted.
Host vhost_signal in handle_tx can totally be removed as well. (A new
virtio_ring API is needed for exporting total of num_free descriptors
here -- virtioqueue_get_num_freed)
Initial TCP_STREAM performance results I got for guest to local host
4.2Gb/s for 1K message size, (vs. 2.5Gb/s)
6.2Gb/s for 2K message size, and (vs. 3.8Gb/s)
9.8Gb/s for 4K message size. (vs.5.xGb/s)
Since large message size (64K) doesn't hit (capacity == 0) case, so the
performance only has a little better. (from 13.xGb/s to 14.x Gb/s)
kvm_stat output shows significant exits reduction for both VM and I/O,
no guest TX interrupts.
With dropping packets, TCP retrans has been increased here, so I can see
performance numbers are various.
This might be not a good solution, but it gave us some ideas on
expensive netif queue stop/wake up between guest and host notification.
I couldn't find a better solution on how to reduce netif queue stop/wake
up rate for small message size. But I think once we can address this,
the guest TX performance will burst for small message size.
I also compared this with return TX_BUSY approach when (capacity == 0),
it is not as good as dropping packets.
> > 3. Delay guest netif queue wake up until certain descriptors (1/2
> ring
> > size, 1/4 ring size...) are available once the queue has stopped.
> >
> > 4. Accumulate more packets per vhost signal in handle_tx?
> >
> > 5. 3 & 4 combinations
> >
> > 6. Accumulate more packets per guest kick() (TCP_RR) by adding a
> timer?
> >
> > 7. Accumulate more packets per vhost handle_tx() by adding some
> delay?
> >
> > > Haven't noticed that part, how does your patch make it
> > handle more packets?
> >
> > Added a delay in handle_tx().
> >
> > What else?
> >
> > It would take sometimes to do this.
> >
> > Shirley
>
>
> Need to think about this.
>
>
^ permalink raw reply
* Re: [PATCH net-next] CHOKe flow scheduler (0.11)
From: David Miller @ 2011-02-03 4:53 UTC (permalink / raw)
To: eric.dumazet; +Cc: shemminger, kaber, netdev
In-Reply-To: <1296698352.4434.4.camel@edumazet-laptop>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 03 Feb 2011 02:59:12 +0100
> Le mercredi 02 février 2011 à 17:21 -0800, Stephen Hemminger a écrit :
>> Subject: sched: CHOKe flow scheduler
>>
>> CHOKe ("CHOose and Kill" or "CHOose and Keep") is an alternative
>> packet scheduler based on the Random Exponential Drop (RED) algorithm.
>>
>> The core idea is:
>> For every packet arrival:
>> Calculate Qave
>> if (Qave < minth)
>> Queue the new packet
>> else
>> Select randomly a packet from the queue
>> if (both packets from same flow)
>> then Drop both the packets
>> else if (Qave > maxth)
>> Drop packet
>> else
>> Admit packet with proability p (same as RED)
>>
>> See also:
>> Rong Pan, Balaji Prabhakar, Konstantinos Psounis, "CHOKe: a stateless active
>> queue management scheme for approximating fair bandwidth allocation",
>> Proceeding of INFOCOM'2000, March 2000.
>>
>> Help from:
>> Eric Dumazet <eric.dumazet@gmail.com>
>> Patrick McHardy <kaber@trash.net>
>>
>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>>
>> ---
>> 0.11 - incorporates Eric's change to use rxhash
>>
>>
>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Also applied, thanks guys!
^ permalink raw reply
* Re: [PATCH] sfq: deadlock in error path
From: David Miller @ 2011-02-03 4:51 UTC (permalink / raw)
To: eric.dumazet; +Cc: shemminger, netdev
In-Reply-To: <1296697862.4434.1.camel@edumazet-laptop>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 03 Feb 2011 02:51:02 +0100
> Le mercredi 02 février 2011 à 17:19 -0800, Stephen Hemminger a écrit :
>> The change to allow divisor to be a parameter (in 2.6.38-rc1)
>> commit 817fb15dfd988d8dda916ee04fa506f0c466b9d6
>> introduced a possible deadlock caught by sparse.
>>
>> The scheduler tree lock was left locked in the case of an incorrect
>> divisor value. Simplest fix is to move test outside of lock
>> which also solves problem of partial update.
>>
>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>
> Oops, thanks Stephen !
>
> Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Applied, thanks.
^ permalink raw reply
* Re: linux-next: build failure after merge of the final tree (net tree related)
From: David Miller @ 2011-02-03 4:46 UTC (permalink / raw)
To: sfr; +Cc: netdev, linux-next, linux-kernel
In-Reply-To: <20110203151309.d4a6c344.sfr@canb.auug.org.au>
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 3 Feb 2011 15:13:09 +1100
> Hi all,
>
> After merging the final tree, today's linux-next build (sparc32 defconfig)
> failed like this:
>
> net/ipv4/fib_trie.c:98: error: redefinition of 'struct node'
>
> Probably exposed by commit 3630b7c050d9c3564f143d595339fc06b888d6f3
> ("ipv4: Remove fib_hash").
>
> Naming a struct "node" (in the face of include/linux/node.h) is a bit
> hopeful. :-)
>
> I have left this broken for now ...
This problem has existed forever, you just have only been testing
the build with fib_hash enabled instead of fib_trie.
I'm really surprised this has never been hit before. :-)
I'll fix it up, thanks for the report.
^ permalink raw reply
* [PATCH 14/19] janz: mfd_cell is now implicitly available to drivers
From: Andres Salomon @ 2011-02-03 4:17 UTC (permalink / raw)
To: Samuel Ortiz
Cc: linux-kernel, Mark Brown, Wolfgang Grandegger, socketcan-core,
netdev
In-Reply-To: <20110202195417.228e2656@queued.net>
No need to explicitly set the cell's platform_data/data_size.
In this case, move the various platform_data pointers
to driver_data. All of the clients which make use of it
are also changed.
Signed-off-by: Andres Salomon <dilinger@queued.net>
---
drivers/gpio/janz-ttl.c | 2 +-
drivers/mfd/janz-cmodio.c | 3 +--
drivers/net/can/janz-ican3.c | 2 +-
3 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/drivers/gpio/janz-ttl.c b/drivers/gpio/janz-ttl.c
index 813ac07..98a3f9d 100644
--- a/drivers/gpio/janz-ttl.c
+++ b/drivers/gpio/janz-ttl.c
@@ -149,7 +149,7 @@ static int __devinit ttl_probe(struct platform_device *pdev)
struct resource *res;
int ret;
- pdata = pdev->dev.platform_data;
+ pdata = platform_get_drvdata(pdev);
if (!pdata) {
dev_err(dev, "no platform data\n");
ret = -ENXIO;
diff --git a/drivers/mfd/janz-cmodio.c b/drivers/mfd/janz-cmodio.c
index 36a166b..77e3a1f 100644
--- a/drivers/mfd/janz-cmodio.c
+++ b/drivers/mfd/janz-cmodio.c
@@ -86,8 +86,7 @@ static int __devinit cmodio_setup_subdevice(struct cmodio_device *priv,
/* Add platform data */
pdata->modno = modno;
- cell->platform_data = pdata;
- cell->data_size = sizeof(*pdata);
+ cell->driver_data = pdata;
/* MODULbus registers -- PCI BAR3 is big-endian MODULbus access */
res->flags = IORESOURCE_MEM;
diff --git a/drivers/net/can/janz-ican3.c b/drivers/net/can/janz-ican3.c
index b9a6d7a..7d282c3 100644
--- a/drivers/net/can/janz-ican3.c
+++ b/drivers/net/can/janz-ican3.c
@@ -1643,7 +1643,7 @@ static int __devinit ican3_probe(struct platform_device *pdev)
struct device *dev;
int ret;
- pdata = pdev->dev.platform_data;
+ pdata = platform_get_drvdata(pdev);
if (!pdata)
return -ENXIO;
--
1.7.2.3
^ permalink raw reply related
* linux-next: build failure after merge of the final tree (net tree related)
From: Stephen Rothwell @ 2011-02-03 4:13 UTC (permalink / raw)
To: David S. Miller, netdev; +Cc: linux-next, linux-kernel
[-- Attachment #1: Type: text/plain, Size: 503 bytes --]
Hi all,
After merging the final tree, today's linux-next build (sparc32 defconfig)
failed like this:
net/ipv4/fib_trie.c:98: error: redefinition of 'struct node'
Probably exposed by commit 3630b7c050d9c3564f143d595339fc06b888d6f3
("ipv4: Remove fib_hash").
Naming a struct "node" (in the face of include/linux/node.h) is a bit
hopeful. :-)
I have left this broken for now ...
--
Cheers,
Stephen Rothwell sfr@canb.auug.org.au
http://www.canb.auug.org.au/~sfr/
[-- Attachment #2: Type: application/pgp-signature, Size: 490 bytes --]
^ permalink raw reply
* [PATCH 07/19] timberdale: mfd_cell is now implicitly available to drivers
From: Andres Salomon @ 2011-02-03 4:08 UTC (permalink / raw)
To: Samuel Ortiz
Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, Mark Brown,
khali-PUYAD+kWke1g9hUCZPvPmw, ben-linux-elnMNo+KYs3YtjvyW6yDsg,
Peter Korsgaard, Mauro Carvalho Chehab, David Brownell,
Grant Likely, linux-i2c-u79uwXL29TY76Z2rM5mHXA,
linux-media-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
spi-devel-general-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
Mocean Laboratories
In-Reply-To: <20110202195417.228e2656-pFFUokh25LWsTnJN9+BGXg@public.gmane.org>
No need to explicitly set the cell's platform_data/data_size.
In this case, move the various platform_data pointers
to driver_data. All of the clients which make use of it
are also changed.
Signed-off-by: Andres Salomon <dilinger-pFFUokh25LWsTnJN9+BGXg@public.gmane.org>
---
drivers/dma/timb_dma.c | 2 +-
drivers/gpio/timbgpio.c | 5 +-
drivers/i2c/busses/i2c-ocores.c | 2 +-
drivers/i2c/busses/i2c-xiic.c | 2 +-
drivers/media/radio/radio-timb.c | 2 +-
drivers/media/video/timblogiw.c | 2 +-
drivers/mfd/timberdale.c | 81 +++++++++++++-------------------------
drivers/net/ks8842.c | 2 +-
drivers/spi/xilinx_spi.c | 2 +-
9 files changed, 36 insertions(+), 64 deletions(-)
diff --git a/drivers/dma/timb_dma.c b/drivers/dma/timb_dma.c
index 3b88a4e..aa06ca4 100644
--- a/drivers/dma/timb_dma.c
+++ b/drivers/dma/timb_dma.c
@@ -684,7 +684,7 @@ static irqreturn_t td_irq(int irq, void *devid)
static int __devinit td_probe(struct platform_device *pdev)
{
- struct timb_dma_platform_data *pdata = pdev->dev.platform_data;
+ struct timb_dma_platform_data *pdata = platform_get_drvdata(pdev);
struct timb_dma *td;
struct resource *iomem;
int irq;
diff --git a/drivers/gpio/timbgpio.c b/drivers/gpio/timbgpio.c
index 58c8f30..e404487 100644
--- a/drivers/gpio/timbgpio.c
+++ b/drivers/gpio/timbgpio.c
@@ -228,7 +228,7 @@ static int __devinit timbgpio_probe(struct platform_device *pdev)
struct gpio_chip *gc;
struct timbgpio *tgpio;
struct resource *iomem;
- struct timbgpio_platform_data *pdata = pdev->dev.platform_data;
+ struct timbgpio_platform_data *pdata = platform_get_drvdata(pdev);
int irq = platform_get_irq(pdev, 0);
if (!pdata || pdata->nr_pins > 32) {
@@ -319,14 +319,13 @@ err_mem:
static int __devexit timbgpio_remove(struct platform_device *pdev)
{
int err;
- struct timbgpio_platform_data *pdata = pdev->dev.platform_data;
struct timbgpio *tgpio = platform_get_drvdata(pdev);
struct resource *iomem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
int irq = platform_get_irq(pdev, 0);
if (irq >= 0 && tgpio->irq_base > 0) {
int i;
- for (i = 0; i < pdata->nr_pins; i++) {
+ for (i = 0; i < tgpio->gpio.ngpio; i++) {
set_irq_chip(tgpio->irq_base + i, NULL);
set_irq_chip_data(tgpio->irq_base + i, NULL);
}
diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c
index ef3bcb1..dc203ec 100644
--- a/drivers/i2c/busses/i2c-ocores.c
+++ b/drivers/i2c/busses/i2c-ocores.c
@@ -305,7 +305,7 @@ static int __devinit ocores_i2c_probe(struct platform_device *pdev)
return -EIO;
}
- pdata = pdev->dev.platform_data;
+ pdata = platform_get_drvdata(pdev);
if (pdata) {
i2c->regstep = pdata->regstep;
i2c->clock_khz = pdata->clock_khz;
diff --git a/drivers/i2c/busses/i2c-xiic.c b/drivers/i2c/busses/i2c-xiic.c
index a9c419e..830b8c1 100644
--- a/drivers/i2c/busses/i2c-xiic.c
+++ b/drivers/i2c/busses/i2c-xiic.c
@@ -704,7 +704,7 @@ static int __devinit xiic_i2c_probe(struct platform_device *pdev)
if (irq < 0)
goto resource_missing;
- pdata = (struct xiic_i2c_platform_data *) pdev->dev.platform_data;
+ pdata = platform_get_drvdata(pdev);
if (!pdata)
return -EINVAL;
diff --git a/drivers/media/radio/radio-timb.c b/drivers/media/radio/radio-timb.c
index a185610..e7baf26 100644
--- a/drivers/media/radio/radio-timb.c
+++ b/drivers/media/radio/radio-timb.c
@@ -148,7 +148,7 @@ static const struct v4l2_file_operations timbradio_fops = {
static int __devinit timbradio_probe(struct platform_device *pdev)
{
- struct timb_radio_platform_data *pdata = pdev->dev.platform_data;
+ struct timb_radio_platform_data *pdata = platform_get_drvdata(pdev);
struct timbradio *tr;
int err;
diff --git a/drivers/media/video/timblogiw.c b/drivers/media/video/timblogiw.c
index fc611eb..61aa67a 100644
--- a/drivers/media/video/timblogiw.c
+++ b/drivers/media/video/timblogiw.c
@@ -790,7 +790,7 @@ static int __devinit timblogiw_probe(struct platform_device *pdev)
{
int err;
struct timblogiw *lw = NULL;
- struct timb_video_platform_data *pdata = pdev->dev.platform_data;
+ struct timb_video_platform_data *pdata = platform_get_drvdata(pdev);
if (!pdata) {
dev_err(&pdev->dev, "No platform data\n");
diff --git a/drivers/mfd/timberdale.c b/drivers/mfd/timberdale.c
index 6ad8a7f..e9ae162 100644
--- a/drivers/mfd/timberdale.c
+++ b/drivers/mfd/timberdale.c
@@ -384,8 +384,7 @@ static __devinitdata struct mfd_cell timberdale_cells_bar0_cfg0[] = {
.name = "timb-dma",
.num_resources = ARRAY_SIZE(timberdale_dma_resources),
.resources = timberdale_dma_resources,
- .platform_data = &timb_dma_platform_data,
- .data_size = sizeof(timb_dma_platform_data),
+ .driver_data = &timb_dma_platform_data,
},
{
.name = "timb-uart",
@@ -396,43 +395,37 @@ static __devinitdata struct mfd_cell timberdale_cells_bar0_cfg0[] = {
.name = "xiic-i2c",
.num_resources = ARRAY_SIZE(timberdale_xiic_resources),
.resources = timberdale_xiic_resources,
- .platform_data = &timberdale_xiic_platform_data,
- .data_size = sizeof(timberdale_xiic_platform_data),
+ .driver_data = &timberdale_xiic_platform_data,
},
{
.name = "timb-gpio",
.num_resources = ARRAY_SIZE(timberdale_gpio_resources),
.resources = timberdale_gpio_resources,
- .platform_data = &timberdale_gpio_platform_data,
- .data_size = sizeof(timberdale_gpio_platform_data),
+ .driver_data = &timberdale_gpio_platform_data,
},
{
.name = "timb-video",
.num_resources = ARRAY_SIZE(timberdale_video_resources),
.resources = timberdale_video_resources,
- .platform_data = &timberdale_video_platform_data,
- .data_size = sizeof(timberdale_video_platform_data),
+ .driver_data = &timberdale_video_platform_data,
},
{
.name = "timb-radio",
.num_resources = ARRAY_SIZE(timberdale_radio_resources),
.resources = timberdale_radio_resources,
- .platform_data = &timberdale_radio_platform_data,
- .data_size = sizeof(timberdale_radio_platform_data),
+ .driver_data = &timberdale_radio_platform_data,
},
{
.name = "xilinx_spi",
.num_resources = ARRAY_SIZE(timberdale_spi_resources),
.resources = timberdale_spi_resources,
- .platform_data = &timberdale_xspi_platform_data,
- .data_size = sizeof(timberdale_xspi_platform_data),
+ .driver_data = &timberdale_xspi_platform_data,
},
{
.name = "ks8842",
.num_resources = ARRAY_SIZE(timberdale_eth_resources),
.resources = timberdale_eth_resources,
- .platform_data = &timberdale_ks8842_platform_data,
- .data_size = sizeof(timberdale_ks8842_platform_data)
+ .driver_data = &timberdale_ks8842_platform_data,
},
};
@@ -441,8 +434,7 @@ static __devinitdata struct mfd_cell timberdale_cells_bar0_cfg1[] = {
.name = "timb-dma",
.num_resources = ARRAY_SIZE(timberdale_dma_resources),
.resources = timberdale_dma_resources,
- .platform_data = &timb_dma_platform_data,
- .data_size = sizeof(timb_dma_platform_data),
+ .driver_data = &timb_dma_platform_data,
},
{
.name = "timb-uart",
@@ -458,15 +450,13 @@ static __devinitdata struct mfd_cell timberdale_cells_bar0_cfg1[] = {
.name = "xiic-i2c",
.num_resources = ARRAY_SIZE(timberdale_xiic_resources),
.resources = timberdale_xiic_resources,
- .platform_data = &timberdale_xiic_platform_data,
- .data_size = sizeof(timberdale_xiic_platform_data),
+ .driver_data = &timberdale_xiic_platform_data,
},
{
.name = "timb-gpio",
.num_resources = ARRAY_SIZE(timberdale_gpio_resources),
.resources = timberdale_gpio_resources,
- .platform_data = &timberdale_gpio_platform_data,
- .data_size = sizeof(timberdale_gpio_platform_data),
+ .driver_data = &timberdale_gpio_platform_data,
},
{
.name = "timb-mlogicore",
@@ -477,29 +467,25 @@ static __devinitdata struct mfd_cell timberdale_cells_bar0_cfg1[] = {
.name = "timb-video",
.num_resources = ARRAY_SIZE(timberdale_video_resources),
.resources = timberdale_video_resources,
- .platform_data = &timberdale_video_platform_data,
- .data_size = sizeof(timberdale_video_platform_data),
+ .driver_data = &timberdale_video_platform_data,
},
{
.name = "timb-radio",
.num_resources = ARRAY_SIZE(timberdale_radio_resources),
.resources = timberdale_radio_resources,
- .platform_data = &timberdale_radio_platform_data,
- .data_size = sizeof(timberdale_radio_platform_data),
+ .driver_data = &timberdale_radio_platform_data,
},
{
.name = "xilinx_spi",
.num_resources = ARRAY_SIZE(timberdale_spi_resources),
.resources = timberdale_spi_resources,
- .platform_data = &timberdale_xspi_platform_data,
- .data_size = sizeof(timberdale_xspi_platform_data),
+ .driver_data = &timberdale_xspi_platform_data,
},
{
.name = "ks8842",
.num_resources = ARRAY_SIZE(timberdale_eth_resources),
.resources = timberdale_eth_resources,
- .platform_data = &timberdale_ks8842_platform_data,
- .data_size = sizeof(timberdale_ks8842_platform_data)
+ .driver_data = &timberdale_ks8842_platform_data,
},
};
@@ -508,8 +494,7 @@ static __devinitdata struct mfd_cell timberdale_cells_bar0_cfg2[] = {
.name = "timb-dma",
.num_resources = ARRAY_SIZE(timberdale_dma_resources),
.resources = timberdale_dma_resources,
- .platform_data = &timb_dma_platform_data,
- .data_size = sizeof(timb_dma_platform_data),
+ .driver_data = &timb_dma_platform_data,
},
{
.name = "timb-uart",
@@ -520,36 +505,31 @@ static __devinitdata struct mfd_cell timberdale_cells_bar0_cfg2[] = {
.name = "xiic-i2c",
.num_resources = ARRAY_SIZE(timberdale_xiic_resources),
.resources = timberdale_xiic_resources,
- .platform_data = &timberdale_xiic_platform_data,
- .data_size = sizeof(timberdale_xiic_platform_data),
+ .driver_data = &timberdale_xiic_platform_data,
},
{
.name = "timb-gpio",
.num_resources = ARRAY_SIZE(timberdale_gpio_resources),
.resources = timberdale_gpio_resources,
- .platform_data = &timberdale_gpio_platform_data,
- .data_size = sizeof(timberdale_gpio_platform_data),
+ .driver_data = &timberdale_gpio_platform_data,
},
{
.name = "timb-video",
.num_resources = ARRAY_SIZE(timberdale_video_resources),
.resources = timberdale_video_resources,
- .platform_data = &timberdale_video_platform_data,
- .data_size = sizeof(timberdale_video_platform_data),
+ .driver_data = &timberdale_video_platform_data,
},
{
.name = "timb-radio",
.num_resources = ARRAY_SIZE(timberdale_radio_resources),
.resources = timberdale_radio_resources,
- .platform_data = &timberdale_radio_platform_data,
- .data_size = sizeof(timberdale_radio_platform_data),
+ .driver_data = &timberdale_radio_platform_data,
},
{
.name = "xilinx_spi",
.num_resources = ARRAY_SIZE(timberdale_spi_resources),
.resources = timberdale_spi_resources,
- .platform_data = &timberdale_xspi_platform_data,
- .data_size = sizeof(timberdale_xspi_platform_data),
+ .driver_data = &timberdale_xspi_platform_data,
},
};
@@ -558,8 +538,7 @@ static __devinitdata struct mfd_cell timberdale_cells_bar0_cfg3[] = {
.name = "timb-dma",
.num_resources = ARRAY_SIZE(timberdale_dma_resources),
.resources = timberdale_dma_resources,
- .platform_data = &timb_dma_platform_data,
- .data_size = sizeof(timb_dma_platform_data),
+ .driver_data = &timb_dma_platform_data,
},
{
.name = "timb-uart",
@@ -570,43 +549,37 @@ static __devinitdata struct mfd_cell timberdale_cells_bar0_cfg3[] = {
.name = "ocores-i2c",
.num_resources = ARRAY_SIZE(timberdale_ocores_resources),
.resources = timberdale_ocores_resources,
- .platform_data = &timberdale_ocores_platform_data,
- .data_size = sizeof(timberdale_ocores_platform_data),
+ .driver_data = &timberdale_ocores_platform_data,
},
{
.name = "timb-gpio",
.num_resources = ARRAY_SIZE(timberdale_gpio_resources),
.resources = timberdale_gpio_resources,
- .platform_data = &timberdale_gpio_platform_data,
- .data_size = sizeof(timberdale_gpio_platform_data),
+ .driver_data = &timberdale_gpio_platform_data,
},
{
.name = "timb-video",
.num_resources = ARRAY_SIZE(timberdale_video_resources),
.resources = timberdale_video_resources,
- .platform_data = &timberdale_video_platform_data,
- .data_size = sizeof(timberdale_video_platform_data),
+ .driver_data = &timberdale_video_platform_data,
},
{
.name = "timb-radio",
.num_resources = ARRAY_SIZE(timberdale_radio_resources),
.resources = timberdale_radio_resources,
- .platform_data = &timberdale_radio_platform_data,
- .data_size = sizeof(timberdale_radio_platform_data),
+ .driver_data = &timberdale_radio_platform_data,
},
{
.name = "xilinx_spi",
.num_resources = ARRAY_SIZE(timberdale_spi_resources),
.resources = timberdale_spi_resources,
- .platform_data = &timberdale_xspi_platform_data,
- .data_size = sizeof(timberdale_xspi_platform_data),
+ .driver_data = &timberdale_xspi_platform_data,
},
{
.name = "ks8842",
.num_resources = ARRAY_SIZE(timberdale_eth_resources),
.resources = timberdale_eth_resources,
- .platform_data = &timberdale_ks8842_platform_data,
- .data_size = sizeof(timberdale_ks8842_platform_data)
+ .driver_data = &timberdale_ks8842_platform_data,
},
};
diff --git a/drivers/net/ks8842.c b/drivers/net/ks8842.c
index 928b2b8..7f0f51f 100644
--- a/drivers/net/ks8842.c
+++ b/drivers/net/ks8842.c
@@ -1145,7 +1145,7 @@ static int __devinit ks8842_probe(struct platform_device *pdev)
struct resource *iomem;
struct net_device *netdev;
struct ks8842_adapter *adapter;
- struct ks8842_platform_data *pdata = pdev->dev.platform_data;
+ struct ks8842_platform_data *pdata = platform_get_drvdata(pdev);
u16 id;
unsigned i;
diff --git a/drivers/spi/xilinx_spi.c b/drivers/spi/xilinx_spi.c
index 7adaef6..2926dec 100644
--- a/drivers/spi/xilinx_spi.c
+++ b/drivers/spi/xilinx_spi.c
@@ -474,7 +474,7 @@ static int __devinit xilinx_spi_probe(struct platform_device *dev)
struct spi_master *master;
u8 i;
- pdata = dev->dev.platform_data;
+ pdata = platform_get_drvdata(dev);
if (pdata) {
num_cs = pdata->num_chipselect;
little_endian = pdata->little_endian;
--
1.7.2.3
^ permalink raw reply related
* Re: [PATCH] tcp: Increase the initial congestion window to 10.
From: David Miller @ 2011-02-03 3:34 UTC (permalink / raw)
To: eric.dumazet; +Cc: netdev, dccp, therbert
In-Reply-To: <1296698023.4434.3.camel@edumazet-laptop>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 03 Feb 2011 02:53:43 +0100
> Le mercredi 02 février 2011 à 17:07 -0800, David Miller a écrit :
>> Signed-off-by: David S. Miller <davem@davemloft.net>
>> ---
>>
>
> Hmm, you forgot a Changelog David ;)
Maybe, or maybe not, frankly it's quite self-evident if you ask
me. :-)
^ permalink raw reply
* Re: [PATCH] tcp: Increase the initial congestion window to 10.
From: Nandita Dukkipati @ 2011-02-03 2:25 UTC (permalink / raw)
To: David Miller; +Cc: netdev, dccp, therbert
In-Reply-To: <20110202.170750.229739784.davem@davemloft.net>
Acked-by: Nandita Dukkipati <nanditad@google.com>
On Wed, Feb 2, 2011 at 5:07 PM, David Miller <davem@davemloft.net> wrote:
>
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>
> I've left the DCCP code to keep using RFC3390 logic, if they
> wish to adopt this change in their code they can do so by
> simply deleting the rfc33390_bytes_to_packets() function and
> using TCP_INIT_CWND in their assignment.
>
> include/net/tcp.h | 12 +++---------
> net/dccp/ccids/ccid2.c | 9 +++++++++
> net/ipv4/tcp_input.c | 2 +-
> 3 files changed, 13 insertions(+), 10 deletions(-)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 9179111..7118668 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -196,6 +196,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
> /* TCP thin-stream limits */
> #define TCP_THIN_LINEAR_RETRIES 6 /* After 6 linear retries, do exp. backoff */
>
> +/* TCP initial congestion window */
> +#define TCP_INIT_CWND 10
> +
> extern struct inet_timewait_death_row tcp_death_row;
>
> /* sysctl variables for tcp */
> @@ -799,15 +802,6 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
> /* Use define here intentionally to get WARN_ON location shown at the caller */
> #define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out)
>
> -/*
> - * Convert RFC 3390 larger initial window into an equivalent number of packets.
> - * This is based on the numbers specified in RFC 5681, 3.1.
> - */
> -static inline u32 rfc3390_bytes_to_packets(const u32 smss)
> -{
> - return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
> -}
> -
> extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh);
> extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst);
>
> diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
> index e96d5e8..fadecd2 100644
> --- a/net/dccp/ccids/ccid2.c
> +++ b/net/dccp/ccids/ccid2.c
> @@ -583,6 +583,15 @@ done:
> dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
> }
>
> +/*
> + * Convert RFC 3390 larger initial window into an equivalent number of packets.
> + * This is based on the numbers specified in RFC 5681, 3.1.
> + */
> +static inline u32 rfc3390_bytes_to_packets(const u32 smss)
> +{
> + return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
> +}
> +
> static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
> {
> struct ccid2_hc_tx_sock *hc = ccid_priv(ccid);
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index eb7f82e..2f692ce 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -817,7 +817,7 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
> __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
>
> if (!cwnd)
> - cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
> + cwnd = TCP_INIT_CWND;
> return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
> }
>
> --
> 1.7.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH net-next] CHOKe flow scheduler (0.11)
From: Eric Dumazet @ 2011-02-03 1:59 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, Patrick McHardy, netdev
In-Reply-To: <20110202172110.7ea96e19@nehalam>
Le mercredi 02 février 2011 à 17:21 -0800, Stephen Hemminger a écrit :
> Subject: sched: CHOKe flow scheduler
>
> CHOKe ("CHOose and Kill" or "CHOose and Keep") is an alternative
> packet scheduler based on the Random Exponential Drop (RED) algorithm.
>
> The core idea is:
> For every packet arrival:
> Calculate Qave
> if (Qave < minth)
> Queue the new packet
> else
> Select randomly a packet from the queue
> if (both packets from same flow)
> then Drop both the packets
> else if (Qave > maxth)
> Drop packet
> else
> Admit packet with proability p (same as RED)
>
> See also:
> Rong Pan, Balaji Prabhakar, Konstantinos Psounis, "CHOKe: a stateless active
> queue management scheme for approximating fair bandwidth allocation",
> Proceeding of INFOCOM'2000, March 2000.
>
> Help from:
> Eric Dumazet <eric.dumazet@gmail.com>
> Patrick McHardy <kaber@trash.net>
>
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>
> ---
> 0.11 - incorporates Eric's change to use rxhash
>
>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Thanks Stephen !
^ permalink raw reply
* Re: [PATCH] tcp: Increase the initial congestion window to 10.
From: Eric Dumazet @ 2011-02-03 1:53 UTC (permalink / raw)
To: David Miller; +Cc: netdev, dccp, therbert
In-Reply-To: <20110202.170750.229739784.davem@davemloft.net>
Le mercredi 02 février 2011 à 17:07 -0800, David Miller a écrit :
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>
Hmm, you forgot a Changelog David ;)
I thought Tom and Google guys were preparing a nice one ?
^ permalink raw reply
* Re: [PATCH] sfq: deadlock in error path
From: Eric Dumazet @ 2011-02-03 1:51 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, netdev
In-Reply-To: <20110202171951.09b89d92@nehalam>
Le mercredi 02 février 2011 à 17:19 -0800, Stephen Hemminger a écrit :
> The change to allow divisor to be a parameter (in 2.6.38-rc1)
> commit 817fb15dfd988d8dda916ee04fa506f0c466b9d6
> introduced a possible deadlock caught by sparse.
>
> The scheduler tree lock was left locked in the case of an incorrect
> divisor value. Simplest fix is to move test outside of lock
> which also solves problem of partial update.
>
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Oops, thanks Stephen !
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
But please, please, please, please, please, remove dada1@cosmosbay.com,
as this address is no longer usable for me !
^ permalink raw reply
* [PATCH net-next] CHOKe flow scheduler (0.11)
From: Stephen Hemminger @ 2011-02-03 1:21 UTC (permalink / raw)
To: Eric Dumazet, David Miller; +Cc: Patrick McHardy, netdev
In-Reply-To: <1295563611.2613.41.camel@edumazet-laptop>
Subject: sched: CHOKe flow scheduler
CHOKe ("CHOose and Kill" or "CHOose and Keep") is an alternative
packet scheduler based on the Random Exponential Drop (RED) algorithm.
The core idea is:
For every packet arrival:
Calculate Qave
if (Qave < minth)
Queue the new packet
else
Select randomly a packet from the queue
if (both packets from same flow)
then Drop both the packets
else if (Qave > maxth)
Drop packet
else
Admit packet with proability p (same as RED)
See also:
Rong Pan, Balaji Prabhakar, Konstantinos Psounis, "CHOKe: a stateless active
queue management scheme for approximating fair bandwidth allocation",
Proceeding of INFOCOM'2000, March 2000.
Help from:
Eric Dumazet <eric.dumazet@gmail.com>
Patrick McHardy <kaber@trash.net>
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
---
0.11 - incorporates Eric's change to use rxhash
include/linux/pkt_sched.h | 29 +
net/sched/Kconfig | 11
net/sched/Makefile | 2
net/sched/sch_choke.c | 676 ++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 718 insertions(+)
--- a/net/sched/Kconfig 2011-01-31 09:01:35.000000000 -0800
+++ b/net/sched/Kconfig 2011-02-02 17:00:36.798764819 -0800
@@ -217,6 +217,17 @@ config NET_SCH_MQPRIO
If unsure, say N.
+config NET_SCH_CHOKE
+ tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
+ help
+ Say Y here if you want to use the CHOKe packet scheduler (CHOose
+ and Keep for responsive flows, CHOose and Kill for unresponsive
+ flows). This is a variation of RED which trys to penalize flows
+ that monopolize the queue.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_choke.
+
config NET_SCH_INGRESS
tristate "Ingress Qdisc"
depends on NET_CLS_ACT
--- a/net/sched/Makefile 2011-01-31 09:01:35.000000000 -0800
+++ b/net/sched/Makefile 2011-02-02 17:01:00.987025820 -0800
@@ -33,6 +33,8 @@ obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
+obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
+
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
obj-$(CONFIG_NET_CLS_FW) += cls_fw.o
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ b/net/sched/sch_choke.c 2011-02-02 17:08:57.208163848 -0800
@@ -0,0 +1,676 @@
+/*
+ * net/sched/sch_choke.c CHOKE scheduler
+ *
+ * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/reciprocal_div.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+
+/*
+ CHOKe stateless AQM for fair bandwidth allocation
+ =================================================
+
+ CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
+ unresponsive flows) is a variant of RED that penalizes misbehaving flows but
+ maintains no flow state. The difference from RED is an additional step
+ during the enqueuing process. If average queue size is over the
+ low threshold (qmin), a packet is chosen at random from the queue.
+ If both the new and chosen packet are from the same flow, both
+ are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
+ needs to access packets in queue randomly. It has a minimal class
+ interface to allow overriding the builtin flow classifier with
+ filters.
+
+ Source:
+ R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
+ Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
+ IEEE INFOCOM, 2000.
+
+ A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
+ Characteristics", IEEE/ACM Transactions on Networking, 2004
+
+ */
+
+/* Upper bound on size of sk_buff table (packets) */
+#define CHOKE_MAX_QUEUE (128*1024 - 1)
+
+struct choke_sched_data {
+/* Parameters */
+ u32 limit;
+ unsigned char flags;
+
+ struct red_parms parms;
+
+/* Variables */
+ struct tcf_proto *filter_list;
+ struct {
+ u32 prob_drop; /* Early probability drops */
+ u32 prob_mark; /* Early probability marks */
+ u32 forced_drop; /* Forced drops, qavg > max_thresh */
+ u32 forced_mark; /* Forced marks, qavg > max_thresh */
+ u32 pdrop; /* Drops due to queue limits */
+ u32 other; /* Drops due to drop() calls */
+ u32 matched; /* Drops to flow match */
+ } stats;
+
+ unsigned int head;
+ unsigned int tail;
+
+ unsigned int tab_mask; /* size - 1 */
+
+ struct sk_buff **tab;
+};
+
+/* deliver a random number between 0 and N - 1 */
+static u32 random_N(unsigned int N)
+{
+ return reciprocal_divide(random32(), N);
+}
+
+/* number of elements in queue including holes */
+static unsigned int choke_len(const struct choke_sched_data *q)
+{
+ return (q->tail - q->head) & q->tab_mask;
+}
+
+/* Is ECN parameter configured */
+static int use_ecn(const struct choke_sched_data *q)
+{
+ return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max just be dropped (versus marked) */
+static int use_harddrop(const struct choke_sched_data *q)
+{
+ return q->flags & TC_RED_HARDDROP;
+}
+
+/* Move head pointer forward to skip over holes */
+static void choke_zap_head_holes(struct choke_sched_data *q)
+{
+ do {
+ q->head = (q->head + 1) & q->tab_mask;
+ if (q->head == q->tail)
+ break;
+ } while (q->tab[q->head] == NULL);
+}
+
+/* Move tail pointer backwards to reuse holes */
+static void choke_zap_tail_holes(struct choke_sched_data *q)
+{
+ do {
+ q->tail = (q->tail - 1) & q->tab_mask;
+ if (q->head == q->tail)
+ break;
+ } while (q->tab[q->tail] == NULL);
+}
+
+/* Drop packet from queue array by creating a "hole" */
+static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = q->tab[idx];
+
+ q->tab[idx] = NULL;
+
+ if (idx == q->head)
+ choke_zap_head_holes(q);
+ if (idx == q->tail)
+ choke_zap_tail_holes(q);
+
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ qdisc_drop(skb, sch);
+ qdisc_tree_decrease_qlen(sch, 1);
+ --sch->q.qlen;
+}
+
+/*
+ * Compare flow of two packets
+ * Returns true only if source and destination address and port match.
+ * false for special cases
+ */
+static bool choke_match_flow(struct sk_buff *skb1,
+ struct sk_buff *skb2)
+{
+ int off1, off2, poff;
+ const u32 *ports1, *ports2;
+ u8 ip_proto;
+ __u32 hash1;
+
+ if (skb1->protocol != skb2->protocol)
+ return false;
+
+ /* Use hash value as quick check
+ * Assumes that __skb_get_rxhash makes IP header and ports linear
+ */
+ hash1 = skb_get_rxhash(skb1);
+ if (!hash1 || hash1 != skb_get_rxhash(skb2))
+ return false;
+
+ /* Probably match, but be sure to avoid hash collisions */
+ off1 = skb_network_offset(skb1);
+ off2 = skb_network_offset(skb2);
+
+ switch (skb1->protocol) {
+ case __constant_htons(ETH_P_IP): {
+ const struct iphdr *ip1, *ip2;
+
+ ip1 = (const struct iphdr *) (skb1->data + off1);
+ ip2 = (const struct iphdr *) (skb2->data + off2);
+
+ ip_proto = ip1->protocol;
+ if (ip_proto != ip2->protocol ||
+ ip1->saddr != ip2->saddr || ip1->daddr != ip2->daddr)
+ return false;
+
+ if ((ip1->frag_off | ip2->frag_off) & htons(IP_MF | IP_OFFSET))
+ ip_proto = 0;
+ off1 += ip1->ihl * 4;
+ off2 += ip2->ihl * 4;
+ break;
+ }
+
+ case __constant_htons(ETH_P_IPV6): {
+ const struct ipv6hdr *ip1, *ip2;
+
+ ip1 = (const struct ipv6hdr *) (skb1->data + off1);
+ ip2 = (const struct ipv6hdr *) (skb2->data + off2);
+
+ ip_proto = ip1->nexthdr;
+ if (ip_proto != ip2->nexthdr ||
+ ipv6_addr_cmp(&ip1->saddr, &ip2->saddr) ||
+ ipv6_addr_cmp(&ip1->daddr, &ip2->daddr))
+ return false;
+ off1 += 40;
+ off2 += 40;
+ }
+
+ default: /* Maybe compare MAC header here? */
+ return false;
+ }
+
+ poff = proto_ports_offset(ip_proto);
+ if (poff < 0)
+ return true;
+
+ off1 += poff;
+ off2 += poff;
+
+ ports1 = (__force u32 *)(skb1->data + off1);
+ ports2 = (__force u32 *)(skb2->data + off2);
+ return *ports1 == *ports2;
+}
+
+static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
+{
+ *(unsigned int *)(qdisc_skb_cb(skb)->data) = classid;
+}
+
+static u16 choke_get_classid(const struct sk_buff *skb)
+{
+ return *(unsigned int *)(qdisc_skb_cb(skb)->data);
+}
+
+/*
+ * Classify flow using either:
+ * 1. pre-existing classification result in skb
+ * 2. fast internal classification
+ * 3. use TC filter based classification
+ */
+static bool choke_classify(struct sk_buff *skb,
+ struct Qdisc *sch, int *qerr)
+
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct tcf_result res;
+ int result;
+
+ result = tc_classify(skb, q->filter_list, &res);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ case TC_ACT_SHOT:
+ return false;
+ }
+#endif
+ choke_set_classid(skb, TC_H_MIN(res.classid));
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Select a packet at random from queue
+ * HACK: since queue can have holes from previous deletion; retry several
+ * times to find a random skb but then just give up and return the head
+ * Will return NULL if queue is empty (q->head == q->tail)
+ */
+static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
+ unsigned int *pidx)
+{
+ struct sk_buff *skb;
+ int retrys = 3;
+
+ do {
+ *pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
+ skb = q->tab[*pidx];
+ if (skb)
+ return skb;
+ } while (--retrys > 0);
+
+ return q->tab[*pidx = q->head];
+}
+
+/*
+ * Compare new packet with random packet in queue
+ * returns true if matched and sets *pidx
+ */
+static bool choke_match_random(const struct choke_sched_data *q,
+ struct sk_buff *nskb,
+ unsigned int *pidx)
+{
+ struct sk_buff *oskb;
+
+ if (q->head == q->tail)
+ return false;
+
+ oskb = choke_peek_random(q, pidx);
+ if (q->filter_list)
+ return choke_get_classid(nskb) == choke_get_classid(oskb);
+
+ return choke_match_flow(oskb, nskb);
+}
+
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct red_parms *p = &q->parms;
+ int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+
+ if (q->filter_list) {
+ /* If using external classifiers, get result and record it. */
+ if (!choke_classify(skb, sch, &ret))
+ goto other_drop; /* Packet was eaten by filter */
+ }
+
+ /* Compute average queue usage (see RED) */
+ p->qavg = red_calc_qavg(p, sch->q.qlen);
+ if (red_is_idling(p))
+ red_end_of_idle_period(p);
+
+ /* Is queue small? */
+ if (p->qavg <= p->qth_min)
+ p->qcount = -1;
+ else {
+ unsigned int idx;
+
+ /* Draw a packet at random from queue and compare flow */
+ if (choke_match_random(q, skb, &idx)) {
+ q->stats.matched++;
+ choke_drop_by_idx(sch, idx);
+ goto congestion_drop;
+ }
+
+ /* Queue is large, always mark/drop */
+ if (p->qavg > p->qth_max) {
+ p->qcount = -1;
+
+ sch->qstats.overlimits++;
+ if (use_harddrop(q) || !use_ecn(q) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.forced_mark++;
+ } else if (++p->qcount) {
+ if (red_mark_probability(p, p->qavg)) {
+ p->qcount = 0;
+ p->qR = red_random(p);
+
+ sch->qstats.overlimits++;
+ if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.prob_mark++;
+ }
+ } else
+ p->qR = red_random(p);
+ }
+
+ /* Admit new packet */
+ if (sch->q.qlen < q->limit) {
+ q->tab[q->tail] = skb;
+ q->tail = (q->tail + 1) & q->tab_mask;
+ ++sch->q.qlen;
+ sch->qstats.backlog += qdisc_pkt_len(skb);
+ return NET_XMIT_SUCCESS;
+ }
+
+ q->stats.pdrop++;
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+
+ congestion_drop:
+ qdisc_drop(skb, sch);
+ return NET_XMIT_CN;
+
+ other_drop:
+ if (ret & __NET_XMIT_BYPASS)
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return ret;
+}
+
+static struct sk_buff *choke_dequeue(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ if (q->head == q->tail) {
+ if (!red_is_idling(&q->parms))
+ red_start_of_idle_period(&q->parms);
+ return NULL;
+ }
+
+ skb = q->tab[q->head];
+ q->tab[q->head] = NULL;
+ choke_zap_head_holes(q);
+ --sch->q.qlen;
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ qdisc_bstats_update(sch, skb);
+
+ return skb;
+}
+
+static unsigned int choke_drop(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ unsigned int len;
+
+ len = qdisc_queue_drop(sch);
+ if (len > 0)
+ q->stats.other++;
+ else {
+ if (!red_is_idling(&q->parms))
+ red_start_of_idle_period(&q->parms);
+ }
+
+ return len;
+}
+
+static void choke_reset(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ red_restart(&q->parms);
+}
+
+static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
+ [TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) },
+ [TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE },
+};
+
+
+static void choke_free(void *addr)
+{
+ if (addr) {
+ if (is_vmalloc_addr(addr))
+ vfree(addr);
+ else
+ kfree(addr);
+ }
+}
+
+static int choke_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_CHOKE_MAX + 1];
+ const struct tc_red_qopt *ctl;
+ int err;
+ struct sk_buff **old = NULL;
+ unsigned int mask;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CHOKE_PARMS] == NULL ||
+ tb[TCA_CHOKE_STAB] == NULL)
+ return -EINVAL;
+
+ ctl = nla_data(tb[TCA_CHOKE_PARMS]);
+
+ if (ctl->limit > CHOKE_MAX_QUEUE)
+ return -EINVAL;
+
+ mask = roundup_pow_of_two(ctl->limit + 1) - 1;
+ if (mask != q->tab_mask) {
+ struct sk_buff **ntab;
+
+ ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
+ if (!ntab)
+ ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
+ if (!ntab)
+ return -ENOMEM;
+
+ sch_tree_lock(sch);
+ old = q->tab;
+ if (old) {
+ unsigned int oqlen = sch->q.qlen, tail = 0;
+
+ while (q->head != q->tail) {
+ struct sk_buff *skb = q->tab[q->head];
+
+ q->head = (q->head + 1) & q->tab_mask;
+ if (!skb)
+ continue;
+ if (tail < mask) {
+ ntab[tail++] = skb;
+ continue;
+ }
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+ --sch->q.qlen;
+ qdisc_drop(skb, sch);
+ }
+ qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
+ q->head = 0;
+ q->tail = tail;
+ }
+
+ q->tab_mask = mask;
+ q->tab = ntab;
+ } else
+ sch_tree_lock(sch);
+
+ q->flags = ctl->flags;
+ q->limit = ctl->limit;
+
+ red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+ ctl->Plog, ctl->Scell_log,
+ nla_data(tb[TCA_CHOKE_STAB]));
+
+ if (q->head == q->tail)
+ red_end_of_idle_period(&q->parms);
+
+ sch_tree_unlock(sch);
+ choke_free(old);
+ return 0;
+}
+
+static int choke_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ return choke_change(sch, opt);
+}
+
+static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts = NULL;
+ struct tc_red_qopt opt = {
+ .limit = q->limit,
+ .flags = q->flags,
+ .qth_min = q->parms.qth_min >> q->parms.Wlog,
+ .qth_max = q->parms.qth_max >> q->parms.Wlog,
+ .Wlog = q->parms.Wlog,
+ .Plog = q->parms.Plog,
+ .Scell_log = q->parms.Scell_log,
+ };
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt);
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct tc_choke_xstats st = {
+ .early = q->stats.prob_drop + q->stats.forced_drop,
+ .marked = q->stats.prob_mark + q->stats.forced_mark,
+ .pdrop = q->stats.pdrop,
+ .other = q->stats.other,
+ .matched = q->stats.matched,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void choke_destroy(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(&q->filter_list);
+ choke_free(q->tab);
+}
+
+static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long choke_get(struct Qdisc *sch, u32 classid)
+{
+ return 0;
+}
+
+static void choke_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return 0;
+}
+
+static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return &q->filter_list;
+}
+
+static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ if (!arg->stop) {
+ if (arg->fn(sch, 1, arg) < 0) {
+ arg->stop = 1;
+ return;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops choke_class_ops = {
+ .leaf = choke_leaf,
+ .get = choke_get,
+ .put = choke_put,
+ .tcf_chain = choke_find_tcf,
+ .bind_tcf = choke_bind,
+ .unbind_tcf = choke_put,
+ .dump = choke_dump_class,
+ .walk = choke_walk,
+};
+
+static struct sk_buff *choke_peek_head(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ return (q->head != q->tail) ? q->tab[q->head] : NULL;
+}
+
+static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
+ .id = "choke",
+ .priv_size = sizeof(struct choke_sched_data),
+
+ .enqueue = choke_enqueue,
+ .dequeue = choke_dequeue,
+ .peek = choke_peek_head,
+ .drop = choke_drop,
+ .init = choke_init,
+ .destroy = choke_destroy,
+ .reset = choke_reset,
+ .change = choke_change,
+ .dump = choke_dump,
+ .dump_stats = choke_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init choke_module_init(void)
+{
+ return register_qdisc(&choke_qdisc_ops);
+}
+
+static void __exit choke_module_exit(void)
+{
+ unregister_qdisc(&choke_qdisc_ops);
+}
+
+module_init(choke_module_init)
+module_exit(choke_module_exit)
+
+MODULE_LICENSE("GPL");
--- a/include/linux/pkt_sched.h 2011-01-31 09:01:32.000000000 -0800
+++ b/include/linux/pkt_sched.h 2011-02-02 17:00:36.802764862 -0800
@@ -247,6 +247,35 @@ struct tc_gred_sopt {
__u16 pad1;
};
+/* CHOKe section */
+
+enum {
+ TCA_CHOKE_UNSPEC,
+ TCA_CHOKE_PARMS,
+ TCA_CHOKE_STAB,
+ __TCA_CHOKE_MAX,
+};
+
+#define TCA_CHOKE_MAX (__TCA_CHOKE_MAX - 1)
+
+struct tc_choke_qopt {
+ __u32 limit; /* Hard queue length (packets) */
+ __u32 qth_min; /* Min average threshold (packets) */
+ __u32 qth_max; /* Max average threshold (packets) */
+ unsigned char Wlog; /* log(W) */
+ unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */
+ unsigned char Scell_log; /* cell size for idle damping */
+ unsigned char flags; /* see RED flags */
+};
+
+struct tc_choke_xstats {
+ __u32 early; /* Early drops */
+ __u32 pdrop; /* Drops due to queue limits */
+ __u32 other; /* Drops due to drop() calls */
+ __u32 marked; /* Marked packets */
+ __u32 matched; /* Drops due to flow match */
+};
+
/* HTB section */
#define TC_HTB_NUMPRIO 8
#define TC_HTB_MAXDEPTH 8
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox