* [patch 2/7] natsemi section fix
From: akpm @ 2008-02-08 11:11 UTC (permalink / raw)
To: davem; +Cc: jeff, netdev, akpm, sam
From: Andrew Morton <akpm@linux-foundation.org>
gcc-3.4.4 on powerpc:
drivers/net/natsemi.c:245: error: natsemi_pci_info causes a section type conflict
Cc: Jeff Garzik <jeff@garzik.org>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
drivers/net/natsemi.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff -puN drivers/net/natsemi.c~natsemi-section-fix drivers/net/natsemi.c
--- a/drivers/net/natsemi.c~natsemi-section-fix
+++ a/drivers/net/natsemi.c
@@ -127,7 +127,7 @@ static int full_duplex[MAX_UNITS];
#define NATSEMI_RX_LIMIT 2046 /* maximum supported by hardware */
/* These identify the driver base version and may not be removed. */
-static const char version[] __devinitdata =
+static char version[] __devinitdata =
KERN_INFO DRV_NAME " dp8381x driver, version "
DRV_VERSION ", " DRV_RELDATE "\n"
KERN_INFO " originally by Donald Becker <becker@scyld.com>\n"
@@ -238,7 +238,7 @@ enum {
};
/* array of board data directly indexed by pci_tbl[x].driver_data */
-static const struct {
+static struct {
const char *name;
unsigned long flags;
unsigned int eeprom_size;
@@ -247,7 +247,7 @@ static const struct {
{ "NatSemi DP8381[56]", 0, 24 },
};
-static const struct pci_device_id natsemi_pci_tbl[] __devinitdata = {
+static struct pci_device_id natsemi_pci_tbl[] __devinitdata = {
{ PCI_VENDOR_ID_NS, 0x0020, 0x12d9, 0x000c, 0, 0, 0 },
{ PCI_VENDOR_ID_NS, 0x0020, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 1 },
{ } /* terminate list */
_
^ permalink raw reply
* [patch 1/7] typhoon section fix
From: akpm @ 2008-02-08 11:11 UTC (permalink / raw)
To: davem; +Cc: jeff, netdev, akpm, sam
From: Andrew Morton <akpm@linux-foundation.org>
gcc-3.4.4 on powerpc:
drivers/net/typhoon.c:137: error: version causes a section type conflict
Cc: Jeff Garzik <jeff@garzik.org>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
drivers/net/typhoon.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff -puN drivers/net/typhoon.c~typhoon-section-fix drivers/net/typhoon.c
--- a/drivers/net/typhoon.c~typhoon-section-fix
+++ a/drivers/net/typhoon.c
@@ -134,7 +134,7 @@ static const int multicast_filter_limit
#include "typhoon.h"
#include "typhoon-firmware.h"
-static const char version[] __devinitdata =
+static char version[] __devinitdata =
"typhoon.c: version " DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
MODULE_AUTHOR("David Dillow <dave@thedillows.org>");
@@ -178,7 +178,7 @@ enum typhoon_cards {
};
/* directly indexed by enum typhoon_cards, above */
-static const struct typhoon_card_info typhoon_card_info[] __devinitdata = {
+static struct typhoon_card_info typhoon_card_info[] __devinitdata = {
{ "3Com Typhoon (3C990-TX)",
TYPHOON_CRYPTO_NONE},
{ "3Com Typhoon (3CR990-TX-95)",
_
^ permalink raw reply
* [patch 7/7] pppol2tp: fix printk warnings
From: akpm @ 2008-02-08 11:11 UTC (permalink / raw)
To: davem; +Cc: jeff, netdev, akpm
From: Andrew Morton <akpm@linux-foundation.org>
drivers/net/pppol2tp.c: In function `pppol2tp_seq_tunnel_show':
drivers/net/pppol2tp.c:2295: warning: long long unsigned int format, __u64 arg (arg 4)
drivers/net/pppol2tp.c:2295: warning: long long unsigned int format, __u64 arg (arg 5)
drivers/net/pppol2tp.c:2295: warning: long long unsigned int format, __u64 arg (arg 6)
drivers/net/pppol2tp.c:2295: warning: long long unsigned int format, __u64 arg (arg 7)
drivers/net/pppol2tp.c:2295: warning: long long unsigned int format, __u64 arg (arg 8)
drivers/net/pppol2tp.c:2295: warning: long long unsigned int format, __u64 arg (arg 9)
drivers/net/pppol2tp.c: In function `pppol2tp_seq_session_show':
drivers/net/pppol2tp.c:2328: warning: long long unsigned int format, __u64 arg (arg 5)
drivers/net/pppol2tp.c:2328: warning: long long unsigned int format, __u64 arg (arg 6)
drivers/net/pppol2tp.c:2328: warning: long long unsigned int format, __u64 arg (arg 7)
drivers/net/pppol2tp.c:2328: warning: long long unsigned int format, __u64 arg (arg 8)
drivers/net/pppol2tp.c:2328: warning: long long unsigned int format, __u64 arg (arg 9)
drivers/net/pppol2tp.c:2328: warning: long long unsigned int format, __u64 arg (arg 10)
Not all platforms implement u64 with unsigned long long. eg: powerpc.
Cc: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
drivers/net/pppol2tp.c | 22 ++++++++++++----------
1 file changed, 12 insertions(+), 10 deletions(-)
diff -puN drivers/net/pppol2tp.c~pppol2tp-fix-printk-warnings drivers/net/pppol2tp.c
--- a/drivers/net/pppol2tp.c~pppol2tp-fix-printk-warnings
+++ a/drivers/net/pppol2tp.c
@@ -2289,10 +2289,12 @@ static void pppol2tp_seq_tunnel_show(str
atomic_read(&tunnel->ref_count) - 1);
seq_printf(m, " %08x %llu/%llu/%llu %llu/%llu/%llu\n",
tunnel->debug,
- tunnel->stats.tx_packets, tunnel->stats.tx_bytes,
- tunnel->stats.tx_errors,
- tunnel->stats.rx_packets, tunnel->stats.rx_bytes,
- tunnel->stats.rx_errors);
+ (unsigned long long)tunnel->stats.tx_packets,
+ (unsigned long long)tunnel->stats.tx_bytes,
+ (unsigned long long)tunnel->stats.tx_errors,
+ (unsigned long long)tunnel->stats.rx_packets,
+ (unsigned long long)tunnel->stats.rx_bytes,
+ (unsigned long long)tunnel->stats.rx_errors);
}
static void pppol2tp_seq_session_show(struct seq_file *m, void *v)
@@ -2320,12 +2322,12 @@ static void pppol2tp_seq_session_show(st
jiffies_to_msecs(session->reorder_timeout));
seq_printf(m, " %hu/%hu %llu/%llu/%llu %llu/%llu/%llu\n",
session->nr, session->ns,
- session->stats.tx_packets,
- session->stats.tx_bytes,
- session->stats.tx_errors,
- session->stats.rx_packets,
- session->stats.rx_bytes,
- session->stats.rx_errors);
+ (unsigned long long)session->stats.tx_packets,
+ (unsigned long long)session->stats.tx_bytes,
+ (unsigned long long)session->stats.tx_errors,
+ (unsigned long long)session->stats.rx_packets,
+ (unsigned long long)session->stats.rx_bytes,
+ (unsigned long long)session->stats.rx_errors);
}
static int pppol2tp_seq_show(struct seq_file *m, void *v)
_
^ permalink raw reply
* [PATCH 2/2][SCTP]: Convert sctp_dbg_objcnt to seq files.
From: Pavel Emelyanov @ 2008-02-08 11:08 UTC (permalink / raw)
To: David Miller; +Cc: Vlad Yasevich, lksctp-developers, Linux Netdev List
This makes the code use a good proc API and the text ~50 bytes shorter.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
---
net/sctp/objcnt.c | 85 +++++++++++++++++++++++++++-------------------------
1 files changed, 44 insertions(+), 41 deletions(-)
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index 2cf6ad6..2b9ac00 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -80,61 +80,64 @@ static sctp_dbg_objcnt_entry_t sctp_dbg_objcnt[] = {
/* Callback from procfs to read out objcount information.
* Walk through the entries in the sctp_dbg_objcnt array, dumping
* the raw object counts for each monitored type.
- *
- * This code was modified from similar code in route.c
*/
-static int sctp_dbg_objcnt_read(char *buffer, char **start, off_t offset,
- int length, int *eof, void *data)
+static int sctp_objcnt_seq_show(struct seq_file *seq, void *v)
{
- int len = 0;
- off_t pos = 0;
- int entries;
int i;
char temp[128];
- /* How many entries? */
- entries = ARRAY_SIZE(sctp_dbg_objcnt);
-
- /* Walk the entries and print out the debug information
- * for proc fs.
- */
- for (i = 0; i < entries; i++) {
- pos += 128;
-
- /* Skip ahead. */
- if (pos <= offset) {
- len = 0;
- continue;
- }
- /* Print out each entry. */
- sprintf(temp, "%s: %d",
- sctp_dbg_objcnt[i].label,
- atomic_read(sctp_dbg_objcnt[i].counter));
-
- sprintf(buffer + len, "%-127s\n", temp);
- len += 128;
- if (pos >= offset+length)
- goto done;
- }
-
-done:
- *start = buffer + len - (pos - offset);
- len = pos - offset;
- if (len > length)
- len = length;
-
- return len;
+ i = (int)*(loff_t *)v;
+ sprintf(temp, "%s: %d", sctp_dbg_objcnt[i].label,
+ atomic_read(sctp_dbg_objcnt[i].counter));
+ seq_printf(seq, "%-127s\n", temp);
+ return 0;
+}
+
+static void *sctp_objcnt_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return (*pos >= ARRAY_SIZE(sctp_dbg_objcnt)) ? NULL : (void *)pos;
+}
+
+static void sctp_objcnt_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static void * sctp_objcnt_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return (*pos >= ARRAY_SIZE(sctp_dbg_objcnt)) ? NULL : (void *)pos;
}
+static const struct seq_operations sctp_objcnt_seq_ops = {
+ .start = sctp_objcnt_seq_start,
+ .next = sctp_objcnt_seq_next,
+ .stop = sctp_objcnt_seq_stop,
+ .show = sctp_objcnt_seq_show,
+};
+
+static int sctp_objcnt_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &sctp_objcnt_seq_ops);
+}
+
+static const struct file_operations sctp_objcnt_ops = {
+ .open = sctp_objcnt_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
/* Initialize the objcount in the proc filesystem. */
void sctp_dbg_objcnt_init(void)
{
struct proc_dir_entry *ent;
- ent = create_proc_read_entry("sctp_dbg_objcnt", 0, proc_net_sctp,
- sctp_dbg_objcnt_read, NULL);
+
+ ent = create_proc_entry("sctp_dbg_objcnt", 0, proc_net_sctp);
if (!ent)
printk(KERN_WARNING
"sctp_dbg_objcnt: Unable to create /proc entry.\n");
+ else
+ ent->proc_fops = &sctp_objcnt_ops;
}
/* Cleanup the objcount entry in the proc filesystem. */
--
1.5.3.4
^ permalink raw reply related
* [PATCH 1/2][SCTP]: Use snmp_fold_field instead of a homebrew analogue.
From: Pavel Emelyanov @ 2008-02-08 11:04 UTC (permalink / raw)
To: David Miller; +Cc: Vlad Yasevich, lksctp-developers, Linux Netdev List
SCPT already depends in INET, so this doesn't create additional
dependencies.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
---
net/sctp/proc.c | 23 ++---------------------
1 files changed, 2 insertions(+), 21 deletions(-)
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 2499732..974350b 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -38,6 +38,7 @@
#include <linux/seq_file.h>
#include <linux/init.h>
#include <net/sctp/sctp.h>
+#include <net/ip.h> /* for snmp_fold_field */
static struct snmp_mib sctp_snmp_list[] = {
SNMP_MIB_ITEM("SctpCurrEstab", SCTP_MIB_CURRESTAB),
@@ -75,26 +76,6 @@ static struct snmp_mib sctp_snmp_list[] = {
SNMP_MIB_SENTINEL
};
-/* Return the current value of a particular entry in the mib by adding its
- * per cpu counters.
- */
-static unsigned long
-fold_field(void *mib[], int nr)
-{
- unsigned long res = 0;
- int i;
-
- for_each_possible_cpu(i) {
- res +=
- *((unsigned long *) (((void *) per_cpu_ptr(mib[0], i)) +
- sizeof (unsigned long) * nr));
- res +=
- *((unsigned long *) (((void *) per_cpu_ptr(mib[1], i)) +
- sizeof (unsigned long) * nr));
- }
- return res;
-}
-
/* Display sctp snmp mib statistics(/proc/net/sctp/snmp). */
static int sctp_snmp_seq_show(struct seq_file *seq, void *v)
{
@@ -102,7 +83,7 @@ static int sctp_snmp_seq_show(struct seq_file *seq, void *v)
for (i = 0; sctp_snmp_list[i].name != NULL; i++)
seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name,
- fold_field((void **)sctp_statistics,
+ snmp_fold_field((void **)sctp_statistics,
sctp_snmp_list[i].entry));
return 0;
--
1.5.3.4
^ permalink raw reply related
* Re: oops with ipcomp
From: Beschorner Daniel @ 2008-02-08 10:45 UTC (permalink / raw)
To: Herbert Xu; +Cc: netdev
In-Reply-To: <20080208085836.GC14650@gondor.apana.org.au>
> No I meant the exact output of ip x p and ip x s.
I know, but as I end up every time with a tainted kernel on our
production server I didn't turn ipcomp on, but now I got it.
src Net_B dst A
dir in priority 2088
tmpl src B dst A
proto comp reqid 16394 mode tunnel
level use
tmpl src 0.0.0.0 dst 0.0.0.0
proto esp reqid 16393 mode transport
src Net_B dst Net_A
dir in priority 2344
tmpl src B dst A
proto comp reqid 16390 mode tunnel
level use
tmpl src 0.0.0.0 dst 0.0.0.0
proto esp reqid 16389 mode transport
src A dst Net_B
dir out priority 2088
tmpl src A dst B
proto comp reqid 16394 mode tunnel
tmpl src 0.0.0.0 dst 0.0.0.0
proto esp reqid 16393 mode transport
src Net_A dst Net_B
dir out priority 2344
tmpl src A dst B
proto comp reqid 16390 mode tunnel
tmpl src 0.0.0.0 dst 0.0.0.0
proto esp reqid 16389 mode transport
src Net_B dst A
dir fwd priority 2088
tmpl src B dst A
proto comp reqid 16394 mode tunnel
level use
tmpl src 0.0.0.0 dst 0.0.0.0
proto esp reqid 16393 mode transport
src Net_B dst Net_A
dir fwd priority 2344
tmpl src B dst A
proto comp reqid 16390 mode tunnel
level use
tmpl src 0.0.0.0 dst 0.0.0.0
proto esp reqid 16389 mode transport
src A dst B
proto comp spi 0x0000427e reqid 16390 mode tunnel
replay-window 0
comp deflate 0x
sel src 0.0.0.0/0 dst 0.0.0.0/0
src B dst A
proto comp spi 0x0000ecf0 reqid 16390 mode tunnel
replay-window 0
comp deflate 0x
sel src 0.0.0.0/0 dst 0.0.0.0/0
src A dst B
proto esp spi 0x53f15e96 reqid 16389 mode transport
replay-window 32
auth hmac(sha1) 0x...
enc cbc(aes) 0x...
sel src 0.0.0.0/0 dst 0.0.0.0/0
src B dst A
proto esp spi 0x7b329066 reqid 16389 mode transport
replay-window 32
auth hmac(sha1) 0...
enc cbc(aes) 0x...
sel src 0.0.0.0/0 dst 0.0.0.0/0
src A dst B
proto (null) spi 0x53ec987a reqid 0 mode tunnel
replay-window 0
sel src 0.0.0.0/0 dst 0.0.0.0/0
src B dst A
proto (null) spi 0xc19ef67c reqid 0 mode tunnel
replay-window 0
sel src 0.0.0.0/0 dst 0.0.0.0/0
src A dst B
proto comp spi 0x00001314 reqid 16394 mode tunnel
replay-window 0
comp deflate 0x
sel src 0.0.0.0/0 dst 0.0.0.0/0
src B dst A
proto comp spi 0x000032ff reqid 16394 mode tunnel
replay-window 0
comp deflate 0x
sel src 0.0.0.0/0 dst 0.0.0.0/0
src A dst B
proto esp spi 0xec7d12de reqid 16393 mode transport
replay-window 32
auth hmac(sha1) 0x...
enc cbc(aes) 0x...
sel src 0.0.0.0/0 dst 0.0.0.0/0
src B dst A
proto esp spi 0x75016d2d reqid 16393 mode transport
replay-window 32
auth hmac(sha1) 0x...
enc cbc(aes) 0x...
sel src 0.0.0.0/0 dst 0.0.0.0/0
^ permalink raw reply
* Re: [PATCHv2] net: sh_eth: Add support for Renesas SuperH Ethernet
From: Yoshihiro Shimoda @ 2008-02-08 10:40 UTC (permalink / raw)
To: Andrew Morton; +Cc: jgarzik, netdev, David Brownell
In-Reply-To: <20080207010529.868e931b.akpm@linux-foundation.org>
Andrew Morton wrote:
> On Thu, 07 Feb 2008 17:39:23 +0900 Yoshihiro Shimoda <shimoda.yoshihiro@renesas.com> wrote:
>
>> Add support for Renesas SuperH Ethernet controller.
>> This driver supported SH7710 and SH7712.
>>
>
> Nice looking driver.
>
> Quick comments:
Thank you very much for your comment.
>> +static void __init update_mac_address(struct net_device *ndev)
>>
>> --- snip ---
>>
>> +static void __init read_mac_address(struct net_device *ndev)
>
> Both the above functions are called from non-__init code and hence cannot
> be __init. sh_eth_tsu_init() is wrong too. Please check all section
> annotations in the driver.
I understood it. I will modify it.
>> +struct bb_info {
>> + struct mdiobb_ctrl ctrl;
>> + u32 addr;
>> + u32 mmd_msk;/* MMD */
>> + u32 mdo_msk;
>> + u32 mdi_msk;
>> + u32 mdc_msk;
>> +};
>
> Please cc David Brownell on updates to this driver - perhaps he will find
> time to review the bit-banging interface usage.
>
>> +/* PHY bit set */
>> +static void bb_set(u32 addr, u32 msk)
>> +{
>> + ctrl_outl(ctrl_inl(addr) | msk, addr);
>> +}
>> +
>> +/* PHY bit clear */
>> +static void bb_clr(u32 addr, u32 msk)
>> +{
>> + ctrl_outl((ctrl_inl(addr) & ~msk), addr);
>> +}
>> +
>> +/* PHY bit read */
>> +static int bb_read(u32 addr, u32 msk)
>> +{
>> + return (ctrl_inl(addr) & msk) != 0;
>> +}
>> +
>> +/* Data I/O pin control */
>> +static inline void sh__mmd_ctrl(struct mdiobb_ctrl *ctrl, int bit)
>> +{
>> + struct bb_info *bitbang = container_of(ctrl, struct bb_info, ctrl);
>> + if (bit)
>> + bb_set(bitbang->addr, bitbang->mmd_msk);
>> + else
>> + bb_clr(bitbang->addr, bitbang->mmd_msk);
>> +}
>> +
>> +/* Set bit data*/
>> +static inline void sh__set_mdio(struct mdiobb_ctrl *ctrl, int bit)
>> +{
>> + struct bb_info *bitbang = container_of(ctrl, struct bb_info, ctrl);
>> +
>> + if (bit)
>> + bb_set(bitbang->addr, bitbang->mdo_msk);
>> + else
>> + bb_clr(bitbang->addr, bitbang->mdo_msk);
>> +}
>> +
>> +/* Get bit data*/
>> +static inline int sh__get_mdio(struct mdiobb_ctrl *ctrl)
>> +{
>> + struct bb_info *bitbang = container_of(ctrl, struct bb_info, ctrl);
>> + return bb_read(bitbang->addr, bitbang->mdi_msk);
>> +}
>
> There seems to be a fairly random mixture of inline and non-inline here.
> I'd suggest that you just remove all the `inline's. The compiler does a
> pretty good job of working doing this for you.
I understood it. I will remove inline. I will not use inline in future
as far as there is not a special reason.
>> +/* MDC pin control */
>> +static inline void sh__mdc_ctrl(struct mdiobb_ctrl *ctrl, int bit)
>> +{
>> + struct bb_info *bitbang = container_of(ctrl, struct bb_info, ctrl);
>> +
>> + if (bit)
>> + bb_set(bitbang->addr, bitbang->mdc_msk);
>> + else
>> + bb_clr(bitbang->addr, bitbang->mdc_msk);
>> +}
>> +
>> +/* mdio bus control struct */
>> +static struct mdiobb_ops bb_ops = {
>> + .owner = THIS_MODULE,
>> + .set_mdc = sh__mdc_ctrl,
>> + .set_mdio_dir = sh__mmd_ctrl,
>> + .set_mdio_data = sh__set_mdio,
>> + .get_mdio_data = sh__get_mdio,
>> +};
>
> It's particularly inappropriate that sh__mdc_ctrl() was inlined - it is
> only ever called via a function pointer and hence will never be inlined!
I understood it.
>> ...
>>
>> +static void sh_eth_timer(unsigned long data)
>> +{
>> + struct net_device *ndev = (struct net_device *)data;
>> + struct sh_eth_private *mdp = netdev_priv(ndev);
>> + int next_tick = 10 * HZ;
>> +
>> + /* We could do something here... nah. */
>> + mdp->timer.expires = jiffies + next_tick;
>> + add_timer(&mdp->timer);
>
> mod_timer() would be neater here.
>
>> +}
>>
>> --- snip ---
>>
>> + /* Set the timer to check for link beat. */
>> + init_timer(&mdp->timer);
>> + mdp->timer.expires = (jiffies + (24 * HZ)) / 10;/* 2.4 sec. */
>> + mdp->timer.data = (u32) ndev;
>> + mdp->timer.function = sh_eth_timer; /* timer handler */
>
> setup_timer()
I understood it. I will modify these.
>> +}
>> +
>>
>> +#ifdef __LITTLE_ENDIAN__
>> +static inline void swaps(char *src, int len)
>> +{
>> + u32 *p = (u32 *)src;
>> + u32 *maxp;
>> + maxp = p + ((len + sizeof(u32) - 1) / sizeof(u32));
>> +
>> + for (; p < maxp; p++)
>> + *p = swab32(*p);
>> +}
>> +#else
>> +#define swaps(x, y)
>> +#endif
>> +
>
> I'd say that the big-endian version of swaps() should be a C function
> rather than a macro. It's nicer to look at, consistent, provides typechecking,
> can help avoid unused-variable warnings (an inline function provides a
> reference to the arguments whereas a macro does not).
>
> The little-endian version of this function is too large to be inlined.
>
> This function looks fairly generic. Are we sure there isn't some library
> function which does this?
>
I looked for lib/ and include/linux/ and include/linux/byteorder/, but
such function was not found. So I will modify swaps().
Thanks,
Yoshihiro Shimoda
^ permalink raw reply
* tg3 kernel BUG at include/linux/netdevice.h:918!
From: Frank van Maarseveen @ 2008-02-08 10:06 UTC (permalink / raw)
To: netdev
FWIW,
kernel 2.6.22.10, tainted with nvidia and vmware. BUG triggered twice
now when a large number of processes (unrelated to vmware) tried to send
lots of TCP data to other linux boxes (real, not virtual). tg3 TSO has
been disabled with ethtool.
kernel BUG at include/linux/netdevice.h:918!
invalid opcode: 0000 [#1]
SMP
Modules linked in: nvidia(P) vmnetfilter vmnet(P) vmmon(P) vmthrottle sysprof
CPU: 0
EIP: 0060:[<c0344b61>] Tainted: P VLI
EFLAGS: 00010046 (2.6.22.10-x168 #1)
EIP is at tg3_poll+0x161/0x1c0
eax: 00000006 ebx: f7445000 ecx: 00000000 edx: f7532000
esi: f7445600 edi: 00000202 ebp: e1f55bcc esp: e1f55bb4
ds: 007b es: 007b fs: 00d8 gs: 0033 ss: 0068
Process fcp (pid: 13068, ti=e1f54000 task=dc6fb400 task.ti=e1f54000)
Stack: e1f55bcc 00000001 e1f55bdc f7445000 c2b3aa80 c2a22d2c e1f55bec c04b8e4d
0ae106dd c2a22d00 0000012c 00000005 c0796b18 c07fb820 e1f55c08 c0128de8
00000000 0000000a 00000246 f7445000 c7a179ac e1f55c14 c0128eac f7445200
Call Trace:
[<c01054aa>] show_trace_log_lvl+0x1a/0x30
[<c010557a>] show_stack_log_lvl+0x9a/0xc0
[<c01057d6>] show_registers+0x1d6/0x2e0
[<c0105a46>] die+0x106/0x240
[<c0105c11>]
do_trap+0x91/0xd0
[<c0105eb9>] do_invalid_op+0x89/0xa0
[<c0575b42>] error_code+0x72/0x80
[<c04b8e4d>] net_rx_action+0x8d/0x170
[<c0128de8>] __do_softirq+0x78/0x100
[<c0128eac>] do_softirq+0x3c/0x40
[<c0128cf0>] local_bh_enable+0x80/0xb0
[<c04b85f2>] dev_queue_xmit+0x222/0x310
[<c04f0de4>] ip_output+0x1d4/0x350
[<c04f13bc>] ip_queue_xmit+0x45c/0x480
[<c0502e06>] tcp_transmit_skb+0x2b6/0x490
[<c050461a>] tcp_write_xmit+0x18a/0x260
[<c0504707>] __tcp_push_pending_frames+0x17/0x80
[<c04f88c0>] tcp_sendmsg+0x780/0xbe0
[<c04acb67>] do_sock_write+0x97/0xb0
[<c04acbea>] sock_aio_write+0x6a/0x80
[<c0173b47>]
do_sync_write+0xc7/0x120
[<c0173cbf>] vfs_write+0x11f/0x130
[<c0173d7d>] sys_write+0x3d/0x70
[<c0104132>] syscall_call+0x7/0xb
=======================
Code: e0 fd 83 c8 01 89 07 8d 83 04 06 00 00 89 45 e8 e8 c5
0a 23 00 31 d2 89 f0 e8 7c f4 ff ff 8b 45 e8 e8 c4 0b 23 00 e9 c9 fe ff ff <0f> 0b
eb fe 9c 5f fa 8b 43
2c a8 20 74 40
8d 8b 80
01 00 00
8b
EIP: [<c0344b61>]
tg3_poll+0x161/0x1c0 SS:ESP 0068:e1f55bb4
Kernel panic - not syncing: Fatal exception in interrupt
913 /* same as netif_rx_complete, except that local_irq_save(flags)
914 * has already been issued
915 */
916 static inline void __netif_rx_complete(struct net_device *dev)
917 {
>918 BUG_ON(!test_bit(__LINK_STATE_RX_SCHED, &dev->state));
919 list_del(&dev->poll_list);
920 smp_mb__before_clear_bit();
921 clear_bit(__LINK_STATE_RX_SCHED, &dev->state);
922 }
--
Frank
^ permalink raw reply
* [PATCH] [IPV6] Replace using the magic constant "1024" with IP6_RT_PRIO_USER for fc_metric.
From: Rami Rosen @ 2008-02-08 10:14 UTC (permalink / raw)
To: David Miller, netdev
[-- Attachment #1: Type: text/plain, Size: 184 bytes --]
Hi,
This patch replaces the explicit usage of the magic constant "1024"
with IP6_RT_PRIO_USER in the IPV6 tree.
Regards,
Rami Rosen
Signed-off-by: Rami Rosen <ramirose@gmail.com>
[-- Attachment #2: patch.txt --]
[-- Type: text/plain, Size: 799 bytes --]
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 513f72e..6e7b56e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1620,7 +1620,7 @@ static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixle
{
struct fib6_config cfg = {
.fc_table = RT6_TABLE_INFO,
- .fc_metric = 1024,
+ .fc_metric = IP6_RT_PRIO_USER,
.fc_ifindex = ifindex,
.fc_dst_len = prefixlen,
.fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
@@ -1670,7 +1670,7 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
{
struct fib6_config cfg = {
.fc_table = RT6_TABLE_DFLT,
- .fc_metric = 1024,
+ .fc_metric = IP6_RT_PRIO_USER,
.fc_ifindex = dev->ifindex,
.fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
^ permalink raw reply related
* [PATCH] smc91x: Add MigoR board support
From: Magnus Damm @ 2008-02-08 9:49 UTC (permalink / raw)
To: netdev; +Cc: Magnus Damm, linux-sh
This patch adds MigoR board support to the smc91x driver.
Signed-off-by: Magnus Damm <damm@igel.co.jp>
---
drivers/net/smc91x.h | 14 ++++++++++++++
1 file changed, 14 insertions(+)
--- 0001/drivers/net/smc91x.h
+++ work/drivers/net/smc91x.h 2008-02-06 23:13:55.000000000 +0900
@@ -306,6 +306,20 @@ SMC_outw(u16 val, void __iomem *ioaddr,
#define SMC_insw(a, r, p, l) insw((a) + (r), p, l)
#define SMC_outsw(a, r, p, l) outsw((a) + (r), p, l)
+#elif defined(CONFIG_SH_MIGOR)
+
+#define SMC_IRQ_FLAGS (-1)
+#define SMC_CAN_USE_8BIT 0
+#define SMC_CAN_USE_16BIT 1
+#define SMC_CAN_USE_32BIT 0
+#define SMC_IO_SHIFT 0
+#define SMC_NOWAIT 1
+
+#define SMC_inw(a, r) inw((a) + (r))
+#define SMC_outw(v, a, r) outw(v, (a) + (r))
+#define SMC_insw(a, r, p, l) insw((a) + (r), p, l)
+#define SMC_outsw(a, r, p, l) outsw((a) + (r), p, l)
+
#else /* BOARDS */
#define SMC_CAN_USE_8BIT 1
^ permalink raw reply
* Re: ipcomp regression in 2.6.24
From: David Miller @ 2008-02-08 9:51 UTC (permalink / raw)
To: pupilla; +Cc: netdev
In-Reply-To: <BAY103-DAV366423029E541B55E047CB22F0@phx.gbl>
From: "Marco Berizzi" <pupilla@hotmail.com>
Date: Fri, 8 Feb 2008 10:12:25 +0100
> I haven't seen this patch in Greg 2.6.24-stable
> review message.
Due to having just returned from LCA08 I haven't
made any -stable submissions in a while, and I
notified Greg of this tonight.
The networking fixes will make it into the next
2.6.24.x release.
^ permalink raw reply
* Re: ipcomp regression in 2.6.24
From: Marco Berizzi @ 2008-02-08 9:12 UTC (permalink / raw)
To: David Miller; +Cc: netdev
In-Reply-To: <20080129.211227.160323820.davem@davemloft.net>
David Miller wrote:
> From: Herbert Xu <herbert@gondor.apana.org.au>
> Date: Wed, 30 Jan 2008 14:15:33 +1100
>
> > Marco Berizzi <pupilla@hotmail.com> wrote:
> > >
> > >> > With 2.6.24 IPSEC/ESP tunnels to older kernels establish fine,
data
> > >> > flows in both directions, but no data comes out of the tunnel.
> > >> > Needed to disable ipcomp.
> > >
> > > Same problem here: linux 2.6.24 driven by openswan 2.4.11
> > > on Slackware 11.0
> >
> > My bad. This patch should fix it.
> >
> > [IPCOMP]: Fetch nexthdr before ipch is destroyed
> >
> > When I moved the nexthdr setting out of IPComp I accidently moved
> > the reading of ipch->nexthdr after the decompression. Unfortunately
> > this means that we'd be reading from a stale ipch pointer which
> > doesn't work very well.
> >
> > This patch moves the reading up so that we get the correct nexthdr
> > value.
> >
> > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
>
> Applied, and queued for -stable, thanks!
Hi David,
I haven't seen this patch in Greg 2.6.24-stable
review message.
^ permalink raw reply
* Re: oops with ipcomp
From: Herbert Xu @ 2008-02-08 8:58 UTC (permalink / raw)
To: Beschorner Daniel; +Cc: netdev
In-Reply-To: <3C59DB883F7B0B4D8096010D45ACCD134F20CE@exch.facton.local>
On Thu, Feb 07, 2008 at 07:01:32PM +0100, Beschorner Daniel wrote:
>
> > Could you show me the exact policies/SAs of the tunnel involved
> > in the crash?
>
> esp/cbc(aes128)/hmac(sha1)
No I meant the exact output of ip x p and ip x s.
Thanks,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply
* [PATCH][NET_SCHED] sch_htb: htb_requeue fix
From: Jarek Poplawski @ 2008-02-08 8:57 UTC (permalink / raw)
To: David Miller; +Cc: netdev
htb_requeue() enqueues skbs for which htb_classify() returns NULL.
This is wrong because such skbs could be handled by NET_CLS_ACT code,
and the decision could be different than earlier in htb_enqueue().
So htb_requeue() is changed to work and look more like htb_enqueue().
Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
---
diff -Nurp 2.6.24-mm1-/net/sched/sch_htb.c 2.6.24-mm1+/net/sched/sch_htb.c
--- 2.6.24-mm1-/net/sched/sch_htb.c 2008-02-05 07:45:48.000000000 +0000
+++ 2.6.24-mm1+/net/sched/sch_htb.c 2008-02-08 08:19:25.000000000 +0000
@@ -609,14 +609,14 @@ static int htb_enqueue(struct sk_buff *s
/* TODO: requeuing packet charges it to policers again !! */
static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch)
{
+ int ret;
struct htb_sched *q = qdisc_priv(sch);
- int ret = NET_XMIT_SUCCESS;
struct htb_class *cl = htb_classify(skb, sch, &ret);
struct sk_buff *tskb;
- if (cl == HTB_DIRECT || !cl) {
+ if (cl == HTB_DIRECT) {
/* enqueue to helper queue */
- if (q->direct_queue.qlen < q->direct_qlen && cl) {
+ if (q->direct_queue.qlen < q->direct_qlen) {
__skb_queue_head(&q->direct_queue, skb);
} else {
__skb_queue_head(&q->direct_queue, skb);
@@ -625,6 +625,13 @@ static int htb_requeue(struct sk_buff *s
sch->qstats.drops++;
return NET_XMIT_CN;
}
+#ifdef CONFIG_NET_CLS_ACT
+ } else if (!cl) {
+ if (ret == NET_XMIT_BYPASS)
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return ret;
+#endif
} else if (cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q) !=
NET_XMIT_SUCCESS) {
sch->qstats.drops++;
^ permalink raw reply
* Re: [NET_SCHED] Traffic Control subsystem Notifier for Kernel Modules
From: David Miller @ 2008-02-08 8:24 UTC (permalink / raw)
To: yashpal.dutta; +Cc: netdev, kim.phillips
In-Reply-To: <1202455182-13379-1-git-send-email-yashpal.dutta@freescale.com>
From: Yashpal Dutta <yashpal.dutta@freescale.com>
Date: Fri, 8 Feb 2008 12:49:42 +0530
> The patch adds raw notifiers in Traffic Control subsystem for communicating
> Queue Disciplines and Classifiers added by user via TC application.
> Interested kernel modules will have to register to the TC subsystem
> to be able to get new QDisc notifys asynchronously.
>
> Signed-off-by: Yashpal Dutta <yashpal.dutta@freescale.com>
> Signed-off-by: Kim Phillips <kim.phillips@freescale.com>
You can simply listen on the appropriate netlink socket for these
events already. There is no reason I can see to provide them over a
different framework like this.
^ permalink raw reply
* [NET_SCHED] Traffic Control subsystem Notifier for Kernel Modules
From: Yashpal Dutta @ 2008-02-08 7:19 UTC (permalink / raw)
To: netdev; +Cc: Yashpal Dutta, Kim Phillips
The patch adds raw notifiers in Traffic Control subsystem for communicating
Queue Disciplines and Classifiers added by user via TC application.
Interested kernel modules will have to register to the TC subsystem
to be able to get new QDisc notifys asynchronously.
Signed-off-by: Yashpal Dutta <yashpal.dutta@freescale.com>
Signed-off-by: Kim Phillips <kim.phillips@freescale.com>
---
include/net/sch_generic.h | 11 ++++++-
include/net/sch_notify.h | 66 +++++++++++++++++++++++++++++++++++++++++++
net/sched/cls_api.c | 42 +++++++++++++++++++++++++++
net/sched/cls_fw.c | 16 ++++++++++
net/sched/sch_api.c | 34 ++++++++++++++++++++++
net/sched/sch_cbq.c | 68 +++++++++++++++++++++++++++++++++++++++++++++
net/sched/sch_prio.c | 22 ++++++++++++++-
net/sched/sch_red.c | 29 ++++++++++++++++++-
net/sched/sch_sfq.c | 15 ++++++++++
net/sched/sch_tbf.c | 29 ++++++++++++++++++-
10 files changed, 328 insertions(+), 4 deletions(-)
create mode 100644 include/net/sch_notify.h
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index ab502ec..7e31279 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -9,6 +9,7 @@
#include <linux/pkt_cls.h>
#include <net/gen_stats.h>
#include <net/rtnetlink.h>
+#include <net/sch_notify.h>
struct Qdisc_ops;
struct qdisc_walker;
@@ -79,6 +80,9 @@ struct Qdisc_class_ops
/* rtnetlink specific */
int (*dump)(struct Qdisc *, unsigned long,
struct sk_buff *skb, struct tcmsg*);
+ /* CLS Notifier */
+ void (*notify_cls)(struct Qdisc *, unsigned long,
+ struct tc_cls_notify_s *);
int (*dump_stats)(struct Qdisc *, unsigned long,
struct gnet_dump *);
};
@@ -101,6 +105,9 @@ struct Qdisc_ops
int (*change)(struct Qdisc *, struct nlattr *arg);
int (*dump)(struct Qdisc *, struct sk_buff *);
+ /* QDISC Notifier */
+ void (*notify_qdisc)(struct Qdisc *,
+ struct qdisc_notify_s *);
int (*dump_stats)(struct Qdisc *, struct gnet_dump *);
struct module *owner;
@@ -134,7 +141,9 @@ struct tcf_proto_ops
/* rtnetlink specific */
int (*dump)(struct tcf_proto*, unsigned long,
struct sk_buff *skb, struct tcmsg*);
-
+ /* Classifier Notifier */
+ void (*notify_tcf)(struct tcf_proto *,
+ unsigned long, struct qdisc_notify_s *);
struct module *owner;
};
diff --git a/include/net/sch_notify.h b/include/net/sch_notify.h
new file mode 100644
index 0000000..cffdf0c
--- /dev/null
+++ b/include/net/sch_notify.h
@@ -0,0 +1,66 @@
+#ifndef __SCH_NOTIFY_H__
+#define __SCH_NOTIFY_H__
+
+#include <linux/types.h>
+#include <linux/rtnetlink.h>
+
+/*CBQ Notification structure */
+struct tc_cbq_notify_s {
+ struct tc_cbq_lssopt lssopt;
+ struct tc_cbq_wrropt wrropt;
+ struct tc_cbq_ovl ovlopt;
+#ifdef CONFIG_NET_CLS_ACT
+ struct tc_cbq_police policeopt;
+#endif
+ struct tc_cbq_fopt fopt;
+};
+
+/* Classifier Notification Structure */
+struct tc_cls_notify_s {
+ uint32_t type;
+ struct tcmsg tcm;
+ union {
+ struct tc_cbq_notify_s tc_cbq_notifier;
+ } u;
+};
+
+/* Traffic Actions notify Structure */
+struct tc_fw_notify_s {
+ uint32_t classid;
+ uint32_t mask;
+ uint32_t handle;
+ struct tc_action *action;
+};
+
+/* TC Main Notification Structure */
+struct qdisc_notify_s {
+ uint32_t type;
+ uint32_t parent;
+ uint32_t handle;
+ uint32_t ifindex;
+ union {
+ struct tc_fw_notify_s tc_fw_notifier;
+ struct tc_cbq_notify_s tc_cbq_notifier;
+ struct tc_tbf_qopt tbf_qdisc_opt;
+ struct tc_red_qopt red_qdisc_opt;
+ struct tc_sfq_qopt sfq_qdisc_opt;
+ struct tc_prio_qopt prio_qdisc_opt; /* Prio Qdisc Options */
+ } u;
+};
+
+/*Notifier Register(unregister) function for Qdisc*/
+extern uint32_t register_qdisc_notifier(struct notifier_block *nb);
+extern uint32_t unregister_qdisc_notifier(struct notifier_block *nb);
+
+/* Notifier Register(unregister) functions for TC Filters */
+extern uint32_t register_tcf_notifier(struct notifier_block *nb);
+extern uint32_t unregister_tcf_notifier(struct notifier_block *nb);
+
+/* Traffic Control Notify Event Types */
+#define TC_EVENT_PRIO 0x00000001
+#define TC_EVENT_CBQ 0x00000002
+#define TC_EVENT_RED 0x00000003
+#define TC_EVENT_SFQ 0x00000004
+#define TC_EVENT_TBF 0x00000005
+#define TC_EVENT_FW 0x00000006
+#endif
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 0fbedca..a2dc1c1 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -29,6 +29,7 @@
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
+#include <net/sch_notify.h>
/* The list of all installed classifier types */
@@ -37,6 +38,9 @@ static struct tcf_proto_ops *tcf_proto_base __read_mostly;
/* Protects list of registered TC modules. It is pure SMP lock. */
static DEFINE_RWLOCK(cls_mod_lock);
+/* Traffic Classifier Notifier Chain Head */
+static RAW_NOTIFIER_HEAD(tcf_notifier);
+
/* Find classifier type by string name */
static struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind)
@@ -57,6 +61,19 @@ static struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind)
return t;
}
+/* Register(Unregister) Traffic Classifier notifier */
+uint32_t register_tcf_notifier(struct notifier_block *nb)
+{
+ return raw_notifier_chain_register(&tcf_notifier, nb);
+}
+EXPORT_SYMBOL(register_tcf_notifier);
+
+uint32_t unregister_tcf_notifier(struct notifier_block *nb)
+{
+ return raw_notifier_chain_unregister(&tcf_notifier, nb);
+}
+EXPORT_SYMBOL(unregister_tcf_notifier);
+
/* Register(unregister) new classifier type */
int register_tcf_proto_ops(struct tcf_proto_ops *ops)
@@ -114,6 +131,29 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
return first;
}
+/* Traffic Filter notify function */
+static void tfilter_notifier(struct tcf_proto *tp, unsigned long fh, int event)
+{
+ struct qdisc_notify_s qdisc_notify;
+
+ switch (event) {
+ case RTM_NEWTFILTER:
+ case RTM_DELTFILTER:
+ case RTM_GETTFILTER:
+ if (tp->ops->notify_tcf) {
+ tp->ops->notify_tcf(tp, fh, &qdisc_notify);
+ qdisc_notify.parent = tp->classid;
+ qdisc_notify.handle = fh;
+ qdisc_notify.ifindex = tp->q->dev->ifindex;
+ raw_notifier_call_chain(&tcf_notifier, event,
+ &qdisc_notify);
+ } else {
+ printk(KERN_DEBUG "%s : TF Notification for %s Not supported\n"
+ , __FUNCTION__, tp->ops->kind);
+ }
+ }
+}
+
/* Add/change/delete/get a filter node */
static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
@@ -316,6 +356,8 @@ replay:
errout:
if (cl)
cops->put(q, cl);
+ if (err == 0)
+ tfilter_notifier(tp, fh, n->nlmsg_type);
if (err == -EAGAIN)
/* Replay the request. */
goto replay;
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index b0f90e5..39c775a 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -27,6 +27,7 @@
#include <net/netlink.h>
#include <net/act_api.h>
#include <net/pkt_cls.h>
+#include <net/sch_notify.h>
#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *))
@@ -327,6 +328,20 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
}
}
+/* FW Mark/Action Notifier */
+static void fw_notifier(struct tcf_proto *tp, unsigned long fh,
+ struct qdisc_notify_s *qdisc_notify)
+{
+ struct fw_head *head = (struct fw_head *)tp->root;
+ struct fw_filter *f = (struct fw_filter *)fh;
+
+ qdisc_notify->type = TC_EVENT_FW;
+ qdisc_notify->u.tc_fw_notifier.classid = f->res.classid;
+ qdisc_notify->u.tc_fw_notifier.mask = head->mask;
+ qdisc_notify->u.tc_fw_notifier.handle = f->id;
+ qdisc_notify->u.tc_fw_notifier.action = f->exts.action;
+}
+
static int fw_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
@@ -382,6 +397,7 @@ static struct tcf_proto_ops cls_fw_ops __read_mostly = {
.delete = fw_delete,
.walk = fw_walk,
.dump = fw_dump,
+ .notify_tcf = fw_notifier,
.owner = THIS_MODULE,
};
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 7e3c048..674301d 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -32,12 +32,15 @@
#include <net/sock.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/sch_notify.h>
static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
struct Qdisc *old, struct Qdisc *new);
static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
struct Qdisc *q, unsigned long cl, int event);
+/* QDisc Notifier Head */
+static RAW_NOTIFIER_HEAD(qdisc_notifier);
/*
Short review.
@@ -818,6 +821,19 @@ graft:
return 0;
}
+/* Registers(Unregister) QDisc Notifier */
+uint32_t register_qdisc_notifier(struct notifier_block *nb)
+{
+ return raw_notifier_chain_register(&qdisc_notifier, nb);
+}
+EXPORT_SYMBOL(register_qdisc_notifier);
+
+uint32_t unregister_qdisc_notifier(struct notifier_block *nb)
+{
+ return raw_notifier_chain_unregister(&qdisc_notifier, nb);
+}
+EXPORT_SYMBOL(unregister_qdisc_notifier);
+
static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
u32 pid, u32 seq, u16 flags, int event)
{
@@ -825,6 +841,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
struct nlmsghdr *nlh;
unsigned char *b = skb_tail_pointer(skb);
struct gnet_dump d;
+ struct qdisc_notify_s qdisc_notify;
nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
tcm = NLMSG_DATA(nlh);
@@ -838,6 +855,16 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
if (q->ops->dump && q->ops->dump(q, skb) < 0)
goto nla_put_failure;
+ /* Send QDisc Notification */
+ if (q->ops->notify_qdisc) {
+ q->ops->notify_qdisc(q, &qdisc_notify);
+ qdisc_notify.parent = clid;
+ qdisc_notify.handle = q->handle;
+ qdisc_notify.ifindex = q->dev->ifindex;
+ raw_notifier_call_chain(&qdisc_notifier, event,
+ &qdisc_notifier);
+ }
+
q->qstats.qlen = q->q.qlen;
if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
@@ -1073,6 +1100,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
unsigned char *b = skb_tail_pointer(skb);
struct gnet_dump d;
const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
+ struct tc_cls_notify_s tc_cls_notify;
nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
tcm = NLMSG_DATA(nlh);
@@ -1084,6 +1112,12 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
goto nla_put_failure;
+ /* Notify Classifier */
+ if (cl_ops->notify_cls) {
+ cl_ops->notify_cls(q, cl, &tc_cls_notify);
+ memcpy(&tc_cls_notify.tcm, tcm, sizeof(struct tcmsg));
+ raw_notifier_call_chain(&qdisc_notifier, event, &tc_cls_notify);
+ }
if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
TCA_XSTATS, q->stats_lock, &d) < 0)
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 09969c1..4d9855e 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -18,6 +18,7 @@
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/sch_notify.h>
/* Class-Based Queueing (CBQ) algorithm.
@@ -1567,6 +1568,50 @@ static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
return 0;
}
+/* CBQ QDISC Notification fill */
+static void cbq_notify_qdisc(struct Qdisc *sch,
+ struct qdisc_notify_s *qdisc_notify)
+{
+ struct cbq_sched_data *q = qdisc_priv(sch);
+ struct cbq_class *cl = &q->link;
+ struct tc_cbq_notify_s *cbq_notify = &qdisc_notify->u.tc_cbq_notifier;
+
+ qdisc_notify->type = TC_EVENT_CBQ;
+ cbq_notify->lssopt.flags = 0;
+ if (cl->borrow == NULL)
+ cbq_notify->lssopt.flags |= TCF_CBQ_LSS_BOUNDED;
+ if (cl->share == NULL)
+ cbq_notify->lssopt.flags |= TCF_CBQ_LSS_ISOLATED;
+ cbq_notify->lssopt.ewma_log = cl->ewma_log;
+ cbq_notify->lssopt.level = cl->level;
+ cbq_notify->lssopt.avpkt = cl->avpkt;
+ cbq_notify->lssopt.maxidle = cl->maxidle;
+ cbq_notify->lssopt.minidle = (u32)(-cl->minidle);
+ cbq_notify->lssopt.offtime = cl->offtime;
+ cbq_notify->lssopt.change = ~0;
+ cbq_notify->wrropt.flags = 0;
+ cbq_notify->wrropt.allot = cl->allot;
+ cbq_notify->wrropt.priority = cl->priority+1;
+ cbq_notify->wrropt.cpriority = cl->cpriority+1;
+ cbq_notify->wrropt.weight = cl->weight;
+ cbq_notify->ovlopt.strategy = cl->ovl_strategy;
+ cbq_notify->ovlopt.priority2 = cl->priority2+1;
+ cbq_notify->ovlopt.pad = 0;
+ cbq_notify->ovlopt.penalty = cl->penalty;
+#ifdef CONFIG_NET_CLS_ACT
+ if (cl->police) {
+ cbq_notify->policeopt.police = cl->police;
+ cbq_notify->policeopt.__res1 = 0;
+ cbq_notify->policeopt.__res2 = 0;
+ }
+#endif
+ if (cl->split || cl->defmap) {
+ cbq_notify->fopt.split = cl->split ? cl->split->classid : 0;
+ cbq_notify->fopt.defmap = cl->defmap;
+ cbq_notify->fopt.defchange = ~0;
+ }
+}
+
static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct cbq_sched_data *q = qdisc_priv(sch);
@@ -1594,6 +1639,27 @@ cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
}
+static void cbq_notify_cls(struct Qdisc *sch, unsigned long arg,
+ struct tc_cls_notify_s *cls_notify)
+{
+ struct cbq_class *cl = (struct cbq_class *)arg;
+ struct tc_cbq_notify_s *cbq_notify = &cls_notify->u.tc_cbq_notifier;
+
+ cls_notify->type = TC_EVENT_CBQ;
+ cbq_notify->lssopt.flags = 0;
+ if (cl->borrow == NULL)
+ cbq_notify->lssopt.flags |= TCF_CBQ_LSS_BOUNDED;
+ if (cl->share == NULL)
+ cbq_notify->lssopt.flags |= TCF_CBQ_LSS_ISOLATED;
+ cbq_notify->lssopt.ewma_log = cl->ewma_log;
+ cbq_notify->lssopt.level = cl->level;
+ cbq_notify->lssopt.avpkt = cl->avpkt;
+ cbq_notify->lssopt.maxidle = cl->maxidle;
+ cbq_notify->lssopt.minidle = (u32)(-cl->minidle);
+ cbq_notify->lssopt.offtime = cl->offtime;
+ cbq_notify->lssopt.change = ~0;
+}
+
static int
cbq_dump_class(struct Qdisc *sch, unsigned long arg,
struct sk_buff *skb, struct tcmsg *tcm)
@@ -2045,6 +2111,7 @@ static const struct Qdisc_class_ops cbq_class_ops = {
.bind_tcf = cbq_bind_filter,
.unbind_tcf = cbq_unbind_filter,
.dump = cbq_dump_class,
+ .notify_cls = cbq_notify_cls,
.dump_stats = cbq_dump_class_stats,
};
@@ -2062,6 +2129,7 @@ static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
.destroy = cbq_destroy,
.change = NULL,
.dump = cbq_dump,
+ .notify_qdisc = cbq_notify_qdisc,
.dump_stats = cbq_dump_stats,
.owner = THIS_MODULE,
};
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 4aa2b45..4dadf0e 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -19,7 +19,7 @@
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
-
+#include <net/sch_notify.h>
struct prio_sched_data
{
@@ -318,6 +318,18 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt)
return 0;
}
+/* Prio Qdisc Notifier dump */
+static void prio_notify_qdisc(struct Qdisc *sch,
+ struct qdisc_notify_s *qdisc_notify)
+{
+ struct prio_sched_data *q = qdisc_priv(sch);
+
+ qdisc_notify->type = TC_EVENT_PRIO;
+ qdisc_notify->u.prio_qdisc_opt.bands = q->bands;
+ memcpy(&qdisc_notify->u.prio_qdisc_opt.priomap, q->prio2band,
+ TC_PRIO_MAX+1);
+}
+
static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct prio_sched_data *q = qdisc_priv(sch);
@@ -417,6 +429,12 @@ static int prio_delete(struct Qdisc *sch, unsigned long cl)
return 0;
}
+/* Prio Classifier Notifier */
+static void prio_notify_cls(struct Qdisc *sch, unsigned long cl,
+ struct tc_cls_notify_s *cls_notify)
+{
+ cls_notify->type = TC_EVENT_PRIO;
+}
static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb,
struct tcmsg *tcm)
@@ -488,6 +506,7 @@ static const struct Qdisc_class_ops prio_class_ops = {
.unbind_tcf = prio_put,
.dump = prio_dump_class,
.dump_stats = prio_dump_class_stats,
+ .notify_cls = prio_notify_cls,
};
static struct Qdisc_ops prio_qdisc_ops __read_mostly = {
@@ -504,6 +523,7 @@ static struct Qdisc_ops prio_qdisc_ops __read_mostly = {
.destroy = prio_destroy,
.change = prio_tune,
.dump = prio_dump,
+ .notify_qdisc = prio_notify_qdisc,
.owner = THIS_MODULE,
};
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 3dcd493..4542f88 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -21,7 +21,7 @@
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
#include <net/red.h>
-
+#include <net/sch_notify.h>
/* Parameters, settable by user:
-----------------------------
@@ -297,6 +297,31 @@ static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
return gnet_stats_copy_app(d, &st, sizeof(st));
}
+/* RED QDisc Notifier Dump */
+static void red_notify_qdisc(struct Qdisc *sch,
+ struct qdisc_notify_s *qdisc_notify)
+{
+ struct red_sched_data *q = qdisc_priv(sch);
+
+ qdisc_notify->type = TC_EVENT_RED;
+ qdisc_notify->u.red_qdisc_opt.limit = q->limit;
+ qdisc_notify->u.red_qdisc_opt.flags = q->flags;
+ qdisc_notify->u.red_qdisc_opt.qth_min =
+ q->parms.qth_min >> q->parms.Wlog;
+ qdisc_notify->u.red_qdisc_opt.qth_max =
+ q->parms.qth_max >> q->parms.Wlog;
+ qdisc_notify->u.red_qdisc_opt.Wlog = q->parms.Wlog;
+ qdisc_notify->u.red_qdisc_opt.Plog = q->parms.Plog;
+ qdisc_notify->u.red_qdisc_opt.Scell_log = q->parms.Scell_log;
+}
+
+/* RED Classifier Notifier dump */
+static void red_notify_cls(struct Qdisc *sch, unsigned long cl,
+ struct tc_cls_notify_s *cls_notify)
+{
+ cls_notify->type = TC_EVENT_RED;
+}
+
static int red_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
@@ -379,6 +404,7 @@ static const struct Qdisc_class_ops red_class_ops = {
.walk = red_walk,
.tcf_chain = red_find_tcf,
.dump = red_dump_class,
+ .notify_cls = red_notify_cls,
};
static struct Qdisc_ops red_qdisc_ops __read_mostly = {
@@ -394,6 +420,7 @@ static struct Qdisc_ops red_qdisc_ops __read_mostly = {
.destroy = red_destroy,
.change = red_change,
.dump = red_dump,
+ .notify_qdisc = red_notify_qdisc,
.dump_stats = red_dump_stats,
.owner = THIS_MODULE,
};
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index a20e2ef..b7c4df7 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -23,6 +23,7 @@
#include <net/ip.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/sch_notify.h>
/* Stochastic Fairness Queuing algorithm.
@@ -524,6 +525,19 @@ static void sfq_destroy(struct Qdisc *sch)
del_timer(&q->perturb_timer);
}
+static void sfq_notify_qdisc(struct Qdisc *sch,
+ struct qdisc_notify_s *qdisc_notify)
+{
+ struct sfq_sched_data *q = qdisc_priv(sch);
+
+ qdisc_notify->type = TC_EVENT_SFQ;
+ qdisc_notify->u.sfq_qdisc_opt.quantum = q->quantum;
+ qdisc_notify->u.sfq_qdisc_opt.perturb_period = q->perturb_period/HZ;
+ qdisc_notify->u.sfq_qdisc_opt.limit = q->limit;
+ qdisc_notify->u.sfq_qdisc_opt.divisor = SFQ_HASH_DIVISOR;
+ qdisc_notify->u.sfq_qdisc_opt.flows = q->limit;
+}
+
static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct sfq_sched_data *q = qdisc_priv(sch);
@@ -630,6 +644,7 @@ static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
.destroy = sfq_destroy,
.change = NULL,
.dump = sfq_dump,
+ .notify_qdisc = sfq_notify_qdisc,
.owner = THIS_MODULE,
};
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 0b7d78f..0d1d2d6 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -20,7 +20,7 @@
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
-
+#include <net/sch_notify.h>
/* Simple Token Bucket Filter.
=======================================
@@ -377,6 +377,31 @@ static void tbf_destroy(struct Qdisc *sch)
qdisc_destroy(q->qdisc);
}
+static void tbf_notify_cls(struct Qdisc *sch, unsigned long cl,
+ struct tc_cls_notify_s *cls_notify)
+{
+ cls_notify->type = TC_EVENT_TBF;
+}
+
+static void tbf_notify_qdisc(struct Qdisc *sch,
+ struct qdisc_notify_s *qdisc_notify)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+
+ qdisc_notify->type = TC_EVENT_TBF;
+ qdisc_notify->u.tbf_qdisc_opt.limit = q->limit;
+ qdisc_notify->u.tbf_qdisc_opt.rate = q->R_tab->rate;
+
+ if (q->P_tab)
+ qdisc_notify->u.tbf_qdisc_opt.peakrate = q->P_tab->rate;
+ else
+ memset(&qdisc_notify->u.tbf_qdisc_opt.peakrate, 0,
+ sizeof(qdisc_notify->u.tbf_qdisc_opt.peakrate));
+
+ qdisc_notify->u.tbf_qdisc_opt.mtu = q->mtu;
+ qdisc_notify->u.tbf_qdisc_opt.buffer = q->buffer;
+}
+
static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct tbf_sched_data *q = qdisc_priv(sch);
@@ -490,6 +515,7 @@ static const struct Qdisc_class_ops tbf_class_ops =
.walk = tbf_walk,
.tcf_chain = tbf_find_tcf,
.dump = tbf_dump_class,
+ .notify_cls = tbf_notify_cls,
};
static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
@@ -506,6 +532,7 @@ static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
.destroy = tbf_destroy,
.change = tbf_change,
.dump = tbf_dump,
+ .notify_qdisc = tbf_notify_qdisc,
.owner = THIS_MODULE,
};
--
1.5.3.3
^ permalink raw reply related
* RE: cgroup: limit network bandwidth
From: Denis V. Lunev @ 2008-02-08 7:52 UTC (permalink / raw)
To: righiandr
Cc: Linux Netdev List, Balbir Singh, Naveen Gupta, Paul Menage,
Linux Containers
Hello, Andrea!
I have occasionally seen your patch on LWN (missed one in netdev@) and
have two words about. May be this is not too late. I have missed my
entire mailbox yesterday and have not followed the discussion. Pls
forgive me.
Rate-limiting message receive is nothing good at all. First, if we talk
about i386, the most important resource is low memory. There are no more
than 1 Gb of it. You suggest to keep it used more time than usual and
this usage will not reduce network traffic to the node for UDP cases.
For TCP the situation is slightly better. But not quite a big. For a
case of rather slow group with a bug traffic you will just eat 64kb *
Nsockets of receive buffers.
So, resource usage is just increased for a case. This is unfortunate. In
order to proper rate-limiting you need to calculate memory used
- dropping incoming packets early for UDP
- manage TCP window on the base of buffer memory used by the cgroup
Regards,
Den
^ permalink raw reply
* Re: [IPSEC] flow: reorder "struct flow_cache_entry" and remove SLAB_HWCACHE_ALIGN
From: David Miller @ 2008-02-08 7:31 UTC (permalink / raw)
To: dada1; +Cc: netdev
In-Reply-To: <47AC00F1.9090308@cosmosbay.com>
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Fri, 08 Feb 2008 08:12:49 +0100
> 1) We can shrink sizeof(struct flow_cache_entry) by 8 bytes on 64bit arches.
> 2) No need to align these structures to hardware cache lines, this only waste
> ram for very litle gain.
>
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Applied, thanks Eric.
^ permalink raw reply
* Re: [DECNET] ROUTE: remove unecessary alignment
From: David Miller @ 2008-02-08 7:30 UTC (permalink / raw)
To: dada1; +Cc: netdev
In-Reply-To: <47AC02E2.1070506@cosmosbay.com>
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Fri, 08 Feb 2008 08:21:06 +0100
> Same alignment requirement was removed on IP route cache in the past.
>
> This alignment actually has bad effect on 32 bit arches, uniprocessor, since
> sizeof(dn_rt_hash_bucket) is forced to 8 bytes instead of 4.
>
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Applied, anyone who shows love for DecNET deserves some love
in return :-)
^ permalink raw reply
* [DECNET] ROUTE: remove unecessary alignment
From: Eric Dumazet @ 2008-02-08 7:21 UTC (permalink / raw)
To: David S. Miller; +Cc: Linux Netdev List
[-- Attachment #1: Type: text/plain, Size: 261 bytes --]
Same alignment requirement was removed on IP route cache in the past.
This alignment actually has bad effect on 32 bit arches, uniprocessor, since
sizeof(dn_rt_hash_bucket) is forced to 8 bytes instead of 4.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
[-- Attachment #2: dn_route.patch --]
[-- Type: text/plain, Size: 323 bytes --]
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 31be29b..9dc0abb 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -94,7 +94,7 @@ struct dn_rt_hash_bucket
{
struct dn_route *chain;
spinlock_t lock;
-} __attribute__((__aligned__(8)));
+};
extern struct neigh_table dn_neigh_table;
^ permalink raw reply related
* Re: [Bugme-new] [Bug 9914] New: bnx2 driver of latest kernel 2.6.24 not working with Cisco catalyst 650x Switch
From: Andrew Morton @ 2008-02-08 7:18 UTC (permalink / raw)
To: Michael Chan, netdev; +Cc: bugme-daemon
In-Reply-To: <bug-9914-10286@http.bugzilla.kernel.org/>
On Thu, 7 Feb 2008 23:06:55 -0800 (PST) bugme-daemon@bugzilla.kernel.org wrote:
> http://bugzilla.kernel.org/show_bug.cgi?id=9914
>
> Summary: bnx2 driver of latest kernel 2.6.24 not working with
> Cisco catalyst 650x Switch
> Product: Drivers
> Version: 2.5
> KernelVersion: 2.6.24
> Platform: All
> OS/Version: Linux
> Tree: Mainline
> Status: NEW
> Severity: normal
> Priority: P1
> Component: Network
> AssignedTo: jgarzik@pobox.com
> ReportedBy: kaccountsend@gmail.com
>
>
> Latest working kernel version: (Works with Redhat 4 kernel)
> Earliest failing kernel version: 2.6.24
> Distribution: Self, kernel.org's kernel version 2.6.24
> Hardware Environment: Dell 2950
> Software Environment: Linux from Scratch, kernel version 2.6.24
>
> Problem Description:
> I am using the latest kernel 2.6.24. The hardware unit is a 2950 Dell box.
> The firmware version of bnx2 is 2.9.1. The bnx2 driver does work with other
> flavors of Cisco Switches like 290x. It is having problems at a customer site
> who has a Cisco 650x.
>
> The customer tried the RHEL 4 and does seem to work fine. The RHEL4 comes with
> the version of bnx2: 1.4.38
>
> The version on 2.6.24 is 1.6.9.
>
> Can anyone give me pointers as to why this is broken? Where can I get a patch?
>
> Any help is appreciated.
>
>
>
>
> Steps to reproduce:
> The driver does not auto-negotiate, neither does it work with a fixed link
> speed.
>
^ permalink raw reply
* [IPSEC] flow: reorder "struct flow_cache_entry" and remove SLAB_HWCACHE_ALIGN
From: Eric Dumazet @ 2008-02-08 7:12 UTC (permalink / raw)
To: David S. Miller; +Cc: Linux Netdev List
[-- Attachment #1: Type: text/plain, Size: 235 bytes --]
1) We can shrink sizeof(struct flow_cache_entry) by 8 bytes on 64bit arches.
2) No need to align these structures to hardware cache lines, this only waste
ram for very litle gain.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
[-- Attachment #2: flow.patch --]
[-- Type: text/plain, Size: 610 bytes --]
diff --git a/net/core/flow.c b/net/core/flow.c
index 9cfe845..a77531c 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -30,8 +30,8 @@ struct flow_cache_entry {
struct flow_cache_entry *next;
u16 family;
u8 dir;
- struct flowi key;
u32 genid;
+ struct flowi key;
void *object;
atomic_t *object_ref;
};
@@ -346,7 +346,7 @@ static int __init flow_cache_init(void)
flow_cachep = kmem_cache_create("flow_cache",
sizeof(struct flow_cache_entry),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ 0, SLAB_PANIC,
NULL);
flow_hash_shift = 10;
flow_lwm = 2 * flow_hash_size;
^ permalink raw reply related
* Re: [PATCH] Add IPv6 support to TCP SYN cookies
From: Glenn Griffin @ 2008-02-08 5:49 UTC (permalink / raw)
To: Glenn Griffin
Cc: Eric Dumazet, Evgeniy Polyakov, Alan Cox, Andi Kleen, netdev,
linux-kernel
In-Reply-To: <47abe7b1.20588c0a.7013.75b7@mx.google.com>
Updated to incorporate Eric's suggestion of using a per cpu buffer
rather than allocating on the stack. Just a two line change, but will
resend in it's entirety.
Signed-off-by: Glenn Griffin <ggriffin.kernel@gmail.com>
---
include/net/tcp.h | 8 ++
net/ipv4/syncookies.c | 7 +-
net/ipv4/tcp_input.c | 1 +
net/ipv4/tcp_minisocks.c | 2 +
net/ipv4/tcp_output.c | 1 +
net/ipv6/Makefile | 1 +
net/ipv6/syncookies.c | 267 ++++++++++++++++++++++++++++++++++++++++++++++
net/ipv6/tcp_ipv6.c | 77 ++++++++++----
8 files changed, 338 insertions(+), 26 deletions(-)
create mode 100644 net/ipv6/syncookies.c
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7de4ea3..c428ec7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -29,6 +29,7 @@
#include <linux/skbuff.h>
#include <linux/dmaengine.h>
#include <linux/crypto.h>
+#include <linux/cryptohash.h>
#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
@@ -434,11 +435,17 @@ extern int tcp_disconnect(struct sock *sk, int flags);
extern void tcp_unhash(struct sock *sk);
/* From syncookies.c */
+extern __u32 syncookie_secret[2][16-3+SHA_DIGEST_WORDS];
extern struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
struct ip_options *opt);
extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb,
__u16 *mss);
+/* From net/ipv6/syncookies.c */
+extern struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb);
+extern __u32 cookie_v6_init_sequence(struct sock *sk, struct sk_buff *skb,
+ __u16 *mss);
+
/* tcp_output.c */
extern void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
@@ -1332,6 +1339,7 @@ extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo);
extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo);
extern struct request_sock_ops tcp_request_sock_ops;
+extern struct request_sock_ops tcp6_request_sock_ops;
extern int tcp_v4_destroy_sock(struct sock *sk);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index f470fe4..cc6637b 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -10,8 +10,6 @@
* 2 of the License, or (at your option) any later version.
*
* $Id: syncookies.c,v 1.18 2002/02/01 22:01:04 davem Exp $
- *
- * Missing: IPv6 support.
*/
#include <linux/tcp.h>
@@ -23,14 +21,15 @@
extern int sysctl_tcp_syncookies;
-static __u32 syncookie_secret[2][16-3+SHA_DIGEST_WORDS];
+__u32 syncookie_secret[2][16-3+SHA_DIGEST_WORDS];
+EXPORT_SYMBOL(syncookie_secret);
static __init int init_syncookies(void)
{
get_random_bytes(syncookie_secret, sizeof(syncookie_secret));
return 0;
}
-module_init(init_syncookies);
+__initcall(init_syncookies);
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 19c449f..93e128c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5326,6 +5326,7 @@ discard:
EXPORT_SYMBOL(sysctl_tcp_ecn);
EXPORT_SYMBOL(sysctl_tcp_reordering);
+EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
EXPORT_SYMBOL(tcp_parse_options);
EXPORT_SYMBOL(tcp_rcv_established);
EXPORT_SYMBOL(tcp_rcv_state_process);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b61b768..0f494cd 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -35,6 +35,8 @@
#endif
int sysctl_tcp_syncookies __read_mostly = SYNC_INIT;
+EXPORT_SYMBOL(sysctl_tcp_syncookies);
+
int sysctl_tcp_abort_on_overflow __read_mostly;
struct inet_timewait_death_row tcp_death_row = {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ed750f9..cbfef8b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2560,6 +2560,7 @@ void tcp_send_probe0(struct sock *sk)
}
}
+EXPORT_SYMBOL(tcp_select_initial_window);
EXPORT_SYMBOL(tcp_connect);
EXPORT_SYMBOL(tcp_make_synack);
EXPORT_SYMBOL(tcp_simple_retransmit);
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 24f3aa0..ae14617 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -16,6 +16,7 @@ ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
ipv6-$(CONFIG_NETFILTER) += netfilter.o
ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o
ipv6-$(CONFIG_PROC_FS) += proc.o
+ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o
ipv6-objs += $(ipv6-y)
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
new file mode 100644
index 0000000..063dade
--- /dev/null
+++ b/net/ipv6/syncookies.c
@@ -0,0 +1,267 @@
+/*
+ * IPv6 Syncookies implementation for the Linux kernel
+ *
+ * Authors:
+ * Glenn Griffin <ggriffin.kernel@gmail.com>
+ *
+ * Based on IPv4 implementation by Andi Kleen
+ * linux/net/ipv4/syncookies.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/tcp.h>
+#include <linux/random.h>
+#include <linux/cryptohash.h>
+#include <linux/kernel.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+
+extern int sysctl_tcp_syncookies;
+extern __u32 syncookie_secret[2][16-3+SHA_DIGEST_WORDS];
+
+#define COOKIEBITS 24 /* Upper bits store count */
+#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
+
+/*
+ * This table has to be sorted and terminated with (__u16)-1.
+ * XXX generate a better table.
+ * Unresolved Issues: HIPPI with a 64k MSS is not well supported.
+ *
+ * Taken directly from ipv4 implementation.
+ * Should this list be modified for ipv6 use or is it close enough?
+ * rfc 2460 8.3 suggests mss values 20 bytes less than ipv4 counterpart
+ */
+static __u16 const msstab[] = {
+ 64 - 1,
+ 256 - 1,
+ 512 - 1,
+ 536 - 1,
+ 1024 - 1,
+ 1440 - 1,
+ 1460 - 1,
+ 4312 - 1,
+ (__u16)-1
+};
+/* The number doesn't include the -1 terminator */
+#define NUM_MSS (ARRAY_SIZE(msstab) - 1)
+
+/*
+ * This (misnamed) value is the age of syncookie which is permitted.
+ * Its ideal value should be dependent on TCP_TIMEOUT_INIT and
+ * sysctl_tcp_retries1. It's a rather complicated formula (exponential
+ * backoff) to compute at runtime so it's currently hardcoded here.
+ */
+#define COUNTER_TRIES 4
+
+static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sock *child;
+
+ child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
+ if (child)
+ inet_csk_reqsk_queue_add(sk, req, child);
+ else
+ reqsk_free(req);
+
+ return child;
+}
+
+static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS];
+
+static u32 cookie_hash(struct in6_addr *saddr, struct in6_addr *daddr,
+ __be16 sport, __be16 dport, u32 count, int c)
+{
+ __u32 *tmp = __get_cpu_var(cookie_scratch);
+
+ /*
+ * we have 320 bits of information to hash, copy in the remaining
+ * 192 bits required for sha_transform, from the syncookie_secret
+ * and overwrite the digest with the secret
+ */
+ memcpy(tmp + 10, syncookie_secret[c], 44);
+ memcpy(tmp, saddr, 16);
+ memcpy(tmp + 4, daddr, 16);
+ tmp[8] = ((__force u32)sport << 16) + (__force u32)dport;
+ tmp[9] = count;
+ sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
+
+ return tmp[17];
+}
+
+static __u32 secure_tcp_syn_cookie(struct in6_addr *saddr, struct in6_addr *daddr,
+ __be16 sport, __be16 dport, __u32 sseq,
+ __u32 count, __u32 data)
+{
+ return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
+ sseq + (count << COOKIEBITS) +
+ ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
+ & COOKIEMASK));
+}
+
+static __u32 check_tcp_syn_cookie(__u32 cookie, struct in6_addr *saddr,
+ struct in6_addr *daddr, __be16 sport,
+ __be16 dport, __u32 sseq, __u32 count,
+ __u32 maxdiff)
+{
+ __u32 diff;
+
+ cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
+
+ diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS);
+ if (diff >= maxdiff)
+ return (__u32)-1;
+
+ return (cookie -
+ cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
+ & COOKIEMASK;
+}
+
+__u32 cookie_v6_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
+{
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
+ int mssind;
+ const __u16 mss = *mssp;
+
+ tcp_sk(sk)->last_synq_overflow = jiffies;
+
+ for (mssind = 0; mss > msstab[mssind + 1]; mssind++)
+ ;
+ *mssp = msstab[mssind] + 1;
+
+ NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESSENT);
+
+ return secure_tcp_syn_cookie(&iph->saddr, &iph->daddr, th->source,
+ th->dest, ntohl(th->seq),
+ jiffies / (HZ * 60), mssind);
+}
+
+static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
+{
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
+ __u32 seq = ntohl(th->seq) - 1;
+ __u32 mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr,
+ th->source, th->dest, seq,
+ jiffies / (HZ * 60), COUNTER_TRIES);
+
+ return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
+}
+
+struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
+{
+ struct inet_request_sock *ireq;
+ struct inet6_request_sock *ireq6;
+ struct tcp_request_sock *treq;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcphdr *th = tcp_hdr(skb);
+ __u32 cookie = ntohl(th->ack_seq) - 1;
+ struct sock *ret = sk;
+ struct request_sock *req;
+ int mss;
+ struct dst_entry *dst;
+ __u8 rcv_wscale;
+
+ if (!sysctl_tcp_syncookies || !th->ack)
+ goto out;
+
+ if (time_after(jiffies, tp->last_synq_overflow + TCP_TIMEOUT_INIT) ||
+ (mss = cookie_check(skb, cookie)) == 0) {
+ NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESFAILED);
+ goto out;
+ }
+
+ NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV);
+
+ ret = NULL;
+ req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
+ if (!req)
+ goto out;
+
+ ireq = inet_rsk(req);
+ ireq6 = inet6_rsk(req);
+ treq = tcp_rsk(req);
+ ireq6->pktopts = NULL;
+
+ if (security_inet_conn_request(sk, skb, req)) {
+ reqsk_free(req);
+ goto out;
+ }
+
+ req->mss = mss;
+ ireq->rmt_port = th->source;
+ ipv6_addr_copy(&ireq6->rmt_addr, &ipv6_hdr(skb)->saddr);
+ ipv6_addr_copy(&ireq6->loc_addr, &ipv6_hdr(skb)->daddr);
+ if (ipv6_opt_accepted(sk, skb) ||
+ np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
+ np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
+ atomic_inc(&skb->users);
+ ireq6->pktopts = skb;
+ }
+
+ ireq6->iif = sk->sk_bound_dev_if;
+ /* So that link locals have meaning */
+ if (!sk->sk_bound_dev_if &&
+ ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL)
+ ireq6->iif = inet6_iif(skb);
+
+ req->expires = 0UL;
+ req->retrans = 0;
+ ireq->snd_wscale = ireq->rcv_wscale = ireq->tstamp_ok = 0;
+ ireq->wscale_ok = ireq->sack_ok = 0;
+ treq->rcv_isn = ntohl(th->seq) - 1;
+ treq->snt_isn = cookie;
+
+ /*
+ * We need to lookup the dst_entry to get the correct window size.
+ * This is taken from tcp_v6_syn_recv_sock. Somebody please enlighten
+ * me if there is a preferred way.
+ */
+ {
+ struct in6_addr *final_p = NULL, final;
+ struct flowi fl;
+ memset(&fl, 0, sizeof(fl));
+ fl.proto = IPPROTO_TCP;
+ ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
+ if (np->opt && np->opt->srcrt) {
+ struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+ ipv6_addr_copy(&final, &fl.fl6_dst);
+ ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
+ final_p = &final;
+ }
+ ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
+ fl.oif = sk->sk_bound_dev_if;
+ fl.fl_ip_dport = inet_rsk(req)->rmt_port;
+ fl.fl_ip_sport = inet_sk(sk)->sport;
+ security_req_classify_flow(req, &fl);
+ if (ip6_dst_lookup(sk, &dst, &fl)) {
+ reqsk_free(req);
+ goto out;
+ }
+ if (final_p)
+ ipv6_addr_copy(&fl.fl6_dst, final_p);
+ if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0)
+ goto out;
+ }
+
+ req->window_clamp = dst_metric(dst, RTAX_WINDOW);
+ tcp_select_initial_window(tcp_full_space(sk), req->mss,
+ &req->rcv_wnd, &req->window_clamp,
+ 0, &rcv_wscale);
+
+ ireq->rcv_wscale = rcv_wscale;
+
+ ret = get_cookie_sock(sk, skb, req, dst);
+
+out: return ret;
+}
+
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 12750f2..1f4e544 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -514,6 +514,20 @@ done:
return err;
}
+static inline void syn_flood_warning(struct sk_buff *skb)
+{
+#ifdef CONFIG_SYN_COOKIES
+ if (sysctl_tcp_syncookies)
+ printk(KERN_INFO
+ "TCPv6: Possible SYN flooding on port %d. "
+ "Sending cookies.\n", ntohs(tcp_hdr(skb)->dest));
+ else
+#endif
+ printk(KERN_INFO
+ "TCPv6: Possible SYN flooding on port %d. "
+ "Dropping request.\n", ntohs(tcp_hdr(skb)->dest));
+}
+
static void tcp_v6_reqsk_destructor(struct request_sock *req)
{
if (inet6_rsk(req)->pktopts)
@@ -917,7 +931,7 @@ done_opts:
}
#endif
-static struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
+struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
.family = AF_INET6,
.obj_size = sizeof(struct tcp6_request_sock),
.rtx_syn_ack = tcp_v6_send_synack,
@@ -1215,9 +1229,9 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
return NULL;
}
-#if 0 /*def CONFIG_SYN_COOKIES*/
+#ifdef CONFIG_SYN_COOKIES
if (!th->rst && !th->syn && th->ack)
- sk = cookie_v6_check(sk, skb, &(IPCB(skb)->opt));
+ sk = cookie_v6_check(sk, skb);
#endif
return sk;
}
@@ -1233,6 +1247,11 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
struct request_sock *req = NULL;
__u32 isn = TCP_SKB_CB(skb)->when;
+#ifdef CONFIG_SYN_COOKIES
+ int want_cookie = 0;
+#else
+#define want_cookie 0
+#endif
if (skb->protocol == htons(ETH_P_IP))
return tcp_v4_conn_request(sk, skb);
@@ -1240,12 +1259,14 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (!ipv6_unicast_destination(skb))
goto drop;
- /*
- * There are no SYN attacks on IPv6, yet...
- */
if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
if (net_ratelimit())
- printk(KERN_INFO "TCPv6: dropping request, synflood is possible\n");
+ syn_flood_warning(skb);
+#ifdef CONFIG_SYN_COOKIES
+ if (sysctl_tcp_syncookies)
+ want_cookie = 1;
+ else
+#endif
goto drop;
}
@@ -1266,29 +1287,39 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_parse_options(skb, &tmp_opt, 0);
+ if (want_cookie) {
+ tcp_clear_options(&tmp_opt);
+ tmp_opt.saw_tstamp = 0;
+ }
+
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb);
treq = inet6_rsk(req);
ipv6_addr_copy(&treq->rmt_addr, &ipv6_hdr(skb)->saddr);
ipv6_addr_copy(&treq->loc_addr, &ipv6_hdr(skb)->daddr);
- TCP_ECN_create_request(req, tcp_hdr(skb));
treq->pktopts = NULL;
- if (ipv6_opt_accepted(sk, skb) ||
- np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
- np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
- atomic_inc(&skb->users);
- treq->pktopts = skb;
- }
- treq->iif = sk->sk_bound_dev_if;
+ if (!want_cookie)
+ TCP_ECN_create_request(req, tcp_hdr(skb));
+
+ if (want_cookie) {
+ isn = cookie_v6_init_sequence(sk, skb, &req->mss);
+ } else if (!isn) {
+ if (ipv6_opt_accepted(sk, skb) ||
+ np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
+ np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
+ atomic_inc(&skb->users);
+ treq->pktopts = skb;
+ }
+ treq->iif = sk->sk_bound_dev_if;
- /* So that link locals have meaning */
- if (!sk->sk_bound_dev_if &&
- ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL)
- treq->iif = inet6_iif(skb);
+ /* So that link locals have meaning */
+ if (!sk->sk_bound_dev_if &&
+ ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL)
+ treq->iif = inet6_iif(skb);
- if (isn == 0)
isn = tcp_v6_init_sequence(skb);
+ }
tcp_rsk(req)->snt_isn = isn;
@@ -1297,8 +1328,10 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (tcp_v6_send_synack(sk, req, NULL))
goto drop;
- inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
- return 0;
+ if (!want_cookie) {
+ inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+ return 0;
+ }
drop:
if (req)
--
1.5.3.4
^ permalink raw reply related
* Re: [PATCH 2/3] partial checksum and GSO support for tun/tap.
From: Max Krasnyansky @ 2008-02-08 5:39 UTC (permalink / raw)
To: Rusty Russell; +Cc: netdev, Herbert Xu, virtualization
In-Reply-To: <200801240110.45178.rusty@rustcorp.com.au>
Rusty Russell wrote:
> (Changes since last time: we how have explicit IFF_RECV_CSUM and
> IFF_RECV_GSO bits, and some renaming of virtio_net hdr)
>
> We use the virtio_net_hdr: it is an ABI already and designed to
> encapsulate such metadata as GSO and partial checksums.
>
> IFF_VIRTIO_HDR means you will write and read a 'struct virtio_net_hdr'
> at the start of each packet. You can always write packets with
> partial checksum and gso to the tap device using this header.
>
> IFF_RECV_CSUM means you can handle reading packets with partial
> checksums. If IFF_RECV_GSO is also set, it means you can handle
> reading (all types of) GSO packets.
>
> Note that there is no easy way to detect if these flags are supported:
> see next patch.
Again sorry for delay in replying. Here are my thoughts on this.
I like the approach in general. Certainly the part that creates skbs out of the user-space
pages looks good. And it's fits nicely into existing TUN driver model.
However I actually wanted to change the model :). In particular I'm talking about
"syscall per packet"
After messing around with things like libe1000.sf.net I'd like to make TUN/TAP driver look
more like modern nic's to the user-space. In other words I'm thinking about introducing RX and
TX rings that the user-space can then mmap() and write/read packets descriptors to/from.
That will saves the number of system calls that the user-space app needs to do. That by
itself saves a lot of overhead, combined with the GSO it's be lightning fast.
I'm going to send you a version that I cooked up awhile ago in a private email. Do not want
to spam netdev :). It's not quite the RX/TX ring model but I'll give you an idea.
I did some profiling and PPS (packets per second) numbers that user-space can handle literally
sky rocketed.
btw We had a long discussion with Eugeniy Polakov on mapping user-pages vs mmap()ing large
kernel buffer and doing normal memcpy() (ie instead of copy_to/fromuser()) in the kernel.
On small packets overhead of get_user_pages() eats up all the benefits. So we should think
of some scheme that nicely combines the two. Kind of like "copy break" that latest net
drivers do these days.
Also btw why call it VIRTIO ? For example I'm actually interested in speeding up tunning
and general network apps. We have wireless basestation apps here that need to handle packets
in user-space. Those kind things have nothing to with virtualization.
Max
^ permalink raw reply
* Re: [Bugme-new] [Bug 9888] New: tun device without protocol info header fails under IPv6
From: Max Krasnyansky @ 2008-02-08 4:58 UTC (permalink / raw)
To: Andrew Morton; +Cc: steve.zabele, bugme-daemon, netdev
In-Reply-To: <20080204145304.f4921bf4.akpm@linux-foundation.org>
Andrew Morton wrote:
> On Mon, 4 Feb 2008 13:46:13 -0800 (PST)
> bugme-daemon@bugzilla.kernel.org wrote:
>>
>> Open a tun device as type TUN, set the TUN_NO_PI flag, and try sending an IPv6
>> packet. The packet appears at the interface under tcpdumps, but propagates no
>> further. This is because the default protocol info used for tun devices where
>> the TUN_NO_PI flag is set assumes IPv4 as can be seen by the initialization at
>> the top of the tun_get_user function in drivers/net/tun.c file given by
>>
>> struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
>>
>> This can easily be fixed by adding a quick check at the top of tun_get_user.
>> Basically the code that used to read
>>
>> if (!(tun->flags & TUN_NO_PI)) {
>> if ((len -= sizeof(pi)) > count)
>> return -EINVAL;
>>
>> if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi)))
>> return -EFAULT;
>> }
>>
>> when changed to read
>>
>> if (!(tun->flags & TUN_NO_PI)) {
>> if ((len -= sizeof(pi)) > count)
>> return -EINVAL;
>>
>> if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi)))
>> return -EFAULT;
>> }
>> else {
>> /* Fixup default pi if IPv6 rather than IPv4 */
>> if (((tun->flags & TUN_TYPE_MASK) == TUN_TUN_DEV) &&
>> (*(char *)(iv->iov_base) == 0x60)) {
>> pi.proto = __constant_htons(ETH_P_IPV6);
>> }
>> }
>>
>> fixes the problem.
>>
>> How do we get this in as part of the maintained codebase??
>>
>
> Please email a tested patch prepared as described in
>
> Documentation/SubmittingPatches
> Documentation/SubmitChecklist
> http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt
>
> to
>
> Maxim Krasnyansky <maxk@qualcomm.com>
> "David S. Miller" <davem@davemloft.net>
> Andrew Morton <akpm@linux-foundation.org>
> netdev@vger.kernel.org
btw I'd be ok with this fix. But I guess the questions is why not use
struct tun_pi in the apps instead ?
Max
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox