* Re: [PATCH] vlan_dev: VLAN 0 should be treated as "no vlan tag" (802.1p packet)
From: Pedro Garcia @ 2010-07-01 18:47 UTC (permalink / raw)
To: netdev; +Cc: Patrick McHardy, Ben Hutchings, Eric Dumazet, David Miller
In-Reply-To: <20100630.131616.233403329.davem@davemloft.net>
The patch with the modifications suggested by David.
- Without the 8021q module loaded in the kernel, all 802.1p packets
(VLAN 0 but QoS tagging) are silently discarded (as expected, as
the protocol is not loaded).
- Without this patch in 8021q module, these packets are forwarded to
the module, but they are discarded also if VLAN 0 is not configured,
which should not be the default behaviour, as VLAN 0 is not really
a VLANed packet but a 802.1p packet. Defining VLAN 0 makes it almost
impossible to communicate with mixed 802.1p and non 802.1p devices on
the same network due to arp table issues.
- Changed logic to skip vlan specific code in vlan_skb_recv if VLAN
is 0 and we have not defined a VLAN with ID 0, but we accept the
packet with the encapsulated proto and pass it later to netif_rx.
- In the vlan device event handler, added some logic to add VLAN 0
to HW filter in devices that support it (this prevented any traffic
in VLAN 0 to reach the stack in e1000e with HW filter under 2.6.35,
and probably also with other HW filtered cards, so we fix it here).
- In the vlan unregister logic, prevent the elimination of VLAN 0
in devices with HW filter.
- The default behaviour is to ignore the VLAN 0 tagging and accept
the packet as if it was not tagged, but we can still define a
VLAN 0 if desired (so it is backwards compatible).
Signed-off-by: Pedro Garcia <pedro.netdev@dondevamos.com>
--
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 3c1c8c1..a2ad152 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -155,9 +155,10 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
BUG_ON(!grp);
/* Take it out of our own structures, but be sure to interlock with
- * HW accelerating devices or SW vlan input packet processing.
+ * HW accelerating devices or SW vlan input packet processing if
+ * VLAN is not 0 (leave it there for 802.1p).
*/
- if (real_dev->features & NETIF_F_HW_VLAN_FILTER)
+ if (vlan_id && (real_dev->features & NETIF_F_HW_VLAN_FILTER))
ops->ndo_vlan_rx_kill_vid(real_dev, vlan_id);
grp->nr_vlans--;
@@ -419,6 +420,14 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
if (is_vlan_dev(dev))
__vlan_device_event(dev, event);
+ if ((event == NETDEV_UP) &&
+ (dev->features & NETIF_F_HW_VLAN_FILTER) &&
+ dev->netdev_ops->ndo_vlan_rx_add_vid) {
+ pr_info("8021q: adding VLAN 0 to HW filter on device %s\n",
+ dev->name);
+ dev->netdev_ops->ndo_vlan_rx_add_vid(dev, 0);
+ }
+
grp = __vlan_find_group(dev);
if (!grp)
goto out;
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 50f58f5..0f91f46 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -8,6 +8,9 @@
int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
u16 vlan_tci, int polling)
{
+ struct net_device *vlan_dev;
+ u16 vlan_id;
+
if (netpoll_rx(skb))
return NET_RX_DROP;
@@ -16,9 +19,12 @@ int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
skb->skb_iif = skb->dev->ifindex;
__vlan_hwaccel_put_tag(skb, vlan_tci);
- skb->dev = vlan_group_get_device(grp, vlan_tci & VLAN_VID_MASK);
+ vlan_id = vlan_tci & VLAN_VID_MASK;
+ vlan_dev = vlan_group_get_device(grp, vlan_id);
- if (!skb->dev)
+ if (vlan_dev)
+ skb->dev = vlan_dev;
+ else if (vlan_id)
goto drop;
return (polling ? netif_receive_skb(skb) : netif_rx(skb));
@@ -82,15 +88,20 @@ vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp,
unsigned int vlan_tci, struct sk_buff *skb)
{
struct sk_buff *p;
+ struct net_device *vlan_dev;
+ u16 vlan_id;
if (skb_bond_should_drop(skb, ACCESS_ONCE(skb->dev->master)))
skb->deliver_no_wcard = 1;
skb->skb_iif = skb->dev->ifindex;
__vlan_hwaccel_put_tag(skb, vlan_tci);
- skb->dev = vlan_group_get_device(grp, vlan_tci & VLAN_VID_MASK);
+ vlan_id = vlan_tci & VLAN_VID_MASK;
+ vlan_dev = vlan_group_get_device(grp, vlan_id);
- if (!skb->dev)
+ if (vlan_dev)
+ skb->dev = vlan_dev;
+ else if (vlan_id)
goto drop;
for (p = napi->gro_list; p; p = p->next) {
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 5298426..21f7229 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -142,6 +142,7 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
{
struct vlan_hdr *vhdr;
struct vlan_rx_stats *rx_stats;
+ struct net_device *vlan_dev;
u16 vlan_id;
u16 vlan_tci;
@@ -157,53 +158,69 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
vlan_id = vlan_tci & VLAN_VID_MASK;
rcu_read_lock();
- skb->dev = __find_vlan_dev(dev, vlan_id);
- if (!skb->dev) {
- pr_debug("%s: ERROR: No net_device for VID: %u on dev: %s\n",
- __func__, vlan_id, dev->name);
- goto err_unlock;
- }
-
- rx_stats = per_cpu_ptr(vlan_dev_info(skb->dev)->vlan_rx_stats,
- smp_processor_id());
- rx_stats->rx_packets++;
- rx_stats->rx_bytes += skb->len;
-
- skb_pull_rcsum(skb, VLAN_HLEN);
-
- skb->priority = vlan_get_ingress_priority(skb->dev, vlan_tci);
+ vlan_dev = __find_vlan_dev(dev, vlan_id);
- pr_debug("%s: priority: %u for TCI: %hu\n",
- __func__, skb->priority, vlan_tci);
-
- switch (skb->pkt_type) {
- case PACKET_BROADCAST: /* Yeah, stats collect these together.. */
- /* stats->broadcast ++; // no such counter :-( */
- break;
-
- case PACKET_MULTICAST:
- rx_stats->multicast++;
- break;
+ /* If the VLAN device is defined, we use it.
+ * If not, and the VID is 0, it is a 802.1p packet (not
+ * really a VLAN), so we will just netif_rx it later to the
+ * original interface, but with the skb->proto set to the
+ * wrapped proto: we do nothing here.
+ */
- case PACKET_OTHERHOST:
- /* Our lower layer thinks this is not local, let's make sure.
- * This allows the VLAN to have a different MAC than the
- * underlying device, and still route correctly.
- */
- if (!compare_ether_addr(eth_hdr(skb)->h_dest,
- skb->dev->dev_addr))
- skb->pkt_type = PACKET_HOST;
- break;
- default:
- break;
+ if (!vlan_dev) {
+ if (vlan_id) {
+ pr_debug("%s: ERROR: No net_device for VID: %u on dev: %s\n",
+ __func__, vlan_id, dev->name);
+ goto err_unlock;
+ }
+ rx_stats = NULL;
+ } else {
+ skb->dev = vlan_dev;
+
+ rx_stats = per_cpu_ptr(vlan_dev_info(skb->dev)->vlan_rx_stats,
+ smp_processor_id());
+ rx_stats->rx_packets++;
+ rx_stats->rx_bytes += skb->len;
+
+ skb->priority = vlan_get_ingress_priority(skb->dev, vlan_tci);
+
+ pr_debug("%s: priority: %u for TCI: %hu\n",
+ __func__, skb->priority, vlan_tci);
+
+ switch (skb->pkt_type) {
+ case PACKET_BROADCAST:
+ /* Yeah, stats collect these together.. */
+ /* stats->broadcast ++; // no such counter :-( */
+ break;
+
+ case PACKET_MULTICAST:
+ rx_stats->multicast++;
+ break;
+
+ case PACKET_OTHERHOST:
+ /* Our lower layer thinks this is not local, let's make
+ * sure.
+ * This allows the VLAN to have a different MAC than the
+ * underlying device, and still route correctly.
+ */
+ if (!compare_ether_addr(eth_hdr(skb)->h_dest,
+ skb->dev->dev_addr))
+ skb->pkt_type = PACKET_HOST;
+ break;
+ default:
+ break;
+ }
}
+ skb_pull_rcsum(skb, VLAN_HLEN);
vlan_set_encap_proto(skb, vhdr);
- skb = vlan_check_reorder_header(skb);
- if (!skb) {
- rx_stats->rx_errors++;
- goto err_unlock;
+ if (vlan_dev) {
+ skb = vlan_check_reorder_header(skb);
+ if (!skb) {
+ rx_stats->rx_errors++;
+ goto err_unlock;
+ }
}
netif_rx(skb);
^ permalink raw reply related
* Re: [PATCH] vlan_dev: VLAN 0 should be treated as "no vlan tag" (802.1p packet)
From: Eric Dumazet @ 2010-07-01 20:19 UTC (permalink / raw)
To: Pedro Garcia; +Cc: netdev, Patrick McHardy, Ben Hutchings, David Miller
In-Reply-To: <a372414c0d74e9ab599d095627353d94@dondevamos.com>
Le jeudi 01 juillet 2010 à 20:47 +0200, Pedro Garcia a écrit :
> The patch with the modifications suggested by David.
>
> - Without the 8021q module loaded in the kernel, all 802.1p packets
> (VLAN 0 but QoS tagging) are silently discarded (as expected, as
> the protocol is not loaded).
>
> - Without this patch in 8021q module, these packets are forwarded to
> the module, but they are discarded also if VLAN 0 is not configured,
> which should not be the default behaviour, as VLAN 0 is not really
> a VLANed packet but a 802.1p packet. Defining VLAN 0 makes it almost
> impossible to communicate with mixed 802.1p and non 802.1p devices on
> the same network due to arp table issues.
>
> - Changed logic to skip vlan specific code in vlan_skb_recv if VLAN
> is 0 and we have not defined a VLAN with ID 0, but we accept the
> packet with the encapsulated proto and pass it later to netif_rx.
>
> - In the vlan device event handler, added some logic to add VLAN 0
> to HW filter in devices that support it (this prevented any traffic
> in VLAN 0 to reach the stack in e1000e with HW filter under 2.6.35,
> and probably also with other HW filtered cards, so we fix it here).
>
> - In the vlan unregister logic, prevent the elimination of VLAN 0
> in devices with HW filter.
>
> - The default behaviour is to ignore the VLAN 0 tagging and accept
> the packet as if it was not tagged, but we can still define a
> VLAN 0 if desired (so it is backwards compatible).
>
> Signed-off-by: Pedro Garcia <pedro.netdev@dondevamos.com>
Seems fine but you need to respin your patch against latest net-next-2.6
tree.
Check your tree got commit 9618e2ffd78aaa (vlan: 64 bit rx counters)
Thanks !
^ permalink raw reply
* Re: [net-next-2.6 PATCH v2] x86: Align skb w/ start of cacheline on newer core 2/Xeon Arch
From: Alexander Duyck @ 2010-07-01 20:37 UTC (permalink / raw)
To: Andi Kleen
Cc: Kirsher, Jeffrey T, davem@davemloft.net, netdev@vger.kernel.org,
gospo@redhat.com, bphilips@novell.com, Thomas Gleixner,
Ingo Molnar, H. Peter Anvin, x86@kernel.org
In-Reply-To: <87tyojzbjm.fsf@basil.nowhere.org>
Andi Kleen wrote:
> Jeff Kirsher <jeffrey.t.kirsher@intel.com> writes:
>
>
> Sorry for the late comment.
>
>>
>> +#ifdef CONFIG_MCORE2
>> +/*
>> + * We handle most unaligned accesses in hardware. On the other hand
>> + * unaligned DMA can be quite expensive on some Nehalem processors.
>> + *
>> + * Based on this we disable the IP header alignment in network drivers.
>> + */
>> +#define NET_IP_ALIGN 0
>> +#endif
>> #endif /* _ASM_X86_SYSTEM_H */
>
> The ifdef should be imho dropped and the option be made unconditional
> for all x86. I am not aware of any x86 core where unalignment is really
> slow. This would increase the chance of it actually working on many
> configurations which do not necessarily optimize for Core2.
>
> -Andi
Seems to make sense to me. I will see about generating a patch that
drops the check for CONFIG_MCORE2.
Thanks,
Alex
^ permalink raw reply
* Re: [net-next-2.6 PATCH v2] x86: Align skb w/ start of cacheline on newer core 2/Xeon Arch
From: H. Peter Anvin @ 2010-07-01 20:41 UTC (permalink / raw)
To: Alexander Duyck
Cc: Andi Kleen, Kirsher, Jeffrey T, davem@davemloft.net,
netdev@vger.kernel.org, gospo@redhat.com, bphilips@novell.com,
Thomas Gleixner, Ingo Molnar, x86@kernel.org
In-Reply-To: <4C2CFC87.6020003@intel.com>
On 07/01/2010 01:37 PM, Alexander Duyck wrote:
> Andi Kleen wrote:
>> Jeff Kirsher <jeffrey.t.kirsher@intel.com> writes:
>>
>>
>> Sorry for the late comment.
>>
>>>
>>> +#ifdef CONFIG_MCORE2
>>> +/*
>>> + * We handle most unaligned accesses in hardware. On the other hand
>>> + * unaligned DMA can be quite expensive on some Nehalem processors.
>>> + *
>>> + * Based on this we disable the IP header alignment in network drivers.
>>> + */
>>> +#define NET_IP_ALIGN 0
>>> +#endif
>>> #endif /* _ASM_X86_SYSTEM_H */
>>
>> The ifdef should be imho dropped and the option be made unconditional
>> for all x86. I am not aware of any x86 core where unalignment is really
>> slow. This would increase the chance of it actually working on many
>> configurations which do not necessarily optimize for Core2.
>>
>> -Andi
>
> Seems to make sense to me. I will see about generating a patch that
> drops the check for CONFIG_MCORE2.
>
Just drop the #ifdef ... #endif.
-hpa
^ permalink raw reply
* Re: [Bugme-new] [Bug 16293] New: 82545GM in newest kernel not working.
From: Andrew Morton @ 2010-07-01 21:00 UTC (permalink / raw)
To: netdev, Jeff Kirsher, Jesse Brandeburg, Bruce Allan,
Alex Duyck <alexander.h.duyck
Cc: bugzilla-daemon, bugme-daemon, lukas.valach
In-Reply-To: <bug-16293-10286@https.bugzilla.kernel.org/>
(switched to email. Please respond via emailed reply-to-all, not via the
bugzilla web interface).
On Fri, 25 Jun 2010 17:48:43 GMT
bugzilla-daemon@bugzilla.kernel.org wrote:
> https://bugzilla.kernel.org/show_bug.cgi?id=16293
>
> Summary: 82545GM in newest kernel not working.
> Product: Drivers
> Version: 2.5
> Kernel Version: 2.6.32.15
> Platform: All
> OS/Version: Linux
> Tree: Mainline
> Status: NEW
> Severity: high
> Priority: P1
> Component: Network
> AssignedTo: drivers_network@kernel-bugs.osdl.org
> ReportedBy: lukas.valach@zoznam.sk
> Regression: No
>
>
> lspci:
>
> 01:01.0 Ethernet controller: Intel Corporation 82545GM Gigabit Ethernet
> Controller (rev 04)
> 01:02.0 Ethernet controller: Intel Corporation 82545GM Gigabit Ethernet
> Controller (rev 04)
> 02:01.0 Ethernet controller: Intel Corporation 82546GB Gigabit Ethernet
> Controller (rev 03)
> 02:01.1 Ethernet controller: Intel Corporation 82546GB Gigabit Ethernet
> Controller (rev 03)
>
> All interface are correctly configured. In kernel <2.6.26 get all OK, in newest
> kernels network interfaces are not working. Problem is in driver e1000. Tested
> on 2.6.31,2.6.32,2.6.34 and 2.6.35rc.
A somewhat old e1000 regression?
^ permalink raw reply
* RE: [Bugme-new] [Bug 16293] New: 82545GM in newest kernel not working.
From: Tantilov, Emil S @ 2010-07-01 21:40 UTC (permalink / raw)
To: Andrew Morton, netdev@vger.kernel.org, Kirsher, Jeffrey T,
"Brandeburg, Jesse" <jesse.brandeburg
Cc: bugzilla-daemon@bugzilla.kernel.org,
bugme-daemon@bugzilla.kernel.org, lukas.valach@zoznam.sk
In-Reply-To: <20100701140041.df2d147c.akpm@linux-foundation.org>
Andrew Morton wrote:
> (switched to email. Please respond via emailed reply-to-all, not via
> the bugzilla web interface).
>
> On Fri, 25 Jun 2010 17:48:43 GMT
> bugzilla-daemon@bugzilla.kernel.org wrote:
>
>> https://bugzilla.kernel.org/show_bug.cgi?id=16293
>>
>> Summary: 82545GM in newest kernel not working.
>> Product: Drivers Version: 2.5
>> Kernel Version: 2.6.32.15
>> Platform: All
>> OS/Version: Linux
>> Tree: Mainline
>> Status: NEW
>> Severity: high
>> Priority: P1
>> Component: Network
>> AssignedTo: drivers_network@kernel-bugs.osdl.org
>> ReportedBy: lukas.valach@zoznam.sk
>> Regression: No
>>
>>
>> lspci:
>>
>> 01:01.0 Ethernet controller: Intel Corporation 82545GM Gigabit
>> Ethernet Controller (rev 04) 01:02.0 Ethernet controller: Intel
>> Corporation 82545GM Gigabit Ethernet Controller (rev 04) 02:01.0
>> Ethernet controller: Intel Corporation 82546GB Gigabit Ethernet
>> Controller (rev 03) 02:01.1 Ethernet controller: Intel Corporation
>> 82546GB Gigabit Ethernet Controller (rev 03)
>>
>> All interface are correctly configured. In kernel <2.6.26 get all
>> OK, in newest kernels network interfaces are not working. Problem is
>> in driver e1000. Tested on 2.6.31,2.6.32,2.6.34 and 2.6.35rc.
>
> A somewhat old e1000 regression?
I ran a quick test on net-next with 82545GM NICs and was able to pass traffic without issues.
Would it be possible to get more information about the system/setup:
1. full lspci output (lspci -vvv)
2. ethtool -i ethX
3. ethtool -d ethX
4. ethtool -e ethX
5. dmesg - after loading the driver and configuring the interfaces
6. kernel config
A better description of the issue would help as well. What exactly fails - ping?
Thanks,
Emil
^ permalink raw reply
* [GIT PULL rcu/urgent] revert ->br_port fix, obsoleted by netdev commit
From: Paul E. McKenney @ 2010-07-01 21:56 UTC (permalink / raw)
To: mingo; +Cc: linux-kernel, arnd, davem, shemminger, netdev, eric.dumazet, sfr
Hello, Ingo,
This pull request reverts commit 81bdf5bd, which has been obsoleted
by commit f350a0a87374 from the netdev folks. This conflict was caught
in the -next tree.
git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-2.6-rcu.git rcu/next
Thanx, Paul
------------------>
Paul E. McKenney (1):
Revert "net: Make accesses to ->br_port safe for sparse RCU"
include/linux/if_bridge.h | 3 ---
net/bridge/br_fdb.c | 2 +-
net/bridge/br_private.h | 5 -----
net/bridge/netfilter/ebt_redirect.c | 2 +-
net/bridge/netfilter/ebt_ulog.c | 4 ++--
net/bridge/netfilter/ebtables.c | 4 ++--
net/netfilter/nfnetlink_log.c | 4 ++--
net/netfilter/nfnetlink_queue.c | 4 ++--
8 files changed, 10 insertions(+), 18 deletions(-)
^ permalink raw reply
* Re: [linux-pm] [PATCH 3/3] pm_qos: get rid of the allocation in pm_qos_add_request()
From: Rafael J. Wysocki @ 2010-07-01 22:23 UTC (permalink / raw)
To: James Bottomley; +Cc: linux-pm, markgross, Takashi Iwai, netdev
In-Reply-To: <20100629043954.GB6250@gvim.org>
On Tuesday, June 29, 2010, mark gross wrote:
> On Mon, Jun 28, 2010 at 12:44:48PM -0500, James Bottomley wrote:
> > Since every caller has to squirrel away the returned pointer anyway,
> > they might as well supply the memory area. This fixes a bug in a few of
> > the call sites where the returned pointer was dereferenced without
> > checking it for NULL (which gets returned if the kzalloc failed).
> >
> > I'd like to hear how sound and netdev feels about this: it will add
> > about two more pointers worth of data to struct netdev and struct
> > snd_pcm_substream .. but I think it's worth it. If you're OK, I'll add
> > your acks and send through the pm tree.
> >
> > This also looks to me like an android independent clean up (even though
> > it renders the request_add atomically callable). I also added include
> > guards to include/linux/pm_qos_params.h
> >
> > cc: netdev@vger.kernel.org
> > cc: Takashi Iwai <tiwai@suse.de>
> > Signed-off-by: James Bottomley <James.Bottomley@suse.de>
> Thank you for doing this!, I'll integrate it into some testing targets
> in the morning!
>
> Signed-off-by: mark gross <markgross@thegnar.org>
I would apply this one too, but I need a final changelog for it. Care to send?
Rafael
^ permalink raw reply
* Re: [linux-pm] [PATCH 3/3] pm_qos: get rid of the allocation in pm_qos_add_request()
From: James Bottomley @ 2010-07-01 22:30 UTC (permalink / raw)
To: Rafael J. Wysocki; +Cc: linux-pm, markgross, Takashi Iwai, netdev
In-Reply-To: <201007020023.13815.rjw@sisk.pl>
On Fri, 2010-07-02 at 00:23 +0200, Rafael J. Wysocki wrote:
> I would apply this one too, but I need a final changelog for it. Care to send?
How about:
All current users of pm_qos_add_request() have the ability to supply the
memory required by the pm_qos routines, so make them do this and
eliminate the kmalloc() with pm_qos_add_request(). This has the double
benefit of making the call never fail and allowing it to be called from
atomic context.
+ signoffs
James
^ permalink raw reply
* Re: [linux-pm] [PATCH 3/3] pm_qos: get rid of the allocation in pm_qos_add_request()
From: Rafael J. Wysocki @ 2010-07-01 22:38 UTC (permalink / raw)
To: James Bottomley; +Cc: linux-pm, markgross, Takashi Iwai, netdev
In-Reply-To: <1278023439.2813.388.camel@mulgrave.site>
On Friday, July 02, 2010, James Bottomley wrote:
> On Fri, 2010-07-02 at 00:23 +0200, Rafael J. Wysocki wrote:
> > I would apply this one too, but I need a final changelog for it. Care to send?
>
> How about:
>
> All current users of pm_qos_add_request() have the ability to supply the
> memory required by the pm_qos routines, so make them do this and
> eliminate the kmalloc() with pm_qos_add_request(). This has the double
> benefit of making the call never fail and allowing it to be called from
> atomic context.
>
> + signoffs
OK
I'll apply it shortly.
Rafael
^ permalink raw reply
* [PATCH] linux/net.h: fix kernel-doc warnings
From: Randy Dunlap @ 2010-07-01 23:18 UTC (permalink / raw)
To: netdev; +Cc: davem
From: Randy Dunlap <randy.dunlap@oracle.com>
Fix kernel-doc warnings in linux/net.h:
Warning(include/linux/net.h:151): No description found for parameter 'wq'
Warning(include/linux/net.h:151): Excess struct/union/enum/typedef member 'fasync_list' description in 'socket'
Warning(include/linux/net.h:151): Excess struct/union/enum/typedef member 'wait' description in 'socket'
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
---
include/linux/net.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
--- linux-2.6.35-rc3-git5.orig/include/linux/net.h
+++ linux-2.6.35-rc3-git5/include/linux/net.h
@@ -129,10 +129,9 @@ struct socket_wq {
* @type: socket type (%SOCK_STREAM, etc)
* @flags: socket flags (%SOCK_ASYNC_NOSPACE, etc)
* @ops: protocol specific socket operations
- * @fasync_list: Asynchronous wake up list
* @file: File back pointer for gc
* @sk: internal networking protocol agnostic socket representation
- * @wait: wait queue for several uses
+ * @wq: wait queue for several uses
*/
struct socket {
socket_state state;
^ permalink raw reply
* [net-2.6 PATCH 1/2] sched: qdisc_reset_all_tx is calling qdisc_reset without qdisc_lock
From: Jeff Kirsher @ 2010-07-01 23:21 UTC (permalink / raw)
To: davem; +Cc: netdev, gospo, bphilips, John Fastabend, Jeff Kirsher
From: John Fastabend <john.r.fastabend@intel.com>
When calling qdisc_reset() the qdisc lock needs to be held. In
this case there is at least one driver i4l which is using this
without holding the lock. Add the locking here.
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
include/net/sch_generic.h | 12 ++++++++++--
1 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 03ca5d8..ba749be 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -317,8 +317,16 @@ extern void tcf_destroy_chain(struct tcf_proto **fl);
static inline void qdisc_reset_all_tx(struct net_device *dev)
{
unsigned int i;
- for (i = 0; i < dev->num_tx_queues; i++)
- qdisc_reset(netdev_get_tx_queue(dev, i)->qdisc);
+ struct Qdisc *qdisc;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+ if (qdisc) {
+ spin_lock_bh(qdisc_lock(qdisc));
+ qdisc_reset(qdisc);
+ spin_unlock_bh(qdisc_lock(qdisc));
+ }
+ }
}
/* Are all TX queues of the device empty? */
^ permalink raw reply related
* [net-2.6 PATCH 2/2] net: decreasing real_num_tx_queues needs to flush qdisc
From: Jeff Kirsher @ 2010-07-01 23:21 UTC (permalink / raw)
To: davem; +Cc: netdev, gospo, bphilips, John Fastabend, Jeff Kirsher
In-Reply-To: <20100701232103.15685.48453.stgit@localhost.localdomain>
From: John Fastabend <john.r.fastabend@intel.com>
Reducing real_num_queues needs to flush the qdisc otherwise
skbs with queue_mappings greater then real_num_tx_queues can
be sent to the underlying driver.
The flow for this is,
dev_queue_xmit()
dev_pick_tx()
skb_tx_hash() => hash using real_num_tx_queues
skb_set_queue_mapping()
...
qdisc_enqueue_root() => enqueue skb on txq from hash
...
dev->real_num_tx_queues -= n
...
sch_direct_xmit()
dev_hard_start_xmit()
ndo_start_xmit(skb,dev) => skb queue set with old hash
skbs are enqueued on the qdisc with skb->queue_mapping set
0 < queue_mappings < real_num_tx_queues. When the driver
decreases real_num_tx_queues skb's may be dequeued from the
qdisc with a queue_mapping greater then real_num_tx_queues.
This fixes a case in ixgbe where this was occurring with DCB
and FCoE. Because the driver is using queue_mapping to map
skbs to tx descriptor rings we can potentially map skbs to
rings that no longer exist.
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Tested-by: Ross Brattain <ross.b.brattain@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ixgbe/ixgbe_main.c | 2 +-
include/linux/netdevice.h | 3 +++
include/net/sch_generic.h | 12 ++++++++----
net/core/dev.c | 18 ++++++++++++++++++
4 files changed, 30 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index a0b3316..7b5d976 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -4001,7 +4001,7 @@ static void ixgbe_set_num_queues(struct ixgbe_adapter *adapter)
done:
/* Notify the stack of the (possibly) reduced Tx Queue count. */
- adapter->netdev->real_num_tx_queues = adapter->num_tx_queues;
+ netif_set_real_num_tx_queues(adapter->netdev, adapter->num_tx_queues);
}
static void ixgbe_acquire_msix_vectors(struct ixgbe_adapter *adapter,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 40291f3..5e6188d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1656,6 +1656,9 @@ static inline int netif_is_multiqueue(const struct net_device *dev)
return (dev->num_tx_queues > 1);
}
+extern void netif_set_real_num_tx_queues(struct net_device *dev,
+ unsigned int txq);
+
/* Use this variant when it is known for sure that it
* is executing from hardware interrupt context or with hardware interrupts
* disabled.
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index ba749be..433604b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -313,13 +313,12 @@ extern void qdisc_calculate_pkt_len(struct sk_buff *skb,
extern void tcf_destroy(struct tcf_proto *tp);
extern void tcf_destroy_chain(struct tcf_proto **fl);
-/* Reset all TX qdiscs of a device. */
-static inline void qdisc_reset_all_tx(struct net_device *dev)
+/* Reset all TX qdiscs greater then index of a device. */
+static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
{
- unsigned int i;
struct Qdisc *qdisc;
- for (i = 0; i < dev->num_tx_queues; i++) {
+ for (; i < dev->num_tx_queues; i++) {
qdisc = netdev_get_tx_queue(dev, i)->qdisc;
if (qdisc) {
spin_lock_bh(qdisc_lock(qdisc));
@@ -329,6 +328,11 @@ static inline void qdisc_reset_all_tx(struct net_device *dev)
}
}
+static inline void qdisc_reset_all_tx(struct net_device *dev)
+{
+ qdisc_reset_all_tx_gt(dev, 0);
+}
+
/* Are all TX queues of the device empty? */
static inline bool qdisc_all_tx_empty(const struct net_device *dev)
{
diff --git a/net/core/dev.c b/net/core/dev.c
index 2b3bf53..723a347 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1553,6 +1553,24 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
rcu_read_unlock();
}
+/*
+ * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
+ * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
+ */
+void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
+{
+ unsigned int real_num = dev->real_num_tx_queues;
+
+ if (unlikely(txq > dev->num_tx_queues))
+ ;
+ else if (txq > real_num)
+ dev->real_num_tx_queues = txq;
+ else if (txq < real_num) {
+ dev->real_num_tx_queues = txq;
+ qdisc_reset_all_tx_gt(dev, txq);
+ }
+}
+EXPORT_SYMBOL(netif_set_real_num_tx_queues);
static inline void __netif_reschedule(struct Qdisc *q)
{
^ permalink raw reply related
* [net-next-2.6 PATCH] x86: Drop CONFIG_MCORE2 check around setting of NET_IP_ALIGN
From: Jeff Kirsher @ 2010-07-01 23:28 UTC (permalink / raw)
To: davem
Cc: netdev, gospo, bphilips, Andi Kleen, Thomas Gleixner, Ingo Molnar,
H. Peter Anvin, x86, Alexander Duyck, Jeff Kirsher
From: Alexander Duyck <alexander.h.duyck@intel.com>
This patch removes the CONFIG_MCORE2 check from around NET_IP_ALIGN. It is
based on a suggestion from Andi Kleen. The assumption is that there are
not any x86 cores where unaligned access is really slow, and this change
would allow for a performance improvement to still exist on configurations
that are not necessarily optimized for Core 2.
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
arch/x86/include/asm/system.h | 2 --
1 files changed, 0 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index b4293fc..1db9bd2 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -457,7 +457,6 @@ static inline void rdtsc_barrier(void)
alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
}
-#ifdef CONFIG_MCORE2
/*
* We handle most unaligned accesses in hardware. On the other hand
* unaligned DMA can be quite expensive on some Nehalem processors.
@@ -465,5 +464,4 @@ static inline void rdtsc_barrier(void)
* Based on this we disable the IP header alignment in network drivers.
*/
#define NET_IP_ALIGN 0
-#endif
#endif /* _ASM_X86_SYSTEM_H */
^ permalink raw reply related
* [net-next-2.6 PATCH 1/5] igb: fix PHY config access on 82580
From: Jeff Kirsher @ 2010-07-01 23:37 UTC (permalink / raw)
To: davem; +Cc: netdev, gospo, bphilips, Nicholas Nunley, Jeff Kirsher
From: Nick Nunley <nicholasx.d.nunley@intel.com>
82580 NICs can have up to 4 functions. This fixes phy accesses
to use the correct locks for functions 2 and 3.
Signed-off-by: Nicholas Nunley <nicholasx.d.nunley@intel.com>
Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/igb/e1000_82575.c | 8 ++++++++
drivers/net/igb/e1000_defines.h | 2 ++
2 files changed, 10 insertions(+), 0 deletions(-)
diff --git a/drivers/net/igb/e1000_82575.c b/drivers/net/igb/e1000_82575.c
index 86438b5..06251a9 100644
--- a/drivers/net/igb/e1000_82575.c
+++ b/drivers/net/igb/e1000_82575.c
@@ -295,6 +295,10 @@ static s32 igb_acquire_phy_82575(struct e1000_hw *hw)
if (hw->bus.func == E1000_FUNC_1)
mask = E1000_SWFW_PHY1_SM;
+ else if (hw->bus.func == E1000_FUNC_2)
+ mask = E1000_SWFW_PHY2_SM;
+ else if (hw->bus.func == E1000_FUNC_3)
+ mask = E1000_SWFW_PHY3_SM;
return igb_acquire_swfw_sync_82575(hw, mask);
}
@@ -312,6 +316,10 @@ static void igb_release_phy_82575(struct e1000_hw *hw)
if (hw->bus.func == E1000_FUNC_1)
mask = E1000_SWFW_PHY1_SM;
+ else if (hw->bus.func == E1000_FUNC_2)
+ mask = E1000_SWFW_PHY2_SM;
+ else if (hw->bus.func == E1000_FUNC_3)
+ mask = E1000_SWFW_PHY3_SM;
igb_release_swfw_sync_82575(hw, mask);
}
diff --git a/drivers/net/igb/e1000_defines.h b/drivers/net/igb/e1000_defines.h
index 24d9be6..90bc29d 100644
--- a/drivers/net/igb/e1000_defines.h
+++ b/drivers/net/igb/e1000_defines.h
@@ -164,6 +164,8 @@
#define E1000_SWFW_EEP_SM 0x1
#define E1000_SWFW_PHY0_SM 0x2
#define E1000_SWFW_PHY1_SM 0x4
+#define E1000_SWFW_PHY2_SM 0x20
+#define E1000_SWFW_PHY3_SM 0x40
/* FACTPS Definitions */
/* Device Control */
^ permalink raw reply related
* [net-next-2.6 PATCH 2/5] igb: Use only a single Tx queue in SR-IOV mode
From: Jeff Kirsher @ 2010-07-01 23:38 UTC (permalink / raw)
To: davem; +Cc: netdev, gospo, bphilips, stable, Greg Rose, Jeff Kirsher
In-Reply-To: <20100701233733.16171.4629.stgit@localhost.localdomain>
From: Greg Rose <gregory.v.rose@intel.com>
The 82576 expects the second rx queue in any pool to receive L2 switch
loop back packets sent from the second tx queue in another pool. The
82576 VF driver does not enable the second rx queue so if the PF driver
sends packets destined to a VF from its second tx queue then the VF
driver will never see them. In SR-IOV mode limit the number of tx queues
used by the PF driver to one. This patch fixes a bug reported in which
the PF cannot communciate with the VF and should be considered for 2.6.34
stable.
CC: stable@kernel.org
Signed-off-by: Greg Rose <gregory.v.rose@intel.com>
Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/igb/igb_main.c | 8 ++++----
1 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 3881918..e79689e 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -630,9 +630,6 @@ static void igb_cache_ring_register(struct igb_adapter *adapter)
for (; i < adapter->rss_queues; i++)
adapter->rx_ring[i]->reg_idx = rbase_offset +
Q_IDX_82576(i);
- for (; j < adapter->rss_queues; j++)
- adapter->tx_ring[j]->reg_idx = rbase_offset +
- Q_IDX_82576(j);
}
case e1000_82575:
case e1000_82580:
@@ -996,7 +993,10 @@ static void igb_set_interrupt_capability(struct igb_adapter *adapter)
/* Number of supported queues. */
adapter->num_rx_queues = adapter->rss_queues;
- adapter->num_tx_queues = adapter->rss_queues;
+ if (adapter->vfs_allocated_count)
+ adapter->num_tx_queues = 1;
+ else
+ adapter->num_tx_queues = adapter->rss_queues;
/* start with one vector for every rx queue */
numvecs = adapter->num_rx_queues;
^ permalink raw reply related
* [net-next-2.6 PATCH 3/5] igb: Fix Tx hangs seen when loading igb with max_vfs > 7.
From: Jeff Kirsher @ 2010-07-01 23:38 UTC (permalink / raw)
To: davem; +Cc: netdev, gospo, bphilips, Emil Tantilov, Jeff Kirsher
In-Reply-To: <20100701233733.16171.4629.stgit@localhost.localdomain>
From: Emil Tantilov <emil.s.tantilov@intel.com>
Check the value of max_vfs at the time of assignment of vfs_allocated_count.
The previous check in igb_probe_vfs was too late as by that time the rx/tx
rings were initialized with the wrong offset.
Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/igb/igb_main.c | 5 +----
1 files changed, 1 insertions(+), 4 deletions(-)
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index e79689e..d811462 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -2091,9 +2091,6 @@ static void __devinit igb_probe_vfs(struct igb_adapter * adapter)
#ifdef CONFIG_PCI_IOV
struct pci_dev *pdev = adapter->pdev;
- if (adapter->vfs_allocated_count > 7)
- adapter->vfs_allocated_count = 7;
-
if (adapter->vfs_allocated_count) {
adapter->vf_data = kcalloc(adapter->vfs_allocated_count,
sizeof(struct vf_data_storage),
@@ -2258,7 +2255,7 @@ static int __devinit igb_sw_init(struct igb_adapter *adapter)
#ifdef CONFIG_PCI_IOV
if (hw->mac.type == e1000_82576)
- adapter->vfs_allocated_count = max_vfs;
+ adapter->vfs_allocated_count = (max_vfs > 7) ? 7 : max_vfs;
#endif /* CONFIG_PCI_IOV */
adapter->rss_queues = min_t(u32, IGB_MAX_RX_QUEUES, num_online_cpus());
^ permalink raw reply related
* [net-next-2.6 PATCH 4/5] igb: correct link test not being run when link is down
From: Jeff Kirsher @ 2010-07-01 23:39 UTC (permalink / raw)
To: davem; +Cc: netdev, gospo, bphilips, Alexander Duyck, Jeff Kirsher
In-Reply-To: <20100701233733.16171.4629.stgit@localhost.localdomain>
From: Alexander Duyck <alexander.h.duyck@intel.com>
The igb online link test was always reporting pass because instead of
checking for if_running it was checking for netif_carrier_ok.
This change corrects the test so that it is run if the interface is running
instead of checking for netif carrier ok.
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Emil Tantilov <emil.s.tantilov@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/igb/igb_ethtool.c | 8 +++-----
1 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/drivers/net/igb/igb_ethtool.c b/drivers/net/igb/igb_ethtool.c
index f2ebf92..26bf6a1 100644
--- a/drivers/net/igb/igb_ethtool.c
+++ b/drivers/net/igb/igb_ethtool.c
@@ -1823,12 +1823,10 @@ static void igb_diag_test(struct net_device *netdev,
dev_info(&adapter->pdev->dev, "online testing starting\n");
/* PHY is powered down when interface is down */
- if (!netif_carrier_ok(netdev)) {
+ if (if_running && igb_link_test(adapter, &data[4]))
+ eth_test->flags |= ETH_TEST_FL_FAILED;
+ else
data[4] = 0;
- } else {
- if (igb_link_test(adapter, &data[4]))
- eth_test->flags |= ETH_TEST_FL_FAILED;
- }
/* Online tests aren't run; pass by default */
data[0] = 0;
^ permalink raw reply related
* [net-next-2.6 PATCH 5/5] igb: Add comment
From: Jeff Kirsher @ 2010-07-01 23:39 UTC (permalink / raw)
To: davem; +Cc: netdev, gospo, bphilips, Greg Rose, Jeff Kirsher
In-Reply-To: <20100701233733.16171.4629.stgit@localhost.localdomain>
From: Greg Rose <gregory.v.rose@intel.com>
Add explanatory comment to avoid confusion when a pointer is set
to the second word of an array instead of the customary cast of a
pointer to the beginning of the array.
Signed-off-by: Greg Rose <gregory.v.rose@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/igb/igb_main.c | 4 ++++
1 files changed, 4 insertions(+), 0 deletions(-)
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index d811462..9cb04e2 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -4974,6 +4974,10 @@ static void igb_vf_reset_msg(struct igb_adapter *adapter, u32 vf)
static int igb_set_vf_mac_addr(struct igb_adapter *adapter, u32 *msg, int vf)
{
+ /*
+ * The VF MAC Address is stored in a packed array of bytes
+ * starting at the second 32 bit word of the msg array
+ */
unsigned char *addr = (char *)&msg[1];
int err = -1;
^ permalink raw reply related
* [net-next-2.6 PATCH] ixgbe: use NETIF_F_LRO
From: Jeff Kirsher @ 2010-07-01 23:58 UTC (permalink / raw)
To: davem
Cc: netdev, gospo, bphilips, Stanislaw Gruszka, Don Skidmore,
Jeff Kirsher
From: Stanislaw Gruszka <sgruszka@redhat.com>
Both ETH_FLAG_LRO and NETIF_F_LRO have the same value, but NETIF_F_LRO
is intended to use with netdev->features.
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Acked-by: Don Skidmore <donald.c.skidmore@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ixgbe/ixgbe_ethtool.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c
index b50b5ea..5275e9c 100644
--- a/drivers/net/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
@@ -2237,7 +2237,7 @@ static int ixgbe_set_flags(struct net_device *netdev, u32 data)
break;
}
} else if (!adapter->rx_itr_setting) {
- netdev->features &= ~ETH_FLAG_LRO;
+ netdev->features &= ~NETIF_F_LRO;
if (data & ETH_FLAG_LRO)
e_info("rx-usecs set to 0, "
"LRO/RSC cannot be enabled.\n");
^ permalink raw reply related
* Re: [net-next-2.6 PATCH] x86: Drop CONFIG_MCORE2 check around setting of NET_IP_ALIGN
From: Stephen Hemminger @ 2010-07-02 0:26 UTC (permalink / raw)
To: Jeff Kirsher
Cc: davem, netdev, gospo, bphilips, Andi Kleen, Thomas Gleixner,
Ingo Molnar, H. Peter Anvin, x86, Alexander Duyck
In-Reply-To: <20100701232742.15934.49030.stgit@localhost.localdomain>
On Thu, 01 Jul 2010 16:28:27 -0700
Jeff Kirsher <jeffrey.t.kirsher@intel.com> wrote:
> From: Alexander Duyck <alexander.h.duyck@intel.com>
>
> This patch removes the CONFIG_MCORE2 check from around NET_IP_ALIGN. It is
> based on a suggestion from Andi Kleen. The assumption is that there are
> not any x86 cores where unaligned access is really slow, and this change
> would allow for a performance improvement to still exist on configurations
> that are not necessarily optimized for Core 2.
>
> Cc: Andi Kleen <ak@linux.intel.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: x86@kernel.org
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> ---
This is a good idea, but warnig it may end up masking broken
hardware. Developers of new drivers will end up never
exercising unaligned DMA, resulting in hardware
that doesn't work on platforms that have NET_IP_ALIGN set
to 2.
--
^ permalink raw reply
* Re: pull request: wireless-next-2.6 2010-07-01
From: David Miller @ 2010-07-02 0:35 UTC (permalink / raw)
To: linville; +Cc: linux-wireless, netdev, linux-kernel
In-Reply-To: <20100701181526.GA2356@tuxdriver.com>
From: "John W. Linville" <linville@tuxdriver.com>
Date: Thu, 1 Jul 2010 14:15:27 -0400
> Two weeks since the last request, plenty of new stuff intended for
> 2.6.36...
>
> Included are the usual bunch of driver updates, including a big
> dump from the rt2x00 team. This also includes cfg80211 support
> for libertas, a flurry of (mostly trivial) stuff from me, and a
> wireless-2.6 pull to resolve some patch dependencies.
>
> Please let me know if there are problems!
This failed to pull cleanly, I got a conflict in
drivers/net/wireless/libertas/host.h It was the
usual "__packed" vs. "__attribute__((packed))" thing.
I resolved it but I wonder if this happened because you
did the wireless-2.6 --> wireless-next-2.6 merge here.
^ permalink raw reply
* Re: [net-next-2.6 PATCH] x86: Drop CONFIG_MCORE2 check around setting of NET_IP_ALIGN
From: H. Peter Anvin @ 2010-07-02 1:15 UTC (permalink / raw)
To: Jeff Kirsher
Cc: davem, netdev, gospo, bphilips, Andi Kleen, Thomas Gleixner,
Ingo Molnar, x86, Alexander Duyck
In-Reply-To: <20100701232742.15934.49030.stgit@localhost.localdomain>
On 07/01/2010 04:28 PM, Jeff Kirsher wrote:
> From: Alexander Duyck <alexander.h.duyck@intel.com>
>
> This patch removes the CONFIG_MCORE2 check from around NET_IP_ALIGN. It is
> based on a suggestion from Andi Kleen. The assumption is that there are
> not any x86 cores where unaligned access is really slow, and this change
> would allow for a performance improvement to still exist on configurations
> that are not necessarily optimized for Core 2.
>
> Cc: Andi Kleen <ak@linux.intel.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: x86@kernel.org
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
-hpa
^ permalink raw reply
* Fwd: Possible bug in net/ipv4/route.c?
From: YOSHIFUJI Hideaki @ 2010-07-02 2:49 UTC (permalink / raw)
To: netdev@vger.kernel.org, linux-kernel
Switch to netdev.
--yoshfuji
-------- Original Message --------
Subject: Possible bug in net/ipv4/route.c?
Date: Thu, 1 Jul 2010 16:00:29 -0700
From: Sol Kavy <skavy@ubicom.com>
To: <linux-kernel@vger.kernel.org>
CC: Greg Ren <gren@ubicom.com>, Guojun Jin <gjin@ubicom.com>, Murat Sezgin <msezgin@ubicom.com>, Sener Ilgen <silgen@ubicom.com>
Found Linux: 2.6.28
Arch: Ubicom32 <not yet pushed>
Project: uCLinux based Router
Test: Bit torrent Stress Test
Note: The top of Linus git net/ipv4/route.c appears to have the same issue.
The following is a patch for clearing out IP options area in an input skb during link failure processing. Without this patch, the icmp_send() can result in a call to ip_options_echo() where the common buffer area of the skb is incorrectly interpreted. Depending on the previous use of the skb->cb[], the interpreted option length values can cause stack corruption by copying more than 40 bytes to the output options.
In our case, a driver is using the skb->cb[] area to hold driver specific data. The driver is not zeroing out the area after use. I can see three basic solutions:
1) Drivers are not allowed to use the skb->cb[] area at all. Ubicom should modify the driver to use a different approach.
2) The layer using skb->cb[] should clear this area after use and before handing the skb to another layer. Ubicom should modify the driver to clear the skb->cb[] area before sending it up the line.
3) Any layer that "uses" the skb->cb[] area must clear the area before use. In which case, the proposed patch would fix the problem for the ipv4_link_failure(). I believe that this is the correct fix because I see ip_rcv() clears the skb->cb[] before using it.
Can someone confirm that this is the appropriate fix? If this is documented somewhere, please direct me to the documentation.
Please send email to sol@ubicom.com in addition to posting your response.
Thanks,
Sol Kavy/Murat Sezgin
Ubicom, Inc.
Patch:
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 125ee64..d13805f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1606,6 +1606,14 @@ static void ipv4_link_failure(struct sk_buff *skb)
{
struct rtable *rt;
+ /*
+ * Since link failure can be called with skbs from many layers (see arp)
+ * the cb area of the skb must be cleared before use. Because the cb area
+ * can be formatted according to the caller layer's cb area format and it may cause
+ * corruptions when it is handled in a different network layer.
+ */
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
rt = skb->rtable;
The packet is enqueud by:
do_IRQ()->do_softirq()->__do_softirq()->net_rx_action()->ubi32_eth_napi_poll()->ubi32_eth_receive()->__vlan_hwaccel_rx()->netif_receive_skb()->br_handle_frame()->nf_hook_slow()->br_nf_pre_routing_finish()->br_nfr_pre_routing_finish_bridge()->neight_resolve_output()->__neigh_event_send().
The packet is then dequeued by:
do_IRQ() -> irq_exit() -> do_softirq() -> run_timer_softirq() -> neigh_timer_handler() -> arp_error_report() -> ipv4_link_failure() -> icmp_send() -> ip_options_echo().
Because the Ubicom Ethernet driver overwrites the common buffer area, the enqueued packet contains garbage when casted as an IP options data structure. This results in ip_options_echo() miss reading the option length information and overwriting memory. By clearing the skb->cb[] before processing the icmp_send() against the packet, we ensure that ip_options_echo() does not corrupt memory.
^ permalink raw reply related
* [PATCH v3] netfilter: xtables target SYNPROXY
From: Changli Gao @ 2010-07-02 4:19 UTC (permalink / raw)
To: Patrick McHardy
Cc: David S. Miller, Alexey Kuznetsov, Jan Engelhardt,
Jozsef Kadlecsik, Pekka Savola (ipv6), James Morris,
Hideaki YOSHIFUJI, netfilter-devel, netdev, Changli Gao
v3:
fix the bug it can't work with bridge.
netfilter: xtables target SYNPROXY.
This patch implements an xtables target SYNPROXY. As the connection to the
TCP server won't be established until the ACK from the client is received, it
can protect the TCP server from the SYN-flood attacks.
It works in the raw table of the PREROUTING chain, before conntracking system.
Syncookies is used, so no new state is introduced into the conntracking system.
In fact, until the first connection is established, conntracking system doesn't
see any packets. So when there is a SYN-flood attack, conntracking system won't
be busy on finding and deleting the un-assured ct.
As the SYN-packet of the second connection request is sent locally, the DNAT
rules which are in the PREROUTING chain should be moved to the OUTPUT chain.
Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
include/net/netfilter/nf_conntrack.h | 10
include/net/netfilter/nf_conntrack_core.h | 21
include/net/netfilter/nf_conntrack_extend.h | 2
include/net/tcp.h | 7
net/ipv4/syncookies.c | 22
net/ipv4/tcp_ipv4.c | 9
net/netfilter/Kconfig | 17
net/netfilter/Makefile | 1
net/netfilter/nf_conntrack_core.c | 45 +
net/netfilter/xt_SYNPROXY.c | 679 ++++++++++++++++++++++++++++
10 files changed, 794 insertions(+), 19 deletions(-)
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index e624dae..5e6d8e4 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -311,5 +311,15 @@ do { \
#define MODULE_ALIAS_NFCT_HELPER(helper) \
MODULE_ALIAS("nfct-helper-" helper)
+#if defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY) || \
+ defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY_MODULE)
+extern unsigned int (*syn_proxy_pre_hook)(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo);
+
+extern unsigned int (*syn_proxy_post_hook)(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo);
+#endif
#endif /* __KERNEL__ */
#endif /* _NF_CONNTRACK_H */
diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index aced085..637b404 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -54,6 +54,23 @@ nf_conntrack_find_get(struct net *net, u16 zone,
extern int __nf_conntrack_confirm(struct sk_buff *skb);
+static inline unsigned int syn_proxy_post_call(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ unsigned int ret = NF_ACCEPT;
+#if defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY) || \
+ defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY_MODULE)
+ unsigned int (*syn_proxy)(struct sk_buff *, struct nf_conn *,
+ enum ip_conntrack_info);
+ syn_proxy = rcu_dereference(syn_proxy_post_hook);
+ if (syn_proxy)
+ ret = syn_proxy(skb, ct, ctinfo);
+#endif
+
+ return ret;
+}
+
/* Confirm a connection: returns NF_DROP if packet must be dropped. */
static inline int nf_conntrack_confirm(struct sk_buff *skb)
{
@@ -63,8 +80,10 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb)
if (ct && !nf_ct_is_untracked(ct)) {
if (!nf_ct_is_confirmed(ct))
ret = __nf_conntrack_confirm(skb);
- if (likely(ret == NF_ACCEPT))
+ if (likely(ret == NF_ACCEPT)) {
nf_ct_deliver_cached_events(ct);
+ ret = syn_proxy_post_call(skb, ct, skb->nfctinfo);
+ }
}
return ret;
}
diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index 32d15bd..b2ae7e9 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -11,6 +11,7 @@ enum nf_ct_ext_id {
NF_CT_EXT_ACCT,
NF_CT_EXT_ECACHE,
NF_CT_EXT_ZONE,
+ NF_CT_EXT_SYNPROXY,
NF_CT_EXT_NUM,
};
@@ -19,6 +20,7 @@ enum nf_ct_ext_id {
#define NF_CT_EXT_ACCT_TYPE struct nf_conn_counter
#define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache
#define NF_CT_EXT_ZONE_TYPE struct nf_conntrack_zone
+#define NF_CT_EXT_SYNPROXY_TYPE struct syn_proxy_state
/* Extensions: optional stuff which isn't permanently in struct. */
struct nf_ct_ext {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c2f96c2..06f28d3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -460,8 +460,11 @@ extern int tcp_disconnect(struct sock *sk, int flags);
extern __u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];
extern struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
struct ip_options *opt);
-extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb,
- __u16 *mss);
+extern __u32 __cookie_v4_init_sequence(__be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport, __u32 seq,
+ __u16 *mssp);
+extern int cookie_v4_check_sequence(const struct iphdr *iph,
+ const struct tcphdr *th, __u32 cookie);
extern __u32 cookie_init_timestamp(struct request_sock *req);
extern bool cookie_check_timestamp(struct tcp_options_received *opt, bool *);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 650cace..3adcba3 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -159,26 +159,21 @@ static __u16 const msstab[] = {
* Generate a syncookie. mssp points to the mss, which is returned
* rounded down to the value encoded in the cookie.
*/
-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
+__u32 __cookie_v4_init_sequence(__be32 saddr, __be32 daddr, __be16 sport,
+ __be16 dport, __u32 seq, __u16 *mssp)
{
- const struct iphdr *iph = ip_hdr(skb);
- const struct tcphdr *th = tcp_hdr(skb);
int mssind;
const __u16 mss = *mssp;
- tcp_synq_overflow(sk);
-
for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
if (mss >= msstab[mssind])
break;
*mssp = msstab[mssind];
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
-
- return secure_tcp_syn_cookie(iph->saddr, iph->daddr,
- th->source, th->dest, ntohl(th->seq),
+ return secure_tcp_syn_cookie(saddr, daddr, sport, dport, seq,
jiffies / (HZ * 60), mssind);
}
+EXPORT_SYMBOL(__cookie_v4_init_sequence);
/*
* This (misnamed) value is the age of syncookie which is permitted.
@@ -191,10 +186,9 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
* Check if a ack sequence number is a valid syncookie.
* Return the decoded mss if it is, or 0 if not.
*/
-static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
+int cookie_v4_check_sequence(const struct iphdr *iph, const struct tcphdr *th,
+ __u32 cookie)
{
- const struct iphdr *iph = ip_hdr(skb);
- const struct tcphdr *th = tcp_hdr(skb);
__u32 seq = ntohl(th->seq) - 1;
__u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
th->source, th->dest, seq,
@@ -203,6 +197,7 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
}
+EXPORT_SYMBOL(cookie_v4_check_sequence);
static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
@@ -282,7 +277,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
goto out;
if (tcp_synq_no_recent_overflow(sk) ||
- (mss = cookie_check(skb, cookie)) == 0) {
+ (mss = cookie_v4_check_sequence(ip_hdr(skb), tcp_hdr(skb),
+ cookie)) == 0) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
goto out;
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 8fa32f5..3b094c7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1332,7 +1332,14 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
TCP_ECN_create_request(req, tcp_hdr(skb));
if (want_cookie) {
- isn = cookie_v4_init_sequence(sk, skb, &req->mss);
+ struct tcphdr *th;
+
+ tcp_synq_overflow(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
+ th = tcp_hdr(skb);
+ isn = __cookie_v4_init_sequence(saddr, daddr, th->source,
+ th->dest, ntohl(th->seq),
+ &req->mss);
req->cookie_ts = tmp_opt.tstamp_ok;
} else if (!isn) {
struct inet_peer *peer = NULL;
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 413ed24..fd8ad8c 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -560,6 +560,23 @@ config NETFILTER_XT_TARGET_SECMARK
To compile it as a module, choose M here. If unsure, say N.
+config NETFILTER_XT_TARGET_SYNPROXY
+ tristate '"SYNPROXY" target support (EXPERIMENTAL)'
+ depends on EXPERIMENTAL
+ depends on SYN_COOKIES
+ depends on IP_NF_RAW
+ depends on NF_CONNTRACK
+ depends on NETFILTER_ADVANCED
+ help
+ The SYNPROXY target allows a raw rule to specify that some TCP
+ connections are relayed to protect the TCP servers from the SYN-flood
+ DoS attacks. Syn cookies is used to save the initial state, so no
+ conntrack is needed until the client side connection is established.
+ It frees the connection tracking system from creating/deleting
+ conntracks when SYN-flood DoS attack acts.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_TARGET_TCPMSS
tristate '"TCPMSS" target support'
depends on (IPV6 || IPV6=n)
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index e28420a..4e32834 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o
obj-$(CONFIG_NETFILTER_XT_TARGET_TEE) += xt_TEE.o
obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o
obj-$(CONFIG_NETFILTER_XT_TARGET_IDLETIMER) += xt_IDLETIMER.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_SYNPROXY) += xt_SYNPROXY.o
# matches
obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 16b41b4..dd85d6f 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -800,6 +800,26 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
return ct;
}
+static inline unsigned int syn_proxy_pre_call(int protonum, struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ unsigned int ret = NF_ACCEPT;
+#if defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY) || \
+ defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY_MODULE)
+ unsigned int (*syn_proxy)(struct sk_buff *, struct nf_conn *,
+ enum ip_conntrack_info);
+
+ if (protonum == IPPROTO_TCP) {
+ syn_proxy = rcu_dereference(syn_proxy_pre_hook);
+ if (syn_proxy)
+ ret = syn_proxy(skb, ct, ctinfo);
+ }
+#endif
+
+ return ret;
+}
+
unsigned int
nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
struct sk_buff *skb)
@@ -855,8 +875,9 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
l3proto, l4proto, &set_reply, &ctinfo);
if (!ct) {
/* Not valid part of a connection */
- NF_CT_STAT_INC_ATOMIC(net, invalid);
- ret = NF_ACCEPT;
+ ret = syn_proxy_pre_call(protonum, skb, NULL, ctinfo);
+ if (ret == NF_ACCEPT)
+ NF_CT_STAT_INC_ATOMIC(net, invalid);
goto out;
}
@@ -869,6 +890,9 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
NF_CT_ASSERT(skb->nfct);
+ ret = syn_proxy_pre_call(protonum, skb, ct, ctinfo);
+ if (ret != NF_ACCEPT)
+ goto out;
ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum);
if (ret <= 0) {
/* Invalid: inverse of the return code tells
@@ -1476,6 +1500,17 @@ s16 (*nf_ct_nat_offset)(const struct nf_conn *ct,
u32 seq);
EXPORT_SYMBOL_GPL(nf_ct_nat_offset);
+#if defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY) || \
+ defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY_MODULE)
+unsigned int (*syn_proxy_pre_hook)(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo);
+EXPORT_SYMBOL(syn_proxy_pre_hook);
+
+unsigned int (*syn_proxy_post_hook)(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo);
+EXPORT_SYMBOL(syn_proxy_post_hook);
+#endif
+
int nf_conntrack_init(struct net *net)
{
int ret;
@@ -1496,6 +1531,12 @@ int nf_conntrack_init(struct net *net)
/* Howto get NAT offsets */
rcu_assign_pointer(nf_ct_nat_offset, NULL);
+
+#if defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY) || \
+ defined(CONFIG_NETFILTER_XT_TARGET_SYNPROXY_MODULE)
+ rcu_assign_pointer(syn_proxy_pre_hook, NULL);
+ rcu_assign_pointer(syn_proxy_post_hook, NULL);
+#endif
}
return 0;
diff --git a/net/netfilter/xt_SYNPROXY.c b/net/netfilter/xt_SYNPROXY.c
new file mode 100644
index 0000000..1a55f33
--- /dev/null
+++ b/net/netfilter/xt_SYNPROXY.c
@@ -0,0 +1,679 @@
+/* (C) 2010- Changli Gao <xiaosuo@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * It bases on ipt_REJECT.c
+ */
+#define pr_fmt(fmt) "SYNPROXY: " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/unaligned/access_ok.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/route.h>
+#include <net/dst.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Changli Gao <xiaosuo@gmail.com>");
+MODULE_DESCRIPTION("Xtables: \"SYNPROXY\" target for IPv4");
+MODULE_ALIAS("ipt_SYNPROXY");
+
+enum {
+ TCP_SEND_FLAG_NOTRACE = 0x1,
+ TCP_SEND_FLAG_SYNCOOKIE = 0x2,
+ TCP_SEND_FLAG_ACK2SYN = 0x4,
+};
+
+struct syn_proxy_state {
+ u16 seq_inited;
+ __be16 window;
+ u32 seq_diff;
+};
+
+static int get_mtu(const struct dst_entry *dst)
+{
+ int mtu;
+
+ mtu = dst_mtu(dst);
+ if (mtu)
+ return mtu;
+
+ return dst->dev ? dst->dev->mtu : 0;
+}
+
+static int get_advmss(const struct dst_entry *dst)
+{
+ int advmss;
+
+ advmss = dst_metric(dst, RTAX_ADVMSS);
+ if (advmss)
+ return advmss;
+ advmss = get_mtu(dst);
+ if (advmss)
+ return advmss - (sizeof(struct iphdr) + sizeof(struct tcphdr));
+
+ return TCP_MSS_DEFAULT;
+}
+
+static int syn_proxy_route(struct sk_buff *skb, struct net *net, u16 *pmss)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct rtable *rt;
+ struct flowi fl = {};
+ unsigned int type;
+ int flags = 0;
+ int err;
+ u16 mss;
+
+ type = inet_addr_type(net, iph->saddr);
+ if (type != RTN_LOCAL) {
+ type = inet_addr_type(net, iph->daddr);
+ if (type == RTN_LOCAL)
+ flags |= FLOWI_FLAG_ANYSRC;
+ }
+
+ if (type == RTN_LOCAL) {
+ fl.nl_u.ip4_u.daddr = iph->daddr;
+ fl.nl_u.ip4_u.saddr = iph->saddr;
+ fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
+ fl.flags = flags;
+ err = ip_route_output_key(net, &rt, &fl);
+ if (err)
+ goto out;
+
+ skb_dst_set(skb, &rt->dst);
+ } else {
+ /* non-local src, find valid iif to satisfy
+ * rp-filter when calling ip_route_input. */
+ fl.nl_u.ip4_u.daddr = iph->saddr;
+ err = ip_route_output_key(net, &rt, &fl);
+ if (err)
+ goto out;
+
+ err = ip_route_input(skb, iph->daddr, iph->saddr,
+ RT_TOS(iph->tos), rt->dst.dev);
+ if (err) {
+ dst_release(&rt->dst);
+ goto out;
+ }
+ if (pmss) {
+ mss = get_advmss(&rt->dst);
+ if (*pmss > mss)
+ *pmss = mss;
+ }
+ dst_release(&rt->dst);
+ }
+
+ err = skb_dst(skb)->error;
+ if (!err && pmss) {
+ mss = get_advmss(skb_dst(skb));
+ if (*pmss > mss)
+ *pmss = mss;
+ }
+
+out:
+ return err;
+}
+
+static int tcp_send(__be32 src, __be32 dst, __be16 sport, __be16 dport,
+ u32 seq, u32 ack_seq, __be16 window, u16 mss, u8 tcp_flags,
+ u8 tos, struct net_device *dev, int flags,
+ struct sk_buff *oskb)
+{
+ struct sk_buff *skb;
+ struct iphdr *iph;
+ struct tcphdr *th;
+ int err, len;
+
+ len = sizeof(*th);
+ if (mss)
+ len += TCPOLEN_MSS;
+
+ skb = NULL;
+ /* caller must give me a large enough oskb */
+ if (oskb) {
+ unsigned char *odata = oskb->data;
+
+ if (skb_recycle_check(oskb, 0)) {
+ oskb->data = odata;
+ skb_reset_tail_pointer(oskb);
+ skb = oskb;
+ pr_debug("recycle skb\n");
+ }
+ }
+ if (!skb) {
+ skb = alloc_skb(LL_MAX_HEADER + sizeof(*iph) + len, GFP_ATOMIC);
+ if (!skb) {
+ err = -ENOMEM;
+ goto out;
+ }
+ skb_reserve(skb, LL_MAX_HEADER);
+ }
+
+ skb_reset_network_header(skb);
+ if (!(flags & TCP_SEND_FLAG_ACK2SYN) || skb != oskb) {
+ iph = (struct iphdr *)skb_put(skb, sizeof(*iph));
+ iph->version = 4;
+ iph->ihl = sizeof(*iph) / 4;
+ iph->tos = tos;
+ /* tot_len is set in ip_local_out() */
+ iph->id = 0;
+ iph->frag_off = htons(IP_DF);
+ iph->protocol = IPPROTO_TCP;
+ iph->saddr = src;
+ iph->daddr = dst;
+ th = (struct tcphdr *)skb_put(skb, len);
+ th->source = sport;
+ th->dest = dport;
+ } else {
+ iph = (struct iphdr *)skb->data;
+ iph->id = 0;
+ iph->frag_off = htons(IP_DF);
+ skb_put(skb, iph->ihl * 4 + len);
+ th = (struct tcphdr *)(skb->data + iph->ihl * 4);
+ }
+
+ th->seq = htonl(seq);
+ th->ack_seq = htonl(ack_seq);
+ tcp_flag_byte(th) = tcp_flags;
+ th->doff = len / 4;
+ th->window = window;
+ th->urg_ptr = 0;
+
+ skb->protocol = htons(ETH_P_IP);
+ if ((flags & TCP_SEND_FLAG_SYNCOOKIE) && mss)
+ err = syn_proxy_route(skb, dev_net(dev), &mss);
+ else
+ err = syn_proxy_route(skb, dev_net(dev), NULL);
+ if (err)
+ goto err_out;
+
+ if ((flags & TCP_SEND_FLAG_SYNCOOKIE)) {
+ if (mss) {
+ th->seq = htonl(__cookie_v4_init_sequence(dst, src,
+ dport, sport,
+ ack_seq - 1,
+ &mss));
+ } else {
+ mss = TCP_MSS_DEFAULT;
+ th->seq = htonl(__cookie_v4_init_sequence(dst, src,
+ dport, sport,
+ ack_seq - 1,
+ &mss));
+ mss = 0;
+ }
+ }
+
+ if (mss)
+ * (__force __be32 *)(th + 1) = htonl((TCPOPT_MSS << 24) |
+ (TCPOLEN_MSS << 16) |
+ mss);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ th->check = ~tcp_v4_check(len, src, dst, 0);
+ skb->csum_start = (unsigned char *)th - skb->head;
+ skb->csum_offset = offsetof(struct tcphdr, check);
+
+ if (!(flags & TCP_SEND_FLAG_ACK2SYN) || skb != oskb)
+ iph->ttl = dst_metric(skb_dst(skb), RTAX_HOPLIMIT);
+
+ if (skb->len > get_mtu(skb_dst(skb))) {
+ if (printk_ratelimit())
+ pr_warning("%s has smaller mtu: %d\n",
+ skb_dst(skb)->dev->name,
+ get_mtu(skb_dst(skb)));
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ if ((flags & TCP_SEND_FLAG_NOTRACE)) {
+ skb->nfct = &nf_ct_untracked_get()->ct_general;
+ skb->nfctinfo = IP_CT_NEW;
+ nf_conntrack_get(skb->nfct);
+ }
+
+ pr_debug("ip_local_out: %pI4n:%hu -> %pI4n:%hu (seq=%u, "
+ "ack_seq=%u mss=%hu flags=%hhx)\n", &src, ntohs(th->source),
+ &dst, ntohs(th->dest), ntohl(th->seq), ack_seq, mss,
+ tcp_flags);
+
+ err = ip_local_out(skb);
+ if (err > 0)
+ err = net_xmit_errno(err);
+
+ pr_debug("ip_local_out: return with %d\n", err);
+out:
+ if (oskb && oskb != skb)
+ kfree_skb(oskb);
+
+ return err;
+
+err_out:
+ kfree_skb(skb);
+ goto out;
+}
+
+static int get_mss(u8 *data, int len)
+{
+ u8 olen;
+
+ while (len >= TCPOLEN_MSS) {
+ switch (data[0]) {
+ case TCPOPT_EOL:
+ return 0;
+ case TCPOPT_NOP:
+ data++;
+ len--;
+ break;
+ case TCPOPT_MSS:
+ if (data[1] != TCPOLEN_MSS)
+ return -EINVAL;
+ return get_unaligned_be16(data + 2);
+ default:
+ olen = data[1];
+ if (olen < 2 || olen > len)
+ return -EINVAL;
+ data += olen;
+ len -= olen;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static DEFINE_PER_CPU(struct syn_proxy_state, syn_proxy_state);
+
+/* syn_proxy_pre isn't under the protection of nf_conntrack_proto_tcp.c */
+static unsigned int syn_proxy_pre(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ struct syn_proxy_state *state;
+ struct iphdr *iph;
+ struct tcphdr *th, _th;
+
+ /* only support IPv4 now */
+ iph = ip_hdr(skb);
+ if (iph->version != 4)
+ return NF_ACCEPT;
+
+ th = skb_header_pointer(skb, iph->ihl * 4, sizeof(_th), &_th);
+ if (th == NULL)
+ return NF_DROP;
+
+ if (!ct || !nf_ct_is_confirmed(ct)) {
+ int ret;
+
+ if (!th->syn && th->ack) {
+ u16 mss;
+ struct sk_buff *rec_skb;
+
+ mss = cookie_v4_check_sequence(iph, th,
+ ntohl(th->ack_seq) - 1);
+ if (!mss)
+ return NF_ACCEPT;
+
+ pr_debug("%pI4n:%hu -> %pI4n:%hu(mss=%hu)\n",
+ &iph->saddr, ntohs(th->source),
+ &iph->daddr, ntohs(th->dest), mss);
+
+ if (skb_tailroom(skb) < TCPOLEN_MSS &&
+ skb->len < iph->ihl * 4 + sizeof(*th) + TCPOLEN_MSS)
+ rec_skb = NULL;
+ else
+ rec_skb = skb;
+
+ local_bh_disable();
+ state = &__get_cpu_var(syn_proxy_state);
+ state->seq_inited = 1;
+ state->window = th->window;
+ state->seq_diff = ntohl(th->ack_seq) - 1;
+ if (rec_skb)
+ tcp_send(iph->saddr, iph->daddr, 0, 0,
+ ntohl(th->seq) - 1, 0, th->window,
+ mss, TCPHDR_SYN, 0, skb->dev,
+ TCP_SEND_FLAG_ACK2SYN, rec_skb);
+ else
+ tcp_send(iph->saddr, iph->daddr, th->source,
+ th->dest, ntohl(th->seq) - 1, 0,
+ th->window, mss, TCPHDR_SYN,
+ iph->tos, skb->dev, 0, NULL);
+ state->seq_inited = 0;
+ local_bh_enable();
+
+ if (!rec_skb)
+ kfree_skb(skb);
+
+ return NF_STOLEN;
+ }
+
+ if (!ct || !th->syn || th->ack)
+ return NF_ACCEPT;
+
+ ret = NF_ACCEPT;
+ local_bh_disable();
+ state = &__get_cpu_var(syn_proxy_state);
+ if (state->seq_inited) {
+ struct syn_proxy_state *nstate;
+
+ nstate = nf_ct_ext_add(ct, NF_CT_EXT_SYNPROXY,
+ GFP_ATOMIC);
+ if (nstate != NULL) {
+ nstate->seq_inited = 0;
+ nstate->window = state->window;
+ nstate->seq_diff = state->seq_diff;
+ pr_debug("seq_diff: %u\n", nstate->seq_diff);
+ } else {
+ ret = NF_DROP;
+ }
+ }
+ local_bh_enable();
+
+ return ret;
+ }
+
+ state = nf_ct_ext_find(ct, NF_CT_EXT_SYNPROXY);
+ if (!state)
+ return NF_ACCEPT;
+
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+ __be32 newack;
+
+ /* don't need to mangle duplicate SYN packets */
+ if (th->syn && !th->ack)
+ return NF_ACCEPT;
+ if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*th)))
+ return NF_DROP;
+ th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
+ newack = htonl(ntohl(th->ack_seq) - state->seq_diff);
+ inet_proto_csum_replace4(&th->check, skb, th->ack_seq, newack,
+ 0);
+ pr_debug("alter ack seq: %u -> %u\n",
+ ntohl(th->ack_seq), ntohl(newack));
+ th->ack_seq = newack;
+ } else {
+ /* Simultaneous open ? Oh, no. The connection between
+ * client and us is established. */
+ if (th->syn && !th->ack)
+ return NF_DROP;
+ }
+
+ return NF_ACCEPT;
+}
+
+static unsigned int syn_proxy_mangle_pkt(struct sk_buff *skb, struct iphdr *iph,
+ struct tcphdr *th, u32 seq_diff)
+{
+ __be32 new;
+ int olen;
+
+ if (skb->len < (iph->ihl + th->doff) * 4)
+ return NF_DROP;
+ if (!skb_make_writable(skb, (iph->ihl + th->doff) * 4))
+ return NF_DROP;
+ iph = (struct iphdr *)(skb->data);
+ th = (struct tcphdr *)(skb->data + iph->ihl * 4);
+
+ new = tcp_flag_word(th) & (~TCP_FLAG_SYN);
+ inet_proto_csum_replace4(&th->check, skb, tcp_flag_word(th), new, 0);
+ tcp_flag_word(th) = new;
+
+ new = htonl(ntohl(th->seq) + seq_diff);
+ inet_proto_csum_replace4(&th->check, skb, th->seq, new, 0);
+ pr_debug("alter seq: %u -> %u\n", ntohl(th->seq), ntohl(new));
+ th->seq = new;
+
+ olen = th->doff - sizeof(*th) / 4;
+ if (olen) {
+ __be32 *opt;
+
+ opt = (__force __be32 *)(th + 1);
+#define TCPOPT_EOL_WORD ((TCPOPT_EOL << 24) + (TCPOPT_EOL << 16) + \
+ (TCPOPT_EOL << 8) + TCPOPT_EOL)
+ inet_proto_csum_replace4(&th->check, skb, *opt, TCPOPT_EOL_WORD,
+ 0);
+ *opt = TCPOPT_EOL_WORD;
+ }
+
+ return NF_ACCEPT;
+}
+
+static unsigned int syn_proxy_post(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ struct syn_proxy_state *state;
+ struct iphdr *iph;
+ struct tcphdr *th;
+
+ /* untraced packets don't have NF_CT_EXT_SYNPROXY ext, as they don't
+ * enter syn_proxy_pre() */
+ state = nf_ct_ext_find(ct, NF_CT_EXT_SYNPROXY);
+ if (state == NULL)
+ return NF_ACCEPT;
+
+ iph = ip_hdr(skb);
+ if (!skb_make_writable(skb, iph->ihl * 4 + sizeof(*th)))
+ return NF_DROP;
+ th = (struct tcphdr *)(skb->data + iph->ihl * 4);
+ if (!state->seq_inited) {
+ if (th->syn) {
+ /* It must be from original direction, as the ones
+ * from the other side are dropped in function
+ * syn_proxy_pre() */
+ if (!th->ack)
+ return NF_ACCEPT;
+
+ pr_debug("SYN-ACK %pI4n:%hu -> %pI4n:%hu "
+ "(seq=%u ack_seq=%u)\n",
+ &iph->saddr, ntohs(th->source), &iph->daddr,
+ ntohs(th->dest), ntohl(th->seq),
+ ntohl(th->ack_seq));
+
+ /* SYN-ACK from reply direction with the protection
+ * of conntrack */
+ spin_lock_bh(&ct->lock);
+ if (!state->seq_inited) {
+ state->seq_inited = 1;
+ pr_debug("update seq_diff %u -> %u\n",
+ state->seq_diff,
+ state->seq_diff - ntohl(th->seq));
+ state->seq_diff -= ntohl(th->seq);
+ }
+ spin_unlock_bh(&ct->lock);
+ tcp_send(iph->daddr, iph->saddr, th->dest, th->source,
+ ntohl(th->ack_seq),
+ ntohl(th->seq) + 1 + state->seq_diff,
+ state->window, 0, TCPHDR_ACK, iph->tos,
+ skb->dev, 0, NULL);
+
+ return syn_proxy_mangle_pkt(skb, iph, th,
+ state->seq_diff + 1);
+ } else {
+ __be32 newseq;
+
+ if (!th->rst)
+ return NF_ACCEPT;
+ newseq = htonl(state->seq_diff + 1);
+ inet_proto_csum_replace4(&th->check, skb, th->seq,
+ newseq, 0);
+ pr_debug("alter RST seq: %u -> %u\n",
+ ntohl(th->seq), ntohl(newseq));
+ th->seq = newseq;
+
+ return NF_ACCEPT;
+ }
+ }
+
+ /* ct should be in ESTABLISHED state, but if the ack packets from
+ * us are lost. */
+ if (th->syn) {
+ if (!th->ack)
+ return NF_ACCEPT;
+
+ tcp_send(iph->daddr, iph->saddr, th->dest, th->source,
+ ntohl(th->ack_seq),
+ ntohl(th->seq) + 1 + state->seq_diff,
+ state->window, 0, TCPHDR_ACK, iph->tos,
+ skb->dev, 0, NULL);
+
+ return syn_proxy_mangle_pkt(skb, iph, th, state->seq_diff + 1);
+ }
+
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
+ __be32 newseq;
+
+ newseq = htonl(ntohl(th->seq) + state->seq_diff);
+ inet_proto_csum_replace4(&th->check, skb, th->seq, newseq, 0);
+ pr_debug("alter seq: %u -> %u\n", ntohl(th->seq),
+ ntohl(newseq));
+ th->seq = newseq;
+ }
+
+ return NF_ACCEPT;
+}
+
+static unsigned int tcp_process(struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ const struct tcphdr *th;
+ int err;
+ u16 mss;
+
+ iph = ip_hdr(skb);
+ if (iph->frag_off & htons(IP_OFFSET))
+ goto out;
+ if (!pskb_may_pull(skb, iph->ihl * 4 + sizeof(*th)))
+ goto out;
+ th = (const struct tcphdr *)(skb->data + iph->ihl * 4);
+ if ((tcp_flag_byte(th) &
+ (TCPHDR_FIN | TCPHDR_RST | TCPHDR_ACK | TCPHDR_SYN)) != TCPHDR_SYN)
+ goto out;
+
+ if (nf_ip_checksum(skb, NF_INET_PRE_ROUTING, iph->ihl * 4, IPPROTO_TCP))
+ goto out;
+ mss = 0;
+ if (th->doff > sizeof(*th) / 4) {
+ if (!pskb_may_pull(skb, (iph->ihl + th->doff) * 4))
+ goto out;
+ err = get_mss((u8 *)(th + 1), th->doff * 4 - sizeof(*th));
+ if (err < 0)
+ goto out;
+ if (err != 0)
+ mss = err;
+ } else if (th->doff != sizeof(*th) / 4)
+ goto out;
+
+ tcp_send(iph->daddr, iph->saddr, th->dest, th->source, 0,
+ ntohl(th->seq) + 1, 0, mss, TCPHDR_SYN | TCPHDR_ACK,
+ iph->tos, skb->dev,
+ TCP_SEND_FLAG_NOTRACE | TCP_SEND_FLAG_SYNCOOKIE, skb);
+
+ return NF_STOLEN;
+
+out:
+ return NF_DROP;
+}
+
+static unsigned int synproxy_tg(struct sk_buff *skb,
+ const struct xt_action_param *par)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ int ret;
+
+ /* received from lo */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct)
+ return IPT_CONTINUE;
+
+ local_bh_disable();
+ if (!__get_cpu_var(syn_proxy_state).seq_inited)
+ ret = tcp_process(skb);
+ else
+ ret = IPT_CONTINUE;
+ local_bh_enable();
+
+ return ret;
+}
+
+static int synproxy_tg_check(const struct xt_tgchk_param *par)
+{
+ int ret;
+
+ ret = nf_ct_l3proto_try_module_get(par->family);
+ if (ret < 0)
+ pr_info("cannot load conntrack support for proto=%u\n",
+ par->family);
+
+ return ret;
+}
+
+static void synproxy_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_target synproxy_tg_reg __read_mostly = {
+ .name = "SYNPROXY",
+ .family = NFPROTO_IPV4,
+ .target = synproxy_tg,
+ .table = "raw",
+ .hooks = 1 << NF_INET_PRE_ROUTING,
+ .proto = IPPROTO_TCP,
+ .checkentry = synproxy_tg_check,
+ .destroy = synproxy_tg_destroy,
+ .me = THIS_MODULE,
+};
+
+static struct nf_ct_ext_type syn_proxy_state_ext __read_mostly = {
+ .len = sizeof(struct syn_proxy_state),
+ .align = __alignof__(struct syn_proxy_state),
+ .id = NF_CT_EXT_SYNPROXY,
+};
+
+static int __init synproxy_tg_init(void)
+{
+ int err;
+
+ rcu_assign_pointer(syn_proxy_pre_hook, syn_proxy_pre);
+ rcu_assign_pointer(syn_proxy_post_hook, syn_proxy_post);
+ err = nf_ct_extend_register(&syn_proxy_state_ext);
+ if (err)
+ goto err_out;
+ err = xt_register_target(&synproxy_tg_reg);
+ if (err)
+ goto err_out2;
+
+ return err;
+
+err_out2:
+ nf_ct_extend_unregister(&syn_proxy_state_ext);
+err_out:
+ rcu_assign_pointer(syn_proxy_post_hook, NULL);
+ rcu_assign_pointer(syn_proxy_pre_hook, NULL);
+ rcu_barrier();
+
+ return err;
+}
+
+static void __exit synproxy_tg_exit(void)
+{
+ xt_unregister_target(&synproxy_tg_reg);
+ nf_ct_extend_unregister(&syn_proxy_state_ext);
+ rcu_assign_pointer(syn_proxy_post_hook, NULL);
+ rcu_assign_pointer(syn_proxy_pre_hook, NULL);
+ rcu_barrier();
+}
+
+module_init(synproxy_tg_init);
+module_exit(synproxy_tg_exit);
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox