* [PATCHv3] extensions: libxt_CHECKSUM extension
From: Michael S. Tsirkin @ 2010-07-11 15:14 UTC (permalink / raw)
To: Patrick McHardy, David S. Miller, Jan Engelhardt, Randy Dunlap,
netfilter-devel
In-Reply-To: <20100711150623.GA7420@redhat.com>
This adds a `CHECKSUM' target, which can be used in the iptables mangle
table.
You can use this target to compute and fill in the checksum in
a packet that lacks a checksum. This is particularly useful,
if you need to work around old applications such as dhcp clients,
that do not work well with checksum offloads, but don't want to disable
checksum offload in your device.
The problem happens in the field with virtualized applications.
For reference, see Red Hat bz 605555, as well as
http://www.spinics.net/lists/kvm/msg37660.html
Typical expected use (helps old dhclient binary running in a VM):
iptables -A POSTROUTING -t mangle -p udp --dport bootpc \
-j CHECKSUM --checksum-fill
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
Correction in the documentation. Sorry about the noise.
Changes from v2:
updated man file
Changes from v1:
switched from ipt to xt
extensions/libxt_CHECKSUM.c | 99 +++++++++++++++++++++++++++++++++++++++++
extensions/libxt_CHECKSUM.man | 8 +++
2 files changed, 107 insertions(+), 0 deletions(-)
create mode 100644 extensions/libxt_CHECKSUM.c
create mode 100644 extensions/libxt_CHECKSUM.man
diff --git a/extensions/libxt_CHECKSUM.c b/extensions/libxt_CHECKSUM.c
new file mode 100644
index 0000000..00fbd8f
--- /dev/null
+++ b/extensions/libxt_CHECKSUM.c
@@ -0,0 +1,99 @@
+/* Shared library add-on to xtables for CHECKSUM
+ *
+ * (C) 2002 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2010 by Red Hat, Inc
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is distributed under the terms of GNU GPL v2, 1991
+ *
+ * libxt_CHECKSUM.c borrowed some bits from libipt_ECN.c
+ *
+ * $Id$
+ */
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <getopt.h>
+
+#include <xtables.h>
+#include <linux/netfilter/xt_CHECKSUM.h>
+
+static void CHECKSUM_help(void)
+{
+ printf(
+"CHECKSUM target options\n"
+" --checksum-fill Fill in packet checksum.\n");
+}
+
+static const struct option CHECKSUM_opts[] = {
+ { "checksum-fill", 0, NULL, 'F' },
+ { .name = NULL }
+};
+
+static int CHECKSUM_parse(int c, char **argv, int invert, unsigned int *flags,
+ const void *entry, struct xt_entry_target **target)
+{
+ struct xt_CHECKSUM_info *einfo
+ = (struct xt_CHECKSUM_info *)(*target)->data;
+
+ switch (c) {
+ case 'F':
+ if (*flags)
+ xtables_error(PARAMETER_PROBLEM,
+ "CHECKSUM target: Only use --checksum-fill ONCE!");
+ einfo->operation = XT_CHECKSUM_OP_FILL;
+ *flags |= XT_CHECKSUM_OP_FILL;
+ break;
+ default:
+ return 0;
+ }
+
+ return 1;
+}
+
+static void CHECKSUM_check(unsigned int flags)
+{
+ if (!flags)
+ xtables_error(PARAMETER_PROBLEM,
+ "CHECKSUM target: Parameter --checksum-fill is required");
+}
+
+static void CHECKSUM_print(const void *ip, const struct xt_entry_target *target,
+ int numeric)
+{
+ const struct xt_CHECKSUM_info *einfo =
+ (const struct xt_CHECKSUM_info *)target->data;
+
+ printf("CHECKSUM ");
+
+ if (einfo->operation & XT_CHECKSUM_OP_FILL)
+ printf("fill ");
+}
+
+static void CHECKSUM_save(const void *ip, const struct xt_entry_target *target)
+{
+ const struct xt_CHECKSUM_info *einfo =
+ (const struct xt_CHECKSUM_info *)target->data;
+
+ if (einfo->operation & XT_CHECKSUM_OP_FILL)
+ printf("--checksum-fill ");
+}
+
+static struct xtables_target checksum_tg_reg = {
+ .name = "CHECKSUM",
+ .version = XTABLES_VERSION,
+ .family = NFPROTO_UNSPEC,
+ .size = XT_ALIGN(sizeof(struct xt_CHECKSUM_info)),
+ .userspacesize = XT_ALIGN(sizeof(struct xt_CHECKSUM_info)),
+ .help = CHECKSUM_help,
+ .parse = CHECKSUM_parse,
+ .final_check = CHECKSUM_check,
+ .print = CHECKSUM_print,
+ .save = CHECKSUM_save,
+ .extra_opts = CHECKSUM_opts,
+};
+
+void _init(void)
+{
+ xtables_register_target(&checksum_tg_reg);
+}
diff --git a/extensions/libxt_CHECKSUM.man b/extensions/libxt_CHECKSUM.man
new file mode 100644
index 0000000..92ae700
--- /dev/null
+++ b/extensions/libxt_CHECKSUM.man
@@ -0,0 +1,8 @@
+This target allows to selectively work around broken/old applications.
+It can only be used in the mangle table.
+.TP
+\fB\-\-checksum\-fill\fP
+Compute and fill in the checksum in a packet that lacks a checksum.
+This is particularly useful, if you need to work around old applications
+such as dhcp clients, that do not work well with checksum offloads,
+but don't want to disable checksum offload in your device.
--
1.7.2.rc0.14.g41c1c
^ permalink raw reply related
* Re: oops in tcp_xmit_retransmit_queue() w/ v2.6.32.15
From: Ilpo Järvinen @ 2010-07-11 16:09 UTC (permalink / raw)
To: Tejun Heo
Cc: David S. Miller, lkml, netdev@vger.kernel.org, Fehrmann, Henning,
Carsten Aulbert, Eric Dumazet
In-Reply-To: <4C358AAA.9080400@kernel.org>
On Thu, 8 Jul 2010, Tejun Heo wrote:
> We've been seeing oops in tcp_xmit_retransmit_queue() w/ 2.6.32.15.
> Please see the attached photoshoot. This is happening on a HPC
> cluster and very interestingly caused by one particular job. How long
> it takes isn't clear yet (at least more than a day) but when it
> happens it happens on a lot of machines in relatively short time.
>
> With a bit of disassemblying, I've found that the oops is happening
> during tcp_for_write_queue_from() because the skb->next points to
> NULL.
>
> void tcp_xmit_retransmit_queue(struct sock *sk)
> {
> ...
> if (tp->retransmit_skb_hint) {
> skb = tp->retransmit_skb_hint;
> last_lost = TCP_SKB_CB(skb)->end_seq;
> if (after(last_lost, tp->retransmit_high))
> last_lost = tp->retransmit_high;
> } else {
> skb = tcp_write_queue_head(sk);
> last_lost = tp->snd_una;
> }
>
> => tcp_for_write_queue_from(skb, sk) {
> __u8 sacked = TCP_SKB_CB(skb)->sacked;
>
> if (skb == tcp_send_head(sk))
> break;
> /* we could do better than to assign each time */
> if (hole == NULL)
>
> This can happen for one of the following reasons,
>
> 1. tp->retransmit_skb_hint is NULL and tcp_write_queue_head() is NULL
> too. ie. tcp_xmit_retransmit_queue() is called on an empty write
> queue for some reason.
>
> 2. tp->retransmit_skb_hint is pointing to a skb which is not on the
> write_queue. ie. somebody forgot to update hint while removing the
> skb from the write queue.
Once again I've read the unlinkers through, and only thing that could
cause this is tcp_send_synack (others do deal with the hints) but I think
Eric already proposed a patch to that but we never got anywhere due to
some counterargument why it wouldn't take place (too far away for me to
remember, see archives about the discussions). ...But if you want be dead
sure some WARN_ON there might not hurt. Also the purging of the whole
queue was a similar suspect I then came across (but that would only
materialize with sk reuse happening e.g., with nfs which the other guys
weren't using).
> 3. The hint is pointing to a skb on the list but the list itself is
> corrupt.
>
> I added some debug code and the crash is happening when
> tp->retransmit_skb_hint is not NULL but tp->retransmit_skb_hint->next
> is NULL. So, #1 is out; unfortunately, I didn't have debug code in
> place to discern between #2 and #3.
>
> Does anything ring a bell? This is a production system and debugging
> affects quite a number of people. I can put debug code in to discern
> between #2 and #3 but I'm basically shooting in the dark and it would
> be great if someone has a better idea.
Thanks for taking this up. I've been kind of waiting somebody to show up
who actually has some way of reproducing it. Once I had one guy in the
hook but his ability to reproduce was for some reason lost when he tried
with a debug patch [1].
I now realize that the debug patch should probably also print the write
queue too when the problem is caught in order to discern the cases you
mention.
Something along these lines:
tcp_for_write_queue(skb, sk) {
printk("skb %p (%u-%u) next %p prev %p sacked %u\n", ...);
}
Anyway, my debugging patch should be such that in a lucky case it avoids
crashing the system too, though price to pay might then be a stuck
connection. In case #3 I'd expect the box to die elsewhere in TCP code
pretty soon anyway so it depends whether avoiding oops is really so
useful, but if you're lucky other mechanism in TCP will recover
the lost one for you (basically RTO driven retransmission).
--
i.
[1] http://marc.info/?l=linux-kernel&m=126624014117610&w=2
^ permalink raw reply
* RE: [PATCH] bnx2x: add support for receive hashing
From: Eric Dumazet @ 2010-07-11 16:45 UTC (permalink / raw)
To: Vladislav Zolotarov
Cc: David Miller, therbert@google.com, netdev@vger.kernel.org
In-Reply-To: <8628FE4E7912BF47A96AE7DD7BAC0AADDDE64704FA@SJEXCHCCR02.corp.ad.broadcom.com>
Le dimanche 11 juillet 2010 à 06:16 -0700, Vladislav Zolotarov a écrit :
> Dave, it's obvious that there a demand for a new HW/FW configuration
> from our side - "rx hash enable" which is currently tightly coupled
> with the RSS capability. As long as RSS flow in our FW includes a few
> more things apart from just creating an RSS hash and as long as there
> are flows (even hypothetical) that demand the RSS hash regardless the
> RSS itself we started to work on separation of these two features from
> FW perspective. This of course will demand a new FW version but once
> we have it we'll be able to be more specific in HW configuration and
> have a cleaner code.
>
> Technically, our FW may provide the Rx HASH always and in a current
> driver configuration this is what it does.
> I wonder if the driver always provides the HW RX HASH in the
> skb->rxhash regardless the value of NETIF_F_RXHASH bit in a
> netdev->features will it cause any harm? If not we can get rid of two
> extra conditionals in the bnx2x_rx_int().
Hi
Please dont top-post on netdev, thanks.
NETIF_RX_HASH bit is needed so that we can disable skb->rxhash from a
particular NIC if we know the hardware provided rxhash is not fulfilling
our needs (We prefer spend some cpu cycles to recompute a software
rxhash).
Software one for example deliver same rxhash values for both ways of a
TCP flow, it can help conntracking for example.
The conditional in driver rx is cheap, since the condition is the same
for all packets and CPU can predicts the branch.
^ permalink raw reply
* Re: oops in tcp_xmit_retransmit_queue() w/ v2.6.32.15
From: Eric Dumazet @ 2010-07-11 17:06 UTC (permalink / raw)
To: Ilpo Järvinen
Cc: Tejun Heo, David S. Miller, lkml, netdev@vger.kernel.org,
Fehrmann, Henning, Carsten Aulbert
In-Reply-To: <alpine.DEB.2.00.1007111825510.15736@melkinpaasi.cs.helsinki.fi>
Le dimanche 11 juillet 2010 à 19:09 +0300, Ilpo Järvinen a écrit :
> On Thu, 8 Jul 2010, Tejun Heo wrote:
>
> > We've been seeing oops in tcp_xmit_retransmit_queue() w/ 2.6.32.15.
> > Please see the attached photoshoot. This is happening on a HPC
> > cluster and very interestingly caused by one particular job. How long
> > it takes isn't clear yet (at least more than a day) but when it
> > happens it happens on a lot of machines in relatively short time.
> >
> > With a bit of disassemblying, I've found that the oops is happening
> > during tcp_for_write_queue_from() because the skb->next points to
> > NULL.
> >
> > void tcp_xmit_retransmit_queue(struct sock *sk)
> > {
> > ...
> > if (tp->retransmit_skb_hint) {
> > skb = tp->retransmit_skb_hint;
> > last_lost = TCP_SKB_CB(skb)->end_seq;
> > if (after(last_lost, tp->retransmit_high))
> > last_lost = tp->retransmit_high;
> > } else {
> > skb = tcp_write_queue_head(sk);
> > last_lost = tp->snd_una;
> > }
> >
> > => tcp_for_write_queue_from(skb, sk) {
> > __u8 sacked = TCP_SKB_CB(skb)->sacked;
> >
> > if (skb == tcp_send_head(sk))
> > break;
> > /* we could do better than to assign each time */
> > if (hole == NULL)
> >
> > This can happen for one of the following reasons,
> >
> > 1. tp->retransmit_skb_hint is NULL and tcp_write_queue_head() is NULL
> > too. ie. tcp_xmit_retransmit_queue() is called on an empty write
> > queue for some reason.
> >
> > 2. tp->retransmit_skb_hint is pointing to a skb which is not on the
> > write_queue. ie. somebody forgot to update hint while removing the
> > skb from the write queue.
>
> Once again I've read the unlinkers through, and only thing that could
> cause this is tcp_send_synack (others do deal with the hints) but I think
> Eric already proposed a patch to that but we never got anywhere due to
> some counterargument why it wouldn't take place (too far away for me to
> remember, see archives about the discussions). ...But if you want be dead
> sure some WARN_ON there might not hurt. Also the purging of the whole
> queue was a similar suspect I then came across (but that would only
> materialize with sk reuse happening e.g., with nfs which the other guys
> weren't using).
>
Hmm.
This sounds familiar to me, but I cannot remember the discussion you
mention or the patch.
Or maybe it was the TCP transaction thing ? (including data in SYN or
SYN-ACK packet)
> > 3. The hint is pointing to a skb on the list but the list itself is
> > corrupt.
> >
> > I added some debug code and the crash is happening when
> > tp->retransmit_skb_hint is not NULL but tp->retransmit_skb_hint->next
> > is NULL. So, #1 is out; unfortunately, I didn't have debug code in
> > place to discern between #2 and #3.
> >
> > Does anything ring a bell? This is a production system and debugging
> > affects quite a number of people. I can put debug code in to discern
> > between #2 and #3 but I'm basically shooting in the dark and it would
> > be great if someone has a better idea.
>
> Thanks for taking this up. I've been kind of waiting somebody to show up
> who actually has some way of reproducing it. Once I had one guy in the
> hook but his ability to reproduce was for some reason lost when he tried
> with a debug patch [1].
>
> I now realize that the debug patch should probably also print the write
> queue too when the problem is caught in order to discern the cases you
> mention.
>
> Something along these lines:
>
> tcp_for_write_queue(skb, sk) {
> printk("skb %p (%u-%u) next %p prev %p sacked %u\n", ...);
> }
>
> Anyway, my debugging patch should be such that in a lucky case it avoids
> crashing the system too, though price to pay might then be a stuck
> connection. In case #3 I'd expect the box to die elsewhere in TCP code
> pretty soon anyway so it depends whether avoiding oops is really so
> useful, but if you're lucky other mechanism in TCP will recover
> the lost one for you (basically RTO driven retransmission).
>
^ permalink raw reply
* Re: [REGRESSION,BISECTED] Panic on ifup
From: George Spelvin @ 2010-07-11 17:09 UTC (permalink / raw)
To: linux, timo.teras; +Cc: davem, netdev
In-Reply-To: <4C39D834.8080206@iki.fi>
> On 07/11/2010 03:38 PM, George Spelvin wrote:
>> No, although /etc/ipec-tools.conf runs setkey. As I said,
>> I was mostly running in single-user mode; a ps axf
>> output is appended.
>>
>> Ah! A discovery! There are rules for the gateway!
>>
>> add <my_ip> <gw_ip> esp 0x200 -E aes-cbc
>> <key>;
>> add <gw_ip> <my_ip> esp 0x300 -E aes-cbc
>> <key>;
>> spdadd <gw_ip> <my_ip> any -P in ipsec
>> esp/transport//use;
>> spdadd <my_ip> <gw_ip> any -P out ipsec
>> esp/transport//use;
> This means optional encryption. Probably something goes wrong
> constructing the bundle which results in encryption not being applied.
> And it would look like xfrm_resolve_and_create_bundle() does not take
> this into account properly. I need to fix it with optional policies.
>
> In the mean while, could confirm if everything works if you change the
> last line to:
> esp/transport//require;
Will do.
This might lead to no traffic if there's something else broken, however
it should not crash. This change is needed only for the 'out' policy, as
the bundles are used only on xmit code paths.
> yup, xfrm_resolve_and_create_bundle looks to be the culprit. I'll try to
> figure out a patch for testing.
> Ok, this is basically what setkey did for you. Looks like it was ran
> twice and you are missing flush and spdflush from setkey, so you get
> duplicates here. Otherwise it's ok.
Um, I am *not* missing flush and spdflush. The entire file, with comments
and blank lines stripped, and some details censored, is:
#!/usr/sbin/setkey -f
flush;
spdflush;
add $LOCALNET.1 $LOCALNET.62 esp 0x200 -E aes-cbc <key redacted>;
add $LOCALNET.62 $LOCALNET.1 esp 0x300 -E aes-cbc <key redacted>;
add $LOCALNET.3 $LOCALNET.62 esp 0x400 -E aes-cbc <key redacted>;
add $LOCALNET.62 $LOCALNET.3 esp 0x500 -E aes-cbc <key redacted>;
spdadd $LOCALNET.1 $LOCALNET.62 any -P in ipsec esp/transport//use;
spdadd $LOCALNET.62 $LOCALNET.1 any -P out ipsec esp/transport//use;
spdadd $LOCALNET.3 $LOCALNET.62 any -P in ipsec esp/transport//use;
spdadd $LOCALNET.62 $LOCALNET.3 any -P out ipsec esp/transport//use;
Anyway, thank you very much!
^ permalink raw reply
* RE: [PATCH] bnx2x: add support for receive hashing
From: Vladislav Zolotarov @ 2010-07-11 17:22 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, therbert@google.com, netdev@vger.kernel.org
In-Reply-To: <1278866713.2538.148.camel@edumazet-laptop>
> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
> Behalf Of Eric Dumazet
> Sent: Sunday, July 11, 2010 7:45 PM
> To: Vladislav Zolotarov
> Cc: David Miller; therbert@google.com; netdev@vger.kernel.org
> Subject: RE: [PATCH] bnx2x: add support for receive hashing
>
> Le dimanche 11 juillet 2010 à 06:16 -0700, Vladislav Zolotarov a écrit :
> > Dave, it's obvious that there a demand for a new HW/FW configuration
> > from our side - "rx hash enable" which is currently tightly coupled
> > with the RSS capability. As long as RSS flow in our FW includes a few
> > more things apart from just creating an RSS hash and as long as there
> > are flows (even hypothetical) that demand the RSS hash regardless the
> > RSS itself we started to work on separation of these two features from
> > FW perspective. This of course will demand a new FW version but once
> > we have it we'll be able to be more specific in HW configuration and
> > have a cleaner code.
> >
> > Technically, our FW may provide the Rx HASH always and in a current
> > driver configuration this is what it does.
> > I wonder if the driver always provides the HW RX HASH in the
> > skb->rxhash regardless the value of NETIF_F_RXHASH bit in a
> > netdev->features will it cause any harm? If not we can get rid of two
> > extra conditionals in the bnx2x_rx_int().
>
> Hi
>
> Please dont top-post on netdev, thanks.
This discussion is directly related to Tom's patch that's why I'm posting on this thread.
>
> NETIF_RX_HASH bit is needed so that we can disable skb->rxhash from a
> particular NIC if we know the hardware provided rxhash is not fulfilling
> our needs (We prefer spend some cpu cycles to recompute a software
> rxhash).
>
> Software one for example deliver same rxhash values for both ways of a
> TCP flow, it can help conntracking for example.
I understand that, in that case the proper implementation would be to check the netdev->features when u decide to calculate the SW rxhash, isn't it?
>
> The conditional in driver rx is cheap, since the condition is the same
> for all packets and CPU can predicts the branch.
Not exactly. Our FW/HW won't provide the rxhash for none-IP packets setting the hash CQE field to zero and clearing the ETH_FAST_PATH_RX_CQE_RSS_HASH_FLG in the CQE statsu_flags (for ARPs for instance). Branch prediction is nice but considering my previous remark why do we need this branch at all?
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* FW: NET_NS: unregister_netdevice: waiting for lo to become free (after using openvpn) (was Re: sysfs bug when using tun with network namespaces)
From: Michael Leun @ 2010-07-11 17:29 UTC (permalink / raw)
To: netdev; +Cc: davem, linux-kernel
In-Reply-To: <20100710235323.5336f627@xenia.leun.net>
Hi,
On Sat, 10 Jul 2010 16:52:08 +0200
Michael Leun <lkml20100708@newton.leun.net> wrote:
> # tunctl -u ml -t tap1
> works as expected, but
> # unshare -n /bin/bash
> # tunctl -u ml -t tap1
[bug (no sysfs support for net namespaces at all) solved in 2.6.35-rcX
- I used 2.6.34.1]
Now that we have solved that last one I've another glitch (this time using 2.6.35-rc4):
In an network namespace I can use an tun/tap tunnel through ssh and
when closing that namespace then eveything is fine.
But when using openvpn (also tunnel trough tun/tap) in an network
namespace and then closing that namespace I get:
unregister_netdevice: waiting for lo to become free
[repeated]
Please see the following two examples showing that difference:
# > unshare -n /bin/bash
# > # how to setup veth device pair to get connectivity into namespace not shown here
# > tunctl -u ml -t tap1
# > ssh -o Tunnel=Ethernet -w 1:1 somewhere
[ running some traffic over tap1 not shown here ]
^d # logging out from somewhere
# > tunctl -d tap1
# > exit # logging out from shell in network namespace
Now the veth device pair used automagically vanishes and nothing
from that different network namespace remains - very well.
but
# > unshare -n /bin/bash
# > # how to setup veth device pair to get connectivity into namespace not shown here
# > openvpn --config some.config
[ running some traffic over vpn device not shown here ]
^c # stopping openvpn
# > lsof -i
# > netstat -an
Active Internet connections (servers and established)
Proto Recv-Q Send-Q Local Address Foreign Address State
Active UNIX domain sockets (servers and established)
Proto RefCnt Flags Type State I-Node Path
# > ps ax|grep openvpn|grep -v grep
# > # cannot find anything that suggests there is anything left from that openvpn session
# > exit # logging out from shell in network namespace
Now I get
Jul 10 20:02:36 doris kernel: unregister_netdevice: waiting for lo to become free. Usage count = 3
[repeated]
Now one might say it is fault of openvpn (used OpenVPN 2.1_rc20
i586-suse-linux - the one in openSuSE 11.2 package), openvpn didn't
close some ressource and ssh does fine.
But: should'nt kernel clean up after process when it exits?
And/or: Should'nt kernel clean up if last process in network namespace
exits - there is nothing left which might use that interface?!
Greg KH <greg@kroah.com> wrote:
> Yes, you are correct. Care to resend all of this to the
> network-namespace developer(s) and the netdev mailing list so that the
> correct people are notified so they can fix it all?
[X] done - hopefully, cannot find a particular network namespace
developer in MAINTAINERS or source files. If such a one exists, please
forward.
Thanks.
--
MfG,
Michael Leun
^ permalink raw reply
* RE: [PATCH] bnx2x: add support for receive hashing
From: Eric Dumazet @ 2010-07-11 17:41 UTC (permalink / raw)
To: Vladislav Zolotarov
Cc: David Miller, therbert@google.com, netdev@vger.kernel.org
In-Reply-To: <8628FE4E7912BF47A96AE7DD7BAC0AADDDE6470521@SJEXCHCCR02.corp.ad.broadcom.com>
Le dimanche 11 juillet 2010 à 10:22 -0700, Vladislav Zolotarov a écrit :
>
> > -----Original Message-----
> > From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
> > Behalf Of Eric Dumazet
> > Sent: Sunday, July 11, 2010 7:45 PM
> > To: Vladislav Zolotarov
> > Cc: David Miller; therbert@google.com; netdev@vger.kernel.org
> > Subject: RE: [PATCH] bnx2x: add support for receive hashing
> >
> > Le dimanche 11 juillet 2010 à 06:16 -0700, Vladislav Zolotarov a écrit :
> > > Dave, it's obvious that there a demand for a new HW/FW configuration
> > > from our side - "rx hash enable" which is currently tightly coupled
> > > with the RSS capability. As long as RSS flow in our FW includes a few
> > > more things apart from just creating an RSS hash and as long as there
> > > are flows (even hypothetical) that demand the RSS hash regardless the
> > > RSS itself we started to work on separation of these two features from
> > > FW perspective. This of course will demand a new FW version but once
> > > we have it we'll be able to be more specific in HW configuration and
> > > have a cleaner code.
> > >
> > > Technically, our FW may provide the Rx HASH always and in a current
> > > driver configuration this is what it does.
> > > I wonder if the driver always provides the HW RX HASH in the
> > > skb->rxhash regardless the value of NETIF_F_RXHASH bit in a
> > > netdev->features will it cause any harm? If not we can get rid of two
> > > extra conditionals in the bnx2x_rx_int().
> >
> > Hi
> >
> > Please dont top-post on netdev, thanks.
>
> This discussion is directly related to Tom's patch that's why I'm posting on this thread.
>
Thats not the question. I am saying "dont top-post", not "dont change
the subject"
> >
> > NETIF_RX_HASH bit is needed so that we can disable skb->rxhash from a
> > particular NIC if we know the hardware provided rxhash is not fulfilling
> > our needs (We prefer spend some cpu cycles to recompute a software
> > rxhash).
> >
> > Software one for example deliver same rxhash values for both ways of a
> > TCP flow, it can help conntracking for example.
>
> I understand that, in that case the proper implementation would be to check the netdev->features when u decide to calculate the SW rxhash, isn't it?
>
Nope, please check get_rps_cpus()
We only do :
if (skb->rxhash)
goto got_hash; /* Skip hash computation on packet header */
That means if skb->rxhash is set, we dont force a recompute.
Your suggestion would move a test from device driver into
get_rps_cpus() ?
Given RPS is more targeted to old devices (not able to provide rxhash),
I am not sure it brings anything.
> >
> > The conditional in driver rx is cheap, since the condition is the same
> > for all packets and CPU can predicts the branch.
>
> Not exactly. Our FW/HW won't provide the rxhash for none-IP packets
> setting the hash CQE field to zero and clearing the
> ETH_FAST_PATH_RX_CQE_RSS_HASH_FLG in the CQE statsu_flags (for ARPs
> for instance). Branch prediction is nice but considering my previous
> remark why do we need this branch at all?
Please limit your lines to 70 chars
We want drivers to set skb->rxhash only if allowed to.
If you feel this is bad for your firmware/hardware/driver, dont set
skb->rxhash.
^ permalink raw reply
* Re: oops in tcp_xmit_retransmit_queue() w/ v2.6.32.15
From: Eric Dumazet @ 2010-07-11 17:46 UTC (permalink / raw)
To: Ilpo Järvinen
Cc: Tejun Heo, David S. Miller, lkml, netdev@vger.kernel.org,
Fehrmann, Henning, Carsten Aulbert
In-Reply-To: <1278867977.2538.167.camel@edumazet-laptop>
Le dimanche 11 juillet 2010 à 19:06 +0200, Eric Dumazet a écrit :
> Le dimanche 11 juillet 2010 à 19:09 +0300, Ilpo Järvinen a écrit :
> > On Thu, 8 Jul 2010, Tejun Heo wrote:
> >
> > > We've been seeing oops in tcp_xmit_retransmit_queue() w/ 2.6.32.15.
> > > Please see the attached photoshoot. This is happening on a HPC
> > > cluster and very interestingly caused by one particular job. How long
> > > it takes isn't clear yet (at least more than a day) but when it
> > > happens it happens on a lot of machines in relatively short time.
> > >
> > > With a bit of disassemblying, I've found that the oops is happening
> > > during tcp_for_write_queue_from() because the skb->next points to
> > > NULL.
> > >
> > > void tcp_xmit_retransmit_queue(struct sock *sk)
> > > {
> > > ...
> > > if (tp->retransmit_skb_hint) {
> > > skb = tp->retransmit_skb_hint;
> > > last_lost = TCP_SKB_CB(skb)->end_seq;
> > > if (after(last_lost, tp->retransmit_high))
> > > last_lost = tp->retransmit_high;
> > > } else {
> > > skb = tcp_write_queue_head(sk);
> > > last_lost = tp->snd_una;
> > > }
> > >
> > > => tcp_for_write_queue_from(skb, sk) {
> > > __u8 sacked = TCP_SKB_CB(skb)->sacked;
> > >
> > > if (skb == tcp_send_head(sk))
> > > break;
> > > /* we could do better than to assign each time */
> > > if (hole == NULL)
> > >
> > > This can happen for one of the following reasons,
> > >
> > > 1. tp->retransmit_skb_hint is NULL and tcp_write_queue_head() is NULL
> > > too. ie. tcp_xmit_retransmit_queue() is called on an empty write
> > > queue for some reason.
> > >
> > > 2. tp->retransmit_skb_hint is pointing to a skb which is not on the
> > > write_queue. ie. somebody forgot to update hint while removing the
> > > skb from the write queue.
> >
> > Once again I've read the unlinkers through, and only thing that could
> > cause this is tcp_send_synack (others do deal with the hints) but I think
> > Eric already proposed a patch to that but we never got anywhere due to
> > some counterargument why it wouldn't take place (too far away for me to
> > remember, see archives about the discussions). ...But if you want be dead
> > sure some WARN_ON there might not hurt. Also the purging of the whole
> > queue was a similar suspect I then came across (but that would only
> > materialize with sk reuse happening e.g., with nfs which the other guys
> > weren't using).
> >
>
> Hmm.
>
> This sounds familiar to me, but I cannot remember the discussion you
> mention or the patch.
>
> Or maybe it was the TCP transaction thing ? (including data in SYN or
> SYN-ACK packet)
Hmm, I cannot find where we reset restransmit_skb_hint in
tcp_mtu_probe(), if we call tcp_unlink_write_queue().
if (skb->len <= copy) {
/* We've eaten all the data from this skb.
* Throw it away. */
TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
<<>> tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
} else {
Sorry if this was already discussed. We might add a comment here in anycase ;)
^ permalink raw reply
* RE: [PATCH] bnx2x: add support for receive hashing
From: Vladislav Zolotarov @ 2010-07-11 18:18 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, therbert@google.com, netdev@vger.kernel.org
In-Reply-To: <1278870093.2538.175.camel@edumazet-laptop>
> -----Original Message-----
> From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
> Sent: Sunday, July 11, 2010 8:42 PM
> To: Vladislav Zolotarov
> Cc: David Miller; therbert@google.com; netdev@vger.kernel.org
> Subject: RE: [PATCH] bnx2x: add support for receive hashing
>
> Le dimanche 11 juillet 2010 à 10:22 -0700, Vladislav Zolotarov a écrit :
> >
> > > -----Original Message-----
> > > From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
> On
> > > Behalf Of Eric Dumazet
> > > Sent: Sunday, July 11, 2010 7:45 PM
> > > To: Vladislav Zolotarov
> > > Cc: David Miller; therbert@google.com; netdev@vger.kernel.org
> > > Subject: RE: [PATCH] bnx2x: add support for receive hashing
> > >
> > > Le dimanche 11 juillet 2010 à 06:16 -0700, Vladislav Zolotarov a écrit :
> > > > Dave, it's obvious that there a demand for a new HW/FW configuration
> > > > from our side - "rx hash enable" which is currently tightly coupled
> > > > with the RSS capability. As long as RSS flow in our FW includes a few
> > > > more things apart from just creating an RSS hash and as long as there
> > > > are flows (even hypothetical) that demand the RSS hash regardless the
> > > > RSS itself we started to work on separation of these two features from
> > > > FW perspective. This of course will demand a new FW version but once
> > > > we have it we'll be able to be more specific in HW configuration and
> > > > have a cleaner code.
> > > >
> > > > Technically, our FW may provide the Rx HASH always and in a current
> > > > driver configuration this is what it does.
> > > > I wonder if the driver always provides the HW RX HASH in the
> > > > skb->rxhash regardless the value of NETIF_F_RXHASH bit in a
> > > > netdev->features will it cause any harm? If not we can get rid of two
> > > > extra conditionals in the bnx2x_rx_int().
> > >
> > > Hi
> > >
> > > Please dont top-post on netdev, thanks.
> >
> > This discussion is directly related to Tom's patch that's why I'm posting
> on this thread.
> >
>
> Thats not the question. I am saying "dont top-post", not "dont change
> the subject"
>
> > >
> > > NETIF_RX_HASH bit is needed so that we can disable skb->rxhash from a
> > > particular NIC if we know the hardware provided rxhash is not fulfilling
> > > our needs (We prefer spend some cpu cycles to recompute a software
> > > rxhash).
> > >
> > > Software one for example deliver same rxhash values for both ways of a
> > > TCP flow, it can help conntracking for example.
> >
> > I understand that, in that case the proper implementation would be to check
> the netdev->features when u decide to calculate the SW rxhash, isn't it?
> >
>
> Nope, please check get_rps_cpus()
>
> We only do :
>
> if (skb->rxhash)
> goto got_hash; /* Skip hash computation on packet header */
>
> That means if skb->rxhash is set, we dont force a recompute.
Ok. No, prob. In that case we just need to check the netdev->feature in
the bnx2x_rx_int(). Checking on CQE flags is a not needed.
I'll post a patch shortly.
Thanks,
vlad
>
> Your suggestion would move a test from device driver into
> get_rps_cpus() ?
>
> Given RPS is more targeted to old devices (not able to provide rxhash),
> I am not sure it brings anything.
>
> > >
> > > The conditional in driver rx is cheap, since the condition is the same
> > > for all packets and CPU can predicts the branch.
> >
> > Not exactly. Our FW/HW won't provide the rxhash for none-IP packets
> > setting the hash CQE field to zero and clearing the
> > ETH_FAST_PATH_RX_CQE_RSS_HASH_FLG in the CQE statsu_flags (for ARPs
> > for instance). Branch prediction is nice but considering my previous
> > remark why do we need this branch at all?
>
> Please limit your lines to 70 chars
>
> We want drivers to set skb->rxhash only if allowed to.
>
> If you feel this is bad for your firmware/hardware/driver, dont set
skb->rxhash.
>
>
>
^ permalink raw reply
* Re: oops in tcp_xmit_retransmit_queue() w/ v2.6.32.15
From: Eric Dumazet @ 2010-07-11 18:29 UTC (permalink / raw)
To: Ilpo Järvinen
Cc: Tejun Heo, David S. Miller, lkml, netdev@vger.kernel.org,
Fehrmann, Henning, Carsten Aulbert
In-Reply-To: <1278870382.2538.180.camel@edumazet-laptop>
Le dimanche 11 juillet 2010 à 19:46 +0200, Eric Dumazet a écrit :
> Le dimanche 11 juillet 2010 à 19:06 +0200, Eric Dumazet a écrit :
> > Le dimanche 11 juillet 2010 à 19:09 +0300, Ilpo Järvinen a écrit :
> > > On Thu, 8 Jul 2010, Tejun Heo wrote:
> > >
> > > > We've been seeing oops in tcp_xmit_retransmit_queue() w/ 2.6.32.15.
> > > > Please see the attached photoshoot. This is happening on a HPC
> > > > cluster and very interestingly caused by one particular job. How long
> > > > it takes isn't clear yet (at least more than a day) but when it
> > > > happens it happens on a lot of machines in relatively short time.
> > > >
> > > > With a bit of disassemblying, I've found that the oops is happening
> > > > during tcp_for_write_queue_from() because the skb->next points to
> > > > NULL.
> > > >
> > > > void tcp_xmit_retransmit_queue(struct sock *sk)
> > > > {
> > > > ...
> > > > if (tp->retransmit_skb_hint) {
> > > > skb = tp->retransmit_skb_hint;
> > > > last_lost = TCP_SKB_CB(skb)->end_seq;
> > > > if (after(last_lost, tp->retransmit_high))
> > > > last_lost = tp->retransmit_high;
> > > > } else {
> > > > skb = tcp_write_queue_head(sk);
> > > > last_lost = tp->snd_una;
> > > > }
> > > >
> > > > => tcp_for_write_queue_from(skb, sk) {
> > > > __u8 sacked = TCP_SKB_CB(skb)->sacked;
> > > >
> > > > if (skb == tcp_send_head(sk))
> > > > break;
> > > > /* we could do better than to assign each time */
> > > > if (hole == NULL)
> > > >
> > > > This can happen for one of the following reasons,
> > > >
> > > > 1. tp->retransmit_skb_hint is NULL and tcp_write_queue_head() is NULL
> > > > too. ie. tcp_xmit_retransmit_queue() is called on an empty write
> > > > queue for some reason.
> > > >
> > > > 2. tp->retransmit_skb_hint is pointing to a skb which is not on the
> > > > write_queue. ie. somebody forgot to update hint while removing the
> > > > skb from the write queue.
> > >
> > > Once again I've read the unlinkers through, and only thing that could
> > > cause this is tcp_send_synack (others do deal with the hints) but I think
> > > Eric already proposed a patch to that but we never got anywhere due to
> > > some counterargument why it wouldn't take place (too far away for me to
> > > remember, see archives about the discussions). ...But if you want be dead
> > > sure some WARN_ON there might not hurt. Also the purging of the whole
> > > queue was a similar suspect I then came across (but that would only
> > > materialize with sk reuse happening e.g., with nfs which the other guys
> > > weren't using).
> > >
> >
> > Hmm.
> >
> > This sounds familiar to me, but I cannot remember the discussion you
> > mention or the patch.
> >
> > Or maybe it was the TCP transaction thing ? (including data in SYN or
> > SYN-ACK packet)
>
> Hmm, I cannot find where we reset restransmit_skb_hint in
> tcp_mtu_probe(), if we call tcp_unlink_write_queue().
>
> if (skb->len <= copy) {
> /* We've eaten all the data from this skb.
> * Throw it away. */
> TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
> <<>> tcp_unlink_write_queue(skb, sk);
> sk_wmem_free_skb(sk, skb);
> } else {
>
>
> Sorry if this was already discussed. We might add a comment here in anycase ;)
>
Just in case, here is a patch for this issue, if Tejun wants to try it.
Thanks
[PATCH] tcp: tcp_mtu_probe() and retransmit hints
When removing an skb from tcp write queue, we must take care of various
hints that could be kept on this skb. tcp_mtu_probe() misses this
cleanup.
lkml reference : http://lkml.org/lkml/2010/7/8/63
Reported-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b4ed957..187453f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1666,6 +1666,9 @@ static int tcp_mtu_probe(struct sock *sk)
* Throw it away. */
TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
tcp_unlink_write_queue(skb, sk);
+ tcp_clear_retrans_hints_partial(tp);
+ if (skb == tp->retransmit_skb_hint)
+ tp->retransmit_skb_hint = nskb;
sk_wmem_free_skb(sk, skb);
} else {
TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
^ permalink raw reply related
* Re: oops in tcp_xmit_retransmit_queue() w/ v2.6.32.15
From: Ilpo Järvinen @ 2010-07-11 19:22 UTC (permalink / raw)
To: Eric Dumazet
Cc: Tejun Heo, David S. Miller, lkml, netdev@vger.kernel.org,
Fehrmann, Henning, Carsten Aulbert
In-Reply-To: <1278872990.2538.189.camel@edumazet-laptop>
[-- Attachment #1: Type: TEXT/PLAIN, Size: 5020 bytes --]
On Sun, 11 Jul 2010, Eric Dumazet wrote:
> Le dimanche 11 juillet 2010 à 19:46 +0200, Eric Dumazet a écrit :
> > Le dimanche 11 juillet 2010 à 19:06 +0200, Eric Dumazet a écrit :
> > > Le dimanche 11 juillet 2010 à 19:09 +0300, Ilpo Järvinen a écrit :
> > > > On Thu, 8 Jul 2010, Tejun Heo wrote:
> > > >
> > > > > We've been seeing oops in tcp_xmit_retransmit_queue() w/ 2.6.32.15.
> > > > > Please see the attached photoshoot. This is happening on a HPC
> > > > > cluster and very interestingly caused by one particular job. How long
> > > > > it takes isn't clear yet (at least more than a day) but when it
> > > > > happens it happens on a lot of machines in relatively short time.
> > > > >
> > > > > With a bit of disassemblying, I've found that the oops is happening
> > > > > during tcp_for_write_queue_from() because the skb->next points to
> > > > > NULL.
> > > > >
> > > > > void tcp_xmit_retransmit_queue(struct sock *sk)
> > > > > {
> > > > > ...
> > > > > if (tp->retransmit_skb_hint) {
> > > > > skb = tp->retransmit_skb_hint;
> > > > > last_lost = TCP_SKB_CB(skb)->end_seq;
> > > > > if (after(last_lost, tp->retransmit_high))
> > > > > last_lost = tp->retransmit_high;
> > > > > } else {
> > > > > skb = tcp_write_queue_head(sk);
> > > > > last_lost = tp->snd_una;
> > > > > }
> > > > >
> > > > > => tcp_for_write_queue_from(skb, sk) {
> > > > > __u8 sacked = TCP_SKB_CB(skb)->sacked;
> > > > >
> > > > > if (skb == tcp_send_head(sk))
> > > > > break;
> > > > > /* we could do better than to assign each time */
> > > > > if (hole == NULL)
> > > > >
> > > > > This can happen for one of the following reasons,
> > > > >
> > > > > 1. tp->retransmit_skb_hint is NULL and tcp_write_queue_head() is NULL
> > > > > too. ie. tcp_xmit_retransmit_queue() is called on an empty write
> > > > > queue for some reason.
> > > > >
> > > > > 2. tp->retransmit_skb_hint is pointing to a skb which is not on the
> > > > > write_queue. ie. somebody forgot to update hint while removing the
> > > > > skb from the write queue.
> > > >
> > > > Once again I've read the unlinkers through, and only thing that could
> > > > cause this is tcp_send_synack (others do deal with the hints) but I think
> > > > Eric already proposed a patch to that but we never got anywhere due to
> > > > some counterargument why it wouldn't take place (too far away for me to
> > > > remember, see archives about the discussions). ...But if you want be dead
> > > > sure some WARN_ON there might not hurt. Also the purging of the whole
> > > > queue was a similar suspect I then came across (but that would only
> > > > materialize with sk reuse happening e.g., with nfs which the other guys
> > > > weren't using).
> > > >
> > >
> > > Hmm.
> > >
> > > This sounds familiar to me, but I cannot remember the discussion you
> > > mention or the patch.
> > >
> > > Or maybe it was the TCP transaction thing ? (including data in SYN or
> > > SYN-ACK packet)
No. That's another thing. ...I've already found it with google today but
cannot seem to find it again. I thought I used tcp_make_synack eric but
for some reason I only get these transaction fix hits. I'll keep looking.
> > Hmm, I cannot find where we reset restransmit_skb_hint in
> > tcp_mtu_probe(), if we call tcp_unlink_write_queue().
> >
> > if (skb->len <= copy) {
> > /* We've eaten all the data from this skb.
> > * Throw it away. */
> > TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
> > <<>> tcp_unlink_write_queue(skb, sk);
> > sk_wmem_free_skb(sk, skb);
> > } else {
> >
> >
> > Sorry if this was already discussed. We might add a comment here in anycase ;)
> >
>
> Just in case, here is a patch for this issue, if Tejun wants to try it.
>
> Thanks
>
> [PATCH] tcp: tcp_mtu_probe() and retransmit hints
>
> When removing an skb from tcp write queue, we must take care of various
> hints that could be kept on this skb. tcp_mtu_probe() misses this
> cleanup.
>
> lkml reference : http://lkml.org/lkml/2010/7/8/63
>
> Reported-by: Tejun Heo <tj@kernel.org>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index b4ed957..187453f 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -1666,6 +1666,9 @@ static int tcp_mtu_probe(struct sock *sk)
> * Throw it away. */
> TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
> tcp_unlink_write_queue(skb, sk);
> + tcp_clear_retrans_hints_partial(tp);
> + if (skb == tp->retransmit_skb_hint)
> + tp->retransmit_skb_hint = nskb;
...I think we've not sent that skb just yet, so this'll never be true (and
the rexmit loop terminates at send_head without setting it so we should
be safe, I'll still need to check that no other code can accidently move
it to the send_head but I doubt it happens).
> sk_wmem_free_skb(sk, skb);
> } else {
> TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
>
>
>
--
i.
^ permalink raw reply
* Re: oops in tcp_xmit_retransmit_queue() w/ v2.6.32.15
From: Ilpo Järvinen @ 2010-07-11 19:25 UTC (permalink / raw)
To: Eric Dumazet
Cc: Tejun Heo, David S. Miller, lkml, netdev@vger.kernel.org,
Fehrmann, Henning, Carsten Aulbert
In-Reply-To: <alpine.DEB.2.00.1007112214250.15736@melkinpaasi.cs.helsinki.fi>
[-- Attachment #1: Type: TEXT/PLAIN, Size: 3511 bytes --]
On Sun, 11 Jul 2010, Ilpo Järvinen wrote:
> On Sun, 11 Jul 2010, Eric Dumazet wrote:
>
> > Le dimanche 11 juillet 2010 à 19:46 +0200, Eric Dumazet a écrit :
> > > Le dimanche 11 juillet 2010 à 19:06 +0200, Eric Dumazet a écrit :
> > > > Le dimanche 11 juillet 2010 à 19:09 +0300, Ilpo Järvinen a écrit :
> > > > > On Thu, 8 Jul 2010, Tejun Heo wrote:
> > > > >
> > > > > > We've been seeing oops in tcp_xmit_retransmit_queue() w/ 2.6.32.15.
> > > > > > Please see the attached photoshoot. This is happening on a HPC
> > > > > > cluster and very interestingly caused by one particular job. How long
> > > > > > it takes isn't clear yet (at least more than a day) but when it
> > > > > > happens it happens on a lot of machines in relatively short time.
> > > > > >
> > > > > > With a bit of disassemblying, I've found that the oops is happening
> > > > > > during tcp_for_write_queue_from() because the skb->next points to
> > > > > > NULL.
> > > > > >
> > > > > > void tcp_xmit_retransmit_queue(struct sock *sk)
> > > > > > {
> > > > > > ...
> > > > > > if (tp->retransmit_skb_hint) {
> > > > > > skb = tp->retransmit_skb_hint;
> > > > > > last_lost = TCP_SKB_CB(skb)->end_seq;
> > > > > > if (after(last_lost, tp->retransmit_high))
> > > > > > last_lost = tp->retransmit_high;
> > > > > > } else {
> > > > > > skb = tcp_write_queue_head(sk);
> > > > > > last_lost = tp->snd_una;
> > > > > > }
> > > > > >
> > > > > > => tcp_for_write_queue_from(skb, sk) {
> > > > > > __u8 sacked = TCP_SKB_CB(skb)->sacked;
> > > > > >
> > > > > > if (skb == tcp_send_head(sk))
> > > > > > break;
> > > > > > /* we could do better than to assign each time */
> > > > > > if (hole == NULL)
> > > > > >
> > > > > > This can happen for one of the following reasons,
> > > > > >
> > > > > > 1. tp->retransmit_skb_hint is NULL and tcp_write_queue_head() is NULL
> > > > > > too. ie. tcp_xmit_retransmit_queue() is called on an empty write
> > > > > > queue for some reason.
> > > > > >
> > > > > > 2. tp->retransmit_skb_hint is pointing to a skb which is not on the
> > > > > > write_queue. ie. somebody forgot to update hint while removing the
> > > > > > skb from the write queue.
> > > > >
> > > > > Once again I've read the unlinkers through, and only thing that could
> > > > > cause this is tcp_send_synack (others do deal with the hints) but I think
> > > > > Eric already proposed a patch to that but we never got anywhere due to
> > > > > some counterargument why it wouldn't take place (too far away for me to
> > > > > remember, see archives about the discussions). ...But if you want be dead
> > > > > sure some WARN_ON there might not hurt. Also the purging of the whole
> > > > > queue was a similar suspect I then came across (but that would only
> > > > > materialize with sk reuse happening e.g., with nfs which the other guys
> > > > > weren't using).
> > > > >
> > > >
> > > > Hmm.
> > > >
> > > > This sounds familiar to me, but I cannot remember the discussion you
> > > > mention or the patch.
> > > >
> > > > Or maybe it was the TCP transaction thing ? (including data in SYN or
> > > > SYN-ACK packet)
>
> No. That's another thing. ...I've already found it with google today but
> cannot seem to find it again. I thought I used tcp_make_synack eric but
> for some reason I only get these transaction fix hits. I'll keep looking.
Right, this one:
http://kerneltrap.org/mailarchive/linux-netdev/2009/10/29/6259073
--
i.
^ permalink raw reply
* [PATCH net-next 4/8] tg3: Relax 5717 serdes restriction
From: Matt Carlson @ 2010-07-11 19:31 UTC (permalink / raw)
To: davem; +Cc: netdev, andy, mcarlson
tg3 is coded to refuse to attach to 5717 serdes devices. Now that the
hardware is better supported, we can relax this restriction. This patch
also fixes a recently introduced bug which will cause serdes parallel
detection not to work with 5780 class devices.
Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
---
drivers/net/tg3.c | 5 ++---
1 files changed, 2 insertions(+), 3 deletions(-)
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index efac448..c91049b 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -8551,7 +8551,7 @@ static void tg3_timer(unsigned long __opaque)
tg3_setup_phy(tp, 0);
}
} else if ((tp->tg3_flags2 & TG3_FLG2_MII_SERDES) &&
- !(tp->tg3_flags2 & TG3_FLG2_5780_CLASS)) {
+ (tp->tg3_flags2 & TG3_FLG2_5780_CLASS)) {
tg3_serdes_parallel_detect(tp);
}
@@ -13427,8 +13427,7 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
return err;
if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5717 &&
- (tp->pci_chip_rev_id != CHIPREV_ID_5717_A0 ||
- (tp->tg3_flags2 & TG3_FLG2_MII_SERDES)))
+ tp->pci_chip_rev_id != CHIPREV_ID_5717_A0)
return -ENOTSUPP;
/* Initialize data/descriptor byte/word swapping. */
--
1.6.4.4
^ permalink raw reply related
* [PATCH net-next 1/8] tg3: Revert RSS indir tbl setup change
From: Matt Carlson @ 2010-07-11 19:31 UTC (permalink / raw)
To: davem; +Cc: netdev, andy, mcarlson
This patch reverts commit 2601d8a0049c8b5d29cd5adb844a305a804e505f. A
spectacular set of coincidences made it look as though the table was
setup incorrectly. The original version was correct.
Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
---
drivers/net/tg3.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index aa2d64f..89aa486 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -8237,7 +8237,7 @@ static int tg3_reset_hw(struct tg3 *tp, int reset_phy)
for (i = 0; i < TG3_RSS_INDIR_TBL_SIZE; i++) {
int idx = i % sizeof(val);
- ent[idx] = (i % (tp->irq_cnt - 1)) + 1;
+ ent[idx] = i % (tp->irq_cnt - 1);
if (idx == sizeof(val) - 1) {
tw32(reg, val);
reg += 4;
--
1.6.4.4
^ permalink raw reply related
* [PATCH net-next 0/8] tg3 bugfixes
From: Matt Carlson @ 2010-07-11 19:31 UTC (permalink / raw)
To: davem; +Cc: netdev, andy, mcarlson
This patchset fixes a few MSI-X related problems and performs a
few driver enhancements.
^ permalink raw reply
* [PATCH net-next 2/8] tg3: Fix single MSI-X vector coalescing
From: Matt Carlson @ 2010-07-11 19:31 UTC (permalink / raw)
To: davem; +Cc: netdev, andy, mcarlson
The interrupt coalescing setup code used the TG3_FLG2_USING_MSIX flag to
determine whether or not to configure the rx coalescing parameters.
This is incorrect for the single MSI-X vector case. This patch changes
the code to look at the TG3_FLG3_ENABLE_RSS instead.
Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
---
drivers/net/tg3.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 89aa486..57dba79 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -7447,7 +7447,7 @@ static void __tg3_set_coalesce(struct tg3 *tp, struct ethtool_coalesce *ec)
tw32(HOSTCC_TXCOAL_MAXF_INT, 0);
}
- if (!(tp->tg3_flags2 & TG3_FLG2_USING_MSIX)) {
+ if (!(tp->tg3_flags3 & TG3_FLG3_ENABLE_RSS)) {
tw32(HOSTCC_RXCOL_TICKS, ec->rx_coalesce_usecs);
tw32(HOSTCC_RXMAX_FRAMES, ec->rx_max_coalesced_frames);
tw32(HOSTCC_RXCOAL_MAXF_INT, ec->rx_max_coalesced_frames_irq);
--
1.6.4.4
^ permalink raw reply related
* [PATCH net-next 3/8] tg3: Fix IPv6 TSO code in tg3_start_xmit_dma_bug()
From: Matt Carlson @ 2010-07-11 19:31 UTC (permalink / raw)
To: davem; +Cc: netdev, andy, mcarlson
The tg3_start_xmit_dma_bug() function was missing code to process IPv6
TSO packets. This patch adds the missing support.
Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
---
drivers/net/tg3.c | 20 ++++++++++++++------
1 files changed, 14 insertions(+), 6 deletions(-)
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 57dba79..efac448 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -5779,7 +5779,7 @@ static netdev_tx_t tg3_start_xmit_dma_bug(struct sk_buff *skb,
if ((mss = skb_shinfo(skb)->gso_size) != 0) {
struct iphdr *iph;
- u32 tcp_opt_len, ip_tcp_len, hdr_len;
+ u32 tcp_opt_len, hdr_len;
if (skb_header_cloned(skb) &&
pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
@@ -5787,10 +5787,21 @@ static netdev_tx_t tg3_start_xmit_dma_bug(struct sk_buff *skb,
goto out_unlock;
}
+ iph = ip_hdr(skb);
tcp_opt_len = tcp_optlen(skb);
- ip_tcp_len = ip_hdrlen(skb) + sizeof(struct tcphdr);
- hdr_len = ip_tcp_len + tcp_opt_len;
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) {
+ hdr_len = skb_headlen(skb) - ETH_HLEN;
+ } else {
+ u32 ip_tcp_len;
+
+ ip_tcp_len = ip_hdrlen(skb) + sizeof(struct tcphdr);
+ hdr_len = ip_tcp_len + tcp_opt_len;
+
+ iph->check = 0;
+ iph->tot_len = htons(mss + hdr_len);
+ }
+
if (unlikely((ETH_HLEN + hdr_len) > 80) &&
(tp->tg3_flags2 & TG3_FLG2_TSO_BUG))
return tg3_tso_bug(tp, skb);
@@ -5798,9 +5809,6 @@ static netdev_tx_t tg3_start_xmit_dma_bug(struct sk_buff *skb,
base_flags |= (TXD_FLAG_CPU_PRE_DMA |
TXD_FLAG_CPU_POST_DMA);
- iph = ip_hdr(skb);
- iph->check = 0;
- iph->tot_len = htons(mss + hdr_len);
if (tp->tg3_flags2 & TG3_FLG2_HW_TSO) {
tcp_hdr(skb)->check = 0;
base_flags &= ~TXD_FLAG_TCPUDP_CSUM;
--
1.6.4.4
^ permalink raw reply related
* [PATCH net-next 6/8] tg3: Revert PCIe tx glitch fix
From: Matt Carlson @ 2010-07-11 19:31 UTC (permalink / raw)
To: davem; +Cc: netdev, andy, mcarlson
This patch reverts commit 52cdf8526fe24f11d300b75458ddee017f3f4c88,
entitled "tg3: Prevent a PCIe tx glitch". The problem does not have
any visible side-effects and happens too early for the driver to do
anything about it. The proper place for this code is within the
device's bootcode.
Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
---
drivers/net/tg3.c | 24 ------------------------
drivers/net/tg3.h | 22 ----------------------
2 files changed, 0 insertions(+), 46 deletions(-)
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index e6da16a..f61a4d8 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -7072,30 +7072,6 @@ static int tg3_chip_reset(struct tg3 *tp)
tg3_mdio_start(tp);
- if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_57780) {
- u8 phy_addr;
-
- phy_addr = tp->phy_addr;
- tp->phy_addr = TG3_PHY_PCIE_ADDR;
-
- tg3_writephy(tp, TG3_PCIEPHY_BLOCK_ADDR,
- TG3_PCIEPHY_TXB_BLK << TG3_PCIEPHY_BLOCK_SHIFT);
- val = TG3_PCIEPHY_TX0CTRL1_TXOCM | TG3_PCIEPHY_TX0CTRL1_RDCTL |
- TG3_PCIEPHY_TX0CTRL1_TXCMV | TG3_PCIEPHY_TX0CTRL1_TKSEL |
- TG3_PCIEPHY_TX0CTRL1_NB_EN;
- tg3_writephy(tp, TG3_PCIEPHY_TX0CTRL1, val);
- udelay(10);
-
- tg3_writephy(tp, TG3_PCIEPHY_BLOCK_ADDR,
- TG3_PCIEPHY_XGXS_BLK1 << TG3_PCIEPHY_BLOCK_SHIFT);
- val = TG3_PCIEPHY_PWRMGMT4_LOWPWR_EN |
- TG3_PCIEPHY_PWRMGMT4_L1PLLPD_EN;
- tg3_writephy(tp, TG3_PCIEPHY_PWRMGMT4, val);
- udelay(10);
-
- tp->phy_addr = phy_addr;
- }
-
if ((tp->tg3_flags2 & TG3_FLG2_PCI_EXPRESS) &&
tp->pci_chip_rev_id != CHIPREV_ID_5750_A0 &&
GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5785 &&
diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h
index be7ab91..0432399 100644
--- a/drivers/net/tg3.h
+++ b/drivers/net/tg3.h
@@ -2032,31 +2032,9 @@
/* Currently this is fixed. */
-#define TG3_PHY_PCIE_ADDR 0x00
#define TG3_PHY_MII_ADDR 0x01
-/*** Tigon3 specific PHY PCIE registers. ***/
-
-#define TG3_PCIEPHY_BLOCK_ADDR 0x1f
-#define TG3_PCIEPHY_XGXS_BLK1 0x0801
-#define TG3_PCIEPHY_TXB_BLK 0x0861
-#define TG3_PCIEPHY_BLOCK_SHIFT 4
-
-/* TG3_PCIEPHY_TXB_BLK */
-#define TG3_PCIEPHY_TX0CTRL1 0x15
-#define TG3_PCIEPHY_TX0CTRL1_TXOCM 0x0003
-#define TG3_PCIEPHY_TX0CTRL1_RDCTL 0x0008
-#define TG3_PCIEPHY_TX0CTRL1_TXCMV 0x0030
-#define TG3_PCIEPHY_TX0CTRL1_TKSEL 0x0040
-#define TG3_PCIEPHY_TX0CTRL1_NB_EN 0x0400
-
-/* TG3_PCIEPHY_XGXS_BLK1 */
-#define TG3_PCIEPHY_PWRMGMT4 0x1a
-#define TG3_PCIEPHY_PWRMGMT4_L1PLLPD_EN 0x0038
-#define TG3_PCIEPHY_PWRMGMT4_LOWPWR_EN 0x4000
-
-
/*** Tigon3 specific PHY MII registers. ***/
#define TG3_BMCR_SPEED1000 0x0040
--
1.6.4.4
^ permalink raw reply related
* [PATCH net-next 5/8] tg3: Report driver version to firmware
From: Matt Carlson @ 2010-07-11 19:31 UTC (permalink / raw)
To: davem; +Cc: netdev, andy, mcarlson
This patch changes the code so that the driver version can be reported
to the firmware in addition to the current use.
Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
---
drivers/net/tg3.c | 8 ++++++--
drivers/net/tg3.h | 4 +++-
2 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index c91049b..e6da16a 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/moduleparam.h>
+#include <linux/stringify.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/compiler.h>
@@ -67,7 +68,10 @@
#include "tg3.h"
#define DRV_MODULE_NAME "tg3"
-#define DRV_MODULE_VERSION "3.111"
+#define TG3_MAJ_NUM 3
+#define TG3_MIN_NUM 111
+#define DRV_MODULE_VERSION \
+ __stringify(TG3_MAJ_NUM) "." __stringify(TG3_MIN_NUM)
#define DRV_MODULE_RELDATE "June 5, 2010"
#define TG3_DEF_MAC_MODE 0
@@ -6629,7 +6633,7 @@ static void tg3_ape_driver_state_change(struct tg3 *tp, int kind)
apedata = tg3_ape_read32(tp, TG3_APE_HOST_INIT_COUNT);
tg3_ape_write32(tp, TG3_APE_HOST_INIT_COUNT, ++apedata);
tg3_ape_write32(tp, TG3_APE_HOST_DRIVER_ID,
- APE_HOST_DRIVER_ID_MAGIC);
+ APE_HOST_DRIVER_ID_MAGIC(TG3_MAJ_NUM, TG3_MIN_NUM));
tg3_ape_write32(tp, TG3_APE_HOST_BEHAVIOR,
APE_HOST_BEHAV_NO_PHYLOCK);
diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h
index b72ac52..be7ab91 100644
--- a/drivers/net/tg3.h
+++ b/drivers/net/tg3.h
@@ -2193,7 +2193,9 @@
#define APE_HOST_SEG_LEN_MAGIC 0x0000001c
#define TG3_APE_HOST_INIT_COUNT 0x4208
#define TG3_APE_HOST_DRIVER_ID 0x420c
-#define APE_HOST_DRIVER_ID_MAGIC 0xf0035100
+#define APE_HOST_DRIVER_ID_LINUX 0xf0000000
+#define APE_HOST_DRIVER_ID_MAGIC(maj, min) \
+ (APE_HOST_DRIVER_ID_LINUX | (maj & 0xff) << 16 | (min & 0xff) << 8)
#define TG3_APE_HOST_BEHAVIOR 0x4210
#define APE_HOST_BEHAV_NO_PHYLOCK 0x00000001
#define TG3_APE_HOST_HEARTBEAT_INT_MS 0x4214
--
1.6.4.4
^ permalink raw reply related
* [PATCH net-next 8/8] tg3: Update version to 3.112
From: Matt Carlson @ 2010-07-11 19:31 UTC (permalink / raw)
To: davem; +Cc: netdev, andy, mcarlson
This patch updates the tg3 version to 3.112.
Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
---
drivers/net/tg3.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 7116a47..5769e15 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -69,10 +69,10 @@
#define DRV_MODULE_NAME "tg3"
#define TG3_MAJ_NUM 3
-#define TG3_MIN_NUM 111
+#define TG3_MIN_NUM 112
#define DRV_MODULE_VERSION \
__stringify(TG3_MAJ_NUM) "." __stringify(TG3_MIN_NUM)
-#define DRV_MODULE_RELDATE "June 5, 2010"
+#define DRV_MODULE_RELDATE "July 11, 2010"
#define TG3_DEF_MAC_MODE 0
#define TG3_DEF_RX_MODE 0
--
1.6.4.4
^ permalink raw reply related
* [PATCH net-next 7/8] tg3: Fix some checkpatch errors
From: Matt Carlson @ 2010-07-11 19:31 UTC (permalink / raw)
To: davem; +Cc: netdev, andy, mcarlson
This patch fixes the following checkpatch errors:
ERROR: do not use assignment in if condition
+ if ((mss = skb_shinfo(skb)->gso_size) != 0) {
ERROR: do not use assignment in if condition
+ if ((mss = skb_shinfo(skb)->gso_size) != 0) {
ERROR: space prohibited after that '!' (ctx:BxW)
+ if (! netif_carrier_ok(tp->dev) &&
^
ERROR: space required after that ',' (ctx:VxV)
+#define GET_REG32_LOOP(base,len) \
^
ERROR: "(foo*)" should be "(foo *)"
+ memcpy(data, ((char*)&val) + b_offset, b_count);
ERROR: do not use assignment in if condition
+ if ((err = tg3_do_mem_test(tp, mem_tbl[i].offset,
Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
Reviewed-by: Michael Chan <mchan@broadcom.com>
---
drivers/net/tg3.c | 17 +++++++++--------
1 files changed, 9 insertions(+), 8 deletions(-)
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index f61a4d8..7116a47 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -5574,8 +5574,8 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb,
entry = tnapi->tx_prod;
base_flags = 0;
- mss = 0;
- if ((mss = skb_shinfo(skb)->gso_size) != 0) {
+ mss = skb_shinfo(skb)->gso_size;
+ if (mss) {
int tcp_opt_len, ip_tcp_len;
u32 hdrlen;
@@ -5781,7 +5781,8 @@ static netdev_tx_t tg3_start_xmit_dma_bug(struct sk_buff *skb,
if (skb->ip_summed == CHECKSUM_PARTIAL)
base_flags |= TXD_FLAG_TCPUDP_CSUM;
- if ((mss = skb_shinfo(skb)->gso_size) != 0) {
+ mss = skb_shinfo(skb)->gso_size;
+ if (mss) {
struct iphdr *iph;
u32 tcp_opt_len, hdr_len;
@@ -8514,7 +8515,7 @@ static void tg3_timer(unsigned long __opaque)
(mac_stat & MAC_STATUS_LNKSTATE_CHANGED)) {
need_setup = 1;
}
- if (! netif_carrier_ok(tp->dev) &&
+ if (!netif_carrier_ok(tp->dev) &&
(mac_stat & (MAC_STATUS_PCS_SYNCED |
MAC_STATUS_SIGNAL_DET))) {
need_setup = 1;
@@ -9372,7 +9373,7 @@ static void tg3_get_regs(struct net_device *dev,
tg3_full_lock(tp, 0);
#define __GET_REG32(reg) (*(p)++ = tr32(reg))
-#define GET_REG32_LOOP(base,len) \
+#define GET_REG32_LOOP(base, len) \
do { p = (u32 *)(orig_p + (base)); \
for (i = 0; i < len; i += 4) \
__GET_REG32((base) + i); \
@@ -9465,7 +9466,7 @@ static int tg3_get_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom,
ret = tg3_nvram_read_be32(tp, offset-b_offset, &val);
if (ret)
return ret;
- memcpy(data, ((char*)&val) + b_offset, b_count);
+ memcpy(data, ((char *)&val) + b_offset, b_count);
len -= b_count;
offset += b_count;
eeprom->len += b_count;
@@ -10585,8 +10586,8 @@ static int tg3_test_memory(struct tg3 *tp)
mem_tbl = mem_tbl_570x;
for (i = 0; mem_tbl[i].offset != 0xffffffff; i++) {
- if ((err = tg3_do_mem_test(tp, mem_tbl[i].offset,
- mem_tbl[i].len)) != 0)
+ err = tg3_do_mem_test(tp, mem_tbl[i].offset, mem_tbl[i].len);
+ if (err)
break;
}
--
1.6.4.4
^ permalink raw reply related
* Re: oops in tcp_xmit_retransmit_queue() w/ v2.6.32.15
From: Ilpo Järvinen @ 2010-07-11 19:44 UTC (permalink / raw)
To: Tejun Heo
Cc: Eric Dumazet, David S. Miller, lkml, netdev@vger.kernel.org,
Fehrmann, Henning, Carsten Aulbert
In-Reply-To: <alpine.DEB.2.00.1007112224110.15736@melkinpaasi.cs.helsinki.fi>
[-- Attachment #1: Type: TEXT/PLAIN, Size: 4203 bytes --]
On Sun, 11 Jul 2010, Ilpo Järvinen wrote:
> On Sun, 11 Jul 2010, Ilpo Järvinen wrote:
>
> > On Sun, 11 Jul 2010, Eric Dumazet wrote:
> >
> > > Le dimanche 11 juillet 2010 à 19:46 +0200, Eric Dumazet a écrit :
> > > > Le dimanche 11 juillet 2010 à 19:06 +0200, Eric Dumazet a écrit :
> > > > > Le dimanche 11 juillet 2010 à 19:09 +0300, Ilpo Järvinen a écrit :
> > > > > > On Thu, 8 Jul 2010, Tejun Heo wrote:
> > > > > >
> > > > > > > We've been seeing oops in tcp_xmit_retransmit_queue() w/ 2.6.32.15.
> > > > > > > Please see the attached photoshoot. This is happening on a HPC
> > > > > > > cluster and very interestingly caused by one particular job. How long
> > > > > > > it takes isn't clear yet (at least more than a day) but when it
> > > > > > > happens it happens on a lot of machines in relatively short time.
> > > > > > >
> > > > > > > With a bit of disassemblying, I've found that the oops is happening
> > > > > > > during tcp_for_write_queue_from() because the skb->next points to
> > > > > > > NULL.
> > > > > > >
> > > > > > > void tcp_xmit_retransmit_queue(struct sock *sk)
> > > > > > > {
> > > > > > > ...
> > > > > > > if (tp->retransmit_skb_hint) {
> > > > > > > skb = tp->retransmit_skb_hint;
> > > > > > > last_lost = TCP_SKB_CB(skb)->end_seq;
> > > > > > > if (after(last_lost, tp->retransmit_high))
> > > > > > > last_lost = tp->retransmit_high;
> > > > > > > } else {
> > > > > > > skb = tcp_write_queue_head(sk);
> > > > > > > last_lost = tp->snd_una;
> > > > > > > }
> > > > > > >
> > > > > > > => tcp_for_write_queue_from(skb, sk) {
> > > > > > > __u8 sacked = TCP_SKB_CB(skb)->sacked;
> > > > > > >
> > > > > > > if (skb == tcp_send_head(sk))
> > > > > > > break;
> > > > > > > /* we could do better than to assign each time */
> > > > > > > if (hole == NULL)
> > > > > > >
> > > > > > > This can happen for one of the following reasons,
> > > > > > >
> > > > > > > 1. tp->retransmit_skb_hint is NULL and tcp_write_queue_head() is NULL
> > > > > > > too. ie. tcp_xmit_retransmit_queue() is called on an empty write
> > > > > > > queue for some reason.
> > > > > > >
> > > > > > > 2. tp->retransmit_skb_hint is pointing to a skb which is not on the
> > > > > > > write_queue. ie. somebody forgot to update hint while removing the
> > > > > > > skb from the write queue.
> > > > > >
> > > > > > Once again I've read the unlinkers through, and only thing that could
> > > > > > cause this is tcp_send_synack (others do deal with the hints) but I think
> > > > > > Eric already proposed a patch to that but we never got anywhere due to
> > > > > > some counterargument why it wouldn't take place (too far away for me to
> > > > > > remember, see archives about the discussions). ...But if you want be dead
> > > > > > sure some WARN_ON there might not hurt. Also the purging of the whole
> > > > > > queue was a similar suspect I then came across (but that would only
> > > > > > materialize with sk reuse happening e.g., with nfs which the other guys
> > > > > > weren't using).
> > > > > >
> > > > >
> > > > > Hmm.
> > > > >
> > > > > This sounds familiar to me, but I cannot remember the discussion you
> > > > > mention or the patch.
> > > > >
> > > > > Or maybe it was the TCP transaction thing ? (including data in SYN or
> > > > > SYN-ACK packet)
> >
> > No. That's another thing. ...I've already found it with google today but
> > cannot seem to find it again. I thought I used tcp_make_synack eric but
> > for some reason I only get these transaction fix hits. I'll keep looking.
>
> Right, this one:
>
> http://kerneltrap.org/mailarchive/linux-netdev/2009/10/29/6259073
Hmm, another idea... It might be useful to try to disable
tcp_retrans_try_collapse in tcp_retransmit_skb as a test. I think that it
might be possible that tcp_xmit_retransmit_queue might end up holding
a stale reference either in hole or skb. Kind of shot into the dark still,
no actual theory on how that could happen but that
tcp_xmit_retransmit_queue logic is rather tricky because of returning to
the first hole if such exists so that I couldn't immediately rule out the
possibility.
--
i.
^ permalink raw reply
* [PATCH 00/12] autoconvert trivial BKL users to private mutex
From: Arnd Bergmann @ 2010-07-11 21:18 UTC (permalink / raw)
To: linux-kernel
Cc: devel, Jesper Nilsson, linux-usb, Arnd Bergmann, Corey Minyard,
linux-cris-kernel, Frederic Weisbecker, Greg Kroah-Hartman,
Karsten Keil, Mauro Carvalho Chehab, linux-scsi, linuxppc-dev,
James E.J. Bottomley, linux-mtd, John Kacur, netdev, linux-media,
openipmi-developer, David S. Miller, David Woodhouse
This is a repost of an earlier patch to remove
those users of the big kernel lock that can be
converted to a mutex using a simple script.
The only use of the BKL is in file operations
that are called without any other lock, so the
new mutex is the top-level serialization
and cannot introduce any AB-BA deadlock.
Please apply to the respective maintainer trees
if the patches look good.
Arnd Bergmann (12):
staging: autoconvert trivial BKL users to private mutex
isdn: autoconvert trivial BKL users to private mutex
scsi: autoconvert trivial BKL users to private mutex
media: autoconvert trivial BKL users to private mutex
usb: autoconvert trivial BKL users to private mutex
net: autoconvert trivial BKL users to private mutex
cris: autoconvert trivial BKL users to private mutex
sbus: autoconvert trivial BKL users to private mutex
mtd: autoconvert trivial BKL users to private mutex
mac: autoconvert trivial BKL users to private mutex
ipmi: autoconvert trivial BKL users to private mutex
drivers: autoconvert trivial BKL users to private mutex
arch/cris/arch-v10/drivers/eeprom.c | 2 -
arch/cris/arch-v10/drivers/i2c.c | 2 -
arch/cris/arch-v32/drivers/cryptocop.c | 2 -
arch/cris/arch-v32/drivers/i2c.c | 12 ++++----
drivers/block/paride/pg.c | 7 ++--
drivers/block/paride/pt.c | 19 ++++++------
drivers/char/apm-emulation.c | 11 ++++---
drivers/char/applicom.c | 9 +++--
drivers/char/ds1302.c | 15 +++++----
drivers/char/ds1620.c | 8 ++--
drivers/char/dsp56k.c | 27 +++++++++--------
drivers/char/dtlk.c | 8 ++--
drivers/char/generic_nvram.c | 7 ++--
drivers/char/genrtc.c | 13 ++++----
drivers/char/i8k.c | 7 ++--
drivers/char/ip2/ip2main.c | 8 ++--
drivers/char/ipmi/ipmi_devintf.c | 14 ++++----
drivers/char/ipmi/ipmi_watchdog.c | 8 ++--
drivers/char/lp.c | 15 +++++----
drivers/char/mbcs.c | 8 ++--
drivers/char/mmtimer.c | 7 ++--
drivers/char/mwave/mwavedd.c | 44 ++++++++++++++--------------
drivers/char/nvram.c | 11 ++++---
drivers/char/nwflash.c | 12 ++++----
drivers/char/pcmcia/cm4000_cs.c | 11 ++++---
drivers/char/pcmcia/cm4040_cs.c | 7 ++--
drivers/char/ppdev.c | 8 ++--
drivers/char/rio/rio_linux.c | 7 ++--
drivers/char/snsc.c | 9 +++--
drivers/char/toshiba.c | 9 +++--
drivers/char/viotape.c | 11 ++++---
drivers/char/xilinx_hwicap/xilinx_hwicap.c | 6 ++--
drivers/hwmon/fschmd.c | 6 ++--
drivers/hwmon/w83793.c | 6 ++--
drivers/input/misc/hp_sdc_rtc.c | 7 ++--
drivers/isdn/capi/capi.c | 6 ++--
drivers/isdn/divert/divert_procfs.c | 7 ++--
drivers/isdn/hardware/eicon/divamnt.c | 7 ++--
drivers/isdn/hardware/eicon/divasi.c | 2 -
drivers/isdn/hardware/eicon/divasmain.c | 2 -
drivers/isdn/hysdn/hysdn_procconf.c | 21 +++++++------
drivers/isdn/hysdn/hysdn_proclog.c | 15 +++++----
drivers/isdn/i4l/isdn_common.c | 27 +++++++++--------
drivers/isdn/mISDN/timerdev.c | 7 ++--
drivers/macintosh/adb.c | 10 +++---
drivers/macintosh/smu.c | 6 ++--
drivers/macintosh/via-pmu.c | 11 ++++---
drivers/media/dvb/bt8xx/dst_ca.c | 7 ++--
drivers/media/video/cx88/cx88-blackbird.c | 13 ++++----
drivers/media/video/dabusb.c | 18 ++++++------
drivers/media/video/se401.c | 9 +++--
drivers/media/video/stradis.c | 9 +++--
drivers/media/video/usbvideo/vicam.c | 14 ++++----
drivers/message/fusion/mptctl.c | 15 +++++----
drivers/message/i2o/i2o_config.c | 23 +++++++-------
drivers/misc/phantom.c | 11 ++++---
drivers/mtd/mtdchar.c | 15 +++++----
drivers/net/ppp_generic.c | 19 ++++++------
drivers/net/wan/cosa.c | 10 +++---
drivers/pci/hotplug/cpqphp_sysfs.c | 13 ++++----
drivers/rtc/rtc-m41t80.c | 13 ++++----
drivers/sbus/char/display7seg.c | 8 ++--
drivers/sbus/char/envctrl.c | 2 -
drivers/sbus/char/flash.c | 15 +++++----
drivers/sbus/char/openprom.c | 15 +++++----
drivers/sbus/char/uctrl.c | 7 ++--
drivers/scsi/3w-9xxx.c | 7 ++--
drivers/scsi/3w-sas.c | 7 ++--
drivers/scsi/3w-xxxx.c | 9 ++---
drivers/scsi/aacraid/linit.c | 15 +++++----
drivers/scsi/ch.c | 8 ++--
drivers/scsi/dpt_i2o.c | 18 ++++++------
drivers/scsi/gdth.c | 11 ++++---
drivers/scsi/megaraid.c | 8 ++--
drivers/scsi/megaraid/megaraid_mm.c | 8 ++--
drivers/scsi/megaraid/megaraid_sas.c | 2 -
drivers/scsi/mpt2sas/mpt2sas_ctl.c | 11 ++++---
drivers/scsi/osst.c | 15 +++++----
drivers/scsi/scsi_tgt_if.c | 2 -
drivers/scsi/sg.c | 11 ++++---
drivers/staging/crystalhd/crystalhd_lnx.c | 9 +++--
drivers/staging/dt3155/dt3155_drv.c | 6 ++-
drivers/staging/vme/devices/vme_user.c | 7 ++--
drivers/telephony/ixj.c | 7 ++--
drivers/usb/gadget/printer.c | 7 ++--
drivers/usb/misc/iowarrior.c | 15 +++++----
drivers/usb/misc/rio500.c | 15 +++++----
drivers/usb/misc/usblcd.c | 16 +++++-----
drivers/watchdog/cpwd.c | 15 +++++----
fs/hfsplus/ioctl.c | 11 ++++---
net/wanrouter/wanmain.c | 7 ++--
net/wanrouter/wanproc.c | 7 ++--
92 files changed, 505 insertions(+), 469 deletions(-)
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Corey Minyard <minyard@acm.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: David Woodhouse <David.Woodhouse@intel.com>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: "James E.J. Bottomley" <James.Bottomley@suse.de>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Karsten Keil <isdn@linux-pingi.de>
Cc: Mauro Carvalho Chehab <mchehab@infradead.org>
Cc: netdev@vger.kernel.org
Cc: openipmi-developer@lists.sourceforge.net
Cc: devel@driverdev.osuosl.org
Cc: linux-cris-kernel@axis.com
Cc: linux-media@vger.kernel.org
Cc: linux-mtd@lists.infradead.org
Cc: linuxppc-dev@ozlabs.org
Cc: linux-scsi@vger.kernel.org
Cc: linux-usb@vger.kernel.org
^ permalink raw reply
* [PATCH 02/12] isdn: autoconvert trivial BKL users to private mutex
From: Arnd Bergmann @ 2010-07-11 21:18 UTC (permalink / raw)
To: linux-kernel
Cc: John Kacur, Frederic Weisbecker, Arnd Bergmann, Karsten Keil,
netdev
In-Reply-To: <1278883143-29035-1-git-send-email-arnd@arndb.de>
All these files use the big kernel lock in a trivial
way to serialize their private file operations,
typically resulting from an earlier semi-automatic
pushdown from VFS.
None of these drivers appears to want to lock against
other code, and they all use the BKL as the top-level
lock in their file operations, meaning that there
is no lock-order inversion problem.
Consequently, we can remove the BKL completely,
replacing it with a per-file mutex in every case.
Using a scripted approach means we can avoid
typos.
file=$1
name=$2
if grep -q lock_kernel ${file} ; then
if grep -q 'include.*linux.mutex.h' ${file} ; then
sed -i '/include.*<linux\/smp_lock.h>/d' ${file}
else
sed -i 's/include.*<linux\/smp_lock.h>.*$/include <linux\/mutex.h>/g' ${file}
fi
sed -i ${file} \
-e "/^#include.*linux.mutex.h/,$ {
1,/^\(static\|int\|long\)/ {
/^\(static\|int\|long\)/istatic DEFINE_MUTEX(${name}_mutex);
} }" \
-e "s/\(un\)*lock_kernel\>[ ]*()/mutex_\1lock(\&${name}_mutex)/g" \
-e '/[ ]*cycle_kernel_lock();/d'
else
sed -i -e '/include.*\<smp_lock.h\>/d' ${file} \
-e '/cycle_kernel_lock()/d'
fi
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Karsten Keil <isdn@linux-pingi.de>
Cc: netdev@vger.kernel.org
---
drivers/isdn/capi/capi.c | 6 +++---
drivers/isdn/divert/divert_procfs.c | 7 ++++---
drivers/isdn/hardware/eicon/divamnt.c | 7 ++++---
drivers/isdn/hardware/eicon/divasi.c | 2 --
drivers/isdn/hardware/eicon/divasmain.c | 2 --
drivers/isdn/hysdn/hysdn_procconf.c | 21 +++++++++++----------
drivers/isdn/hysdn/hysdn_proclog.c | 15 ++++++++-------
drivers/isdn/i4l/isdn_common.c | 27 ++++++++++++++-------------
drivers/isdn/mISDN/timerdev.c | 7 ++++---
9 files changed, 48 insertions(+), 46 deletions(-)
diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index 0cabe31..b0a4a69 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -20,7 +20,6 @@
#include <linux/signal.h>
#include <linux/mutex.h>
#include <linux/mm.h>
-#include <linux/smp_lock.h>
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/tty.h>
@@ -50,6 +49,7 @@ MODULE_LICENSE("GPL");
/* -------- driver information -------------------------------------- */
+static DEFINE_MUTEX(capi_mutex);
static struct class *capi_class;
static int capi_major = 68; /* allocated */
@@ -985,9 +985,9 @@ capi_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
int ret;
- lock_kernel();
+ mutex_lock(&capi_mutex);
ret = capi_ioctl(file, cmd, arg);
- unlock_kernel();
+ mutex_unlock(&capi_mutex);
return ret;
}
diff --git a/drivers/isdn/divert/divert_procfs.c b/drivers/isdn/divert/divert_procfs.c
index c53e241..33ec9e4 100644
--- a/drivers/isdn/divert/divert_procfs.c
+++ b/drivers/isdn/divert/divert_procfs.c
@@ -20,7 +20,7 @@
#include <linux/sched.h>
#include <linux/isdnif.h>
#include <net/net_namespace.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include "isdn_divert.h"
@@ -28,6 +28,7 @@
/* Variables for interface queue */
/*********************************/
ulong if_used = 0; /* number of interface users */
+static DEFINE_MUTEX(isdn_divert_mutex);
static struct divert_info *divert_info_head = NULL; /* head of queue */
static struct divert_info *divert_info_tail = NULL; /* pointer to last entry */
static DEFINE_SPINLOCK(divert_info_lock);/* lock for queue */
@@ -261,9 +262,9 @@ static long isdn_divert_ioctl(struct file *file, uint cmd, ulong arg)
{
long ret;
- lock_kernel();
+ mutex_lock(&isdn_divert_mutex);
ret = isdn_divert_ioctl_unlocked(file, cmd, arg);
- unlock_kernel();
+ mutex_unlock(&isdn_divert_mutex);
return ret;
}
diff --git a/drivers/isdn/hardware/eicon/divamnt.c b/drivers/isdn/hardware/eicon/divamnt.c
index 1e85f74..f1d464f 100644
--- a/drivers/isdn/hardware/eicon/divamnt.c
+++ b/drivers/isdn/hardware/eicon/divamnt.c
@@ -14,7 +14,7 @@
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/poll.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <asm/uaccess.h>
#include "platform.h"
@@ -22,6 +22,7 @@
#include "divasync.h"
#include "debug_if.h"
+static DEFINE_MUTEX(maint_mutex);
static char *main_revision = "$Revision: 1.32.6.10 $";
static int major;
@@ -130,7 +131,7 @@ static int maint_open(struct inode *ino, struct file *filep)
{
int ret;
- lock_kernel();
+ mutex_lock(&maint_mutex);
/* only one open is allowed, so we test
it atomically */
if (test_and_set_bit(0, &opened))
@@ -139,7 +140,7 @@ static int maint_open(struct inode *ino, struct file *filep)
filep->private_data = NULL;
ret = nonseekable_open(ino, filep);
}
- unlock_kernel();
+ mutex_unlock(&maint_mutex);
return ret;
}
diff --git a/drivers/isdn/hardware/eicon/divasi.c b/drivers/isdn/hardware/eicon/divasi.c
index f577719..42d3b83 100644
--- a/drivers/isdn/hardware/eicon/divasi.c
+++ b/drivers/isdn/hardware/eicon/divasi.c
@@ -18,7 +18,6 @@
#include <linux/proc_fs.h>
#include <linux/skbuff.h>
#include <linux/seq_file.h>
-#include <linux/smp_lock.h>
#include <asm/uaccess.h>
#include "platform.h"
@@ -402,7 +401,6 @@ static unsigned int um_idi_poll(struct file *file, poll_table * wait)
static int um_idi_open(struct inode *inode, struct file *file)
{
- cycle_kernel_lock();
return (0);
}
diff --git a/drivers/isdn/hardware/eicon/divasmain.c b/drivers/isdn/hardware/eicon/divasmain.c
index fbbcb27..16a874b 100644
--- a/drivers/isdn/hardware/eicon/divasmain.c
+++ b/drivers/isdn/hardware/eicon/divasmain.c
@@ -21,7 +21,6 @@
#include <linux/list.h>
#include <linux/poll.h>
#include <linux/kmod.h>
-#include <linux/smp_lock.h>
#include "platform.h"
#undef ID_MASK
@@ -581,7 +580,6 @@ xdi_copy_from_user(void *os_handle, void *dst, const void __user *src, int lengt
*/
static int divas_open(struct inode *inode, struct file *file)
{
- cycle_kernel_lock();
return (0);
}
diff --git a/drivers/isdn/hysdn/hysdn_procconf.c b/drivers/isdn/hysdn/hysdn_procconf.c
index 8096646..96b3e39 100644
--- a/drivers/isdn/hysdn/hysdn_procconf.c
+++ b/drivers/isdn/hysdn/hysdn_procconf.c
@@ -17,11 +17,12 @@
#include <linux/proc_fs.h>
#include <linux/pci.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <net/net_namespace.h>
#include "hysdn_defs.h"
+static DEFINE_MUTEX(hysdn_conf_mutex);
static char *hysdn_procconf_revision = "$Revision: 1.8.6.4 $";
#define INFO_OUT_LEN 80 /* length of info line including lf */
@@ -234,7 +235,7 @@ hysdn_conf_open(struct inode *ino, struct file *filep)
char *cp, *tmp;
/* now search the addressed card */
- lock_kernel();
+ mutex_lock(&hysdn_conf_mutex);
card = card_root;
while (card) {
pd = card->procconf;
@@ -243,7 +244,7 @@ hysdn_conf_open(struct inode *ino, struct file *filep)
card = card->next; /* search next entry */
}
if (!card) {
- unlock_kernel();
+ mutex_unlock(&hysdn_conf_mutex);
return (-ENODEV); /* device is unknown/invalid */
}
if (card->debug_flags & (LOG_PROC_OPEN | LOG_PROC_ALL))
@@ -255,7 +256,7 @@ hysdn_conf_open(struct inode *ino, struct file *filep)
/* write only access -> write boot file or conf line */
if (!(cnf = kmalloc(sizeof(struct conf_writedata), GFP_KERNEL))) {
- unlock_kernel();
+ mutex_unlock(&hysdn_conf_mutex);
return (-EFAULT);
}
cnf->card = card;
@@ -267,7 +268,7 @@ hysdn_conf_open(struct inode *ino, struct file *filep)
/* read access -> output card info data */
if (!(tmp = kmalloc(INFO_OUT_LEN * 2 + 2, GFP_KERNEL))) {
- unlock_kernel();
+ mutex_unlock(&hysdn_conf_mutex);
return (-EFAULT); /* out of memory */
}
filep->private_data = tmp; /* start of string */
@@ -301,10 +302,10 @@ hysdn_conf_open(struct inode *ino, struct file *filep)
*cp++ = '\n';
*cp = 0; /* end of string */
} else { /* simultaneous read/write access forbidden ! */
- unlock_kernel();
+ mutex_unlock(&hysdn_conf_mutex);
return (-EPERM); /* no permission this time */
}
- unlock_kernel();
+ mutex_unlock(&hysdn_conf_mutex);
return nonseekable_open(ino, filep);
} /* hysdn_conf_open */
@@ -319,7 +320,7 @@ hysdn_conf_close(struct inode *ino, struct file *filep)
int retval = 0;
struct proc_dir_entry *pd;
- lock_kernel();
+ mutex_lock(&hysdn_conf_mutex);
/* search the addressed card */
card = card_root;
while (card) {
@@ -329,7 +330,7 @@ hysdn_conf_close(struct inode *ino, struct file *filep)
card = card->next; /* search next entry */
}
if (!card) {
- unlock_kernel();
+ mutex_unlock(&hysdn_conf_mutex);
return (-ENODEV); /* device is unknown/invalid */
}
if (card->debug_flags & (LOG_PROC_OPEN | LOG_PROC_ALL))
@@ -352,7 +353,7 @@ hysdn_conf_close(struct inode *ino, struct file *filep)
kfree(filep->private_data); /* release memory */
}
- unlock_kernel();
+ mutex_unlock(&hysdn_conf_mutex);
return (retval);
} /* hysdn_conf_close */
diff --git a/drivers/isdn/hysdn/hysdn_proclog.c b/drivers/isdn/hysdn/hysdn_proclog.c
index e83f6fd..37a9dd3 100644
--- a/drivers/isdn/hysdn/hysdn_proclog.c
+++ b/drivers/isdn/hysdn/hysdn_proclog.c
@@ -15,13 +15,14 @@
#include <linux/proc_fs.h>
#include <linux/sched.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include "hysdn_defs.h"
/* the proc subdir for the interface is defined in the procconf module */
extern struct proc_dir_entry *hysdn_proc_entry;
+static DEFINE_MUTEX(hysdn_log_mutex);
static void put_log_buffer(hysdn_card * card, char *cp);
/*************************************************/
@@ -251,7 +252,7 @@ hysdn_log_open(struct inode *ino, struct file *filep)
struct procdata *pd = NULL;
unsigned long flags;
- lock_kernel();
+ mutex_lock(&hysdn_log_mutex);
card = card_root;
while (card) {
pd = card->proclog;
@@ -260,7 +261,7 @@ hysdn_log_open(struct inode *ino, struct file *filep)
card = card->next; /* search next entry */
}
if (!card) {
- unlock_kernel();
+ mutex_unlock(&hysdn_log_mutex);
return (-ENODEV); /* device is unknown/invalid */
}
filep->private_data = card; /* remember our own card */
@@ -278,10 +279,10 @@ hysdn_log_open(struct inode *ino, struct file *filep)
filep->private_data = &pd->log_head;
spin_unlock_irqrestore(&card->hysdn_lock, flags);
} else { /* simultaneous read/write access forbidden ! */
- unlock_kernel();
+ mutex_unlock(&hysdn_log_mutex);
return (-EPERM); /* no permission this time */
}
- unlock_kernel();
+ mutex_unlock(&hysdn_log_mutex);
return nonseekable_open(ino, filep);
} /* hysdn_log_open */
@@ -300,7 +301,7 @@ hysdn_log_close(struct inode *ino, struct file *filep)
hysdn_card *card;
int retval = 0;
- lock_kernel();
+ mutex_lock(&hysdn_log_mutex);
if ((filep->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_WRITE) {
/* write only access -> write debug level written */
retval = 0; /* success */
@@ -339,7 +340,7 @@ hysdn_log_close(struct inode *ino, struct file *filep)
kfree(inf);
}
} /* read access */
- unlock_kernel();
+ mutex_unlock(&hysdn_log_mutex);
return (retval);
} /* hysdn_log_close */
diff --git a/drivers/isdn/i4l/isdn_common.c b/drivers/isdn/i4l/isdn_common.c
index a44cdb4..15632bd 100644
--- a/drivers/isdn/i4l/isdn_common.c
+++ b/drivers/isdn/i4l/isdn_common.c
@@ -17,7 +17,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/isdn.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include "isdn_common.h"
#include "isdn_tty.h"
#include "isdn_net.h"
@@ -42,6 +42,7 @@ MODULE_LICENSE("GPL");
isdn_dev *dev;
+static DEFINE_MUTEX(isdn_mutex);
static char *isdn_revision = "$Revision: 1.1.2.3 $";
extern char *isdn_net_revision;
@@ -1070,7 +1071,7 @@ isdn_read(struct file *file, char __user *buf, size_t count, loff_t * off)
int retval;
char *p;
- lock_kernel();
+ mutex_lock(&isdn_mutex);
if (minor == ISDN_MINOR_STATUS) {
if (!file->private_data) {
if (file->f_flags & O_NONBLOCK) {
@@ -1163,7 +1164,7 @@ isdn_read(struct file *file, char __user *buf, size_t count, loff_t * off)
#endif
retval = -ENODEV;
out:
- unlock_kernel();
+ mutex_unlock(&isdn_mutex);
return retval;
}
@@ -1180,7 +1181,7 @@ isdn_write(struct file *file, const char __user *buf, size_t count, loff_t * off
if (!dev->drivers)
return -ENODEV;
- lock_kernel();
+ mutex_lock(&isdn_mutex);
if (minor <= ISDN_MINOR_BMAX) {
printk(KERN_WARNING "isdn_write minor %d obsolete!\n", minor);
drvidx = isdn_minor2drv(minor);
@@ -1225,7 +1226,7 @@ isdn_write(struct file *file, const char __user *buf, size_t count, loff_t * off
#endif
retval = -ENODEV;
out:
- unlock_kernel();
+ mutex_unlock(&isdn_mutex);
return retval;
}
@@ -1236,7 +1237,7 @@ isdn_poll(struct file *file, poll_table * wait)
unsigned int minor = iminor(file->f_path.dentry->d_inode);
int drvidx = isdn_minor2drv(minor - ISDN_MINOR_CTRL);
- lock_kernel();
+ mutex_lock(&isdn_mutex);
if (minor == ISDN_MINOR_STATUS) {
poll_wait(file, &(dev->info_waitq), wait);
/* mask = POLLOUT | POLLWRNORM; */
@@ -1266,7 +1267,7 @@ isdn_poll(struct file *file, poll_table * wait)
#endif
mask = POLLERR;
out:
- unlock_kernel();
+ mutex_unlock(&isdn_mutex);
return mask;
}
@@ -1727,9 +1728,9 @@ isdn_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
int ret;
- lock_kernel();
+ mutex_lock(&isdn_mutex);
ret = isdn_ioctl(file, cmd, arg);
- unlock_kernel();
+ mutex_unlock(&isdn_mutex);
return ret;
}
@@ -1745,7 +1746,7 @@ isdn_open(struct inode *ino, struct file *filep)
int chidx;
int retval = -ENODEV;
- lock_kernel();
+ mutex_lock(&isdn_mutex);
if (minor == ISDN_MINOR_STATUS) {
infostruct *p;
@@ -1796,7 +1797,7 @@ isdn_open(struct inode *ino, struct file *filep)
#endif
out:
nonseekable_open(ino, filep);
- unlock_kernel();
+ mutex_unlock(&isdn_mutex);
return retval;
}
@@ -1805,7 +1806,7 @@ isdn_close(struct inode *ino, struct file *filep)
{
uint minor = iminor(ino);
- lock_kernel();
+ mutex_lock(&isdn_mutex);
if (minor == ISDN_MINOR_STATUS) {
infostruct *p = dev->infochain;
infostruct *q = NULL;
@@ -1839,7 +1840,7 @@ isdn_close(struct inode *ino, struct file *filep)
#endif
out:
- unlock_kernel();
+ mutex_unlock(&isdn_mutex);
return 0;
}
diff --git a/drivers/isdn/mISDN/timerdev.c b/drivers/isdn/mISDN/timerdev.c
index 81048b8..de43c8c 100644
--- a/drivers/isdn/mISDN/timerdev.c
+++ b/drivers/isdn/mISDN/timerdev.c
@@ -24,9 +24,10 @@
#include <linux/miscdevice.h>
#include <linux/module.h>
#include <linux/mISDNif.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include "core.h"
+static DEFINE_MUTEX(mISDN_mutex);
static u_int *debug;
@@ -224,7 +225,7 @@ mISDN_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
if (*debug & DEBUG_TIMER)
printk(KERN_DEBUG "%s(%p, %x, %lx)\n", __func__,
filep, cmd, arg);
- lock_kernel();
+ mutex_lock(&mISDN_mutex);
switch (cmd) {
case IMADDTIMER:
if (get_user(tout, (int __user *)arg)) {
@@ -256,7 +257,7 @@ mISDN_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
default:
ret = -EINVAL;
}
- unlock_kernel();
+ mutex_unlock(&mISDN_mutex);
return ret;
}
--
1.7.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox