* [PATCH 5/5] netfilter: hashlimit: byte-based limit mode
From: pablo @ 2012-05-09 11:33 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1336563188-6720-1-git-send-email-pablo@netfilter.org>
From: Florian Westphal <fw@strlen.de>
can be used e.g. for ingress traffic policing or
to detect when a host/port consumes more bandwidth than expected.
This is done by optionally making cost to mean
"cost per 16-byte-chunk-of-data" instead of "cost per packet".
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/linux/netfilter/xt_hashlimit.h | 10 ++-
net/netfilter/xt_hashlimit.c | 116 ++++++++++++++++++++++++++------
2 files changed, 106 insertions(+), 20 deletions(-)
diff --git a/include/linux/netfilter/xt_hashlimit.h b/include/linux/netfilter/xt_hashlimit.h
index b1925b5..05fe799 100644
--- a/include/linux/netfilter/xt_hashlimit.h
+++ b/include/linux/netfilter/xt_hashlimit.h
@@ -6,7 +6,11 @@
/* timings are in milliseconds. */
#define XT_HASHLIMIT_SCALE 10000
/* 1/10,000 sec period => max of 10,000/sec. Min rate is then 429490
- seconds, or one every 59 hours. */
+ * seconds, or one packet every 59 hours.
+ */
+
+/* packet length accounting is done in 16-byte steps */
+#define XT_HASHLIMIT_BYTE_SHIFT 4
/* details of this structure hidden by the implementation */
struct xt_hashlimit_htable;
@@ -17,6 +21,10 @@ enum {
XT_HASHLIMIT_HASH_SIP = 1 << 2,
XT_HASHLIMIT_HASH_SPT = 1 << 3,
XT_HASHLIMIT_INVERT = 1 << 4,
+ XT_HASHLIMIT_BYTES = 1 << 5,
+#ifdef __KERNEL__
+ XT_HASHLIMIT_MAX = 1 << 6,
+#endif
};
struct hashlimit_cfg {
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index b6bbd06..d0424f9 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -388,6 +388,18 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+/* in byte mode, the lowest possible rate is one packet/second.
+ * credit_cap is used as a counter that tells us how many times we can
+ * refill the "credits available" counter when it becomes empty.
+ */
+#define MAX_CPJ_BYTES (0xFFFFFFFF / HZ)
+#define CREDITS_PER_JIFFY_BYTES POW2_BELOW32(MAX_CPJ_BYTES)
+
+static u32 xt_hashlimit_len_to_chunks(u32 len)
+{
+ return (len >> XT_HASHLIMIT_BYTE_SHIFT) + 1;
+}
+
/* Precision saver. */
static u32 user2credits(u32 user)
{
@@ -399,21 +411,53 @@ static u32 user2credits(u32 user)
return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;
}
-static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now)
+static u32 user2credits_byte(u32 user)
{
- dh->rateinfo.credit += (now - dh->rateinfo.prev) * CREDITS_PER_JIFFY;
- if (dh->rateinfo.credit > dh->rateinfo.credit_cap)
- dh->rateinfo.credit = dh->rateinfo.credit_cap;
+ u64 us = user;
+ us *= HZ * CREDITS_PER_JIFFY_BYTES;
+ return (u32) (us >> 32);
+}
+
+static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode)
+{
+ unsigned long delta = now - dh->rateinfo.prev;
+ u32 cap;
+
+ if (delta == 0)
+ return;
+
dh->rateinfo.prev = now;
+
+ if (mode & XT_HASHLIMIT_BYTES) {
+ u32 tmp = dh->rateinfo.credit;
+ dh->rateinfo.credit += CREDITS_PER_JIFFY_BYTES * delta;
+ cap = CREDITS_PER_JIFFY_BYTES * HZ;
+ if (tmp >= dh->rateinfo.credit) {/* overflow */
+ dh->rateinfo.credit = cap;
+ return;
+ }
+ } else {
+ dh->rateinfo.credit += delta * CREDITS_PER_JIFFY;
+ cap = dh->rateinfo.credit_cap;
+ }
+ if (dh->rateinfo.credit > cap)
+ dh->rateinfo.credit = cap;
}
static void rateinfo_init(struct dsthash_ent *dh,
struct xt_hashlimit_htable *hinfo)
{
dh->rateinfo.prev = jiffies;
- dh->rateinfo.credit = user2credits(hinfo->cfg.avg * hinfo->cfg.burst);
- dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
- dh->rateinfo.credit_cap = dh->rateinfo.credit;
+ if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
+ dh->rateinfo.credit = CREDITS_PER_JIFFY_BYTES * HZ;
+ dh->rateinfo.cost = user2credits_byte(hinfo->cfg.avg);
+ dh->rateinfo.credit_cap = hinfo->cfg.burst;
+ } else {
+ dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
+ hinfo->cfg.burst);
+ dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
+ dh->rateinfo.credit_cap = dh->rateinfo.credit;
+ }
}
static inline __be32 maskl(__be32 a, unsigned int l)
@@ -519,6 +563,21 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
return 0;
}
+static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh)
+{
+ u64 tmp = xt_hashlimit_len_to_chunks(len);
+ tmp = tmp * dh->rateinfo.cost;
+
+ if (unlikely(tmp > CREDITS_PER_JIFFY_BYTES * HZ))
+ tmp = CREDITS_PER_JIFFY_BYTES * HZ;
+
+ if (dh->rateinfo.credit < tmp && dh->rateinfo.credit_cap) {
+ dh->rateinfo.credit_cap--;
+ dh->rateinfo.credit = CREDITS_PER_JIFFY_BYTES * HZ;
+ }
+ return (u32) tmp;
+}
+
static bool
hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
@@ -527,6 +586,7 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
unsigned long now = jiffies;
struct dsthash_ent *dh;
struct dsthash_dst dst;
+ u32 cost;
if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
goto hotdrop;
@@ -544,12 +604,17 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
} else {
/* update expiration timeout */
dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
- rateinfo_recalc(dh, now);
+ rateinfo_recalc(dh, now, hinfo->cfg.mode);
}
- if (dh->rateinfo.credit >= dh->rateinfo.cost) {
+ if (info->cfg.mode & XT_HASHLIMIT_BYTES)
+ cost = hashlimit_byte_cost(skb->len, dh);
+ else
+ cost = dh->rateinfo.cost;
+
+ if (dh->rateinfo.credit >= cost) {
/* below the limit */
- dh->rateinfo.credit -= dh->rateinfo.cost;
+ dh->rateinfo.credit -= cost;
spin_unlock(&dh->lock);
rcu_read_unlock_bh();
return !(info->cfg.mode & XT_HASHLIMIT_INVERT);
@@ -571,14 +636,6 @@ static int hashlimit_mt_check(const struct xt_mtchk_param *par)
struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
int ret;
- /* Check for overflow. */
- if (info->cfg.burst == 0 ||
- user2credits(info->cfg.avg * info->cfg.burst) <
- user2credits(info->cfg.avg)) {
- pr_info("overflow, try lower: %u/%u\n",
- info->cfg.avg, info->cfg.burst);
- return -ERANGE;
- }
if (info->cfg.gc_interval == 0 || info->cfg.expire == 0)
return -EINVAL;
if (info->name[sizeof(info->name)-1] != '\0')
@@ -591,6 +648,26 @@ static int hashlimit_mt_check(const struct xt_mtchk_param *par)
return -EINVAL;
}
+ if (info->cfg.mode >= XT_HASHLIMIT_MAX) {
+ pr_info("Unknown mode mask %X, kernel too old?\n",
+ info->cfg.mode);
+ return -EINVAL;
+ }
+
+ /* Check for overflow. */
+ if (info->cfg.mode & XT_HASHLIMIT_BYTES) {
+ if (user2credits_byte(info->cfg.avg) == 0) {
+ pr_info("overflow, rate too high: %u\n", info->cfg.avg);
+ return -EINVAL;
+ }
+ } else if (info->cfg.burst == 0 ||
+ user2credits(info->cfg.avg * info->cfg.burst) <
+ user2credits(info->cfg.avg)) {
+ pr_info("overflow, try lower: %u/%u\n",
+ info->cfg.avg, info->cfg.burst);
+ return -ERANGE;
+ }
+
mutex_lock(&hashlimit_mutex);
info->hinfo = htable_find_get(net, info->name, par->family);
if (info->hinfo == NULL) {
@@ -683,10 +760,11 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
struct seq_file *s)
{
int res;
+ const struct xt_hashlimit_htable *ht = s->private;
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
- rateinfo_recalc(ent, jiffies);
+ rateinfo_recalc(ent, jiffies, ht->cfg.mode);
switch (family) {
case NFPROTO_IPV4:
--
1.7.9.5
^ permalink raw reply related
* [PATCH 0/5] netfilter updates for net-next (upcoming 3.5), batch 2
From: pablo @ 2012-05-09 11:33 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
From: Pablo Neira Ayuso <pablo@netfilter.org>
Hi David,
This is a second batch of netfilter updates for net-next, they contain:
* The new HMARK target from Hans Schillstrom. It took lots of spins
to get this into shape. This target provides a hash-based packet / flow
pre-classifier for iptables that can be used to distribute packets
/ flows between uplinks and backend servers. It provides to modes, one
that relies on conntrack, and one that is stateless per-packet.
* Byte-based cost calculation for the hashlimit match, to detect when
a host consumes more bandwidth than expected. This patch from Florian
Westphal.
You can pull these changes from:
git://1984.lsi.us.es/net-next
Thanks!
Florian Westphal (3):
netfilter: limit, hashlimit: avoid duplicated inline
netfilter: hashlimit: move rateinfo initialization to helper
netfilter: hashlimit: byte-based limit mode
Hans Schillstrom (2):
netfilter: ip6_tables: add flags parameter to ipv6_find_hdr()
netfilter: add xt_hmark target for hash-based skb marking
include/linux/netfilter/xt_HMARK.h | 45 ++++
include/linux/netfilter/xt_hashlimit.h | 10 +-
include/linux/netfilter_ipv6/ip6_tables.h | 7 +-
net/ipv6/netfilter/ip6_tables.c | 36 ++-
net/ipv6/netfilter/ip6t_ah.c | 4 +-
net/ipv6/netfilter/ip6t_frag.c | 4 +-
net/ipv6/netfilter/ip6t_hbh.c | 4 +-
net/ipv6/netfilter/ip6t_rt.c | 4 +-
net/netfilter/Kconfig | 15 ++
net/netfilter/Makefile | 1 +
net/netfilter/xt_HMARK.c | 362 +++++++++++++++++++++++++++++
net/netfilter/xt_TPROXY.c | 4 +-
net/netfilter/xt_hashlimit.c | 129 ++++++++--
net/netfilter/xt_limit.c | 5 +-
net/netfilter/xt_socket.c | 4 +-
15 files changed, 588 insertions(+), 46 deletions(-)
create mode 100644 include/linux/netfilter/xt_HMARK.h
create mode 100644 net/netfilter/xt_HMARK.c
--
1.7.9.5
^ permalink raw reply
* [PATCH 4/5] netfilter: hashlimit: move rateinfo initialization to helper
From: pablo @ 2012-05-09 11:33 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1336563188-6720-1-git-send-email-pablo@netfilter.org>
From: Florian Westphal <fw@strlen.de>
followup patch would bloat main match function too much.
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/xt_hashlimit.c | 15 ++++++++++-----
1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 2195eb0..b6bbd06 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -407,6 +407,15 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now)
dh->rateinfo.prev = now;
}
+static void rateinfo_init(struct dsthash_ent *dh,
+ struct xt_hashlimit_htable *hinfo)
+{
+ dh->rateinfo.prev = jiffies;
+ dh->rateinfo.credit = user2credits(hinfo->cfg.avg * hinfo->cfg.burst);
+ dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
+ dh->rateinfo.credit_cap = dh->rateinfo.credit;
+}
+
static inline __be32 maskl(__be32 a, unsigned int l)
{
return l ? htonl(ntohl(a) & ~0 << (32 - l)) : 0;
@@ -531,11 +540,7 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
goto hotdrop;
}
dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
- dh->rateinfo.prev = jiffies;
- dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
- hinfo->cfg.burst);
- dh->rateinfo.credit_cap = dh->rateinfo.credit;
- dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
+ rateinfo_init(dh, hinfo);
} else {
/* update expiration timeout */
dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
--
1.7.9.5
^ permalink raw reply related
* Re: [PATCH net-next] netfilter: remove two dropwatch false positives
From: Pablo Neira Ayuso @ 2012-05-09 11:36 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev, netfilter-devel, David Miller
In-Reply-To: <1335942790.22133.55.camel@edumazet-glaptop>
On Wed, May 02, 2012 at 09:13:10AM +0200, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> After skb_copy_expand() call, we want to switch skbs without dropping
> original one. Call consume_skb() to avoid drop_monitor sending a drop
> event.
We have removed ip_queue in 3.5. That's why I cannot take this patch.
^ permalink raw reply
* Re: [Lksctp-developers] Bug: sctp packets are dropped after IPSEC rekeying (route cache issue)
From: Nicolas Dichtel @ 2012-05-09 12:04 UTC (permalink / raw)
To: Vladislav Yasevich
Cc: David Miller, babu.srinivasan, lksctp-developers, linux-sctp,
netdev, michael.kreuzer
In-Reply-To: <CAGCdqXFjyWst22DJgxNfakz3-JwRq0Z+WZVj-XT=qRQFY5w3rw@mail.gmail.com>
Le 04/05/2012 20:15, Vladislav Yasevich a écrit :
> On Fri, May 4, 2012 at 11:24 AM, Nicolas Dichtel
> <nicolas.dichtel@6wind.com>wrote:
>
>> Le 04/05/2012 16:48, David Miller a écrit :
>>
>> From: Nicolas Dichtel<nicolas.dichtel@6wind.**com<nicolas.dichtel@6wind.com>
>>>>
>>> Date: Fri, 04 May 2012 12:07:38 +0200
>>>
>>> Finally, this patch was never integrated into the mainline. Should I
>>>> rebase it on the head?
It applies cleanly on linus tree, so I will not rebase it.
Regards,
Nicolas
>>>>
>>>> I've attach the last approved patch.
>>>>
>>>> Here is the original thread:
>>>> http://sourceforge.net/**mailarchive/message.php?msg_**id=25786006<http://sourceforge.net/mailarchive/message.php?msg_id=25786006>
>>>>
>>>
>>> Vlad no longer works for HP so your email likely will bounce, and
>>> he will not see it.
>>>
>> Right.
>>
>>
>>
>>> His new email address is vyasevich@gmail.com, as per the MAINTAINERS
>>> file.
>>>
>>
>> I put the right email address now. I attach the patch again, for Vlad.
>>
>>
>> Thank you,
>> Nicolas
>>
>
>
> Right. This looks fine.
>
> Acked-by: Vlad Yaseivch<vyasevich@gmail.com>
^ permalink raw reply
* [PATCH] ip.7: Improve explanation about calling listen or connect
From: Flavio Leitner @ 2012-05-09 12:30 UTC (permalink / raw)
To: mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w
Cc: linux-man-u79uwXL29TY76Z2rM5mHXA, netdev, Flavio Leitner
Signed-off-by: Flavio Leitner <fbl-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
man7/ip.7 | 15 +++++++++------
1 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/man7/ip.7 b/man7/ip.7
index 9f560df..84fe32d 100644
--- a/man7/ip.7
+++ b/man7/ip.7
@@ -69,12 +69,11 @@ For
you may specify a valid IANA IP protocol defined in
RFC\ 1700 assigned numbers.
.PP
-.\" FIXME ip current does an autobind in listen, but I'm not sure
-.\" if that should be documented.
When a process wants to receive new incoming packets or connections, it
should bind a socket to a local interface address using
.BR bind (2).
-Only one IP socket may be bound to any given local (address, port) pair.
+In this case, only one IP socket may be bound to any given local
+(address, port) pair.
When
.B INADDR_ANY
is specified in the bind call, the socket will be bound to
@@ -82,10 +81,14 @@ is specified in the bind call, the socket will be bound to
local interfaces.
When
.BR listen (2)
-or
+is called on an unbound socket, the socket is automatically bound
+to a random free port with the local address set to
+.BR INADDR_ANY .
+When
.BR connect (2)
-are called on an unbound socket, it is automatically bound to a
-random free port with the local address set to
+is called on an unbound socket, the socket is automatically bound
+to a random free port or an usable shared port with the local address
+set to
.BR INADDR_ANY .
A TCP local socket address that has been bound is unavailable for
--
1.7.7.1
--
To unsubscribe from this list: send the line "unsubscribe linux-man" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* Re: [PATCH net-next] netfilter: remove two dropwatch false positives
From: Eric Dumazet @ 2012-05-09 13:00 UTC (permalink / raw)
To: Pablo Neira Ayuso; +Cc: netdev, netfilter-devel, David Miller
In-Reply-To: <20120509113647.GA23243@1984>
Le mercredi 09 mai 2012 à 13:36 +0200, Pablo Neira Ayuso a écrit :
> On Wed, May 02, 2012 at 09:13:10AM +0200, Eric Dumazet wrote:
> > From: Eric Dumazet <edumazet@google.com>
> >
> > After skb_copy_expand() call, we want to switch skbs without dropping
> > original one. Call consume_skb() to avoid drop_monitor sending a drop
> > event.
>
> We have removed ip_queue in 3.5. That's why I cannot take this patch.
So far, so good !
Thanks
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH 0/5] netfilter updates for net-next (upcoming 3.5), batch 2
From: Eric Dumazet @ 2012-05-09 13:04 UTC (permalink / raw)
To: pablo, Deng-Cheng Zhu; +Cc: netfilter-devel, davem, netdev
In-Reply-To: <1336563188-6720-1-git-send-email-pablo@netfilter.org>
Le mercredi 09 mai 2012 à 13:33 +0200, pablo@netfilter.org a écrit :
> From: Pablo Neira Ayuso <pablo@netfilter.org>
>
> Hi David,
>
> This is a second batch of netfilter updates for net-next, they contain:
>
> * The new HMARK target from Hans Schillstrom. It took lots of spins
> to get this into shape. This target provides a hash-based packet / flow
> pre-classifier for iptables that can be used to distribute packets
> / flows between uplinks and backend servers. It provides to modes, one
> that relies on conntrack, and one that is stateless per-packet.
It seems this might be a good base for Deng-Cheng Zhu concerns.
(subject on netdev : RPS: Sparse connection optimizations )
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* RE: [PATCH] pch_gbe: Adding read memory barriers
From: Ben Hutchings @ 2012-05-09 13:18 UTC (permalink / raw)
To: David Laight; +Cc: Erwan Velu, netdev, linux-kernel, tshimizu818
In-Reply-To: <AE90C24D6B3A694183C094C60CF0A2F602CD83CB@saturn3.aculab.com>
On Wed, 2012-05-09 at 11:47 +0100, David Laight wrote:
> > Under a strong incoming packet stream and/or high cpu usage,
> > the pch_gbe driver reports "Receive CRC Error" and discards packet.
> >
> > It occurred on an Intel ATOM E620T while running a
> > 300mbit/sec multicast
> > network stream leading to a ~100% cpu usage.
> >
> > Adding rmb() calls before considering the network card's status solve
> > this issue.
> >
> > Getting it into stable would be perfect as it solves
> > reliability issues.
> >
> > Signed-off-by: Erwan Velu <erwan.velu@zodiacaerospace.com>
> > ---
> > .../net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c | 3 +++
> > 1 files changed, 3 insertions(+), 0 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
> > b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
> > index 8035e5f..ace3654 100644
> > --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
> > +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
> > @@ -1413,6 +1413,7 @@ static irqreturn_t pch_gbe_intr(int
> > irq, void *data)
> > pch_gbe_mac_set_pause_packet(hw);
> > }
> > }
> > + smp_rmb(); /* prevent any other reads before*/
>
> Under the assumption that your memory references are uncached,
> you only need to stop gcc reordering the object code,
> Rather than actually adding one of the 'fence' instructions.
Also, the usual MMIO functions already include compiler barriers.
> So you should only need: asm volatile(:::"memory")
> NFI which define generates that, the defines in the copy of
> sysdep.h I just looked at always include one of the fences.
This is barrier(). But I think this must be intended to control
ordering of fields that are written elsewhere.
Really, this needs a much more specific comment to explain the intent.
Then reviewers can work out whether it actually achieves the intent!
Ben.
--
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply
* Re: [PATCH RFC 0/6] copy aside frags with destructors (was [PATCH 7/9] net: add skb_orphan_frags to copy aside frags with destructors)
From: Ian Campbell @ 2012-05-09 13:18 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: David Miller, netdev@vger.kernel.org, eric.dumazet@gmail.com
In-Reply-To: <cover.1336397823.git.mst@redhat.com>
On Mon, 2012-05-07 at 14:53 +0100, Michael S. Tsirkin wrote:
> On Fri, May 04, 2012 at 11:51:31AM +0100, Ian Campbell wrote:
> > On Fri, 2012-05-04 at 11:03 +0100, Michael S. Tsirkin wrote:
> > > On Fri, May 04, 2012 at 02:54:33AM -0400, David Miller wrote:
> > > > From: "Michael S. Tsirkin" <mst@redhat.com>
> > > > Date: Fri, 4 May 2012 00:10:24 +0300
> > > >
> > > > > Hmm we orphan skbs when we loop them back so how about reusing the
> > > > > skb->destructor for this?
> > > >
> > > > That's one possibility.
>
> So originally I thought it would work: destructor would wake the
> original owner which would do data copy and modify the fragments. But
> then I unfortunately realized that would be racy: the new owner could be
By "new owner" you mean the owner of some other skb which refernences
the same shinfo (so either clone or the original if we currently hold a
clone).
> using the old frags and there appears no way for us to
> make sure it doesn't so that we can put the original page.
Right, the key issue is, I think, that cloning etc take an extra dataref
on the shinfo but do not take references on the frags. This is obviously
sane but does cause the issue above.
> And the same logic applies to modifying the frags at any
> other time if the skb is cloned. So it seems we must copy if we
> want to clone the skb.
Right. I had been hoping that the frag destructors could avoid this but
it seems like this is not going to be the case.
Just as a thought experiment would taking a frag ref on clone help us?
(lets assume for now that, iff there are destructors, this is cheaper
than the copy). I think this doesn't work -- since although the ref will
mean that the page doesn't go away under anyone elses feet after we've
orphaned it we will have lost the pointer to the original page by the
time that other ref is dropped, so freeing it will be tricky.
So following that train of thought a step further -- would creating a
new shinfo entirely on frag orphan buy us anything (and at what cost)?
Obviously it an extra allocation -- but we are already allocating N
pages of new frag. The others skb's referencing the shared shinfo would
still continue to see the old shinfo, pages and refs until they
themselves needed to orphan the frags (if they do, they might avoid it).
I think that could work, but I'm not sure it is worth the complexity.
Basically all it means that if you are bridging + flooding then you
would potentially save some number of a copies depending on the types of
devices on each port.
[...]
> Second option is what macvtap zero copy uses and it already
> does copy on clone too. So I hacked that to make it support tcp/udp
> used by sunrpc.
This does seem like it is the preferable solution.
I haven't read the patches yet, so maybe you already support this, but
it would be good to allow the creator of an skb to declare that it
doesn't want this behaviour, e.g. because it has some other mechanism
for dealing with this copy out for the case where an skb gets held
somewhere.
Ian.
> > > >
> > > > But I fear we're about to toss Ian into yet another rabbit hole. :-)
> > > >
> > > > Let's try to converge on something quickly as I think integration of
> > > > his work has been delayed enough as-is.
>
> ...
>
> > > It's weekend here, I'll work on a patch like this
> > > Sunday.
> >
> > Thanks, I was starting to feel my nose twitching and my ears beginning
> > to elongate ;-)
> >
> > Ian.
>
> Here's a first stub at a fix. Basically to be able to modify frags on
> the fly we must make sure the skb isn't cloned, so the moment someone
> clones the skb we need to trigger the frag copy logic. And this is
> exactly what happens with SKBTX_DEV_ZEROCOPY so it seems to make sense
> to reuse that logic.
>
> The below patchset replaces patch 7
> ([PATCH 7/9] net: add skb_orphan_frags to copy aside frags with destructors)
> in Ian's patchset and needs to be applied there.
>
>
> Compiled only but I'd like to hear what people think all the same
> because it does add a couple of branches on fast path. On the other
> hand this makes it generic so the same logic will be reusable for packet
> sockets (which IIRC are currently buggy in the same way as sunrpc) and
> for adding zero copy support to tun.
>
> Please comment,
> Thanks!
>
> --
> MST
>
>
> Michael S. Tsirkin (6):
> skbuff: support per-page destructors in copy_ubufs
> skbuff: add an api to orphan frags
> skbuff: convert to skb_orphan_frags
> tun: orphan frags on xmit
> net: orphan frags on receive
> skbuff: set zerocopy flag on frag destructor
>
> drivers/net/tun.c | 2 ++
> include/linux/skbuff.h | 41 +++++++++++++++++++++++++++++++++++++++++
> net/core/dev.c | 2 ++
> net/core/skbuff.c | 43 +++++++++++++++++++++++++------------------
> 4 files changed, 70 insertions(+), 18 deletions(-)
>
^ permalink raw reply
* Re: [v12 PATCH 2/3] NETFILTER module xt_hmark, new target for HASH based fwmark
From: Hans Schillstrom @ 2012-05-09 13:36 UTC (permalink / raw)
To: Pablo Neira Ayuso
Cc: kaber@trash.net, jengelh@medozas.de,
netfilter-devel@vger.kernel.org, netdev@vger.kernel.org,
hans@schillstrom.com
In-Reply-To: <20120509103820.GA22608@1984>
On Wednesday 09 May 2012 12:38:20 Pablo Neira Ayuso wrote:
> On Tue, May 08, 2012 at 09:37:35AM +0200, Hans Schillstrom wrote:
> > From d5065af3988cc7561a02f30bae8342e1a89126a4 Mon Sep 17 00:00:00 2001
> > From: Hans Schillstrom <hans.schillstrom@ericsson.com>
> > Date: Wed, 2 May 2012 07:49:47 +0000
> > Subject: netfilter: add xt_hmark target for hash-based skb
> > marking
[snip]
>
> I have applied this, I'm going to pass it to davem.
Thanks Pablo
--
Regards
Hans Schillstrom <hans.schillstrom@ericsson.com>
^ permalink raw reply
* Re: [PATCH RFC 0/6] copy aside frags with destructors (was [PATCH 7/9] net: add skb_orphan_frags to copy aside frags with destructors)
From: Michael S. Tsirkin @ 2012-05-09 13:52 UTC (permalink / raw)
To: Ian Campbell; +Cc: David Miller, netdev@vger.kernel.org, eric.dumazet@gmail.com
In-Reply-To: <1336569529.25514.109.camel@zakaz.uk.xensource.com>
On Wed, May 09, 2012 at 02:18:49PM +0100, Ian Campbell wrote:
> On Mon, 2012-05-07 at 14:53 +0100, Michael S. Tsirkin wrote:
> > On Fri, May 04, 2012 at 11:51:31AM +0100, Ian Campbell wrote:
> > > On Fri, 2012-05-04 at 11:03 +0100, Michael S. Tsirkin wrote:
> > > > On Fri, May 04, 2012 at 02:54:33AM -0400, David Miller wrote:
> > > > > From: "Michael S. Tsirkin" <mst@redhat.com>
> > > > > Date: Fri, 4 May 2012 00:10:24 +0300
> > > > >
> > > > > > Hmm we orphan skbs when we loop them back so how about reusing the
> > > > > > skb->destructor for this?
> > > > >
> > > > > That's one possibility.
> >
> > So originally I thought it would work: destructor would wake the
> > original owner which would do data copy and modify the fragments. But
> > then I unfortunately realized that would be racy: the new owner could be
>
> By "new owner" you mean the owner of some other skb which refernences
> the same shinfo (so either clone or the original if we currently hold a
> clone).
>
> > using the old frags and there appears no way for us to
> > make sure it doesn't so that we can put the original page.
>
> Right, the key issue is, I think, that cloning etc take an extra dataref
> on the shinfo but do not take references on the frags. This is obviously
> sane but does cause the issue above.
>
> > And the same logic applies to modifying the frags at any
> > other time if the skb is cloned. So it seems we must copy if we
> > want to clone the skb.
>
> Right. I had been hoping that the frag destructors could avoid this but
> it seems like this is not going to be the case.
>
> Just as a thought experiment would taking a frag ref on clone help us?
> (lets assume for now that, iff there are destructors, this is cheaper
> than the copy). I think this doesn't work -- since although the ref will
> mean that the page doesn't go away under anyone elses feet after we've
> orphaned it we will have lost the pointer to the original page by the
> time that other ref is dropped, so freeing it will be tricky.
>
> So following that train of thought a step further -- would creating a
> new shinfo entirely on frag orphan buy us anything (and at what cost)?
> Obviously it an extra allocation -- but we are already allocating N
> pages of new frag. The others skb's referencing the shared shinfo would
> still continue to see the old shinfo, pages and refs until they
> themselves needed to orphan the frags (if they do, they might avoid it).
So it would address the second problem (cloned skb) but
not the first one (original owner racing with the first one).
> I think that could work, but I'm not sure it is worth the complexity.
> Basically all it means that if you are bridging + flooding then you
> would potentially save some number of a copies depending on the types of
> devices on each port.
>
> [...]
>
> > Second option is what macvtap zero copy uses and it already
> > does copy on clone too. So I hacked that to make it support tcp/udp
> > used by sunrpc.
>
> This does seem like it is the preferable solution.
>
> I haven't read the patches yet, so maybe you already support this, but
> it would be good to allow the creator of an skb to declare that it
> doesn't want this behaviour, e.g. because it has some other mechanism
> for dealing with this copy out for the case where an skb gets held
> somewhere.
>
> Ian.
Sure, if we find a way to do that we just don't set ZEROCOPY skb flag.
So I'll give people a bit more time to review, hope
you find the time too.
> > > > >
> > > > > But I fear we're about to toss Ian into yet another rabbit hole. :-)
> > > > >
> > > > > Let's try to converge on something quickly as I think integration of
> > > > > his work has been delayed enough as-is.
> >
> > ...
> >
> > > > It's weekend here, I'll work on a patch like this
> > > > Sunday.
> > >
> > > Thanks, I was starting to feel my nose twitching and my ears beginning
> > > to elongate ;-)
> > >
> > > Ian.
> >
> > Here's a first stub at a fix. Basically to be able to modify frags on
> > the fly we must make sure the skb isn't cloned, so the moment someone
> > clones the skb we need to trigger the frag copy logic. And this is
> > exactly what happens with SKBTX_DEV_ZEROCOPY so it seems to make sense
> > to reuse that logic.
> >
> > The below patchset replaces patch 7
> > ([PATCH 7/9] net: add skb_orphan_frags to copy aside frags with destructors)
> > in Ian's patchset and needs to be applied there.
> >
> >
> > Compiled only but I'd like to hear what people think all the same
> > because it does add a couple of branches on fast path. On the other
> > hand this makes it generic so the same logic will be reusable for packet
> > sockets (which IIRC are currently buggy in the same way as sunrpc) and
> > for adding zero copy support to tun.
> >
> > Please comment,
> > Thanks!
> >
> > --
> > MST
> >
> >
> > Michael S. Tsirkin (6):
> > skbuff: support per-page destructors in copy_ubufs
> > skbuff: add an api to orphan frags
> > skbuff: convert to skb_orphan_frags
> > tun: orphan frags on xmit
> > net: orphan frags on receive
> > skbuff: set zerocopy flag on frag destructor
> >
> > drivers/net/tun.c | 2 ++
> > include/linux/skbuff.h | 41 +++++++++++++++++++++++++++++++++++++++++
> > net/core/dev.c | 2 ++
> > net/core/skbuff.c | 43 +++++++++++++++++++++++++------------------
> > 4 files changed, 70 insertions(+), 18 deletions(-)
> >
>
^ permalink raw reply
* Re: [PATCH RFC 5/6] net: orphan frags on receive
From: Ian Campbell @ 2012-05-09 13:54 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: David Miller, netdev@vger.kernel.org, eric.dumazet@gmail.com
In-Reply-To: <a3353f6235d877c6b2d300eb5a914dbc9b78824d.1336397823.git.mst@redhat.com>
On Mon, 2012-05-07 at 14:54 +0100, Michael S. Tsirkin wrote:
> zero copy packets are normally sent to the outside
> network, but bridging, tun etc might loop them
> back to host networking stack. If this happens
> destructors will never be called, so orphan
> the frags immediately on receive.
I think this deceptively simply patch is actually the meat of the
series.
It's been a long time since I dug into the bridging code. Am I right
that this function is only reached when an SKB is going to be delivered
locally and not when forwarding to another port on the bridge?
Ian.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> net/core/dev.c | 2 ++
> 1 files changed, 2 insertions(+), 0 deletions(-)
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index a2be59f..c0cdc00 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -1630,6 +1630,8 @@ static inline int deliver_skb(struct sk_buff *skb,
> struct packet_type *pt_prev,
> struct net_device *orig_dev)
> {
> + if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
> + return -ENOMEM;
> atomic_inc(&skb->users);
> return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
> }
^ permalink raw reply
* Re: [PATCH RFC 0/6] copy aside frags with destructors (was [PATCH 7/9] net: add skb_orphan_frags to copy aside frags with destructors)
From: Ian Campbell @ 2012-05-09 14:01 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: David Miller, netdev@vger.kernel.org, eric.dumazet@gmail.com
In-Reply-To: <20120509135240.GA19246@redhat.com>
On Wed, 2012-05-09 at 14:52 +0100, Michael S. Tsirkin wrote:
> On Wed, May 09, 2012 at 02:18:49PM +0100, Ian Campbell wrote:
> > On Mon, 2012-05-07 at 14:53 +0100, Michael S. Tsirkin wrote:
> > > On Fri, May 04, 2012 at 11:51:31AM +0100, Ian Campbell wrote:
> > > > On Fri, 2012-05-04 at 11:03 +0100, Michael S. Tsirkin wrote:
> > > > > On Fri, May 04, 2012 at 02:54:33AM -0400, David Miller wrote:
> > > > > > From: "Michael S. Tsirkin" <mst@redhat.com>
> > > > > > Date: Fri, 4 May 2012 00:10:24 +0300
> > > > > >
> > > > > > > Hmm we orphan skbs when we loop them back so how about reusing the
> > > > > > > skb->destructor for this?
> > > > > >
> > > > > > That's one possibility.
> > >
> > > So originally I thought it would work: destructor would wake the
> > > original owner which would do data copy and modify the fragments. But
> > > then I unfortunately realized that would be racy: the new owner could be
> >
> > By "new owner" you mean the owner of some other skb which refernences
> > the same shinfo (so either clone or the original if we currently hold a
> > clone).
> >
> > > using the old frags and there appears no way for us to
> > > make sure it doesn't so that we can put the original page.
> >
> > Right, the key issue is, I think, that cloning etc take an extra dataref
> > on the shinfo but do not take references on the frags. This is obviously
> > sane but does cause the issue above.
> >
> > > And the same logic applies to modifying the frags at any
> > > other time if the skb is cloned. So it seems we must copy if we
> > > want to clone the skb.
> >
> > Right. I had been hoping that the frag destructors could avoid this but
> > it seems like this is not going to be the case.
> >
> > Just as a thought experiment would taking a frag ref on clone help us?
> > (lets assume for now that, iff there are destructors, this is cheaper
> > than the copy). I think this doesn't work -- since although the ref will
> > mean that the page doesn't go away under anyone elses feet after we've
> > orphaned it we will have lost the pointer to the original page by the
> > time that other ref is dropped, so freeing it will be tricky.
> >
> > So following that train of thought a step further -- would creating a
> > new shinfo entirely on frag orphan buy us anything (and at what cost)?
> > Obviously it an extra allocation -- but we are already allocating N
> > pages of new frag. The others skb's referencing the shared shinfo would
> > still continue to see the old shinfo, pages and refs until they
> > themselves needed to orphan the frags (if they do, they might avoid it).
>
> So it would address the second problem (cloned skb) but
> not the first one (original owner racing with the first one).
Right.
> > I think that could work, but I'm not sure it is worth the complexity.
> > Basically all it means that if you are bridging + flooding then you
> > would potentially save some number of a copies depending on the types of
> > devices on each port.
> >
> > [...]
> >
> > > Second option is what macvtap zero copy uses and it already
> > > does copy on clone too. So I hacked that to make it support tcp/udp
> > > used by sunrpc.
> >
> > This does seem like it is the preferable solution.
> >
> > I haven't read the patches yet, so maybe you already support this, but
> > it would be good to allow the creator of an skb to declare that it
> > doesn't want this behaviour, e.g. because it has some other mechanism
> > for dealing with this copy out for the case where an skb gets held
> > somewhere.
> >
> > Ian.
>
> Sure, if we find a way to do that we just don't set ZEROCOPY skb flag.
After reading "net: orphan frags on receive" I'm not sure how necessary
it will be, since the main case I was worried about was lots of
copying/orphaning on bridge forwarding, but I convinced myself (I think)
that the series only effects delivery and not forwarding.
Still, no harm in offering the option.
> So I'll give people a bit more time to review, hope
> you find the time too.
I took a look at a fairly high level and it all looked sensible, I've
not looked closely at the details, not run it yet, I hope I can do that
shortly.
BTW, I never said thanks for looking into this, so thanks ;-)
^ permalink raw reply
* Re: [net-next 5/9] e1000e: Disable ASPM L1 on 82574
From: Nix @ 2012-05-09 14:02 UTC (permalink / raw)
To: Wyborny, Carolyn
Cc: Matthew Garrett, Kirsher, Jeffrey T, davem@davemloft.net,
Chris Boot, netdev@vger.kernel.org, gospo@redhat.com,
sassmann@redhat.com
In-Reply-To: <87fwbea8pi.fsf@spindle.srvr.nix>
On 5 May 2012, nix@esperi.org.uk outgrape:
> The question here is how to fix it. It appears that the motherboard or
> BIOS on this machine does not grant _OSC control even (especially?) if
> you have turned on PCIe ASPM in the BIOS. But perhaps even if _OSC is
> not granted you should permit PCIe to be *disabled* by drivers, just not
> enabled? (The BIOS appears to be buggy in this area: if you turn off
> ASPM, save, and go back into setup, ASPM has turned itself back on
> again!)
This turned out to be me not knowing how to drive the BIOS's deeply
unintuitive configuration program. If I turn PCIe ASPM off in the BIOS,
the kernel does exactly the same as it does in my previous message (i.e.
decides that ASPM is disabled due to the failure of an _OSC request,
then refuses to change the ASPM link state of the e1000e), but since the
BIOS has already disabled ASPM, the card is not in crash-happy mode and
I don't need to force anything off by hand.
But if I turn ASPM on, as reported in my previous message the kernel
promptly bans itself from changing any PCIe ASPM link states whatsoever,
and the e1000e locks up about an hour later.
I presume that
May 5 17:06:53 spindle info: [ 0.629699] pci0000:00: Requesting ACPI _OSC control (0x1d)
May 5 17:06:53 spindle info: [ 0.629941] pci0000:00: ACPI _OSC request failed (AE_NOT_FOUND), returned control mask: 0x1d
May 5 17:06:53 spindle info: [ 0.630373] ACPI _OSC control for PCIe not granted, disabling ASPM
is reporting some sort of BIOS bug, but it is at best confusing to have
the boot messages reporting that ASPM is disabled: better perhaps to
describe this as 'leaving ASPM on all devices how the BIOS set it' and
have the e1000e driver emit a giant flaming warning if it spots this
happening on an 82574L with ASPM turned on. (Or, alternatively, permit
ASPM to be turned off when the system is in this state, but not on,
whereupon the existing code in the e1000e driver will do the right
thing. But I don't know if that will break any laptops. This machine is
very much not a laptop.)
> I'm not sure what the right thing to do is here: I don't know enough
> about this area. But it does seem very strange that the only way I have
> to turn off PCIe ASPM reliably on this device is to tell the kernel to
> forcibly turn it *on*!
This is still strange, though it seems that turning ASPM completely off
in the BIOS will also serve.
^ permalink raw reply
* RE: [PATCH] pch_gbe: Adding read memory barriers
From: David Laight @ 2012-05-09 14:44 UTC (permalink / raw)
To: Erwan Velu, Ben Hutchings; +Cc: netdev, linux-kernel, tshimizu818
In-Reply-To: <CAL2JzuxirjAnmbcV+R=MJFAi1+ayrRnM6i0Dz5poqMuG7Cxwxg@mail.gmail.com>
> So I've been reading how does others drivers are working regarding
this type of message.
> While reading many drivers, I found that many does rmb() calls at this
points.
> As I'm not a linux kernel expert, I did mimic them to the result
> and it was good since the errors disappeared.
> So do you mean the patch is wrong or shall I rework only the comments
?
The problem is that barriers are not understood at all well
by most driver writers - so a lot of code is sub-optimal.
It isn't helped by the readability of the cpu documentation,
never mind some old docs that described something that an
engineer thought they might want to allow - rather than
describing what any cpus actually did/do.
For x86 cpus (amd and intel at least) reads and writes to
uncached memory and io are guaranteed to happen in instruction
order. For other cpus (sparc and ppc) such reads can
overtake writes - requiring something to force the write to
complete (eg a synchronising instruction, or maybe a readeback).
One problem is that the barrier instructions are generally
slow - so you don't want to use them where unnecessary,
however whether you need them is somewhat hardware specific
and the typical OS lists of barriers doesn't cover all the
cases - even for things that are actually compile time knowable.
Eg: what you need between a write to 'dma coherent' memory
and a write to a control register depends on whether 'dma
coherency' is implemented by disabling the cache.
For SMP there may be additional synchronisation, especially
for cached data, but for most drivers that is handled by
mutex/lock operations.
The other, and separate, issue is making sure that the compiler
itself doesn't reorder of optimise away memory cycles.
By far the best way is to mark the data (not the pointer to the
data) 'volatile', this forces the compiler to generate object
code for every variable reference in the source, and in the right
order.
Sometimes that is inappropiate, gcc's asm volatile (:::"memory")
tells the compiler that this asm statement might modify all
of memory (even though there are actually no instructions!).
This forces it to avoid having anything in a register that is
a copy of something in memory - thus enforcing the order of
memory accesses.
It can also be used to reduce register pressure within the
compiler.
In your case I suspect that the compiler has re-ordered the
reads - adding almost anything between them will change the
order.
David
^ permalink raw reply
* [PATCH] netlink: connector: implement cn_netlink_reply
From: Alban Crequy @ 2012-05-09 14:37 UTC (permalink / raw)
To: Evgeniy Polyakov
Cc: netdev, Vincent Sanders, Javier Martinez Canillas, Alban Crequy,
Rodrigo Moya, Evgeniy Polyakov
In a connector callback, it was not possible to reply to a message only to a
sender. This patch implements cn_netlink_reply(). It uses the connector socket
to send an unicast netlink message back to the sender.
The following pseudo-code can be used from a connector callback:
struct cn_msg *cn_reply;
cn_reply = kzalloc(sizeof(struct cn_msg)
+ sizeof(struct ..._nl_cfg_reply), GFP_KERNEL);
cn_reply->id = msg->id;
cn_reply->seq = msg->seq;
cn_reply->ack = msg->ack + 1;
cn_reply->len = sizeof(struct ..._nl_cfg_reply);
cn_reply->flags = 0;
rr = cn_netlink_reply(cn_reply, nsp->pid, GFP_KERNEL);
Signed-off-by: Alban Crequy <alban.crequy@collabora.co.uk>
Reviewed-by: Vincent Sanders <vincent.sanders@collabora.co.uk>
Acked-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
CC: Rodrigo Moya <rodrigo.moya@collabora.co.uk>
CC: Evgeniy Polyakov <zbr@ioremap.net>
---
drivers/connector/connector.c | 32 ++++++++++++++++++++++++++++++++
include/linux/connector.h | 1 +
2 files changed, 33 insertions(+), 0 deletions(-)
diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c
index dde6a0f..1cb488d 100644
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -118,6 +118,38 @@ nlmsg_failure:
EXPORT_SYMBOL_GPL(cn_netlink_send);
/*
+ * Send an unicast reply from a connector callback
+ *
+ */
+int cn_netlink_reply(struct cn_msg *msg, u32 pid, gfp_t gfp_mask)
+{
+ unsigned int size;
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ struct cn_msg *data;
+ struct cn_dev *dev = &cdev;
+
+ size = NLMSG_SPACE(sizeof(*msg) + msg->len);
+
+ skb = alloc_skb(size, gfp_mask);
+ if (!skb)
+ return -ENOMEM;
+
+ nlh = NLMSG_PUT(skb, 0, msg->seq, NLMSG_DONE, size - sizeof(*nlh));
+
+ data = NLMSG_DATA(nlh);
+
+ memcpy(data, msg, sizeof(*data) + msg->len);
+
+ return netlink_unicast(dev->nls, skb, pid, 1);
+
+nlmsg_failure:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(cn_netlink_reply);
+
+/*
* Callback helper - queues work and setup destructor for given data.
*/
static int cn_call_callback(struct sk_buff *skb)
diff --git a/include/linux/connector.h b/include/linux/connector.h
index 7638407..c27be60 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -125,6 +125,7 @@ int cn_add_callback(struct cb_id *id, const char *name,
void (*callback)(struct cn_msg *, struct netlink_skb_parms *));
void cn_del_callback(struct cb_id *);
int cn_netlink_send(struct cn_msg *, u32, gfp_t);
+int cn_netlink_reply(struct cn_msg *, u32, gfp_t);
int cn_queue_add_callback(struct cn_queue_dev *dev, const char *name,
struct cb_id *id,
--
1.7.2.5
^ permalink raw reply related
* Re: [PATCH RFC 0/6] copy aside frags with destructors (was [PATCH 7/9] net: add skb_orphan_frags to copy aside frags with destructors)
From: Michael S. Tsirkin @ 2012-05-09 14:56 UTC (permalink / raw)
To: Ian Campbell; +Cc: David Miller, netdev@vger.kernel.org, eric.dumazet@gmail.com
In-Reply-To: <20120509135240.GA19246@redhat.com>
On Wed, May 09, 2012 at 04:52:40PM +0300, Michael S. Tsirkin wrote:
> On Wed, May 09, 2012 at 02:18:49PM +0100, Ian Campbell wrote:
> > On Mon, 2012-05-07 at 14:53 +0100, Michael S. Tsirkin wrote:
> > > On Fri, May 04, 2012 at 11:51:31AM +0100, Ian Campbell wrote:
> > > > On Fri, 2012-05-04 at 11:03 +0100, Michael S. Tsirkin wrote:
> > > > > On Fri, May 04, 2012 at 02:54:33AM -0400, David Miller wrote:
> > > > > > From: "Michael S. Tsirkin" <mst@redhat.com>
> > > > > > Date: Fri, 4 May 2012 00:10:24 +0300
> > > > > >
> > > > > > > Hmm we orphan skbs when we loop them back so how about reusing the
> > > > > > > skb->destructor for this?
> > > > > >
> > > > > > That's one possibility.
> > >
> > > So originally I thought it would work: destructor would wake the
> > > original owner which would do data copy and modify the fragments. But
> > > then I unfortunately realized that would be racy: the new owner could be
> >
> > By "new owner" you mean the owner of some other skb which refernences
> > the same shinfo (so either clone or the original if we currently hold a
> > clone).
Sorry didn't clarify this well enough. We often do something like
the following
orphan
set owner
new owner is what is set afterwards.
> > > using the old frags and there appears no way for us to
> > > make sure it doesn't so that we can put the original page.
> >
> > Right, the key issue is, I think, that cloning etc take an extra dataref
> > on the shinfo but do not take references on the frags. This is obviously
> > sane but does cause the issue above.
> >
> > > And the same logic applies to modifying the frags at any
> > > other time if the skb is cloned. So it seems we must copy if we
> > > want to clone the skb.
> >
> > Right. I had been hoping that the frag destructors could avoid this but
> > it seems like this is not going to be the case.
> >
> > Just as a thought experiment would taking a frag ref on clone help us?
> > (lets assume for now that, iff there are destructors, this is cheaper
> > than the copy). I think this doesn't work -- since although the ref will
> > mean that the page doesn't go away under anyone elses feet after we've
> > orphaned it we will have lost the pointer to the original page by the
> > time that other ref is dropped, so freeing it will be tricky.
> >
> > So following that train of thought a step further -- would creating a
> > new shinfo entirely on frag orphan buy us anything (and at what cost)?
> > Obviously it an extra allocation -- but we are already allocating N
> > pages of new frag. The others skb's referencing the shared shinfo would
> > still continue to see the old shinfo, pages and refs until they
> > themselves needed to orphan the frags (if they do, they might avoid it).
>
> So it would address the second problem (cloned skb) but
> not the first one (original owner racing with the first one).
Oh and there's another issue: some code assumes that clones
are always identical. For example tcpdump
might show wrong packet content if the page is modified
while packet gets through networking core, so a malicious
application might confuse anything that relies on
this data to do e.g. security filtering.
I don't know whether anyone cares about tcpdump output
being always correct but it seems nicer to side-step the issue.
> > I think that could work, but I'm not sure it is worth the complexity.
> > Basically all it means that if you are bridging + flooding then you
> > would potentially save some number of a copies depending on the types of
> > devices on each port.
> >
> > [...]
> >
> > > Second option is what macvtap zero copy uses and it already
> > > does copy on clone too. So I hacked that to make it support tcp/udp
> > > used by sunrpc.
> >
> > This does seem like it is the preferable solution.
> >
> > I haven't read the patches yet, so maybe you already support this, but
> > it would be good to allow the creator of an skb to declare that it
> > doesn't want this behaviour, e.g. because it has some other mechanism
> > for dealing with this copy out for the case where an skb gets held
> > somewhere.
> >
> > Ian.
>
> Sure, if we find a way to do that we just don't set ZEROCOPY skb flag.
>
> So I'll give people a bit more time to review, hope
> you find the time too.
>
> > > > > >
> > > > > > But I fear we're about to toss Ian into yet another rabbit hole. :-)
> > > > > >
> > > > > > Let's try to converge on something quickly as I think integration of
> > > > > > his work has been delayed enough as-is.
> > >
> > > ...
> > >
> > > > > It's weekend here, I'll work on a patch like this
> > > > > Sunday.
> > > >
> > > > Thanks, I was starting to feel my nose twitching and my ears beginning
> > > > to elongate ;-)
> > > >
> > > > Ian.
> > >
> > > Here's a first stub at a fix. Basically to be able to modify frags on
> > > the fly we must make sure the skb isn't cloned, so the moment someone
> > > clones the skb we need to trigger the frag copy logic. And this is
> > > exactly what happens with SKBTX_DEV_ZEROCOPY so it seems to make sense
> > > to reuse that logic.
> > >
> > > The below patchset replaces patch 7
> > > ([PATCH 7/9] net: add skb_orphan_frags to copy aside frags with destructors)
> > > in Ian's patchset and needs to be applied there.
> > >
> > >
> > > Compiled only but I'd like to hear what people think all the same
> > > because it does add a couple of branches on fast path. On the other
> > > hand this makes it generic so the same logic will be reusable for packet
> > > sockets (which IIRC are currently buggy in the same way as sunrpc) and
> > > for adding zero copy support to tun.
> > >
> > > Please comment,
> > > Thanks!
> > >
> > > --
> > > MST
> > >
> > >
> > > Michael S. Tsirkin (6):
> > > skbuff: support per-page destructors in copy_ubufs
> > > skbuff: add an api to orphan frags
> > > skbuff: convert to skb_orphan_frags
> > > tun: orphan frags on xmit
> > > net: orphan frags on receive
> > > skbuff: set zerocopy flag on frag destructor
> > >
> > > drivers/net/tun.c | 2 ++
> > > include/linux/skbuff.h | 41 +++++++++++++++++++++++++++++++++++++++++
> > > net/core/dev.c | 2 ++
> > > net/core/skbuff.c | 43 +++++++++++++++++++++++++------------------
> > > 4 files changed, 70 insertions(+), 18 deletions(-)
> > >
> >
^ permalink raw reply
* Re: [PATCH 04/13] bridge: netfilter: Convert compare_ether_addr to ether_addr_equal
From: Stephen Hemminger @ 2012-05-09 15:07 UTC (permalink / raw)
To: Joe Perches
Cc: David S. Miller, Bart De Schuymer, Pablo Neira Ayuso,
Patrick McHardy, netfilter-devel, netfilter, coreteam, bridge,
netdev, linux-kernel
In-Reply-To: <5a91d328db3393320170723cdfc9395ad59182c6.1336538938.git.joe@perches.com>
On Tue, 8 May 2012 21:56:48 -0700
Joe Perches <joe@perches.com> wrote:
> Use the new bool function ether_addr_equal to add
> some clarity and reduce the likelihood for misuse
> of compare_ether_addr for sorting.
>
> Done via cocci script:
>
> $ cat compare_ether_addr.cocci
> @@
> expression a,b;
> @@
> - !compare_ether_addr(a, b)
> + ether_addr_equal(a, b)
>
> @@
> expression a,b;
> @@
> - compare_ether_addr(a, b)
> + !ether_addr_equal(a, b)
>
> @@
> expression a,b;
> @@
> - !ether_addr_equal(a, b) == 0
> + ether_addr_equal(a, b)
>
> @@
> expression a,b;
> @@
> - !ether_addr_equal(a, b) != 0
> + !ether_addr_equal(a, b)
>
> @@
> expression a,b;
> @@
> - ether_addr_equal(a, b) == 0
> + !ether_addr_equal(a, b)
>
> @@
> expression a,b;
> @@
> - ether_addr_equal(a, b) != 0
> + ether_addr_equal(a, b)
>
> @@
> expression a,b;
> @@
> - !!ether_addr_equal(a, b)
> + ether_addr_equal(a, b)
>
> Signed-off-by: Joe Perches <joe@perches.com>
> ---
> net/bridge/netfilter/ebt_stp.c | 4 ++--
> 1 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
> index 5b33a2e..071d872 100644
> --- a/net/bridge/netfilter/ebt_stp.c
> +++ b/net/bridge/netfilter/ebt_stp.c
> @@ -164,8 +164,8 @@ static int ebt_stp_mt_check(const struct xt_mtchk_param *par)
> !(info->bitmask & EBT_STP_MASK))
> return -EINVAL;
> /* Make sure the match only receives stp frames */
> - if (compare_ether_addr(e->destmac, bridge_ula) ||
> - compare_ether_addr(e->destmsk, msk) || !(e->bitmask & EBT_DESTMAC))
> + if (!ether_addr_equal(e->destmac, bridge_ula) ||
> + !ether_addr_equal(e->destmsk, msk) || !(e->bitmask & EBT_DESTMAC))
> return -EINVAL;
>
> return 0;
All look good.
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
^ permalink raw reply
* Re: [PATCH RFC 5/6] net: orphan frags on receive
From: Michael S. Tsirkin @ 2012-05-09 15:12 UTC (permalink / raw)
To: Ian Campbell; +Cc: David Miller, netdev@vger.kernel.org, eric.dumazet@gmail.com
In-Reply-To: <1336571692.25514.122.camel@zakaz.uk.xensource.com>
On Wed, May 09, 2012 at 02:54:52PM +0100, Ian Campbell wrote:
> On Mon, 2012-05-07 at 14:54 +0100, Michael S. Tsirkin wrote:
> > zero copy packets are normally sent to the outside
> > network, but bridging, tun etc might loop them
> > back to host networking stack. If this happens
> > destructors will never be called, so orphan
> > the frags immediately on receive.
>
> I think this deceptively simply patch is actually the meat of the
> series.
> It's been a long time since I dug into the bridging code. Am I right
> that this function is only reached when an SKB is going to be delivered
> locally and not when forwarding to another port on the bridge?
>
> Ian.
I think so - bridge uses netdev_rx_handler_unregister
so it does not go through packet handlers.
> >
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> > net/core/dev.c | 2 ++
> > 1 files changed, 2 insertions(+), 0 deletions(-)
> >
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index a2be59f..c0cdc00 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -1630,6 +1630,8 @@ static inline int deliver_skb(struct sk_buff *skb,
> > struct packet_type *pt_prev,
> > struct net_device *orig_dev)
> > {
> > + if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
> > + return -ENOMEM;
> > atomic_inc(&skb->users);
> > return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
> > }
>
^ permalink raw reply
* Re: [PATCH] net: skb_set_dev do not unconditionally drop ref to dst
From: Stephen Hemminger @ 2012-05-09 15:16 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Frank Blaschka, davem, netdev, linux-s390, Arnd Bergmann
In-Reply-To: <1335765093.2296.4.camel@edumazet-glaptop>
On Mon, 30 Apr 2012 07:51:33 +0200
Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Mon, 2012-04-30 at 07:38 +0200, Frank Blaschka wrote:
> > From: Frank Blaschka <frank.blaschka@de.ibm.com>
> >
> > commit 8a83a00b0735190384a348156837918271034144 unconditionally
> > drops dst reference when skb->dev is set. This causes a regression
> > with VLAN and the qeth_l3 network driver. qeth_l3 can not get gw
> > information from the skb coming from the vlan driver. It is only
> > valid to drop the dst in case of different name spaces.
> >
> > Signed-off-by: Frank Blaschka <frank.blaschka@de.ibm.com>
> > ---
> > net/core/dev.c | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -1881,8 +1881,8 @@ EXPORT_SYMBOL(netif_device_attach);
> > #ifdef CONFIG_NET_NS
> > void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
> > {
> > - skb_dst_drop(skb);
> > if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
> > + skb_dst_drop(skb);
> > secpath_reset(skb);
> > nf_reset(skb);
> > skb_init_secmark(skb);
> >
>
> You forgot CC Arnd Bergmann <arnd@arndb.de> ?
>
> But we do want to do the skb_dst_drop() in dev_forward_skb()
>
> Your patch breaks dev_forward_skb() then.
>
> But apparently this forward path was alredy broken in Arnd patch...
Is this related to why, PMTU discover is broken now over GRE.
The simple case of doing a TCP transfer from local host over
GRE tunnel hangs.
What happens is that skb_dst(skb) is null in ip_tunnel_xmit.
Which leads to the MTU not being updated, and the ICMP_FRAG_NEEDED
is never sent.
^ permalink raw reply
* Re: [PATCH RFC 0/6] copy aside frags with destructors (was [PATCH 7/9] net: add skb_orphan_frags to copy aside frags with destructors)
From: Michael S. Tsirkin @ 2012-05-09 15:27 UTC (permalink / raw)
To: Ian Campbell; +Cc: David Miller, netdev@vger.kernel.org, eric.dumazet@gmail.com
In-Reply-To: <1336572107.25514.127.camel@zakaz.uk.xensource.com>
On Wed, May 09, 2012 at 03:01:47PM +0100, Ian Campbell wrote:
> I took a look at a fairly high level and it all looked sensible, I've
> not looked closely at the details, not run it yet, I hope I can do that
> shortly.
In particular you can trace skb_copy_ubufs to verify that
it's called in the scenario where we want a copybreak
and not called where unexpected e.g. in bridge forwarding.
^ permalink raw reply
* Re: [PATCH] pch_gbe: Adding read memory barriers
From: Ben Hutchings @ 2012-05-09 15:39 UTC (permalink / raw)
To: Erwan Velu; +Cc: David Laight, netdev, linux-kernel, tshimizu818
In-Reply-To: <CAL2JzuxawnesC+=Fqyu71Hkwi5gnRughe_NYM4k1c=zqDoNGRg@mail.gmail.com>
On Wed, 2012-05-09 at 17:20 +0200, Erwan Velu wrote:
> Thanks for this very clear description.
It might be clear, but it's not very relevant: you need to understand
the Linux kernel barrier APIs (which cover all architectures). See
Documentation/memory-barriers.txt.
> As I'm pretty new to this kind of integration in the kernel, do you
> request me to test this volatile option ?
> Does the patch tends to be rejected for this issue ?
> What should be the next step for my patch ?
[...]
Explain what kind of reordering you are trying to avoid.
Ben.
--
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply
* Re: [PATCH 3.3.0] phy:icplus:fix Auto Power Saving in ip101a_config_init.
From: David Miller @ 2012-05-09 15:57 UTC (permalink / raw)
To: jrnieder; +Cc: srinivas.kandagatla, netdev, peppe.cavallaro
In-Reply-To: <20120509072537.GA437@burratino>
From: Jonathan Nieder <jrnieder@gmail.com>
Date: Wed, 9 May 2012 02:25:37 -0500
> Hi Dave,
>
> David Miller wrote:
>> From: Srinivas KANDAGATLA <srinivas.kandagatla@st.com>
>
>>> This patch fixes Auto Power Saving configuration in ip101a_config_init
>>> which was broken as there is no phy register write followed after
>>> setting IP101A_APS_ON flag.
>>>
>>> This patch also fixes the return value of ip101a_config_init.
>>>
>>> Without this patch ip101a_config_init returns 2 which is not an error
>>> accroding to IS_ERR and the mac driver will continue accessing 2 as
>>> valid pointer to phy_dev resulting in memory fault.
>>>
>>> Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@st.com>
>>
>> Applied and queued up for -stable, thanks.
>
> I'm guessing 3.2.y needs this, too, since it also includes
> v3.2-rc1~129^2~247 ("net/phy: add IC+ IP101A and support APS").
>
> Here's a quick backport made by adjusting the context line to
> use IP101A_APS_ON instead of IP101A_G_APS_ON.
>
> Please include it in some later stable update if appropriate.
Please submit this directly to the stable list.
^ permalink raw reply
* Re: [net-next] e1000e: Fix merge conflict (net->net-next)
From: David Miller @ 2012-05-09 16:07 UTC (permalink / raw)
To: jeffrey.t.kirsher; +Cc: netdev, gospo, sassmann
In-Reply-To: <1336555426-13934-1-git-send-email-jeffrey.t.kirsher@intel.com>
From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Wed, 9 May 2012 02:23:46 -0700
> During merge of net to net-next the changes in patch:
>
> e1000e: Fix default interrupt throttle rate not set in NIC HW
>
> got munged in param.c of the e1000e driver. This rectifies the
> merge issues.
>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Applied, thanks Jeff.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox