Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH 1/1] netfilter: xt_recent: Add optional mask option for xt_recent
From: Pablo Neira Ayuso @ 2012-05-02  1:01 UTC (permalink / raw)
  To: Denys Fedoryshchenko
  Cc: Patrick McHardy, David S. Miller, netfilter-devel, netfilter,
	coreteam, linux-kernel, netdev
In-Reply-To: <1331033084-23803-1-git-send-email-denys@visp.net.lb>

On Tue, Mar 06, 2012 at 01:24:44PM +0200, Denys Fedoryshchenko wrote:
> Use case for this feature:
> 1)In some occasions if you need to allow,block,match specific subnet.
> 2)I can use recent as a trigger when netfilter rule matches, with mask 0.0.0.0
> 
> Example:
> 
> If you ping 8.8.8.8, after that you can't ping 2.2.2.10
> 
> Tested for backward compatibility:
> )old (userspace) iptables, new kernel
> )old kernel, new iptables
> )new kernel, new iptables
> 
> Signed-off-by: Denys Fedoryshchenko <denys@visp.net.lb>
> ---
>  include/linux/netfilter.h           |   11 +++++
>  include/linux/netfilter/xt_recent.h |   10 +++++
>  net/netfilter/xt_recent.c           |   70 +++++++++++++++++++++++++++++++----
>  3 files changed, 83 insertions(+), 8 deletions(-)
> 
> diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
> index b809265..c132de6 100644
> --- a/include/linux/netfilter.h
> +++ b/include/linux/netfilter.h
> @@ -85,6 +85,17 @@ static inline int NF_DROP_GETERR(int verdict)
> 	return -(verdict >> NF_VERDICT_QBITS);
>  }
> 
> +
> +static inline void nf_inet_addr_mask(const union nf_inet_addr *a1,
> +				    union nf_inet_addr *result,
> +				    const union nf_inet_addr *mask)
> +{
> +	result->all[0] = a1->all[0] & mask->all[0];
> +	result->all[1] = a1->all[1] & mask->all[1];
> +	result->all[2] = a1->all[2] & mask->all[2];
> +	result->all[3] = a1->all[3] & mask->all[3];
> +}

Please, put this function in the xt_recent. I prefer leave it there
until more clients of it show up.

> +
>  static inline int nf_inet_addr_cmp(const union nf_inet_addr *a1,
> 				   const union nf_inet_addr *a2)
>  {
> diff --git a/include/linux/netfilter/xt_recent.h b/include/linux/netfilter/xt_recent.h
> index 83318e0..6ef36c1 100644
> --- a/include/linux/netfilter/xt_recent.h
> +++ b/include/linux/netfilter/xt_recent.h
> @@ -32,4 +32,14 @@ struct xt_recent_mtinfo {
> 	__u8 side;
>  };
> 
> +struct xt_recent_mtinfo_v1 {
> +	__u32 seconds;
> +	__u32 hit_count;
> +	__u8 check_set;
> +	__u8 invert;
> +	char name[XT_RECENT_NAME_LEN];
> +	__u8 side;
> +	union nf_inet_addr mask;
> +};
> +
>  #endif /* _LINUX_NETFILTER_XT_RECENT_H */
> diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
> index d2ff15a..bdc38fa 100644
> --- a/net/netfilter/xt_recent.c
> +++ b/net/netfilter/xt_recent.c
> @@ -75,6 +75,7 @@ struct recent_entry {
>  struct recent_table {
> 	struct list_head	list;
> 	char			name[XT_RECENT_NAME_LEN];
> +	union nf_inet_addr	mask;
> 	unsigned int		refcnt;
> 	unsigned int		entries;
> 	struct list_head	lru_list;
> @@ -228,10 +229,11 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
>  {
> 	struct net *net = dev_net(par->in ? par->in : par->out);
> 	struct recent_net *recent_net = recent_pernet(net);
> -	const struct xt_recent_mtinfo *info = par->matchinfo;
> +	const struct xt_recent_mtinfo_v1 *info = par->matchinfo;
> 	struct recent_table *t;
> 	struct recent_entry *e;
> 	union nf_inet_addr addr = {};
> +	union nf_inet_addr addr_masked;
> 	u_int8_t ttl;
> 	bool ret = info->invert;
> 
> @@ -261,12 +263,15 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
> 
> 	spin_lock_bh(&recent_lock);
> 	t = recent_table_lookup(recent_net, info->name);
> -	e = recent_entry_lookup(t, &addr, par->family,
> +
> +	nf_inet_addr_mask(&addr, &addr_masked, &t->mask);
> +
> +	e = recent_entry_lookup(t, &addr_masked, par->family,
> 				(info->check_set & XT_RECENT_TTL) ? ttl : 0);
> 	if (e == NULL) {
> 		if (!(info->check_set & XT_RECENT_SET))
> 			goto out;
> -		e = recent_entry_init(t, &addr, par->family, ttl);
> +		e = recent_entry_init(t, &addr_masked, par->family, ttl);
> 		if (e == NULL)
> 			par->hotdrop = true;
> 		ret = !ret;
> @@ -306,10 +311,10 @@ out:
> 	return ret;
>  }
> 
> -static int recent_mt_check(const struct xt_mtchk_param *par)
> +static int recent_mt_check(const struct xt_mtchk_param *par,
> +	const struct xt_recent_mtinfo_v1 *info)
>  {
> 	struct recent_net *recent_net = recent_pernet(par->net);
> -	const struct xt_recent_mtinfo *info = par->matchinfo;
> 	struct recent_table *t;
>  #ifdef CONFIG_PROC_FS
> 	struct proc_dir_entry *pde;
> @@ -361,6 +366,8 @@ static int recent_mt_check(const struct xt_mtchk_param *par)
> 		goto out;
> 	}
> 	t->refcnt = 1;
> +
> +	memcpy(&t->mask, &info->mask, sizeof(t->mask));
> 	strcpy(t->name, info->name);
> 	INIT_LIST_HEAD(&t->lru_list);
> 	for (i = 0; i < ip_list_hash_size; i++)
> @@ -385,10 +392,37 @@ out:
> 	return ret;
>  }
> 
> +static int recent_mt_check_v0(const struct xt_mtchk_param *par)
> +{
> +	const struct xt_recent_mtinfo_v0 *info_v0 = par->matchinfo;
> +	struct xt_recent_mtinfo_v1 *info_v1 = par->matchinfo;
> +	int ret;
> +
> +	info_v1 = kzalloc(sizeof(struct xt_recent_mtinfo_v1),
> +		    GFP_KERNEL);

Better allocate this in the stack. It's fairly small and it is used
temporarily.

> +	if (info_v1 == NULL)
> +		return -ENOMEM;
> +
> +
> +	/* Copy old data */
> +	memcpy(info_v1, info_v0, sizeof(struct xt_recent_mtinfo));
> +	/* Default mask will make same behavior as old recent */
> +	memset(info_v1->mask.all, 0xFF, sizeof(info_v1->mask.all));
> +	ret = recent_mt_check(par, info_v1);
> +
> +	kfree(info_v1);
> +	return ret;
> +}
> +
> +static int recent_mt_check_v1(const struct xt_mtchk_param *par)
> +{
> +	return recent_mt_check(par, par->matchinfo);
> +}
> +
>  static void recent_mt_destroy(const struct xt_mtdtor_param *par)
>  {
> 	struct recent_net *recent_net = recent_pernet(par->net);
> -	const struct xt_recent_mtinfo *info = par->matchinfo;
> +	const struct xt_recent_mtinfo_v1 *info = par->matchinfo;
> 	struct recent_table *t;
> 
> 	mutex_lock(&recent_mutex);
> @@ -625,7 +659,7 @@ static struct xt_match recent_mt_reg[] __read_mostly = {
> 		.family     = NFPROTO_IPV4,
> 		.match      = recent_mt,
> 		.matchsize  = sizeof(struct xt_recent_mtinfo),
> -		.checkentry = recent_mt_check,
> +		.checkentry = recent_mt_check_v0,
> 		.destroy    = recent_mt_destroy,
> 		.me         = THIS_MODULE,
> 	},
> @@ -635,10 +669,30 @@ static struct xt_match recent_mt_reg[] __read_mostly = {
> 		.family     = NFPROTO_IPV6,
> 		.match      = recent_mt,
> 		.matchsize  = sizeof(struct xt_recent_mtinfo),
> -		.checkentry = recent_mt_check,
> +		.checkentry = recent_mt_check_v0,
> 		.destroy    = recent_mt_destroy,
> 		.me         = THIS_MODULE,
> 	},
> +	{
> +		.name       = "recent",
> +		.revision   = 1,
> +		.family     = NFPROTO_IPV4,
> +		.match      = recent_mt,
> +		.matchsize  = sizeof(struct xt_recent_mtinfo_v1),
> +		.checkentry = recent_mt_check_v1,
> +		.destroy    = recent_mt_destroy,
> +		.me         = THIS_MODULE,
> +	},
> +	{
> +		.name       = "recent",
> +		.revision   = 1,
> +		.family     = NFPROTO_IPV6,
> +		.match      = recent_mt,
> +		.matchsize  = sizeof(struct xt_recent_mtinfo_v1),
> +		.checkentry = recent_mt_check_v1,
> +		.destroy    = recent_mt_destroy,
> +		.me         = THIS_MODULE,
> +	}
>  };
> 
>  static int __init recent_mt_init(void)
> --
> 1.7.3.4

^ permalink raw reply

* Re: [PATCH 1/1] netfilter: xt_recent: Add optional mask option for xt_recent
From: Pablo Neira Ayuso @ 2012-05-02  0:56 UTC (permalink / raw)
  To: Denys Fedoryshchenko
  Cc: Patrick McHardy, David S. Miller, netfilter-devel, netfilter,
	coreteam, linux-kernel, netdev
In-Reply-To: <d8de90a8f08a092730109e72827264fe@visp.net.lb>

On Thu, Apr 12, 2012 at 12:00:03PM +0300, Denys Fedoryshchenko wrote:
[...]
> For me personally it is useful, because i have around 140 NAS
> servers, and i give each of them /24 "gray" subnets, and in some
> cases i need to handle bad users, that are changing dynamic ip and
> attacking from new ip each time. I just block non-critical service
> for whole subnet then, till technician on duty will solve issue
> completely. And sure if attack are stopped, subnet will be unblocked
> "automagically".

OK, if you need this, I'm fine with it.

> Sure this feature not critical, or "a must", and if code are not
> good, it is up to you, if it should be added or not.

I didn't say anything about the code yet. E-mail reviewing this will
follow.

^ permalink raw reply

* Re: [PATCH RFC 2/2] netfilter: conntrack: replace mutex with cmpxchg
From: Pablo Neira Ayuso @ 2012-05-02  0:51 UTC (permalink / raw)
  To: Benjamin Poirier
  Cc: netdev, Patrick McHardy, David S. Miller, Andrew Morton,
	Eric Dumazet, Mike Frysinger, Arun Sharma, netfilter-devel,
	netfilter, coreteam, linux-kernel, Paul E. McKenney
In-Reply-To: <1335551333-6103-2-git-send-email-bpoirier@suse.de>

On Fri, Apr 27, 2012 at 02:28:53PM -0400, Benjamin Poirier wrote:
> This mutex protects a single pointer.

You seem to be using an old Linux kernel tree. This doesn't apply.

> ---
>  net/netfilter/nf_conntrack_ecache.c |   38 +++++++++-------------------------
>  1 files changed, 10 insertions(+), 28 deletions(-)
> 
> diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
> index 0134009..603eb69 100644
> --- a/net/netfilter/nf_conntrack_ecache.c
> +++ b/net/netfilter/nf_conntrack_ecache.c
> @@ -25,8 +25,6 @@
>  #include <net/netfilter/nf_conntrack_core.h>
>  #include <net/netfilter/nf_conntrack_extend.h>
>  
> -static DEFINE_MUTEX(nf_ct_ecache_mutex);
> -
>  /* deliver cached events and clear cache entry - must be called with locally
>   * disabled softirqs */
>  void nf_ct_deliver_cached_events(struct nf_conn *ct)
> @@ -80,52 +78,36 @@ EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
>  int nf_conntrack_register_notifier(struct net *net,
>  				   struct nf_ct_event_notifier *new)
>  {
> -	int ret = 0;
> -
> -	mutex_lock(&nf_ct_ecache_mutex);
> -	if (net->ct.nf_conntrack_event_cb != NULL)
> -		ret = -EBUSY;
> +	if (cmpxchg(&net->ct.nf_conntrack_event_cb, NULL, new) != NULL)
> +		return -EBUSY;
>  	else
> -		net->ct.nf_conntrack_event_cb = new;
> -	mutex_unlock(&nf_ct_ecache_mutex);
> -
> -	return ret;
> +		return 0;
>  }
>  EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
>  
>  void nf_conntrack_unregister_notifier(struct net *net,
>  				      struct nf_ct_event_notifier *new)
>  {
> -	mutex_lock(&nf_ct_ecache_mutex);
> -	BUG_ON(net->ct.nf_conntrack_event_cb != new);
> -	net->ct.nf_conntrack_event_cb = NULL;
> -	mutex_unlock(&nf_ct_ecache_mutex);
> +	if (xchg(&net->ct.nf_conntrack_event_cb, NULL) != new)
> +		BUG();
>  }
>  EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
>  
>  int nf_ct_expect_register_notifier(struct net *net,
>  				   struct nf_exp_event_notifier *new)
>  {
> -	int ret = 0;
> -
> -	mutex_lock(&nf_ct_ecache_mutex);
> -	if (net->ct.nf_expect_event_cb != NULL)
> -		ret = -EBUSY;
> +	if (cmpxchg(&net->ct.nf_expect_event_cb, NULL, new) != NULL)
> +		return -EBUSY;
>  	else
> -		net->ct.nf_expect_event_cb = new;
> -	mutex_unlock(&nf_ct_ecache_mutex);
> -
> -	return ret;
> +		return 0;
>  }
>  EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier);
>  
>  void nf_ct_expect_unregister_notifier(struct net *net,
>  				      struct nf_exp_event_notifier *new)
>  {
> -	mutex_lock(&nf_ct_ecache_mutex);
> -	BUG_ON(net->ct.nf_expect_event_cb != new);
> -	net->ct.nf_expect_event_cb = NULL;
> -	mutex_unlock(&nf_ct_ecache_mutex);
> +	if (xchg(&net->ct.nf_expect_event_cb, NULL) != new)
> +		BUG();
>  }
>  EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
>  
> -- 
> 1.7.7
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v2 00/17] netfilter: add namespace support for netfilter protos
From: Pablo Neira Ayuso @ 2012-05-02  0:40 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Gao feng, netfilter-devel, netdev, serge.hallyn, dlezcano
In-Reply-To: <m1zk9rwxf2.fsf@fess.ebiederm.org>

Hi Eric,

On Tue, May 01, 2012 at 11:47:45AM -0700, Eric W. Biederman wrote:
> Gao feng <gaofeng@cn.fujitsu.com> writes:
> 
> > Currently the sysctl of netfilter proto is not isolated, so when
> > changing proto's sysctl in container will cause the host's sysctl
> > be changed too. it's not expected.
> >
> > This patch set adds the namespace support for netfilter protos.
> >
> > impletement four pernet_operations to register sysctl and initial
> > pernet data for proto.
> >
> > -ipv4_net_ops is used to register tcp4(compat),
> >  udp4(compat),icmp(compat),ipv4(compat).
> > -ipv6_net_ops is used to register tcp6,udp6 and icmpv6.
> > -sctp_net_ops is used to register sctp4(compat) and sctp6.
> > -udplite_net_ops is used to register udplite4 and udplite6
> >
> > extern l[3,4]proto (sysctl) register functions to make them support
> > namespace.
> >
> > finailly add namespace support for cttimeout.
> 
> I am a bit out of it this week so I could not look at these patches
> in the detail that I would like.  However skimming through it looks
> like you addressed your review comments, and the changes look like
> the kind of changes I would expect from something like this.
> 
> I assume you have tested to make certain your code actually works.
> 
> So on that basis for the patchset:
> Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
> 
> Anyone else want to chime in or does everyone else figure
> that this code is ready to be merged and no additional comments
> are necessary?

I also want to see this code in this round of net-next.

Yet, I'd like to have it a closer look to the patches. Please, be
patient.

^ permalink raw reply

* Re: [v12 PATCH 2/3] NETFILTER module xt_hmark, new target for HASH based fwmark
From: Pablo Neira Ayuso @ 2012-05-02  0:34 UTC (permalink / raw)
  To: Hans Schillstrom; +Cc: kaber, jengelh, netfilter-devel, netdev, hans
In-Reply-To: <1335188128-23645-3-git-send-email-hans.schillstrom@ericsson.com>

[-- Attachment #1: Type: text/plain, Size: 1084 bytes --]

Hi Hans,

I have decided to take your patch and give it one spin today.

Please, find it attached. The main things I've done are:

* splitting the code into smaller functions, thus, it becomes more
  maintainable.

* try to put common code into functions, eg. the layer 4 protocol
  parsing to obtain the ports is the same for both IPv4 and IPv6.

* adding the hmark_tuple abstraction, cleaner than using several
  variables to set the address, ports, and so on. Thus, we only pass
  one single pointer to it.

* I have removed most of the comments, they bloat the file and most
  information can be extracted by reading the code. I only left the
  comments that clarify "strange" things.

Regarding ICMP traffic, I think we can use the ID field for the
hashing as well. Thus, we handle ICMP like other protocols.

Please, I'd appreciate if you can test and spot issues after my
rework. I have slightly tested here.

I may make some minor cleanup on it before submission but, in that
case, in that case, I'll post the patch. I would not expect more major
changes in it.

Let me know.

[-- Attachment #2: 0001-netfilter-add-xt_hmark-target-for-hash-based-skb-mar.patch --]
[-- Type: text/x-diff, Size: 15964 bytes --]

>From 2aaa13cb2020d7cd8fe7f30b54e083fecbff9975 Mon Sep 17 00:00:00 2001
From: Hans Schillstrom <hans.schillstrom@ericsson.com>
Date: Mon, 23 Apr 2012 03:35:27 +0000
Subject: [PATCH] netfilter: add xt_hmark target for hash-based skb marking

The target allows you to create rules in the "raw" and "mangle" tables
which set the skbuff mark by means of hash calculation within a given
range. The nfmark can influence the routing method (see "Use netfilter
MARK value as routing key") and can also be used by other subsystems to
change their behaviour.

Some examples:

* Default rule handles all TCP, UDP, SCTP, ESP & AH

 iptables -t mangle -A PREROUTING -m state --state NEW,ESTABLISHED,RELATED \
	-j HMARK --hmark-offset 10000 --hmark-mod 10

* Handle SCTP and hash dest port only and produce a nfmark between 100-119.

 iptables -t mangle -A PREROUTING -p SCTP -j HMARK --src-mask 0 --dst-mask 0 \
	--sp-mask 0 --offset 100 --mod 20

* Fragment safe Layer 3 only, that keep a class C network flow together

 iptables -t mangle -A PREROUTING -j HMARK --method L3 \
	--src-mask 24 --mod 20 --offset 100

[ Many code of this patch has been refactorized by Pablo Neira Ayuso ]

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/linux/netfilter/xt_HMARK.h |   62 ++++++
 net/netfilter/Kconfig              |   15 ++
 net/netfilter/Makefile             |    1 +
 net/netfilter/xt_HMARK.c           |  391 ++++++++++++++++++++++++++++++++++++
 4 files changed, 469 insertions(+)
 create mode 100644 include/linux/netfilter/xt_HMARK.h
 create mode 100644 net/netfilter/xt_HMARK.c

diff --git a/include/linux/netfilter/xt_HMARK.h b/include/linux/netfilter/xt_HMARK.h
new file mode 100644
index 0000000..cdf4a8f
--- /dev/null
+++ b/include/linux/netfilter/xt_HMARK.h
@@ -0,0 +1,62 @@
+#ifndef XT_HMARK_H_
+#define XT_HMARK_H_
+
+#include <linux/types.h>
+
+enum {
+	XT_HMARK_NONE,
+	XT_HMARK_SADR_AND,
+	XT_HMARK_DADR_AND,
+	XT_HMARK_SPI_AND,
+	XT_HMARK_SPI_OR,
+	XT_HMARK_SPORT_AND,
+	XT_HMARK_DPORT_AND,
+	XT_HMARK_SPORT_OR,
+	XT_HMARK_DPORT_OR,
+	XT_HMARK_PROTO_AND,
+	XT_HMARK_RND,
+	XT_HMARK_MODULUS,
+	XT_HMARK_OFFSET,
+	XT_HMARK_CT,
+	XT_HMARK_METHOD_L3,
+	XT_HMARK_METHOD_L3_4,
+	XT_F_HMARK_SADR_AND    = 1 << XT_HMARK_SADR_AND,
+	XT_F_HMARK_DADR_AND    = 1 << XT_HMARK_DADR_AND,
+	XT_F_HMARK_SPI_AND     = 1 << XT_HMARK_SPI_AND,
+	XT_F_HMARK_SPI_OR      = 1 << XT_HMARK_SPI_OR,
+	XT_F_HMARK_SPORT_AND   = 1 << XT_HMARK_SPORT_AND,
+	XT_F_HMARK_DPORT_AND   = 1 << XT_HMARK_DPORT_AND,
+	XT_F_HMARK_SPORT_OR    = 1 << XT_HMARK_SPORT_OR,
+	XT_F_HMARK_DPORT_OR    = 1 << XT_HMARK_DPORT_OR,
+	XT_F_HMARK_PROTO_AND   = 1 << XT_HMARK_PROTO_AND,
+	XT_F_HMARK_RND         = 1 << XT_HMARK_RND,
+	XT_F_HMARK_MODULUS     = 1 << XT_HMARK_MODULUS,
+	XT_F_HMARK_OFFSET      = 1 << XT_HMARK_OFFSET,
+	XT_F_HMARK_CT          = 1 << XT_HMARK_CT,
+	XT_F_HMARK_METHOD_L3   = 1 << XT_HMARK_METHOD_L3,
+	XT_F_HMARK_METHOD_L3_4 = 1 << XT_HMARK_METHOD_L3_4,
+};
+
+union hmark_ports {
+	struct {
+		__u16	src;
+		__u16	dst;
+	} p16;
+	__u32	v32;
+};
+
+struct xt_hmark_info {
+	union nf_inet_addr	src_mask;	/* Source address mask */
+	union nf_inet_addr	dst_mask;	/* Dest address mask */
+	union hmark_ports	port_mask;
+	union hmark_ports	port_set;
+	__u32			spi_mask;
+	__u32			spi_set;
+	__u32			flags;		/* Print out only */
+	__u16			proto_mask;	/* L4 Proto mask */
+	__u32			hashrnd;
+	__u32			hmodulus;	/* Modulus */
+	__u32			hoffset;	/* Offset */
+};
+
+#endif /* XT_HMARK_H_ */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index d3f583e..cd5668e 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -517,6 +517,21 @@ config NETFILTER_XT_TARGET_HL
 	since you can easily create immortal packets that loop
 	forever on the network.
 
+config NETFILTER_XT_TARGET_HMARK
+	tristate '"HMARK" target support'
+	depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
+	depends on NETFILTER_ADVANCED
+	---help---
+	This option adds the "HMARK" target.
+
+	The target allows you to create rules in the "raw" and "mangle" tables
+	which set the skbuff mark by means of hash calculation within a given
+	range. The nfmark can influence the routing method (see "Use netfilter
+	MARK value as routing key") and can also be used by other subsystems to
+	change their behaviour.
+
+	To compile it as a module, choose M here. If unsure, say N.
+
 config NETFILTER_XT_TARGET_IDLETIMER
 	tristate  "IDLETIMER target support"
 	depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 78b8591..2f3bc0f 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -60,6 +60,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_HMARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_LOG) += xt_LOG.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o
diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c
new file mode 100644
index 0000000..df743bd
--- /dev/null
+++ b/net/netfilter/xt_HMARK.c
@@ -0,0 +1,391 @@
+/*
+ * xt_HMARK - Netfilter module to set mark as hash value
+ *
+ * (C) 2012 by Hans Schillstrom <hans.schillstrom@ericsson.com>
+ * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * Description:
+ *
+ * This module calculates a hash value that can be modified by modulus and an
+ * offset, i.e. it is possible to produce a skb->mark within a range The hash
+ * value is based on a direction independent five tuple: src & dst addr src &
+ * dst ports and protocol.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_HMARK.h>
+
+#include <net/ip.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+#include <net/ipv6.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Hans Schillstrom <hans.schillstrom@ericsson.com>");
+MODULE_DESCRIPTION("Xtables: packet marking using hash calculation");
+MODULE_ALIAS("ipt_HMARK");
+MODULE_ALIAS("ip6t_HMARK");
+
+struct hmark_tuple {
+	u32			src;
+	u32			dst;
+	union hmark_ports	uports;
+	uint8_t			proto;
+};
+
+static inline u32
+hmark_hash(const struct hmark_tuple *t, const struct xt_hmark_info *info)
+{
+	u32 hash;
+
+	hash = jhash_3words(t->src, t->dst, t->uports.v32, info->hashrnd);
+	hash = hash ^ (t->proto & info->proto_mask);
+
+	return (hash % info->hmodulus) + info->hoffset;
+}
+
+static void
+hmark_set_tuple_ports(const struct sk_buff *skb, unsigned int nhoff,
+		      struct hmark_tuple *t, const struct xt_hmark_info *info)
+{
+	int protoff;
+
+	protoff = proto_ports_offset(t->proto);
+	if (protoff < 0)
+		return;
+
+	nhoff += protoff;
+	if (skb_copy_bits(skb, nhoff, &t->uports, sizeof(t->uports)) < 0)
+		return;
+
+	if (t->proto == IPPROTO_ESP || t->proto == IPPROTO_AH)
+		t->uports.v32 = (t->uports.v32 & info->spi_mask) |
+				info->spi_set;
+	else {
+		t->uports.v32 = (t->uports.v32 & info->port_mask.v32) |
+				info->port_set.v32;
+
+		if (t->uports.p16.dst < t->uports.p16.src)
+			swap(t->uports.p16.dst, t->uports.p16.src);
+	}
+}
+
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+static int get_inner6_hdr(const struct sk_buff *skb, int *offset)
+{
+	struct icmp6hdr *icmp6h, _ih6;
+
+	icmp6h = skb_header_pointer(skb, *offset, sizeof(_ih6), &_ih6);
+	if (icmp6h == NULL)
+		return 0;
+
+	if (icmp6h->icmp6_type && icmp6h->icmp6_type < 128) {
+		*offset += sizeof(struct icmp6hdr);
+		return 1;
+	}
+	return 0;
+}
+
+static int
+hmark_ct_set_htuple_ipv6(const struct sk_buff *skb, struct hmark_tuple *t,
+			 const struct xt_hmark_info *info)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conntrack_tuple *otuple;
+	struct nf_conntrack_tuple *rtuple;
+
+	if (ct == NULL || nf_ct_is_untracked(ct))
+		return -1;
+
+	otuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+	rtuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+
+	t->src = (__force u32)
+		(otuple->src.u3.in6.s6_addr32[0] &
+			info->src_mask.in6.s6_addr32[0]) ^
+		(otuple->src.u3.in6.s6_addr32[1] &
+			info->src_mask.in6.s6_addr32[1]) ^
+		(otuple->src.u3.in6.s6_addr32[2] &
+			info->src_mask.in6.s6_addr32[2]) ^
+		(otuple->src.u3.in6.s6_addr32[3] &
+			info->src_mask.in6.s6_addr32[3]);
+	t->dst = (__force u32)
+		(otuple->src.u3.in6.s6_addr32[0] &
+			info->dst_mask.in6.s6_addr32[0]) ^
+		(otuple->src.u3.in6.s6_addr32[1] &
+			info->dst_mask.in6.s6_addr32[1]) ^
+		(otuple->src.u3.in6.s6_addr32[2] &
+			info->dst_mask.in6.s6_addr32[2]) ^
+		(otuple->src.u3.in6.s6_addr32[3] &
+			info->dst_mask.in6.s6_addr32[3]);
+
+	t->proto = nf_ct_protonum(ct);
+	if (t->proto != IPPROTO_ICMP) {
+		t->uports.p16.src = (otuple->src.u.all & info->port_mask.v32) |
+					info->port_set.v32;
+		t->uports.p16.dst = (rtuple->src.u.all & info->port_mask.v32) |
+					info->port_set.v32;
+	}
+
+	return 0;
+#else
+	return -1;
+#endif
+}
+
+static int
+hmark_pkt_set_htuple_ipv6(const struct sk_buff *skb, struct hmark_tuple *t,
+			  const struct xt_hmark_info *info)
+{
+	struct ipv6hdr *ip6, _ip6;
+	int flag = IP6T_FH_F_AUTH; /* Ports offset, find_hdr flags */
+	unsigned int nhoff = 0;
+	u16 fragoff = 0;
+	u8 nexthdr;
+
+	ip6 = (struct ipv6hdr *) (skb->data + skb_network_offset(skb));
+	nexthdr = ipv6_find_hdr(skb, &nhoff, -1, &fragoff, &flag);
+	if (nexthdr < 0)
+		return 0;
+	/* No need to check for icmp errors on fragments */
+	if ((flag & IP6T_FH_F_FRAG) || (nexthdr != IPPROTO_ICMPV6))
+		goto noicmp;
+	/* if an icmp error, use the inner header */
+	if (get_inner6_hdr(skb, &nhoff)) {
+		ip6 = skb_header_pointer(skb, nhoff, sizeof(_ip6), &_ip6);
+		if (ip6 == NULL)
+			return XT_CONTINUE;
+		/* Treat AH as ESP, use SPI nothing else. */
+		flag = IP6T_FH_F_AUTH;
+		nexthdr = ipv6_find_hdr(skb, &nhoff, -1, &fragoff, &flag);
+		if (nexthdr < 0)
+			return XT_CONTINUE;
+	}
+noicmp:
+	t->src = (__force u32)
+		(ip6->saddr.s6_addr32[0] & info->src_mask.in6.s6_addr32[0]) ^
+		(ip6->saddr.s6_addr32[1] & info->src_mask.in6.s6_addr32[1]) ^
+		(ip6->saddr.s6_addr32[2] & info->src_mask.in6.s6_addr32[2]) ^
+		(ip6->saddr.s6_addr32[3] & info->src_mask.in6.s6_addr32[3]);
+	t->dst = (__force u32)
+		(ip6->daddr.s6_addr32[0] & info->dst_mask.in6.s6_addr32[0]) ^
+		(ip6->daddr.s6_addr32[1] & info->dst_mask.in6.s6_addr32[1]) ^
+		(ip6->daddr.s6_addr32[2] & info->dst_mask.in6.s6_addr32[2]) ^
+		(ip6->daddr.s6_addr32[3] & info->dst_mask.in6.s6_addr32[3]);
+
+	t->proto = nexthdr;
+
+	if (t->proto == IPPROTO_ICMPV6)
+		return 0;
+
+	if (flag & IP6T_FH_F_FRAG)
+		return 0;
+
+	if (!(info->flags & XT_F_HMARK_METHOD_L3))
+		hmark_set_tuple_ports(skb, nhoff, t, info);
+
+	return 0;
+}
+
+static unsigned int
+hmark_tg_v6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_hmark_info *info = par->targinfo;
+	struct hmark_tuple t;
+
+	memset(&t, 0, sizeof(struct hmark_tuple));
+
+	if (info->flags & XT_F_HMARK_CT) {
+		if (hmark_ct_set_htuple_ipv6(skb, &t, info) < 0)
+			return XT_CONTINUE;
+	} else {
+		if (hmark_pkt_set_htuple_ipv6(skb, &t, info) < 0)
+			return XT_CONTINUE;
+	}
+
+	skb->mark = hmark_hash(&t, info);
+	return XT_CONTINUE;
+}
+#endif
+
+static int get_inner_hdr(const struct sk_buff *skb, int iphsz, int *nhoff)
+{
+	const struct icmphdr *icmph;
+	struct icmphdr _ih;
+
+	/* Not enough header? */
+	icmph = skb_header_pointer(skb, *nhoff + iphsz, sizeof(_ih), &_ih);
+	if (icmph == NULL && icmph->type > NR_ICMP_TYPES)
+		return 0;
+
+	/* Error message? */
+	if (icmph->type != ICMP_DEST_UNREACH &&
+	    icmph->type != ICMP_SOURCE_QUENCH &&
+	    icmph->type != ICMP_TIME_EXCEEDED &&
+	    icmph->type != ICMP_PARAMETERPROB &&
+	    icmph->type != ICMP_REDIRECT)
+		return 0;
+
+	*nhoff += iphsz + sizeof(_ih);
+	return 1;
+}
+
+static int
+hmark_ct_set_htuple_ipv4(const struct sk_buff *skb, struct hmark_tuple *t,
+			 const struct xt_hmark_info *info)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conntrack_tuple *otuple;
+	struct nf_conntrack_tuple *rtuple;
+
+	if (ct == NULL || nf_ct_is_untracked(ct))
+		return -1;
+
+	otuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+	rtuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+
+	t->src = (__force u32) otuple->src.u3.in.s_addr &
+			info->src_mask.in.s_addr;
+	t->dst = (__force u32) rtuple->src.u3.in.s_addr &
+			info->dst_mask.in.s_addr;
+
+	t->proto = nf_ct_protonum(ct);
+	if (t->proto != IPPROTO_ICMP) {
+		t->uports.p16.src = (otuple->src.u.all & info->port_mask.v32) |
+					info->port_set.v32;
+		t->uports.p16.dst = (rtuple->src.u.all & info->port_mask.v32) |
+					info->port_set.v32;
+	}
+	return 0;
+#else
+	return -1;
+#endif
+}
+
+static int
+hmark_pkt_set_htuple_ipv4(const struct sk_buff *skb, struct hmark_tuple *t,
+			  const struct xt_hmark_info *info)
+{
+	struct iphdr *ip, _ip;
+	int nhoff = skb_network_offset(skb);
+
+	ip = (struct iphdr *) (skb->data + nhoff);
+	if (ip->protocol == IPPROTO_ICMP) {
+		/* use inner header in case of ICMP errors */
+		if (get_inner_hdr(skb, ip->ihl * 4, &nhoff)) {
+			ip = skb_header_pointer(skb, nhoff, sizeof(_ip), &_ip);
+			if (ip == NULL)
+				return 0;
+		}
+	}
+
+	t->src = (__force u32) ip->saddr;
+	t->dst = (__force u32) ip->daddr;
+
+	/* this ensures consistent hashing for both directions */
+	if (t->dst < t->src)
+		swap(t->src, t->dst);
+
+	t->src &= info->src_mask.ip;
+	t->dst &= info->dst_mask.ip;
+
+	t->proto = ip->protocol;
+
+	/* ICMP has no ports, skip */
+	if (t->proto == IPPROTO_ICMP)
+		return 0;
+
+	/* follow-up fragments don't contain ports, skip */
+	if (ip->frag_off & htons(IP_MF | IP_OFFSET))
+		return 0;
+
+	if (!(info->flags & XT_F_HMARK_METHOD_L3))
+		hmark_set_tuple_ports(skb, ip->ihl * 4, t, info);
+
+	return 0;
+}
+
+static unsigned int
+hmark_tg_v4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_hmark_info *info = par->targinfo;
+	struct hmark_tuple t;
+
+	memset(&t, 0, sizeof(struct hmark_tuple));
+
+	if (info->flags & XT_F_HMARK_CT) {
+		if (hmark_ct_set_htuple_ipv4(skb, &t, info) < 0)
+			return XT_CONTINUE;
+	} else {
+		if (hmark_pkt_set_htuple_ipv4(skb, &t, info) < 0)
+			return XT_CONTINUE;
+	}
+
+	skb->mark = hmark_hash(&t, info);
+	return XT_CONTINUE;
+}
+
+static int hmark_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct xt_hmark_info *info = par->targinfo;
+
+	if (!info->hmodulus) {
+		pr_info("xt_HMARK: hash modulus can't be zero\n");
+		return -EINVAL;
+	}
+	if (info->proto_mask && (info->flags & XT_F_HMARK_METHOD_L3)) {
+		pr_info("xt_HMARK: proto mask must be zero with L3 mode\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct xt_target hmark_tg_reg[] __read_mostly = {
+	{
+		.name		= "HMARK",
+		.family		= NFPROTO_IPV4,
+		.target		= hmark_tg_v4,
+		.targetsize	= sizeof(struct xt_hmark_info),
+		.checkentry	= hmark_tg_check,
+		.me		= THIS_MODULE,
+	},
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+	{
+		.name		= "HMARK",
+		.family		= NFPROTO_IPV6,
+		.target		= hmark_tg_v6,
+		.targetsize	= sizeof(struct xt_hmark_info),
+		.checkentry	= hmark_tg_check,
+		.me		= THIS_MODULE,
+	},
+#endif
+};
+
+static int __init hmark_tg_init(void)
+{
+	return xt_register_targets(hmark_tg_reg, ARRAY_SIZE(hmark_tg_reg));
+}
+
+static void __exit hmark_tg_exit(void)
+{
+	xt_unregister_targets(hmark_tg_reg, ARRAY_SIZE(hmark_tg_reg));
+}
+
+module_init(hmark_tg_init);
+module_exit(hmark_tg_exit);
-- 
1.7.9.5


^ permalink raw reply related

* Re: [PATCH 3/4 v2 net-next] net: make GRO aware of skb->head_frag
From: Alexander Duyck @ 2012-05-01 23:10 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Alexander Duyck, David Miller, netdev, Neal Cardwell, Tom Herbert,
	Jeff Kirsher, Michael Chan, Matt Carlson, Herbert Xu,
	Ben Hutchings, Ilpo Järvinen, Maciej Żenczykowski
In-Reply-To: <4FA06A94.8050704@intel.com>

On 05/01/2012 03:58 PM, Alexander Duyck wrote:
> On 05/01/2012 10:04 AM, Eric Dumazet wrote:
>> On Tue, 2012-05-01 at 09:17 -0700, Alexander Duyck wrote:
>>> On 04/30/2012 11:39 PM, Eric Dumazet wrote:
>>>> On Mon, 2012-04-30 at 22:33 -0700, Alexander Duyck wrote:
>>>>
>>>>> The question I had was more specific to GRO.  As long as we have
>>>>> skb->users == 1 and the skb isn't cloned we should be fine.   It just
>>>>> hadn't occurred to me before that napi_gro_receive had the extra
>>>>> requirement that the skb couldn't be cloned.
>>>>>
>>>> OK
>>>>
>>>> By the way, even if skb was cloned, we would be allowed to steal
>>>> skb->head.
>>>>
>>>> When we clone an oskb we :
>>>>
>>>> 1) allocate a struct nskb sk_buff (or use the shadow in case of TCP)
>>>> 2) increment dataref
>>> The problem I have is with this piece right here.  So you increment
>>> dataref.  Now you have an skb that is still pointing to the shared info
>>> on this page and dataref is 2.  What about the side that is stealing the
>>> head?  Is it going to be tracking the dataref as well and decrementing
>>> it before put_page or does it just assume that dataref is 1 and call
>>> put_page directly?  I am guessing the latter since I didn't see anything
>>> that allowed for tracking the dataref of stolen heads.
>> The only changed thing is the kfree() replaced by put_page()
>>
>> This kfree() was done when last reference to dataref was released.
>>
>> If we had a problem before, we have same problem after my patch.
>>
>> Truth is : In TCP (coalesce and splice()) and GRO, we owns skbs.
>>
>> (See the various __kfree_skb(skb) calls in net/ipv4/tcp_input.c
>> There is one exception in ipv6 / treq->pktopts ) but its for SYN packet
>> and this wont be merged with a previous packet.
> Eric,
>
> I think I have found a bug, although it is not specific to this patch
> but it is related.  It looks like the TCP coalesce code is causing
> tcpdump to fail when using frags.  Based on the comments in the patch I
> am assuming you have an ixgbe adapter to test with as that is what I
> reproduced this on.  To reproduce the issue all you need to do is run
> "tcpdump -i ethX > /dev/null" on one console, and on a second console
> run a netperf TCP_MAERTS test to some other server.  Tcpdump will exit
> out with a message about bad address like this:
>
> tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
> listening on eth1, link-type EN10MB (Ethernet), capture size 96 bytes
> tcpdump: pcap_loop: recvfrom: Bad address
> 682 packets captured
> 2357 packets received by filter
> 1543 packets dropped by kernel
>
>
> A bisect of the issue tracked it down to:
>
> 1402d366019fedaa2b024f2bac06b7cc9a8782e1 is first bad commit
> commit 1402d366019fedaa2b024f2bac06b7cc9a8782e1
> Author: Eric Dumazet <edumazet@google.com>
> Date:   Mon Apr 23 07:11:42 2012 +0000
>
>     tcp: introduce tcp_try_coalesce
>     
>     commit c8628155ece3 (tcp: reduce out_of_order memory use) took care of
>     coalescing tcp segments provided by legacy devices (linear skbs)
>     
>     We extend this idea to fragged skbs, as their truesize can be heavy.
>     
>     ixgbe for example uses 256+1024+PAGE_SIZE/2 = 3328 bytes per segment.
>     
>     Use this coalescing strategy for receive queue too.
>     
>     This contributes to reduce number of tcp collapses, at minimal cost, and
>     reduces memory overhead and packets drops.
>     
>     Signed-off-by: Eric Dumazet <edumazet@google.com>
>     Cc: Neal Cardwell <ncardwell@google.com>
>     Cc: Tom Herbert <therbert@google.com>
>     Cc: Maciej Å»enczykowski <maze@google.com>
>     Cc: Ilpo JÃ¤rvinen <ilpo.jarvinen@helsinki.fi>
>     Acked-by: Neal Cardwell <ncardwell@google.com>
>     Signed-off-by: David S. Miller <davem@davemloft.net>
>
> :040000 040000 8ca3e0b4e6c6a8f375fd800069d24203880623f3 2576d34c5c9cfc717a11e2ebe054143956716b93 M	net
>
> I suspect we are dealing with either a shared or cloned skb in this
> case, though I haven't verified which it is yet.
>
> Thanks,
>
> Alex
>
One additional note.  It looks like LRO and GRO need to be disabled to
trigger the bug.  If either of them are enabled it doesn't seem to
occur.  Likely due to the fact that they are doing the coalescing before
it gets up to the tcp_try_coalesce call.

Thanks,

Alex

^ permalink raw reply

* Re: [PATCH 3/4 v2 net-next] net: make GRO aware of skb->head_frag
From: Alexander Duyck @ 2012-05-01 22:58 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Alexander Duyck, David Miller, netdev, Neal Cardwell, Tom Herbert,
	Jeff Kirsher, Michael Chan, Matt Carlson, Herbert Xu,
	Ben Hutchings, Ilpo Järvinen, Maciej Żenczykowski
In-Reply-To: <1335891892.22133.23.camel@edumazet-glaptop>

On 05/01/2012 10:04 AM, Eric Dumazet wrote:
> On Tue, 2012-05-01 at 09:17 -0700, Alexander Duyck wrote:
>> On 04/30/2012 11:39 PM, Eric Dumazet wrote:
>>> On Mon, 2012-04-30 at 22:33 -0700, Alexander Duyck wrote:
>>>
>>>> The question I had was more specific to GRO.  As long as we have
>>>> skb->users == 1 and the skb isn't cloned we should be fine.   It just
>>>> hadn't occurred to me before that napi_gro_receive had the extra
>>>> requirement that the skb couldn't be cloned.
>>>>
>>> OK
>>>
>>> By the way, even if skb was cloned, we would be allowed to steal
>>> skb->head.
>>>
>>> When we clone an oskb we :
>>>
>>> 1) allocate a struct nskb sk_buff (or use the shadow in case of TCP)
>>> 2) increment dataref
>> The problem I have is with this piece right here.  So you increment
>> dataref.  Now you have an skb that is still pointing to the shared info
>> on this page and dataref is 2.  What about the side that is stealing the
>> head?  Is it going to be tracking the dataref as well and decrementing
>> it before put_page or does it just assume that dataref is 1 and call
>> put_page directly?  I am guessing the latter since I didn't see anything
>> that allowed for tracking the dataref of stolen heads.
> The only changed thing is the kfree() replaced by put_page()
>
> This kfree() was done when last reference to dataref was released.
>
> If we had a problem before, we have same problem after my patch.
>
> Truth is : In TCP (coalesce and splice()) and GRO, we owns skbs.
>
> (See the various __kfree_skb(skb) calls in net/ipv4/tcp_input.c
> There is one exception in ipv6 / treq->pktopts ) but its for SYN packet
> and this wont be merged with a previous packet.
Eric,

I think I have found a bug, although it is not specific to this patch
but it is related.  It looks like the TCP coalesce code is causing
tcpdump to fail when using frags.  Based on the comments in the patch I
am assuming you have an ixgbe adapter to test with as that is what I
reproduced this on.  To reproduce the issue all you need to do is run
"tcpdump -i ethX > /dev/null" on one console, and on a second console
run a netperf TCP_MAERTS test to some other server.  Tcpdump will exit
out with a message about bad address like this:

tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth1, link-type EN10MB (Ethernet), capture size 96 bytes
tcpdump: pcap_loop: recvfrom: Bad address
682 packets captured
2357 packets received by filter
1543 packets dropped by kernel


A bisect of the issue tracked it down to:

1402d366019fedaa2b024f2bac06b7cc9a8782e1 is first bad commit
commit 1402d366019fedaa2b024f2bac06b7cc9a8782e1
Author: Eric Dumazet <edumazet@google.com>
Date:   Mon Apr 23 07:11:42 2012 +0000

    tcp: introduce tcp_try_coalesce
    
    commit c8628155ece3 (tcp: reduce out_of_order memory use) took care of
    coalescing tcp segments provided by legacy devices (linear skbs)
    
    We extend this idea to fragged skbs, as their truesize can be heavy.
    
    ixgbe for example uses 256+1024+PAGE_SIZE/2 = 3328 bytes per segment.
    
    Use this coalescing strategy for receive queue too.
    
    This contributes to reduce number of tcp collapses, at minimal cost, and
    reduces memory overhead and packets drops.
    
    Signed-off-by: Eric Dumazet <edumazet@google.com>
    Cc: Neal Cardwell <ncardwell@google.com>
    Cc: Tom Herbert <therbert@google.com>
    Cc: Maciej Å»enczykowski <maze@google.com>
    Cc: Ilpo JÃ¤rvinen <ilpo.jarvinen@helsinki.fi>
    Acked-by: Neal Cardwell <ncardwell@google.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

:040000 040000 8ca3e0b4e6c6a8f375fd800069d24203880623f3 2576d34c5c9cfc717a11e2ebe054143956716b93 M	net

I suspect we are dealing with either a shared or cloned skb in this
case, though I haven't verified which it is yet.

Thanks,

Alex

^ permalink raw reply

* Re: [PATCH 05/11] mm: swap: Implement generic handler for swap_activate
From: Andrew Morton @ 2012-05-01 22:57 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Linux-MM, Linux-Netdev, Linux-NFS, LKML, David Miller,
	Trond Myklebust, Neil Brown, Christoph Hellwig, Peter Zijlstra,
	Mike Christie, Eric B Munson
In-Reply-To: <1334578675-23445-6-git-send-email-mgorman@suse.de>

On Mon, 16 Apr 2012 13:17:49 +0100
Mel Gorman <mgorman@suse.de> wrote:

> The version of swap_activate introduced is sufficient for swap-over-NFS
> but would not provide enough information to implement a generic handler.
> This patch shuffles things slightly to ensure the same information is
> available for aops->swap_activate() as is available to the core.
> 
> No functionality change.
> 
> ...
>
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -587,6 +587,8 @@ typedef struct {
>  typedef int (*read_actor_t)(read_descriptor_t *, struct page *,
>  		unsigned long, unsigned long);
>  
> +struct swap_info_struct;

Please put forward declarations at top-of-file.  To prevent accidental
duplication later on.

>  struct address_space_operations {
>  	int (*writepage)(struct page *page, struct writeback_control *wbc);
>  	int (*readpage)(struct file *, struct page *);
>
> ...
>
> --- a/mm/page_io.c
> +++ b/mm/page_io.c

Have you tested all this code with CONFIG_SWAP=n?

Have you sought to minimise additional new code when CONFIG_SWAP=n?


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 04/11] mm: Add support for a filesystem to activate swap files and use direct_IO for writing swap pages
From: Andrew Morton @ 2012-05-01 22:53 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Linux-MM, Linux-Netdev, Linux-NFS, LKML, David Miller,
	Trond Myklebust, Neil Brown, Christoph Hellwig, Peter Zijlstra,
	Mike Christie, Eric B Munson
In-Reply-To: <1334578675-23445-5-git-send-email-mgorman-l3A5Bk7waGM@public.gmane.org>

On Mon, 16 Apr 2012 13:17:48 +0100
Mel Gorman <mgorman-l3A5Bk7waGM@public.gmane.org> wrote:

> Currently swapfiles are managed entirely by the core VM by using ->bmap
> to allocate space and write to the blocks directly. This effectively
> ensures that the underlying blocks are allocated and avoids the need
> for the swap subsystem to locate what physical blocks store offsets
> within a file.
> 
> If the swap subsystem is to use the filesystem information to locate
> the blocks, it is critical that information such as block groups,
> block bitmaps and the block descriptor table that map the swap file
> were resident in memory. This patch adds address_space_operations that
> the VM can call when activating or deactivating swap backed by a file.
> 
>   int swap_activate(struct file *);
>   int swap_deactivate(struct file *);
> 
> The ->swap_activate() method is used to communicate to the
> file that the VM relies on it, and the address_space should take
> adequate measures such as reserving space in the underlying device,
> reserving memory for mempools and pinning information such as the
> block descriptor table in memory. The ->swap_deactivate() method is
> called on sys_swapoff() if ->swap_activate() returned success.
> 
> After a successful swapfile ->swap_activate, the swapfile
> is marked SWP_FILE and swapper_space.a_ops will proxy to
> sis->swap_file->f_mappings->a_ops using ->direct_io to write swapcache
> pages and ->readpage to read.
> 
> It is perfectly possible that direct_IO be used to read the swap
> pages but it is an unnecessary complication. Similarly, it is possible
> that ->writepage be used instead of direct_io to write the pages but
> filesystem developers have stated that calling writepage from the VM
> is undesirable for a variety of reasons and using direct_IO opens up
> the possibility of writing back batches of swap pages in the future.

This all seems a bit odd.  And abusive.

Yes, it would be more pleasing if direct-io was used for reading as
well.  How much more complication would it add?

If I understand correctly, on the read path we're taking a fresh page
which is destined for swapcache and then pretending that it is a
pagecache page for the purpose of the I/O?  If there already existed a
pagecache page for that file offset then we let it just sit there and
bypass it?

I'm surprised that this works at all - I guess nothing under
->readpage() goes poking around in the address_space.  For NFS, at
least!

>
> ...
>
> @@ -93,11 +94,38 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
>  {
>  	struct bio *bio;
>  	int ret = 0, rw = WRITE;
> +	struct swap_info_struct *sis = page_swap_info(page);
>  
>  	if (try_to_free_swap(page)) {
>  		unlock_page(page);
>  		goto out;
>  	}
> +
> +	if (sis->flags & SWP_FILE) {
> +		struct kiocb kiocb;
> +		struct file *swap_file = sis->swap_file;
> +		struct address_space *mapping = swap_file->f_mapping;
> +		struct iovec iov = {
> +			.iov_base = page_address(page),

Didn't we need to kmap the page?

> +			.iov_len  = PAGE_SIZE,
> +		};
> +
> +		init_sync_kiocb(&kiocb, swap_file);
> +		kiocb.ki_pos = page_file_offset(page);
> +		kiocb.ki_left = PAGE_SIZE;
> +		kiocb.ki_nbytes = PAGE_SIZE;
> +
> +		unlock_page(page);
> +		ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
> +						&kiocb, &iov,
> +						kiocb.ki_pos, 1);

I wonder if there's any point in setting PG_writeback around the IO.  I
can't think of a reason.

> +		if (ret == PAGE_SIZE) {
> +			count_vm_event(PSWPOUT);
> +			ret = 0;
> +		}
> +		return ret;
> +	}
> +
>  	bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
>  	if (bio == NULL) {
>  		set_page_dirty(page);
> @@ -119,9 +147,21 @@ int swap_readpage(struct page *page)
>  {
>  	struct bio *bio;
>  	int ret = 0;
> +	struct swap_info_struct *sis = page_swap_info(page);
>  
>  	VM_BUG_ON(!PageLocked(page));
>  	VM_BUG_ON(PageUptodate(page));
> +
> +	if (sis->flags & SWP_FILE) {
> +		struct file *swap_file = sis->swap_file;
> +		struct address_space *mapping = swap_file->f_mapping;
> +
> +		ret = mapping->a_ops->readpage(swap_file, page);
> +		if (!ret)
> +			count_vm_event(PSWPIN);
> +		return ret;
> +	}

Confused.  Where did we set up page->index with the file offset?

>  	bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
>  	if (bio == NULL) {
>  		unlock_page(page);
> @@ -133,3 +173,15 @@ int swap_readpage(struct page *page)
>  out:
>  	return ret;
>  }
> +
> +int swap_set_page_dirty(struct page *page)
> +{
> +	struct swap_info_struct *sis = page_swap_info(page);
> +
> +	if (sis->flags & SWP_FILE) {
> +		struct address_space *mapping = sis->swap_file->f_mapping;
> +		return mapping->a_ops->set_page_dirty(page);
> +	} else {
> +		return __set_page_dirty_nobuffers(page);
> +	}
> +}

More confused.  This is a swapcache page, not a pagecache page?  Why
are we running set_page_dirty() against it?

And what are we doing on the !SWP_FILE path?  Newly setting PG_dirty
against block-device-backed swapcache pages?  Why?  Where does it get
cleared again?

> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 9d3dd37..c25b9cf 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -26,7 +26,7 @@
>   */
>  static const struct address_space_operations swap_aops = {
>  	.writepage	= swap_writepage,
> -	.set_page_dirty	= __set_page_dirty_nobuffers,
> +	.set_page_dirty	= swap_set_page_dirty,
>  	.migratepage	= migrate_page,
>  };
>
> ...
>
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [E1000-devel] e1000e tx queue timeout in 3.3.0 (bisected to BQL support for e1000e)
From: Jeff Kirsher @ 2012-05-01 22:52 UTC (permalink / raw)
  To: David Miller; +Cc: greearb, john.r.fastabend, netdev, e1000-devel, therbert
In-Reply-To: <20120501.184650.450772649940152534.davem@davemloft.net>

[-- Attachment #1: Type: text/plain, Size: 490 bytes --]

On Tue, 2012-05-01 at 18:46 -0400, David Miller wrote:
> From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> Date: Tue, 01 May 2012 15:42:28 -0700
> 
> > I will take care of the stable submission's Ben.
> 
> Actually, Jeff, hold off on that.
> 
> I misread Ben's email and didn't see that these were not
> driver-specific changes.
> 
> Therefore, I'll take care of the -stable submissions and you don't
> need to worry about it.
> 
> Thanks everyone.

Ok, sounds good Dave.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: e1000e tx queue timeout in 3.3.0 (bisected to BQL support for e1000e)
From: David Miller @ 2012-05-01 22:46 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: john.r.fastabend, netdev, therbert, e1000-devel
In-Reply-To: <1335912148.2656.11.camel@jtkirshe-mobl>

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Tue, 01 May 2012 15:42:28 -0700

> I will take care of the stable submission's Ben.

Actually, Jeff, hold off on that.

I misread Ben's email and didn't see that these were not
driver-specific changes.

Therefore, I'll take care of the -stable submissions and you don't
need to worry about it.

Thanks everyone.

------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and 
threat landscape has changed and how IT managers can respond. Discussions 
will include endpoint security, mobile security and the latest in malware 
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* Re: [E1000-devel] e1000e tx queue timeout in 3.3.0 (bisected to BQL support for e1000e)
From: Jeff Kirsher @ 2012-05-01 22:42 UTC (permalink / raw)
  To: Ben Greear; +Cc: David Miller, john.r.fastabend, netdev, e1000-devel, therbert
In-Reply-To: <4FA05ED2.3030608@candelatech.com>

[-- Attachment #1: Type: text/plain, Size: 1059 bytes --]

On Tue, 2012-05-01 at 15:08 -0700, Ben Greear wrote:
> On 05/01/2012 02:49 PM, David Miller wrote:
> > From: Ben Greear<greearb@candelatech.com>
> > Date: Tue, 01 May 2012 14:10:43 -0700
> >
> >> On 04/20/2012 02:56 PM, Ben Greear wrote:
> >>> On 04/20/2012 02:21 PM, Tom Herbert wrote:
> >>>> Thanks John for pointers to those.  Ben, are you running a kernel
> with
> >>>> these patches?
> >>>
> >>> I just tested this on my e1000e and igb machine.  With these
> patches,
> >>> I can no longer reproduce the problem.
> >>>
> >>> So, please make sure those are queued up for 3.3 stable!
> >>
> >> Dave:  I think these patches below should go to 3.3 stable.
> >>
> >> They are not queued for stable yet as far as I can tell.
> >
> > I let the Intel developers handle -stable submissions for their
> > drivers.
> 
> I don't think this is specific to their drivers.
> 
> But, as long as _someone_ is pushing it to stable,
> it is fine by me.
> 
> Thanks,
> Ben 

I will take care of the stable submission's Ben.

Cheers,
Jeff

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: [PATCH 00/16] Swap-over-NBD without deadlocking V9
From: Andrew Morton @ 2012-05-01 22:28 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Neil Brown,
	Peter Zijlstra, Mike Christie, Eric B Munson
In-Reply-To: <1334578624-23257-1-git-send-email-mgorman@suse.de>


This patchset is far less ghastly than I feared/remembered/dreamed ;)

The mm parts, anyway.  Are the net guys on board with it all?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 15/16] mm: Throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage
From: Andrew Morton @ 2012-05-01 22:24 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Neil Brown,
	Peter Zijlstra, Mike Christie, Eric B Munson
In-Reply-To: <1334578624-23257-16-git-send-email-mgorman@suse.de>

On Mon, 16 Apr 2012 13:17:02 +0100
Mel Gorman <mgorman@suse.de> wrote:

> If swap is backed by network storage such as NBD, there is a risk
> that a large number of reclaimers can hang the system by consuming
> all PF_MEMALLOC reserves. To avoid these hangs, the administrator
> must tune min_free_kbytes in advance which is a bit fragile.
> 
> This patch throttles direct reclaimers if half the PF_MEMALLOC reserves
> are in use. If the system is routinely getting throttled the system
> administrator can increase min_free_kbytes so degradation is smoother
> but the system will keep running.
> 
>
> ...
>
> +static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
> +{
> +	struct zone *zone;
> +	unsigned long pfmemalloc_reserve = 0;
> +	unsigned long free_pages = 0;
> +	int i;
> +	bool wmark_ok;
> +
> +	for (i = 0; i <= ZONE_NORMAL; i++) {
> +		zone = &pgdat->node_zones[i];
> +		pfmemalloc_reserve += min_wmark_pages(zone);
> +		free_pages += zone_page_state(zone, NR_FREE_PAGES);
> +	}
> +
> +	wmark_ok = (free_pages > pfmemalloc_reserve / 2) ? true : false;

	wmark_ok = free_pages > pfmemalloc_reserve / 2;

> +
> +	/* kswapd must be awake if processes are being throttled */
> +	if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
> +		pgdat->classzone_idx = min(pgdat->classzone_idx,
> +						(enum zone_type)ZONE_NORMAL);
> +		wake_up_interruptible(&pgdat->kswapd_wait);
> +	}
> +
> +	return wmark_ok;
> +}
> +
> +/*
> + * Throttle direct reclaimers if backing storage is backed by the network
> + * and the PFMEMALLOC reserve for the preferred node is getting dangerously
> + * depleted. kswapd will continue to make progress and wake the processes
> + * when the low watermark is reached
> + */
> +static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
> +					nodemask_t *nodemask)
> +{
> +	struct zone *zone;
> +	int high_zoneidx = gfp_zone(gfp_mask);
> +	pg_data_t *pgdat;
> +
> +	/* Kernel threads such as kjournald should not be throttled */

The comment should explain "why", not "what".  Particularly when the
"what" was bleedin obvious ;)

Also...   why?

> +	if (current->flags & PF_KTHREAD)
> +		return;
> +
> +	/* Check if the pfmemalloc reserves are ok */
> +	first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
> +	pgdat = zone->zone_pgdat;
> +	if (pfmemalloc_watermark_ok(pgdat))
> +		return;
> +
> +	/*
> +	 * If the caller cannot enter the filesystem, it's possible that it
> +	 * is processing a journal transaction. In this case, it is not safe
> +	 * to block on pfmemalloc_wait as kswapd could also be blocked waiting
> +	 * to start a transaction. Instead, throttle for up to a second before
> +	 * the reclaim must continue.
> +	 */

I suppose this applies to fs locks in general, not just to
journal_start()?

> +	if (!(gfp_mask & __GFP_FS)) {
> +		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
> +			pfmemalloc_watermark_ok(pgdat), HZ);
> +		return;
> +	}
> +
> +	/* Throttle until kswapd wakes the process */
> +	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
> +		pfmemalloc_watermark_ok(pgdat));
> +}
> +
>  unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>  				gfp_t gfp_mask, nodemask_t *nodemask)
>  {
>
> ...
>
> @@ -2610,6 +2686,20 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
>  	if (remaining)
>  		return true;
>  
> +	/*
> +	 * There is a potential race between when kswapd checks it watermarks

"its"

> +	 * and a process gets throttled. There is also a potential race if
> +	 * processes get throttled, kswapd wakes, a large process exits therby
> +	 * balancing the zones that causes kswapd to miss a wakeup. If kswapd
> +	 * is going to sleep, no process should be sleeping on pfmemalloc_wait
> +	 * so wake them now if necessary. If necessary, processes will wake
> +	 * kswapd and get throttled again
> +	 */

Yes, the possibility for missed wakeups here worried me.  There's no
synchronization and it would be easy to leave holes.

It's good that there is no timeout on the throttling - a timeout would
cover up rare races most nastily.

> +	if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
> +		wake_up(&pgdat->pfmemalloc_wait);
> +		return true;
> +	}

A bool-returning function called "sleeping_prematurely" should have no
side-effects.  But it now performs wakeups.  Wanna see if there is a
way of making this nicer?

>  	/* Check the watermark levels */
>  	for (i = 0; i <= classzone_idx; i++) {
>  		struct zone *zone = pgdat->node_zones + i;
> @@ -2871,6 +2961,12 @@ loop_again:
>  			}
>  
>  		}
> +
> +		/* Wake throttled direct reclaimers if low watermark is met */

s/"what"/"why"/ !

> +		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
> +				pfmemalloc_watermark_ok(pgdat))
> +			wake_up(&pgdat->pfmemalloc_wait);
> +
>  		if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
>  			break;		/* kswapd: all done */
>  		/*
>
> ...
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: e1000e tx queue timeout in 3.3.0 (bisected to BQL support for e1000e)
From: Ben Greear @ 2012-05-01 22:08 UTC (permalink / raw)
  To: David Miller; +Cc: john.r.fastabend, netdev, e1000-devel, therbert
In-Reply-To: <20120501.174914.654247970871851432.davem@davemloft.net>

On 05/01/2012 02:49 PM, David Miller wrote:
> From: Ben Greear<greearb@candelatech.com>
> Date: Tue, 01 May 2012 14:10:43 -0700
>
>> On 04/20/2012 02:56 PM, Ben Greear wrote:
>>> On 04/20/2012 02:21 PM, Tom Herbert wrote:
>>>> Thanks John for pointers to those.  Ben, are you running a kernel with
>>>> these patches?
>>>
>>> I just tested this on my e1000e and igb machine.  With these patches,
>>> I can no longer reproduce the problem.
>>>
>>> So, please make sure those are queued up for 3.3 stable!
>>
>> Dave:  I think these patches below should go to 3.3 stable.
>>
>> They are not queued for stable yet as far as I can tell.
>
> I let the Intel developers handle -stable submissions for their
> drivers.

I don't think this is specific to their drivers.

But, as long as _someone_ is pushing it to stable,
it is fine by me.

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com


------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and 
threat landscape has changed and how IT managers can respond. Discussions 
will include endpoint security, mobile security and the latest in malware 
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* Re: [PATCH 05/16] mm: allow PF_MEMALLOC from softirq context
From: Andrew Morton @ 2012-05-01 22:08 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Neil Brown,
	Peter Zijlstra, Mike Christie, Eric B Munson
In-Reply-To: <1334578624-23257-6-git-send-email-mgorman@suse.de>

On Mon, 16 Apr 2012 13:16:52 +0100
Mel Gorman <mgorman@suse.de> wrote:

> This is needed to allow network softirq packet processing to make
> use of PF_MEMALLOC.

hm, why?  You just added __GFP_MEMALLOC so we don't need to futz with
PF_MEMALLOC?

> Currently softirq context cannot use PF_MEMALLOC due to it not being
> associated with a task, and therefore not having task flags to fiddle
> with - thus the gfp to alloc flag mapping ignores the task flags when
> in interrupts (hard or soft) context.
> 
> Allowing softirqs to make use of PF_MEMALLOC therefore requires some
> trickery.  We basically borrow the task flags from whatever process
> happens to be preempted by the softirq.
> 
> So we modify the gfp to alloc flags mapping to not exclude task flags
> in softirq context, and modify the softirq code to save, clear and
> restore the PF_MEMALLOC flag.
> 
> The save and clear, ensures the preempted task's PF_MEMALLOC flag
> doesn't leak into the softirq. The restore ensures a softirq's
> PF_MEMALLOC flag cannot leak back into the preempted process.
> 
> ...
>
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1913,6 +1913,13 @@ static inline void rcu_copy_process(struct task_struct *p)
>  
>  #endif
>  
> +static inline void tsk_restore_flags(struct task_struct *p,
> +				     unsigned long pflags, unsigned long mask)

The naming is poor.

p -> "tsk" or "task"
pflags -> "old_flags"
mask -> "flags"

> +{
> +	p->flags &= ~mask;
> +	p->flags |= pflags & mask;
> +}
> +
>  #ifdef CONFIG_SMP
>  extern void do_set_cpus_allowed(struct task_struct *p,
>  			       const struct cpumask *new_mask);
> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index 671f959..d349caa 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -210,6 +210,8 @@ asmlinkage void __do_softirq(void)
>  	__u32 pending;
>  	int max_restart = MAX_SOFTIRQ_RESTART;
>  	int cpu;
> +	unsigned long pflags = current->flags;

"old_flags"

> +	current->flags &= ~PF_MEMALLOC;

The line before this one would be a suitable place for a comment!

>  	pending = local_softirq_pending();
>  	account_system_vtime(current);
> @@ -265,6 +267,7 @@ restart:
>  
>  	account_system_vtime(current);
>  	__local_bh_enable(SOFTIRQ_OFFSET);
> +	tsk_restore_flags(current, pflags, PF_MEMALLOC);
>  }
>  
> ...
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: e1000e tx queue timeout in 3.3.0 (bisected to BQL support for e1000e)
From: David Miller @ 2012-05-01 21:49 UTC (permalink / raw)
  To: greearb; +Cc: john.r.fastabend, netdev, e1000-devel, therbert
In-Reply-To: <4FA05153.1010903@candelatech.com>

From: Ben Greear <greearb@candelatech.com>
Date: Tue, 01 May 2012 14:10:43 -0700

> On 04/20/2012 02:56 PM, Ben Greear wrote:
>> On 04/20/2012 02:21 PM, Tom Herbert wrote:
>>> Thanks John for pointers to those.  Ben, are you running a kernel with
>>> these patches?
>>
>> I just tested this on my e1000e and igb machine.  With these patches,
>> I can no longer reproduce the problem.
>>
>> So, please make sure those are queued up for 3.3 stable!
> 
> Dave:  I think these patches below should go to 3.3 stable.
> 
> They are not queued for stable yet as far as I can tell.

I let the Intel developers handle -stable submissions for their
drivers.

------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and 
threat landscape has changed and how IT managers can respond. Discussions 
will include endpoint security, mobile security and the latest in malware 
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* Re: [E1000-devel] e1000e tx queue timeout in 3.3.0 (bisected to BQL support for e1000e)
From: Ben Greear @ 2012-05-01 21:10 UTC (permalink / raw)
  To: Tom Herbert, David Miller; +Cc: John Fastabend, netdev, e1000-devel list
In-Reply-To: <4F91DB7B.7060601@candelatech.com>

On 04/20/2012 02:56 PM, Ben Greear wrote:
> On 04/20/2012 02:21 PM, Tom Herbert wrote:
>> Thanks John for pointers to those.  Ben, are you running a kernel with
>> these patches?
>
> I just tested this on my e1000e and igb machine.  With these patches,
> I can no longer reproduce the problem.
>
> So, please make sure those are queued up for 3.3 stable!

Dave:  I think these patches below should go to 3.3 stable.

They are not queued for stable yet as far as I can tell.

Thanks,
Ben

>
> Thanks,
> Ben
>
>>
>> Tom
>>
>>>
>>> Tom, did you see these two patches? Maybe this is resolved by
>>> the second patch.
>>>
>>> We needed these to fixup ixgbe and igb (i didn't test e1000e)
>>> looks like we might want to push these at stable. I don't
>>> believe they are in 3.3.
>>>
>>> commit b37c0fbe3f6dfba1f8ad2aed47fb40578a254635
>>> Author: Alexander Duyck<alexander.h.duyck@intel.com>
>>> Date:   Tue Feb 7 02:29:06 2012 +0000
>>>
>>>      net: Add memory barriers to prevent possible race in byte queue limits
>>>
>>>      This change adds a memory barrier to the byte queue limit code to address a
>>>      possible race as has been seen in the past with the
>>>      netif_stop_queue/netif_wake_queue logic.
>>>
>>>      Signed-off-by: Alexander Duyck<alexander.h.duyck@intel.com>
>>>      Tested-by: Stephen Ko<stephen.s.ko@intel.com>
>>>      Signed-off-by: Jeff Kirsher<jeffrey.t.kirsher@intel.com>
>>>
>>>
>>> http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=b37c0fbe3f6dfba1f8ad2aed47fb40578a254635
>>>
>>>
>>> commit 5c4903549c05bbb373479e0ce2992573c120654a
>>> Author: Alexander Duyck<alexander.h.duyck@intel.com>
>>> Date:   Tue Feb 7 02:29:01 2012 +0000
>>>
>>>      net: Fix issue with netdev_tx_reset_queue not resetting queue from XOFF state
>>>
>>>      We are seeing dev_watchdog hangs on several drivers.  I suspect this is due
>>>      to the __QUEUE_STATE_STACK_XOFF bit being set prior to a reset for link
>>>      change, and then not being cleared by netdev_tx_reset_queue.  This change
>>>      corrects that.
>>>
>>>      In addition we were seeing dev_watchdog hangs on igb after running the
>>>      ethtool tests.  We found this to be due to the fact that the ethtool test
>>>      runs the same logic as ndo_start_xmit, but we were never clearing the XOFF
>>>      flag since the loopback test in ethtool does not do byte queue accounting.
>>>
>>>      Signed-off-by: Alexander Duyck<alexander.h.duyck@intel.com>
>>>      Tested-by: Stephen Ko<stephen.s.ko@intel.com>
>>>      Signed-off-by: Jeff Kirsher<jeffrey.t.kirsher@intel.com>
>>>
>>> http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=5c4903549c05bbb373479e0ce2992573c120654a
>>>
>>>
>
>


-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply

* Re: 3.4-rc: NETDEV WATCHDOG: eth0 (r8169): transmit queue 0 timed out
From: Francois Romieu @ 2012-05-01 20:04 UTC (permalink / raw)
  To: Stefan Richter; +Cc: netdev
In-Reply-To: <20120501200423.6804dcf0@stein>

Stefan Richter <stefanr@s5r6.in-berlin.de> :
[...]
> # dmesg | grep XID
> r8169 0000:0b:00.0: eth0: RTL8168c/8111c at 0xffffc9000005e000,
> 00:23:54:91:8a:2b, XID 1c4000c0 IRQ 49

I have no idea what has gone wrong with this chipset during the
3.3 to 3.4-rc5 move.

Can you apply the patch below on top of current ethtool
(git://git.kernel.org/pub/scm/network/ethtool/ethtool.git) and see
if it is enough to compare the register dumps (ethtool -d eth0).

diff --git a/realtek.c b/realtek.c
index c3d7ae5..ac83829 100644
--- a/realtek.c
+++ b/realtek.c
@@ -33,6 +33,7 @@ enum chip_type {
 	RTL8101Ebc,
 	RTL8100E1,
 	RTL8100E2,
+	RTL8168c0,
 };

 enum {
@@ -65,6 +66,7 @@ static struct chip_info {
 	{ "RTL-8101Ebc",	HW_REVID(0, 0, 1, 1, 0, 1, 0, 0) },
 	{ "RTL-8100E(1)",	HW_REVID(0, 0, 1, 1, 0, 0, 1, 0) },
 	{ "RTL-8100E(2)",	HW_REVID(0, 0, 1, 1, 1, 0, 1, 0) },
+	{ "RTL-8168c",		HW_REVID(0, 0, 1, 1, 1, 1, 0, 1) },
 	{ }
 };

> 
> # ethtool eth0
> Settings for eth0:
[...]
>                                drv probe ifdown ifup
>         Link detected: yes

Everything seems normal and we should be able to see both link up and down
messages.

> # ethtool -i eth0
[...]

Ok. This is a firmware free chipset anyway. Nothing strange in the interface
stats (ethtool -S eth0) ?

You may have to narrow things. Can you check if the r8169.c at
036dafa28da1e2565a8529de2ae663c37b7a0060 behaves the same ?

-- 
Ueimor

^ permalink raw reply related

* Re: [PATCH 3/4 v2 net-next] net: make GRO aware of skb->head_frag
From: Alexander Duyck @ 2012-05-01 19:45 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Alexander Duyck, David Miller, netdev, Neal Cardwell, Tom Herbert,
	Jeff Kirsher, Michael Chan, Matt Carlson, Herbert Xu,
	Ben Hutchings, Ilpo Järvinen, Maciej Żenczykowski
In-Reply-To: <1335891892.22133.23.camel@edumazet-glaptop>

On 05/01/2012 10:04 AM, Eric Dumazet wrote:
> On Tue, 2012-05-01 at 09:17 -0700, Alexander Duyck wrote:
>> On 04/30/2012 11:39 PM, Eric Dumazet wrote:
>>> On Mon, 2012-04-30 at 22:33 -0700, Alexander Duyck wrote:
>>>
>>>> The question I had was more specific to GRO.  As long as we have
>>>> skb->users == 1 and the skb isn't cloned we should be fine.   It just
>>>> hadn't occurred to me before that napi_gro_receive had the extra
>>>> requirement that the skb couldn't be cloned.
>>>>
>>> OK
>>>
>>> By the way, even if skb was cloned, we would be allowed to steal
>>> skb->head.
>>>
>>> When we clone an oskb we :
>>>
>>> 1) allocate a struct nskb sk_buff (or use the shadow in case of TCP)
>>> 2) increment dataref
>> The problem I have is with this piece right here.  So you increment
>> dataref.  Now you have an skb that is still pointing to the shared info
>> on this page and dataref is 2.  What about the side that is stealing the
>> head?  Is it going to be tracking the dataref as well and decrementing
>> it before put_page or does it just assume that dataref is 1 and call
>> put_page directly?  I am guessing the latter since I didn't see anything
>> that allowed for tracking the dataref of stolen heads.
> The only changed thing is the kfree() replaced by put_page()
>
> This kfree() was done when last reference to dataref was released.
>
> If we had a problem before, we have same problem after my patch.
>
> Truth is : In TCP (coalesce and splice()) and GRO, we owns skbs.
>
> (See the various __kfree_skb(skb) calls in net/ipv4/tcp_input.c
> There is one exception in ipv6 / treq->pktopts ) but its for SYN packet
> and this wont be merged with a previous packet.
I wasn't worried about the kfree vs put_page, I was worried about the
coalesce case.  However, it looks like you are correct and I am not
seeing any issues so everything seems to be working fine.

I have a hacked together ixgbe up and running now with the new build_skb
logic and RSC/LRO disabled.  It looks like it is giving me a 5%
performance boost for small packet routing, but I am using more CPU for
netperf TCP receive tests and I was wondering if you had seen anything
similar on the tg3 driver?

Thanks,

Alex

^ permalink raw reply

* RE: [PATCH 02/13 v4] usb/net: rndis: break out <linux/rndis.h> defines
From: Haiyang Zhang @ 2012-05-01 19:42 UTC (permalink / raw)
  To: Linus Walleij
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-usb-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Greg Kroah-Hartman, David S. Miller, Felipe Balbi,
	Jussi Kivilinna, Wei Yongjun, Ben Hutchings
In-Reply-To: <CACRpkdZLg5f9uWrT1hYSWhiL-i-=NSUtuPAGKrWcC4+LZs5ACA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>



> -----Original Message-----
> From: Linus Walleij [mailto:linus.walleij-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org]
> Sent: Tuesday, May 01, 2012 3:09 PM
> To: Haiyang Zhang
> Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; linux-usb-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; Greg Kroah-Hartman;
> David S. Miller; Felipe Balbi; Jussi Kivilinna; Wei Yongjun; Ben Hutchings
> Subject: Re: [PATCH 02/13 v4] usb/net: rndis: break out <linux/rndis.h>
> defines
> 
> On Tue, May 1, 2012 at 8:54 PM, Haiyang Zhang <haiyangz-0li6OtcxBFHby3iVrkZq2A@public.gmane.org>
> wrote:
> 
> >> ChangeLog v2->v3:
> >> - Squashed subsequent patch removing duplicate entries into
> >>   this patch so it's bisectable without any compilation warnings.
> >
> > I have not reviewed this version yet.
> >
> > Also, I think you should merge the "ChangeLog v2->v3" into the patch
> > comments. Don't need to say "The compilation screams about double-
> defines
> > all over the place" any more.
> 
> This one is completely bogus, please review the v4 versions of the
> patches instead. I was halfway on this one and screwed it up
> somehow, I've had better days :-(

Yes, I'm replying to your v4 patch. It contains the "ChangeLog v2->v3", and
still says "The compilation screams about double-defines all over the place"
in the comments.

Thanks,
- Haiyang


--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH net-next] netem: add ECN capability
From: David Miller @ 2012-05-01 19:13 UTC (permalink / raw)
  To: dave.taht; +Cc: shemminger, eric.dumazet, netdev, therbert, ncardwell, hagen
In-Reply-To: <CAA93jw7r8UTTEy9KjmQekbuJ6U7hVVURtr0z9pw9L8wa67BUZA@mail.gmail.com>

From: Dave Taht <dave.taht@gmail.com>
Date: Tue, 1 May 2012 11:49:51 -0700

> As I think about it, I think he could handle the ninja problem, tho.

Eric is the Networking Ninja.

^ permalink raw reply

* Re: [PATCH 02/13 v4] usb/net: rndis: break out <linux/rndis.h> defines
From: Linus Walleij @ 2012-05-01 19:09 UTC (permalink / raw)
  To: Haiyang Zhang
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-usb-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Greg Kroah-Hartman, David S. Miller, Felipe Balbi,
	Jussi Kivilinna, Wei Yongjun, Ben Hutchings
In-Reply-To: <A1F3067C9B68744AA19F6802BAB8FFDC0DD3CAAA-Jfd81uAzPQthtgIzimG1dFir+X/St4rqwBk/1ggFUS45P9zcU8sUGwC/G2K4zDHf@public.gmane.org>

On Tue, May 1, 2012 at 8:54 PM, Haiyang Zhang <haiyangz-0li6OtcxBFHby3iVrkZq2A@public.gmane.org> wrote:

>> ChangeLog v2->v3:
>> - Squashed subsequent patch removing duplicate entries into
>>   this patch so it's bisectable without any compilation warnings.
>
> I have not reviewed this version yet.
>
> Also, I think you should merge the "ChangeLog v2->v3" into the patch
> comments. Don't need to say "The compilation screams about double-defines
> all over the place" any more.

This one is completely bogus, please review the v4 versions of the
patches instead. I was halfway on this one and screwed it up
somehow, I've had better days :-(

Thanks,
Linus Walleij
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* RE: [PATCH 02/13 v4] usb/net: rndis: break out <linux/rndis.h> defines
From: Haiyang Zhang @ 2012-05-01 18:54 UTC (permalink / raw)
  To: Linus Walleij, netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-usb-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Greg Kroah-Hartman, David S. Miller, Felipe Balbi
  Cc: Jussi Kivilinna, Wei Yongjun, Ben Hutchings
In-Reply-To: <1335896546-13315-1-git-send-email-linus.walleij-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>



> -----Original Message-----
> From: Linus Walleij [mailto:linus.walleij-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org]
> Sent: Tuesday, May 01, 2012 2:22 PM
> To: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; linux-usb-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; Greg Kroah-Hartman;
> David S. Miller; Felipe Balbi
> Cc: Jussi Kivilinna; Haiyang Zhang; Wei Yongjun; Ben Hutchings; Linus
> Walleij
> Subject: [PATCH 02/13 v4] usb/net: rndis: break out <linux/rndis.h>
> defines
> 
> As a first step to consolidate the RNDIS implementations, break out
> a common file with all the #defines and move it to <linux/rndis.h>.
> The compilation screams about double-defines all over the place
> (as could be expected, this is why we're consolidating) but it
> compiles cleanly still.
> 
> This also deletes the immediate duplicated defines in the
> <linux/rndis.h> file that yields a lot of compilation warnings.
> 
> Reviewed-by: Haiyang Zhang <haiyangz-0li6OtcxBFHby3iVrkZq2A@public.gmane.org>
> Signed-off-by: Linus Walleij <linus.walleij-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>
> ---
> ChangeLog v2->v3:
> - Squashed subsequent patch removing duplicate entries into
>   this patch so it's bisectable without any compilation warnings.

I have not reviewed this version yet.

Also, I think you should merge the "ChangeLog v2->v3" into the patch
comments. Don't need to say "The compilation screams about double-defines
all over the place" any more.

Thanks,
- Haiyang


--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH net-next] netem: add ECN capability
From: Dave Taht @ 2012-05-01 18:49 UTC (permalink / raw)
  To: David Miller; +Cc: shemminger, eric.dumazet, netdev, therbert, ncardwell, hagen
In-Reply-To: <20120501.131750.636505003036346376.davem@davemloft.net>

On Tue, May 1, 2012 at 10:17 AM, David Miller <davem@davemloft.net> wrote:
> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Tue, 1 May 2012 09:59:38 -0700
>
>> On Tue, 01 May 2012 09:40:51 -0400 (EDT)
>> David Miller <davem@davemloft.net> wrote:
>>
>>> From: Eric Dumazet <eric.dumazet@gmail.com>
>>> Date: Tue, 01 May 2012 11:11:05 +0200
>>>
>>> > From: Eric Dumazet <edumazet@google.com>
>>> >
>>> > Add ECN (Explicit Congestion Notification) marking capability to netem
>>> >
>>> > tc qdisc add dev eth0 root netem drop 0.5 ecn
>>> >
>>> > Instead of dropping packets, try to ECN mark them.
>>> >
>>> > Signed-off-by: Eric Dumazet <edumazet@google.com>
>>>
>>> Applied.
>>
>> At least give a day of review, rather than premature acceptance
>
> Yeah, because Eric Dumazet is going to fall off the face of the
> planet and not fix whatever problems you have with his work.
>
> Get real Stephen.

I'd be more concerned about wayward buses, or a team of elite
ninja's hired by outraged BSD and windows hackers sent to slow
eric down.

As I think about it, I think he could handle the ninja problem, tho.



-- 
Dave Täht
SKYPE: davetaht
US Tel: 1-239-829-5608
http://www.bufferbloat.net

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox