Netdev List
 help / color / mirror / Atom feed
* Re: [RFC Patch net-next 1/4] net_sched: switch filter list to list_head
From: John Fastabend @ 2014-01-10  3:48 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev, Eric Dumazet, David S. Miller, Jamal Hadi Salim
In-Reply-To: <1389291593-2494-2-git-send-email-xiyou.wangcong@gmail.com>

On 01/09/2014 10:19 AM, Cong Wang wrote:
> So that it could be potentially traversed with RCU.
>
> Cc: John Fastabend <john.fastabend@gmail.com>
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> Cc: David S. Miller <davem@davemloft.net>
> Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
> ---
>   include/net/pkt_sched.h   |  4 ++--
>   include/net/sch_generic.h |  9 ++++++---
>   net/sched/cls_api.c       | 36 ++++++++++++++++++++++++------------
>   net/sched/sch_api.c       | 35 ++++++++++++++++++++++-------------
>   net/sched/sch_atm.c       | 14 +++++++-------
>   net/sched/sch_cbq.c       |  9 +++++----
>   net/sched/sch_choke.c     | 11 ++++++-----
>   net/sched/sch_drr.c       |  7 ++++---
>   net/sched/sch_dsmark.c    |  7 ++++---
>   net/sched/sch_fq_codel.c  |  9 +++++----
>   net/sched/sch_hfsc.c      | 15 +++++++++------
>   net/sched/sch_htb.c       | 20 ++++++++++++--------
>   net/sched/sch_ingress.c   | 14 +++++++++++---
>   net/sched/sch_multiq.c    |  7 ++++---
>   net/sched/sch_prio.c      |  9 +++++----
>   net/sched/sch_qfq.c       |  7 ++++---
>   net/sched/sch_sfb.c       |  9 +++++----
>   net/sched/sch_sfq.c       | 11 ++++++-----
>   18 files changed, 141 insertions(+), 92 deletions(-)
>
> diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
> index 891d80d..27a1efa 100644
> --- a/include/net/pkt_sched.h
> +++ b/include/net/pkt_sched.h
> @@ -109,9 +109,9 @@ static inline void qdisc_run(struct Qdisc *q)
>   		__qdisc_run(q);
>   }
>
> -int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
> +int tc_classify_compat(struct sk_buff *skb, const struct list_head *head,
>   		       struct tcf_result *res);
> -int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
> +int tc_classify(struct sk_buff *skb, const struct list_head *head,
>   		struct tcf_result *res);
>
>   /* Calculate maximal size of packet seen by hard_start_xmit
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index 013d96d..97123cc 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -6,6 +6,7 @@
>   #include <linux/rcupdate.h>
>   #include <linux/pkt_sched.h>
>   #include <linux/pkt_cls.h>
> +#include <linux/list.h>
>   #include <net/gen_stats.h>
>   #include <net/rtnetlink.h>
>
> @@ -143,7 +144,7 @@ struct Qdisc_class_ops {
>   	void			(*walk)(struct Qdisc *, struct qdisc_walker * arg);
>
>   	/* Filter manipulation */
> -	struct tcf_proto **	(*tcf_chain)(struct Qdisc *, unsigned long);
> +	struct list_head *	(*tcf_chain)(struct Qdisc *, unsigned long);

I'm not sure, I thought it was just fine the way it was. That pattern
exists in a few other places inside the networking stack and we don't
go around converting them to lists. But that is just my opinion.

>   	unsigned long		(*bind_tcf)(struct Qdisc *, unsigned long,
>   					u32 classid);
>   	void			(*unbind_tcf)(struct Qdisc *, unsigned long);
> @@ -184,6 +185,8 @@ struct tcf_result {
>   	u32		classid;
>   };
>
> +struct tcf_proto;
> +
>   struct tcf_proto_ops {
>   	struct list_head	head;
>   	char			kind[IFNAMSIZ];
> @@ -212,7 +215,7 @@ struct tcf_proto_ops {
>
>   struct tcf_proto {
>   	/* Fast access part */
> -	struct tcf_proto	*next;
> +	struct list_head	head;
>   	void			*root;
>   	int			(*classify)(struct sk_buff *,
>   					    const struct tcf_proto *,
> @@ -376,7 +379,7 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
>   void __qdisc_calculate_pkt_len(struct sk_buff *skb,
>   			       const struct qdisc_size_table *stab);
>   void tcf_destroy(struct tcf_proto *tp);
> -void tcf_destroy_chain(struct tcf_proto **fl);
> +void tcf_destroy_chain(struct list_head *fl);
>
>   /* Reset all TX qdiscs greater then index of a device.  */
>   static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
> index d8c42b1..bcc9987 100644
> --- a/net/sched/cls_api.c
> +++ b/net/sched/cls_api.c
> @@ -125,8 +125,9 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
>   	u32 parent;
>   	struct net_device *dev;
>   	struct Qdisc  *q;
> -	struct tcf_proto **back, **chain;
> -	struct tcf_proto *tp;
> +	struct tcf_proto *back;
> +	struct list_head *chain;
> +	struct tcf_proto *tp, *res = NULL;
>   	const struct tcf_proto_ops *tp_ops;
>   	const struct Qdisc_class_ops *cops;
>   	unsigned long cl;
> @@ -196,21 +197,27 @@ replay:
>   		goto errout;
>
>   	/* Check the chain for existence of proto-tcf with this priority */
> -	for (back = chain; (tp = *back) != NULL; back = &tp->next) {
> +	rcu_read_lock();

why rcu_read_lock() this is under rtnl lock.

> +	list_for_each_entry_rcu(tp, chain, head) {

No need for the rcu you are under the writer lock.

> +		back = list_next_entry(tp, head);
>   		if (tp->prio >= prio) {
>   			if (tp->prio == prio) {
>   				if (!nprio ||
> -				    (tp->protocol != protocol && protocol))
> +				    (tp->protocol != protocol && protocol)) {
> +					rcu_read_unlock();
>   					goto errout;
> +				}
> +				res = tp;
>   			} else
> -				tp = NULL;
> +				res = NULL;
>   			break;
>   		}
>   	}
> +	rcu_read_unlock();

ditto

>
>   	root_lock = qdisc_root_sleeping_lock(q);
>
> -	if (tp == NULL) {
> +	if ((tp = res) == NULL) {
>   		/* Proto-tcf does not exist, create new one */
>
>   		if (tca[TCA_KIND] == NULL || !protocol)
> @@ -228,6 +235,7 @@ replay:
>   		tp = kzalloc(sizeof(*tp), GFP_KERNEL);
>   		if (tp == NULL)
>   			goto errout;
> +		INIT_LIST_HEAD(&tp->head);
>   		err = -ENOENT;
>   		tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]);
>   		if (tp_ops == NULL) {
> @@ -258,7 +266,7 @@ replay:
>   		}
>   		tp->ops = tp_ops;
>   		tp->protocol = protocol;
> -		tp->prio = nprio ? : TC_H_MAJ(tcf_auto_prio(*back));
> +		tp->prio = nprio ? : TC_H_MAJ(tcf_auto_prio(back));
>   		tp->q = q;
>   		tp->classify = tp_ops->classify;
>   		tp->classid = parent;
> @@ -280,7 +288,7 @@ replay:
>   	if (fh == 0) {
>   		if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
>   			spin_lock_bh(root_lock);
> -			*back = tp->next;
> +			list_del_rcu(&tp->head);
>   			spin_unlock_bh(root_lock);
>
>   			tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
> @@ -321,8 +329,7 @@ replay:
>   	if (err == 0) {
>   		if (tp_created) {
>   			spin_lock_bh(root_lock);
> -			tp->next = *back;
> -			*back = tp;
> +			list_add_rcu(&tp->head, chain);
>   			spin_unlock_bh(root_lock);
>   		}
>   		tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
> @@ -417,7 +424,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
>   	int s_t;
>   	struct net_device *dev;
>   	struct Qdisc *q;
> -	struct tcf_proto *tp, **chain;
> +	struct tcf_proto *tp;
> +	struct list_head *chain;
>   	struct tcmsg *tcm = nlmsg_data(cb->nlh);
>   	unsigned long cl = 0;
>   	const struct Qdisc_class_ops *cops;
> @@ -451,7 +459,9 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
>
>   	s_t = cb->args[0];
>
> -	for (tp = *chain, t = 0; tp; tp = tp->next, t++) {
> +	t = 0;
> +	rcu_read_lock();

same under rtnl right?

> +	list_for_each_entry_rcu(tp, chain, head) {

... so the _rcu can be dropped.

>   		if (t < s_t)
>   			continue;
>   		if (TC_H_MAJ(tcm->tcm_info) &&
> @@ -482,7 +492,9 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
>   		cb->args[1] = arg.w.count + 1;
>   		if (arg.w.stop)
>   			break;
> +		t++;
>   	}
> +	rcu_read_unlock();
>
>   	cb->args[0] = t;
>
> diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
> index 1313145..df86686 100644
> --- a/net/sched/sch_api.c
> +++ b/net/sched/sch_api.c
> @@ -29,6 +29,7 @@
>   #include <linux/hrtimer.h>
>   #include <linux/lockdep.h>
>   #include <linux/slab.h>
> +#include <linux/rculist.h>
>
>   #include <net/net_namespace.h>
>   #include <net/sock.h>
> @@ -1772,13 +1773,15 @@ done:
>    * to this qdisc, (optionally) tests for protocol and asks
>    * specific classifiers.
>    */
> -int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
> +int tc_classify_compat(struct sk_buff *skb, const struct list_head *head,

notice you don't check for a null head value here so it better not be
null.

>   		       struct tcf_result *res)
>   {
> +	struct tcf_proto *tp;
>   	__be16 protocol = skb->protocol;
> -	int err;
> +	int err = -1;
>
> -	for (; tp; tp = tp->next) {
> +	WARN_ON_ONCE(!rcu_read_lock_held());

its just rfc code but no need for the warn on.

> +	list_for_each_entry_rcu(tp, head, head) {
>   		if (tp->protocol != protocol &&
>   		    tp->protocol != htons(ETH_P_ALL))
>   			continue;
> @@ -1789,23 +1792,25 @@ int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
>   			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
>   				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
>   #endif
> -			return err;
> +			break;
>   		}
>   	}
> -	return -1;
> +
> +	return err;
>   }
>   EXPORT_SYMBOL(tc_classify_compat);
>
> -int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
> +int tc_classify(struct sk_buff *skb, const struct list_head *head,
>   		struct tcf_result *res)
>   {
>   	int err = 0;
>   #ifdef CONFIG_NET_CLS_ACT
> -	const struct tcf_proto *otp = tp;
> +	const struct tcf_proto *tp;
> +	const struct tcf_proto *otp = list_first_entry(head, struct tcf_proto, head);
>   reclassify:
>   #endif
>
> -	err = tc_classify_compat(skb, tp, res);
> +	err = tc_classify_compat(skb, head, res);
>   #ifdef CONFIG_NET_CLS_ACT
>   	if (err == TC_ACT_RECLASSIFY) {
>   		u32 verd = G_TC_VERD(skb->tc_verd);
> @@ -1830,16 +1835,20 @@ void tcf_destroy(struct tcf_proto *tp)
>   {
>   	tp->ops->destroy(tp);
>   	module_put(tp->ops->owner);
> +	synchronize_rcu();

maybe call_rcu we don't wan't to syncronize during a destroy chain
operation.

>   	kfree(tp);
>   }
>
> -void tcf_destroy_chain(struct tcf_proto **fl)
> +void tcf_destroy_chain(struct list_head *fl)
>   {
> -	struct tcf_proto *tp;
> +	struct tcf_proto *tp, *n;
> +	LIST_HEAD(list);
> +	list_splice_init_rcu(fl, &list, synchronize_rcu);
>
> -	while ((tp = *fl) != NULL) {
> -		*fl = tp->next;
> -		tcf_destroy(tp);
> +	list_for_each_entry_safe(tp, n, &list, head) {
> +		tp->ops->destroy(tp);
> +		module_put(tp->ops->owner);
> +		kfree(tp);

fix the sync above and then there is no reason as far as I can tell
to change this code. Or at least that should be kfree_rcu().

>   	}
>   }
>   EXPORT_SYMBOL(tcf_destroy_chain);
> diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
> index 1f9c314..20a07c2 100644
> --- a/net/sched/sch_atm.c
> +++ b/net/sched/sch_atm.c
> @@ -41,7 +41,7 @@
>
>   struct atm_flow_data {
>   	struct Qdisc		*q;	/* FIFO, TBF, etc. */
> -	struct tcf_proto	*filter_list;
> +	struct list_head 	filter_list;
>   	struct atm_vcc		*vcc;	/* VCC; NULL if VCC is closed */
>   	void			(*old_pop)(struct atm_vcc *vcc,
>   					   struct sk_buff *skb); /* chaining */
> @@ -273,7 +273,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
>   		error = -ENOBUFS;
>   		goto err_out;
>   	}
> -	flow->filter_list = NULL;
> +	INIT_LIST_HEAD(&flow->filter_list);
>   	flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
>   	if (!flow->q)
>   		flow->q = &noop_qdisc;
> @@ -311,7 +311,7 @@ static int atm_tc_delete(struct Qdisc *sch, unsigned long arg)
>   	pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
>   	if (list_empty(&flow->list))
>   		return -EINVAL;
> -	if (flow->filter_list || flow == &p->link)
> +	if (!list_empty(&flow->filter_list) || flow == &p->link)
>   		return -EBUSY;
>   	/*
>   	 * Reference count must be 2: one for "keepalive" (set at class
> @@ -345,7 +345,7 @@ static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker)
>   	}
>   }
>
> -static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch, unsigned long cl)
> +static struct list_head *atm_tc_find_tcf(struct Qdisc *sch, unsigned long cl)
>   {
>   	struct atm_qdisc_data *p = qdisc_priv(sch);
>   	struct atm_flow_data *flow = (struct atm_flow_data *)cl;
> @@ -370,9 +370,9 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
>   	if (TC_H_MAJ(skb->priority) != sch->handle ||
>   	    !(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) {
>   		list_for_each_entry(flow, &p->flows, list) {
> -			if (flow->filter_list) {
> +			if (!list_empty(&flow->filter_list)) {
>   				result = tc_classify_compat(skb,
> -							    flow->filter_list,
> +							    &flow->filter_list,
>   							    &res);
>   				if (result < 0)
>   					continue;
> @@ -544,7 +544,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt)
>   	if (!p->link.q)
>   		p->link.q = &noop_qdisc;
>   	pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q);
> -	p->link.filter_list = NULL;
> +	INIT_LIST_HEAD(&p->link.filter_list);
>   	p->link.vcc = NULL;
>   	p->link.sock = NULL;
>   	p->link.classid = sch->handle;
> diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
> index 2f80d01..a5914ff 100644
> --- a/net/sched/sch_cbq.c
> +++ b/net/sched/sch_cbq.c
> @@ -133,7 +133,7 @@ struct cbq_class {
>   	struct gnet_stats_rate_est64 rate_est;
>   	struct tc_cbq_xstats	xstats;
>
> -	struct tcf_proto	*filter_list;
> +	struct list_head	filter_list;
>
>   	int			refcnt;
>   	int			filters;
> @@ -239,8 +239,8 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
>   		/*
>   		 * Step 2+n. Apply classifier.
>   		 */
> -		if (!head->filter_list ||
> -		    (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
> +		if (list_empty(&head->filter_list) ||
> +		    (result = tc_classify_compat(skb, &head->filter_list, &res)) < 0)

there is a race here, if the list is not empty when you check it but
emptied by the time you call tc_classify_compat.

Also notice you accessed a list without an _rcu function. If you grep
for list_empty_rcu() iirc its not even defined to stop this sort of
error.


>   			goto fallback;
>
>   		cl = (void *)res.class;
> @@ -1881,6 +1881,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
>   		}
>   	}
>
> +	INIT_LIST_HEAD(&cl->filter_list);
>   	cl->R_tab = rtab;
>   	rtab = NULL;
>   	cl->refcnt = 1;
> @@ -1976,7 +1977,7 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg)
>   	return 0;
>   }
>
> -static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
> +static struct list_head *cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
>   {
>   	struct cbq_sched_data *q = qdisc_priv(sch);
>   	struct cbq_class *cl = (struct cbq_class *)arg;
> diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
> index ddd73cb..9b36830 100644
> --- a/net/sched/sch_choke.c
> +++ b/net/sched/sch_choke.c
> @@ -58,7 +58,7 @@ struct choke_sched_data {
>
>   /* Variables */
>   	struct red_vars  vars;
> -	struct tcf_proto *filter_list;
> +	struct list_head filter_list;
>   	struct {
>   		u32	prob_drop;	/* Early probability drops */
>   		u32	prob_mark;	/* Early probability marks */
> @@ -202,7 +202,7 @@ static bool choke_classify(struct sk_buff *skb,
>   	struct tcf_result res;
>   	int result;
>
> -	result = tc_classify(skb, q->filter_list, &res);
> +	result = tc_classify(skb, &q->filter_list, &res);

hmm q->filter_list passed to tc_classify without rcu_deref.

>   	if (result >= 0) {
>   #ifdef CONFIG_NET_CLS_ACT
>   		switch (result) {
> @@ -256,7 +256,7 @@ static bool choke_match_random(const struct choke_sched_data *q,
>   		return false;
>
>   	oskb = choke_peek_random(q, pidx);
> -	if (q->filter_list)
> +	if (!list_empty(&q->filter_list))
>   		return choke_get_classid(nskb) == choke_get_classid(oskb);

same class of list_empty bug?

>
>   	return choke_match_flow(oskb, nskb);
> @@ -268,7 +268,7 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
>   	const struct red_parms *p = &q->parms;
>   	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
>
> -	if (q->filter_list) {
> +	if (!list_empty(&q->filter_list)) {
>   		/* If using external classifiers, get result and record it. */
>   		if (!choke_classify(skb, sch, &ret))
>   			goto other_drop;	/* Packet was eaten by filter */
> @@ -476,6 +476,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
>
>   	q->flags = ctl->flags;
>   	q->limit = ctl->limit;
> +	INIT_LIST_HEAD(&q->filter_list);
>
>   	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
>   		      ctl->Plog, ctl->Scell_log,
> @@ -566,7 +567,7 @@ static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
>   	return 0;
>   }
>
> -static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
> +static struct list_head *choke_find_tcf(struct Qdisc *sch, unsigned long cl)
>   {
>   	struct choke_sched_data *q = qdisc_priv(sch);
>
> diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
> index 8302717..62f45ac 100644
> --- a/net/sched/sch_drr.c
> +++ b/net/sched/sch_drr.c
> @@ -35,7 +35,7 @@ struct drr_class {
>
>   struct drr_sched {
>   	struct list_head		active;
> -	struct tcf_proto		*filter_list;
> +	struct list_head		filter_list;

also all of this needs to be annotated.

>   	struct Qdisc_class_hash		clhash;
>   };
>
> @@ -184,7 +184,7 @@ static void drr_put_class(struct Qdisc *sch, unsigned long arg)
>   		drr_destroy_class(sch, cl);
>   }
>
> -static struct tcf_proto **drr_tcf_chain(struct Qdisc *sch, unsigned long cl)
> +static struct list_head *drr_tcf_chain(struct Qdisc *sch, unsigned long cl)
>   {
>   	struct drr_sched *q = qdisc_priv(sch);
>
> @@ -328,7 +328,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
>   	}
>
>   	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
> -	result = tc_classify(skb, q->filter_list, &res);
> +	result = tc_classify(skb, &q->filter_list, &res);

ditto no rcu_deref.

>   	if (result >= 0) {
>   #ifdef CONFIG_NET_CLS_ACT
>   		switch (result) {
> @@ -443,6 +443,7 @@ static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
>   	if (err < 0)
>   		return err;
>   	INIT_LIST_HEAD(&q->active);
> +	INIT_LIST_HEAD(&q->filter_list);
>   	return 0;
>   }
>
> diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
> index 49d6ef3..4489ffe 100644
> --- a/net/sched/sch_dsmark.c
> +++ b/net/sched/sch_dsmark.c
> @@ -37,7 +37,7 @@
>
>   struct dsmark_qdisc_data {
>   	struct Qdisc		*q;
> -	struct tcf_proto	*filter_list;
> +	struct list_head	filter_list;

annotate and let smatch catch a lot of bugs for you. Same comment
for all the above instantiates I didn't comment on.

>   	u8			*mask;	/* "owns" the array */
>   	u8			*value;
>   	u16			indices;
> @@ -186,7 +186,7 @@ ignore:
>   	}
>   }
>
> -static inline struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,
> +static inline struct list_head *dsmark_find_tcf(struct Qdisc *sch,
>   						 unsigned long cl)
>   {
>   	struct dsmark_qdisc_data *p = qdisc_priv(sch);
> @@ -229,7 +229,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
>   		skb->tc_index = TC_H_MIN(skb->priority);
>   	else {
>   		struct tcf_result res;
> -		int result = tc_classify(skb, p->filter_list, &res);
> +		int result = tc_classify(skb, &p->filter_list, &res);

rcu deref.

[...]

OK now I'm just repeating myself. You see the trend.

Additionally I don't see how any of the cls_*c change routines work in
your series. For example look at basic_change. Even with the rtnl lock
you need to do a read, copy, update (RCU namesake) you can't just modify
it in place like you have done.

I'll send a fixed up series out in a few minutes it should illustrate
my point.

.John

-- 
John Fastabend         Intel Corporation

^ permalink raw reply

* Re: Use of 'SIOCDEVPRIVATE' in ethernet drivers.
From: Giri Reddy @ 2014-01-10  3:45 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: Stephen Hemminger, David Miller, netdev
In-Reply-To: <1389301775.2025.43.camel@bwh-desktop.uk.level5networks.com>

On 1/9/14 1:09 PM, Ben Hutchings wrote:
> On Wed, 2014-01-08 at 16:02 +0000, Giri Reddy wrote:
>> Thanks for the input. I will explore the 'ethtool' option. We will have
>> multiple control frames exchanged between user space and kernel for
>> every flash update operation, most of this will be proprietary stuff
>> that will get exchanged - I will explore how to provide a generic
>> interface for that.
> [...]
>
> What you need to do beyond read, write, erase and reset?

Thanks for detailed note about various options for firmware update in
the other email. Aside form the payload we also want control frames
(proprietary) to be exchanged between our firmware and management
application.

Giri

>
> Ben.
>


________________________________

This message and any attached documents contain information from QLogic Corporation or its wholly-owned subsidiaries that may be confidential. If you are not the intended recipient, you may not read, copy, distribute, or use this information. If you have received this transmission in error, please notify the sender immediately by reply e-mail and then delete this message.

^ permalink raw reply

* RE: [PATCH 3/6] [v5] phylib: turn genphy_driver to an array
From: Shaohui Xie @ 2014-01-10  3:24 UTC (permalink / raw)
  To: David Miller, shh.xie@gmail.com
  Cc: jg1.han@samsung.com, mugunthanvnm@ti.com, f.fainelli@gmail.com,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <20140109.214447.242793673746496044.davem@davemloft.net>

> -----Original Message-----
> From: David Miller [mailto:davem@davemloft.net]
> Sent: Friday, January 10, 2014 10:45 AM
> To: shh.xie@gmail.com
> Cc: jg1.han@samsung.com; mugunthanvnm@ti.com; f.fainelli@gmail.com;
> netdev@vger.kernel.org; linux-kernel@vger.kernel.org; Xie Shaohui-B21989
> Subject: Re: [PATCH 3/6] [v5] phylib: turn genphy_driver to an array
> 
> 
> Many of these patches give rejects when I try to apply them to net-next,
> you will need to respin this series.
> 
[S.H] These patches were based on Linus's tree. I'll rebase them to the net-next.
I'm Sorry for inconvenient!

Thank you!

Best Regards, 
Shaohui Xie

^ permalink raw reply

* Re: [RFC Patch net-next 2/4] net_sched: avoid holding qdisc lock for filters
From: John Fastabend @ 2014-01-10  3:13 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev, Eric Dumazet, David S. Miller, Jamal Hadi Salim
In-Reply-To: <1389291593-2494-3-git-send-email-xiyou.wangcong@gmail.com>

On 01/09/2014 10:19 AM, Cong Wang wrote:
> Cc: John Fastabend <john.fastabend@gmail.com>
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> Cc: David S. Miller <davem@davemloft.net>
> Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
> ---
>   include/net/pkt_cls.h     | 25 ++++---------------------
>   include/net/sch_generic.h |  5 +++--
>   net/sched/cls_api.c       | 12 +++++-------
>   3 files changed, 12 insertions(+), 30 deletions(-)
>
> diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
> index 50ea079..9afb15b 100644
> --- a/include/net/pkt_cls.h
> +++ b/include/net/pkt_cls.h
> @@ -18,26 +18,9 @@ int register_tcf_proto_ops(struct tcf_proto_ops *ops);
>   int unregister_tcf_proto_ops(struct tcf_proto_ops *ops);
>
>   static inline unsigned long
> -__cls_set_class(unsigned long *clp, unsigned long cl)
> +cls_set_class(unsigned long *clp, unsigned long cl)
>   {

commit message should mention this conversion to xchg.

> -	unsigned long old_cl;
> -
> -	old_cl = *clp;
> -	*clp = cl;
> -	return old_cl;
> -}
> -
> -static inline unsigned long
> -cls_set_class(struct tcf_proto *tp, unsigned long *clp,
> -	unsigned long cl)
> -{
> -	unsigned long old_cl;
> -	
> -	tcf_tree_lock(tp);
> -	old_cl = __cls_set_class(clp, cl);
> -	tcf_tree_unlock(tp);
> -
> -	return old_cl;
> +	return xchg(clp, cl);
>   }
>
>   static inline void
> @@ -46,7 +29,7 @@ tcf_bind_filter(struct tcf_proto *tp, struct tcf_result *r, unsigned long base)
>   	unsigned long cl;
>
>   	cl = tp->q->ops->cl_ops->bind_tcf(tp->q, base, r->classid);
> -	cl = cls_set_class(tp, &r->class, cl);
> +	cl = cls_set_class(&r->class, cl);
>   	if (cl)
>   		tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
>   }
> @@ -56,7 +39,7 @@ tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r)
>   {
>   	unsigned long cl;
>
> -	if ((cl = __cls_set_class(&r->class, 0)) != 0)
> +	if ((cl = cls_set_class(&r->class, 0)) != 0)
>   		tp->q->ops->cl_ops->unbind_tcf(tp->q, cl);
>   }
>
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index 97123cc..000ce54 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -215,6 +215,7 @@ struct tcf_proto_ops {
>
>   struct tcf_proto {
>   	/* Fast access part */
> +	spinlock_t		lock;

[...]

Th lock can just be dropped all these operations are initiated via
rtnetlink and hence have rtnl held.

.John
-- 
John Fastabend         Intel Corporation

^ permalink raw reply

* [PATCH 1/1] drivers: net: silence compiler warning in smc91x.c
From: Pankaj Dubey @ 2014-01-10  3:04 UTC (permalink / raw)
  To: linux-samsung-soc, linux-arm-kernel
  Cc: Pankaj Dubey, David S. Miller, Jingoo Han, netdev

If used 64 bit compiler GCC warns that:

drivers/net/ethernet/smsc/smc91x.c:1897:7:
warning: cast from pointer to integer of different
size [-Wpointer-to-int-cast]

This patch fixes this by changing typecase from "unsigned int" to "unsigned long"

CC: "David S. Miller" <davem@davemloft.net>
CC: Jingoo Han <jg1.han@samsung.com> 
CC: netdev@vger.kernel.org
Signed-off-by: Pankaj Dubey <pankaj.dubey@samsung.com>
---
 drivers/net/ethernet/smsc/smc91x.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/smsc/smc91x.c b/drivers/net/ethernet/smsc/smc91x.c
index 0c9b5d9..42e79b4 100644
--- a/drivers/net/ethernet/smsc/smc91x.c
+++ b/drivers/net/ethernet/smsc/smc91x.c
@@ -1894,7 +1894,7 @@ static int smc_probe(struct net_device *dev, void __iomem *ioaddr,
 	SMC_SELECT_BANK(lp, 1);
 	val = SMC_GET_BASE(lp);
 	val = ((val & 0x1F00) >> 3) << SMC_IO_SHIFT;
-	if (((unsigned int)ioaddr & (0x3e0 << SMC_IO_SHIFT)) != val) {
+	if (((unsigned long)ioaddr & (0x3e0 << SMC_IO_SHIFT)) != val) {
 		netdev_warn(dev, "%s: IOADDR %p doesn't match configuration (%x).\n",
 			    CARDNAME, ioaddr, val);
 	}
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH net-next] net: xfrm6: silence sparse warning
From: Ben Hutchings @ 2014-01-10  2:53 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, ying.xue, steffen.klassert, dborkman, netdev
In-Reply-To: <20140109150451.30b57b2b@nehalam.linuxnetplumber.net>

On Thu, 2014-01-09 at 15:04 -0800, Stephen Hemminger wrote:
> On Wed, 08 Jan 2014 00:56:56 -0500 (EST)
> David Miller <davem@davemloft.net> wrote:
> 
> > From: Ying Xue <ying.xue@windriver.com>
> > Date: Wed, 8 Jan 2014 13:55:09 +0800
> > 
> > > 3. Just drop the patch
> > 
> > This is the only suitable thing to do.
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> There was some patches floating around to eliminate uses of variable
> length array in the kernel since they aren't supported by CLANG.

I think you're mixing up two different things there.  VLAs are part of
C99 so I'm quite sure Clang supports them (in general).  But they seem
to be deprecated in the kernel anyway, maybe because they make it easy
to introduce a stack overflow.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: [PATCH net] bnx2x: prevent WARN during driver unload
From: David Miller @ 2014-01-10  2:46 UTC (permalink / raw)
  To: yuvalmin; +Cc: netdev, dmitry, ariele
In-Reply-To: <979A8436335E3744ADCD3A9F2A2B68A52AF240A3@SJEXCHMB10.corp.ad.broadcom.com>

From: Yuval Mintz <yuvalmin@broadcom.com>
Date: Wed, 8 Jan 2014 07:05:36 +0000

> True; but we didn't reach this bit of intuitive code on our own -
> this is practically a porting of 27d9ce3f "ixgbe: fix qv_lock_napi
> call in ixgbe_napi_disable_all" into the bnx2x driver.

Fair enough, patch applied, thanks.

^ permalink raw reply

* Re: [PATCH 3/6] [v5] phylib: turn genphy_driver to an array
From: David Miller @ 2014-01-10  2:44 UTC (permalink / raw)
  To: shh.xie
  Cc: jg1.han, mugunthanvnm, f.fainelli, netdev, linux-kernel,
	Shaohui.Xie
In-Reply-To: <1389155240-15996-1-git-send-email-shh.xie@gmail.com>


Many of these patches give rejects when I try to apply them to
net-next, you will need to respin this series.

^ permalink raw reply

* Re: [PATCH 00/23] nf_tables updates for net-next
From: David Miller @ 2014-01-10  2:36 UTC (permalink / raw)
  To: pablo; +Cc: netfilter-devel, netdev
In-Reply-To: <1389314142-17969-1-git-send-email-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 10 Jan 2014 01:35:19 +0100

> The following patchset contains the following nf_tables updates,
> mostly updates from Patrick McHardy, they are:
...
>   git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nftables.git master

Pulled, thanks Pablo.

^ permalink raw reply

* Re: [PATCH net-next] net: gro: change GRO overflow strategy
From: Neal Cardwell @ 2014-01-10  2:28 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, Jerry Chu
In-Reply-To: <1389305539.31367.65.camel@edumazet-glaptop2.roam.corp.google.com>

On Thu, Jan 9, 2014 at 5:12 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> From: Eric Dumazet <edumazet@google.com>
...
> This patch changes strategy to simply evict the oldest flow of
> the list. This works better because of the nature of packet
> trains for which GRO is efficient. This also has the effect
> of lowering the GRO latency if many flows are competing.
...
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Jerry Chu <hkchu@google.com>
> Cc: Neal Cardwell <ncardwell@google.com>
> ---
>  net/core/dev.c |   17 +++++++++++++++--
>  1 file changed, 15 insertions(+), 2 deletions(-)

Acked-by: Neal Cardwell <ncardwell@google.com>

Nice!

neal

^ permalink raw reply

* Re: [PATCH net-next] gre_offload: simplify GRE header length calculation in gre_gso_segment()
From: Eric Dumazet @ 2014-01-10  2:09 UTC (permalink / raw)
  To: Neal Cardwell
  Cc: David Miller, netdev, Eric Dumazet, H.K. Jerry Chu,
	Pravin B Shelar
In-Reply-To: <1389318437-17744-1-git-send-email-ncardwell@google.com>

On Thu, 2014-01-09 at 20:47 -0500, Neal Cardwell wrote:
> Simplify the GRE header length calculation in gre_gso_segment().
> Switch to an approach that is simpler, faster, and more general. The
> new approach will continue to be correct even if we add support for
> the optional variable-length routing info that may be present in a GRE
> header.
> 
> Signed-off-by: Neal Cardwell <ncardwell@google.com>
> Cc: Eric Dumazet <edumazet@google.com>
> Cc: H.K. Jerry Chu <hkchu@google.com>
> Cc: Pravin B Shelar <pshelar@nicira.com>
> ---

Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply

* [PATCH net-next] gre_offload: simplify GRE header length calculation in gre_gso_segment()
From: Neal Cardwell @ 2014-01-10  1:47 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Neal Cardwell, Eric Dumazet, H.K. Jerry Chu,
	Pravin B Shelar

Simplify the GRE header length calculation in gre_gso_segment().
Switch to an approach that is simpler, faster, and more general. The
new approach will continue to be correct even if we add support for
the optional variable-length routing info that may be present in a GRE
header.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: H.K. Jerry Chu <hkchu@google.com>
Cc: Pravin B Shelar <pshelar@nicira.com>
---
 net/ipv4/gre_offload.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 746a7b1..f6604b6 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -26,7 +26,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	netdev_features_t enc_features;
-	int ghl = GRE_HEADER_SECTION;
+	int ghl;
 	struct gre_base_hdr *greh;
 	u16 mac_offset = skb->mac_header;
 	int mac_len = skb->mac_len;
@@ -49,15 +49,11 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 
 	greh = (struct gre_base_hdr *)skb_transport_header(skb);
 
-	if (greh->flags & GRE_KEY)
-		ghl += GRE_HEADER_SECTION;
-	if (greh->flags & GRE_SEQ)
-		ghl += GRE_HEADER_SECTION;
-	if (greh->flags & GRE_CSUM) {
-		ghl += GRE_HEADER_SECTION;
-		csum = true;
-	} else
-		csum = false;
+	ghl = skb_inner_network_header(skb) - skb_transport_header(skb);
+	if (unlikely(ghl < sizeof(*greh)))
+		goto out;
+
+	csum = !!(greh->flags & GRE_CSUM);
 
 	if (unlikely(!pskb_may_pull(skb, ghl)))
 		goto out;
-- 
1.8.5.1

^ permalink raw reply related

* Re: [RFC Patch net-next 4/4] net_sched: make ingress qdisc lockless
From: Cong Wang @ 2014-01-10  1:21 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Linux Kernel Network Developers, John Fastabend, David S. Miller,
	Jamal Hadi Salim
In-Reply-To: <1389316283.31367.79.camel@edumazet-glaptop2.roam.corp.google.com>

On Thu, Jan 9, 2014 at 5:11 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> Well, there is one qdisc, and if your NIC is multiqueue, with for
> example 32 queues, you can have 32 cpu happily using this qdisc at once.

I did see this, but still don't see the problem.

>
> Thats why you need the spinlock.
>

It looks like you are saying we queue the packets somewhere
in qdisc_enqueue_root() therefore needs a spinlock, but looking
at the code:

static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
        qdisc_calculate_pkt_len(skb, sch);
        return sch->enqueue(skb, sch);
}

static inline int qdisc_enqueue_root(struct sk_buff *skb, struct Qdisc *sch)
{
        qdisc_skb_cb(skb)->pkt_len = skb->len;
        return qdisc_enqueue(skb, sch) & NET_XMIT_MASK;
}

so it almost equals to calling ->enqueue directly, for ingress, which is
ingress_enqueue(). Except updating some stats, the only thing
it does is calling tc_classify().

So where is the problem at qdisc layer? I must miss something too obvious...

^ permalink raw reply

* Re: [RFC Patch net-next 4/4] net_sched: make ingress qdisc lockless
From: Eric Dumazet @ 2014-01-10  1:11 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, John Fastabend, David S. Miller,
	Jamal Hadi Salim
In-Reply-To: <CAM_iQpUj9kg=15K3yyOWuMEaxWQC2TUStAMjHrn+ixbGyYHK0Q@mail.gmail.com>

On Thu, 2014-01-09 at 16:30 -0800, Cong Wang wrote:
> On Thu, Jan 9, 2014 at 4:21 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> >
> > Really, you'll have to explain in the changelog why you think this is
> > safe, because I really do not see how this can be valid.
> >
> > I think I already said it was not safe at all...
> >
> > You could try a multiqueue NIC for some interesting effects.
> >
> 
> There is only one ingress queue, that is dev->ingress_queue, right ?

Yes. And you can have multiple cpus trying to use it at the same time.

> And since on ingress, the only possible qdisc is sch_ingress,
> looking at ingress_enqueue(), I don't see anything so dangerous.
> 
> As I said in the cover letter, I may still miss something in the qdisc
> layer, but doesn't look like related with multiqueue. Mind to be more
> specific?

Well, there is one qdisc, and if your NIC is multiqueue, with for
example 32 queues, you can have 32 cpu happily using this qdisc at once.

Thats why you need the spinlock.

I am very afraid seeing you changing this path without understanding
how it is used.

^ permalink raw reply

* Re: [RFC Patch net-next 4/4] net_sched: make ingress qdisc lockless
From: Cong Wang @ 2014-01-10  1:09 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, Linux Kernel Network Developers, John Fastabend,
	David S. Miller, Jamal Hadi Salim
In-Reply-To: <20140109164936.4b17cf38@nehalam.linuxnetplumber.net>

On Thu, Jan 9, 2014 at 4:49 PM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
>
> I think what Eric is saying is that on a multi-queue NIC, multiple
> queues can be receiving packets and then sending them on to the ingress
> queue discipline. Up until your patch that code itself was protected
> by qdisc_lock and did not have to worry about any SMP issues. Now, any
> qdisc attached on ingress could run in parallel. This would break all
> the code in those queue disciplines. Think of the simplest case
> of policing.

I noticed that we have multiqueue for rx, which is dev->_rx[], so they
still share the name dev->ingress_queue. Packets on different CPU's
from the same device still need to run through the same ingress qdisc.

Except for ifb, ingress qdisc can only do filtering (with some actions),
therefore I still don't see the problem.

ifb device switches to its "egress" to do policing, which is safe since it
does not use ->ingress_queue any more.

^ permalink raw reply

* Re: [PATCH net-next] net: gro: change GRO overflow strategy
From: Eric Dumazet @ 2014-01-10  1:08 UTC (permalink / raw)
  To: Cong Wang; +Cc: David Miller, netdev, Neal Cardwell, Jerry Chu
In-Reply-To: <CAHA+R7M1wYwLA32fw=jzOUTR1+_zy8K9ugJohLmv1Tr1UHdhTw@mail.gmail.com>

On Thu, 2014-01-09 at 16:44 -0800, Cong Wang wrote:
> On Thu, Jan 9, 2014 at 2:12 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > +               /* locate the end of the list to select the 'oldest' flow */
> > +               while (nskb->next) {
> > +                       pp = &nskb->next;
> > +                       nskb = *pp;
> > +               }
> > +               *pp = NULL;
> > +               nskb->next = NULL;
> 
> How long could this list be?
> 

#define MAX_GRO_SKBS 8

> I think it's time to use doubly linked list for gro,
> although definitely not in this patch.

Its not needed, and it would be slower.

In fast path, we do not need to get the previous pointers.

^ permalink raw reply

* Re: [RFC Patch net-next 4/4] net_sched: make ingress qdisc lockless
From: John Fastabend @ 2014-01-10  1:06 UTC (permalink / raw)
  To: Cong Wang
  Cc: Stephen Hemminger, Eric Dumazet, Linux Kernel Network Developers,
	John Fastabend, David S. Miller, Jamal Hadi Salim
In-Reply-To: <20140109164936.4b17cf38@nehalam.linuxnetplumber.net>

On 1/9/2014 4:49 PM, Stephen Hemminger wrote:
> On Thu, 9 Jan 2014 16:30:12 -0800
> Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
>> On Thu, Jan 9, 2014 at 4:21 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>>
>>>
>>> Really, you'll have to explain in the changelog why you think this is
>>> safe, because I really do not see how this can be valid.
>>>
>>> I think I already said it was not safe at all...
>>>
>>> You could try a multiqueue NIC for some interesting effects.
>>>
>>
>> There is only one ingress queue, that is dev->ingress_queue, right?
>>
>> And since on ingress, the only possible qdisc is sch_ingress,
>> looking at ingress_enqueue(), I don't see anything so dangerous.
>>
>> As I said in the cover letter, I may still miss something in the qdisc
>> layer, but doesn't look like related with multiqueue. Mind to be more
>> specific?
>> --
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
> I think what Eric is saying is that on a multi-queue NIC, multiple
> queues can be receiving packets and then sending them on to the ingress
> queue discipline. Up until your patch that code itself was protected
> by qdisc_lock and did not have to worry about any SMP issues. Now, any
> qdisc attached on ingress could run in parallel. This would break all
> the code in those queue disciplines. Think of the simplest case
> of policing.
> --

Just to re-iterate you need to go through each and every qdisc,
classifier, action and verify it is safe to run in parallel. Take
a look at how the skb lists are managed in the qdiscs. If we want
to do this we need to make these changes in some coherent way
because it touches lots of pieces.

Also your stats are going to get hosed none of the bstats, qstats
supports this. I'll send out the classifier set later tonight
if you want. I got stalled going through the actions.

Finally any global state in those qdiscs is going to drive performance
down so many of them would likely need to be redesigned.

.John

^ permalink raw reply

* Re: [RFC Patch net-next 4/4] net_sched: make ingress qdisc lockless
From: Stephen Hemminger @ 2014-01-10  0:49 UTC (permalink / raw)
  To: Cong Wang
  Cc: Eric Dumazet, Linux Kernel Network Developers, John Fastabend,
	David S. Miller, Jamal Hadi Salim
In-Reply-To: <CAM_iQpUj9kg=15K3yyOWuMEaxWQC2TUStAMjHrn+ixbGyYHK0Q@mail.gmail.com>

On Thu, 9 Jan 2014 16:30:12 -0800
Cong Wang <xiyou.wangcong@gmail.com> wrote:

> On Thu, Jan 9, 2014 at 4:21 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> >
> > Really, you'll have to explain in the changelog why you think this is
> > safe, because I really do not see how this can be valid.
> >
> > I think I already said it was not safe at all...
> >
> > You could try a multiqueue NIC for some interesting effects.
> >
> 
> There is only one ingress queue, that is dev->ingress_queue, right?
> 
> And since on ingress, the only possible qdisc is sch_ingress,
> looking at ingress_enqueue(), I don't see anything so dangerous.
> 
> As I said in the cover letter, I may still miss something in the qdisc
> layer, but doesn't look like related with multiqueue. Mind to be more
> specific?
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

I think what Eric is saying is that on a multi-queue NIC, multiple
queues can be receiving packets and then sending them on to the ingress
queue discipline. Up until your patch that code itself was protected
by qdisc_lock and did not have to worry about any SMP issues. Now, any
qdisc attached on ingress could run in parallel. This would break all
the code in those queue disciplines. Think of the simplest case
of policing.

^ permalink raw reply

* Re: [PATCH net-next] net: gro: change GRO overflow strategy
From: Cong Wang @ 2014-01-10  0:44 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, Neal Cardwell, Jerry Chu
In-Reply-To: <1389305539.31367.65.camel@edumazet-glaptop2.roam.corp.google.com>

On Thu, Jan 9, 2014 at 2:12 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> +               /* locate the end of the list to select the 'oldest' flow */
> +               while (nskb->next) {
> +                       pp = &nskb->next;
> +                       nskb = *pp;
> +               }
> +               *pp = NULL;
> +               nskb->next = NULL;

How long could this list be?

I think it's time to use doubly linked list for gro,
although definitely not in this patch.

^ permalink raw reply

* Re: [PATCH v2 4/4] dma debug: introduce debug_dma_assert_idle()
From: Andrew Morton @ 2014-01-10  0:38 UTC (permalink / raw)
  To: Dan Williams
  Cc: dmaengine, Vinod Koul, netdev, Joerg Roedel, linux-kernel,
	James Bottomley, Russell King
In-Reply-To: <20140109201657.28381.9305.stgit@viggo.jf.intel.com>

On Thu, 09 Jan 2014 12:17:26 -0800 Dan Williams <dan.j.williams@intel.com> wrote:

> Record actively mapped pages and provide an api for asserting a given
> page is dma inactive before execution proceeds.  Placing
> debug_dma_assert_idle() in cow_user_page() flagged the violation of the
> dma-api in the NET_DMA implementation (see commit 77873803363c "net_dma:
> mark broken").
> 
> --- a/include/linux/dma-debug.h
> +++ b/include/linux/dma-debug.h
> @@ -185,4 +185,10 @@ static inline void debug_dma_dump_mappings(struct device *dev)
>  
>  #endif /* CONFIG_DMA_API_DEBUG */
>  
> +#ifdef CONFIG_DMA_VS_CPU_DEBUG
> +extern void debug_dma_assert_idle(struct page *page);
> +#else
> +static inline void debug_dma_assert_idle(struct page *page) { }

Surely this breaks the build when CONFIG_DMA_VS_CPU_DEBUG=n? 
lib/dma-debug.c is missing the necessary "#ifdef
CONFIG_DMA_VS_CPU_DEBUG"s.

Do we really need this config setting anyway?  What goes bad if we
permanently enable this subfeature when dma debugging is enabled?

>
> ...
>
> index d87a17a819d0..f67ae111cd2f 100644
> --- a/lib/dma-debug.c
> +++ b/lib/dma-debug.c
> @@ -57,7 +57,8 @@ struct dma_debug_entry {
>  	struct list_head list;
>  	struct device    *dev;
>  	int              type;
> -	phys_addr_t      paddr;
> +	unsigned long	 pfn;
> +	size_t		 offset;

Some documentation for the fields would be nice.  offset of what
relative to what, in what units?

>  	u64              dev_addr;
>  	u64              size;
>  	int              direction;
> @@ -372,6 +373,11 @@ static void hash_bucket_del(struct dma_debug_entry *entry)
>  	list_del(&entry->list);
>  }
>  
>
> ...
>
>  
> +/* memory usage is constrained by the maximum number of available
> + * dma-debug entries
> + */

A brief design overview would be useful.  What goes in tree, how is it
indexed, when and why do we add/remove/test items, etc.

> +static RADIX_TREE(dma_active_pfn, GFP_NOWAIT);
> +static DEFINE_SPINLOCK(radix_lock);
> +
> +static void __active_pfn_inc_overlap(struct dma_debug_entry *entry)
> +{
> +	unsigned long pfn = entry->pfn;
> +	int i;
> +
> +	for (i = 0; i < RADIX_TREE_MAX_TAGS; i++)
> +		if (radix_tree_tag_get(&dma_active_pfn, pfn, i) == 0) {
> +			radix_tree_tag_set(&dma_active_pfn, pfn, i);
> +			return;
> +		}
> +	pr_debug("DMA-API: max overlap count (%d) reached for pfn 0x%lx\n",
> +		 RADIX_TREE_MAX_TAGS, pfn);
> +}
> +
>
> ...
>
> +void debug_dma_assert_idle(struct page *page)
> +{
> +	unsigned long flags;
> +	struct dma_debug_entry *entry;
> +
> +	if (!page)
> +		return;
> +
> +	spin_lock_irqsave(&radix_lock, flags);
> +	entry = radix_tree_lookup(&dma_active_pfn, page_to_pfn(page));
> +	spin_unlock_irqrestore(&radix_lock, flags);
> +
> +	if (!entry)
> +		return;
> +
> +	err_printk(entry->dev, entry,
> +		   "DMA-API: cpu touching an active dma mapped page "
> +		   "[pfn=0x%lx]\n", entry->pfn);
> +}
> +EXPORT_SYMBOL_GPL(debug_dma_assert_idle);

The export isn't needed for mm/memory.c

>
> ...
>

^ permalink raw reply

* [PATCH 23/23] netfilter: nf_tables: fix error path in the init functions
From: Pablo Neira Ayuso @ 2014-01-10  0:35 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1389314142-17969-1-git-send-email-pablo@netfilter.org>

We have to unregister chain type if this fails to register netns.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_tables_ipv4.c |    8 +++++++-
 net/ipv6/netfilter/nf_tables_ipv6.c |    8 +++++++-
 net/netfilter/nf_tables_inet.c      |    8 +++++++-
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index fec163a..6820c8c 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -105,8 +105,14 @@ static const struct nf_chain_type filter_ipv4 = {
 
 static int __init nf_tables_ipv4_init(void)
 {
+	int ret;
+
 	nft_register_chain_type(&filter_ipv4);
-	return register_pernet_subsys(&nf_tables_ipv4_net_ops);
+	ret = register_pernet_subsys(&nf_tables_ipv4_net_ops);
+	if (ret < 0)
+		nft_unregister_chain_type(&filter_ipv4);
+
+	return ret;
 }
 
 static void __exit nf_tables_ipv4_exit(void)
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index 59a43b4..0d812b3 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -104,8 +104,14 @@ static const struct nf_chain_type filter_ipv6 = {
 
 static int __init nf_tables_ipv6_init(void)
 {
+	int ret;
+
 	nft_register_chain_type(&filter_ipv6);
-	return register_pernet_subsys(&nf_tables_ipv6_net_ops);
+	ret = register_pernet_subsys(&nf_tables_ipv6_net_ops);
+	if (ret < 0)
+		nft_unregister_chain_type(&filter_ipv6);
+
+	return ret;
 }
 
 static void __exit nf_tables_ipv6_exit(void)
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
index 84478de..9dd2d21 100644
--- a/net/netfilter/nf_tables_inet.c
+++ b/net/netfilter/nf_tables_inet.c
@@ -80,8 +80,14 @@ static const struct nf_chain_type filter_inet = {
 
 static int __init nf_tables_inet_init(void)
 {
+	int ret;
+
 	nft_register_chain_type(&filter_inet);
-	return register_pernet_subsys(&nf_tables_inet_net_ops);
+	ret = register_pernet_subsys(&nf_tables_inet_net_ops);
+	if (ret < 0)
+		nft_unregister_chain_type(&filter_inet);
+
+	return ret;
 }
 
 static void __exit nf_tables_inet_exit(void)
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 21/23] netfilter: nf_tables: prohibit deletion of a table with existing sets
From: Pablo Neira Ayuso @ 2014-01-10  0:35 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1389314142-17969-1-git-send-email-pablo@netfilter.org>

From: Patrick McHardy <kaber@trash.net>

We currently leak the set memory when deleting a table that still has
sets in it. Return EBUSY when attempting to delete a table with sets.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index c352614..36add31 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -467,7 +467,7 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	if (table->use)
+	if (!list_empty(&table->chains) || !list_empty(&table->sets))
 		return -EBUSY;
 
 	list_del(&table->list);
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 17/23] netfilter: nf_tables: constify chain type definitions and pointers
From: Pablo Neira Ayuso @ 2014-01-10  0:35 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1389314142-17969-1-git-send-email-pablo@netfilter.org>

From: Patrick McHardy <kaber@trash.net>

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h         |    6 +++---
 net/bridge/netfilter/nf_tables_bridge.c   |    2 +-
 net/ipv4/netfilter/nf_tables_arp.c        |    2 +-
 net/ipv4/netfilter/nf_tables_ipv4.c       |    2 +-
 net/ipv4/netfilter/nft_chain_nat_ipv4.c   |    2 +-
 net/ipv4/netfilter/nft_chain_route_ipv4.c |    2 +-
 net/ipv6/netfilter/nf_tables_ipv6.c       |    2 +-
 net/ipv6/netfilter/nft_chain_nat_ipv6.c   |    2 +-
 net/ipv6/netfilter/nft_chain_route_ipv6.c |    2 +-
 net/netfilter/nf_tables_api.c             |   14 +++++++-------
 net/netfilter/nf_tables_inet.c            |    2 +-
 11 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index e9b9786..d3f7053 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -436,7 +436,7 @@ struct nft_stats {
  */
 struct nft_base_chain {
 	struct nf_hook_ops		ops[NFT_HOOK_OPS_MAX];
-	struct nf_chain_type		*type;
+	const struct nf_chain_type	*type;
 	u8				policy;
 	struct nft_stats __percpu	*stats;
 	struct nft_chain		chain;
@@ -507,8 +507,8 @@ struct nf_chain_type {
 	int			family;
 };
 
-int nft_register_chain_type(struct nf_chain_type *);
-void nft_unregister_chain_type(struct nf_chain_type *);
+int nft_register_chain_type(const struct nf_chain_type *);
+void nft_unregister_chain_type(const struct nf_chain_type *);
 
 int nft_register_expr(struct nft_expr_type *);
 void nft_unregister_expr(struct nft_expr_type *);
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index f97222e..283658d 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -68,7 +68,7 @@ static struct pernet_operations nf_tables_bridge_net_ops = {
 	.exit	= nf_tables_bridge_exit_net,
 };
 
-static struct nf_chain_type filter_bridge = {
+static const struct nf_chain_type filter_bridge = {
 	.family		= NFPROTO_BRIDGE,
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index 228df00..8af01a5 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -68,7 +68,7 @@ static struct pernet_operations nf_tables_arp_net_ops = {
 	.exit   = nf_tables_arp_exit_net,
 };
 
-static struct nf_chain_type filter_arp = {
+static const struct nf_chain_type filter_arp = {
 	.family		= NFPROTO_ARP,
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index d6fc1b4..cec7805 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -91,7 +91,7 @@ static struct pernet_operations nf_tables_ipv4_net_ops = {
 	.exit	= nf_tables_ipv4_exit_net,
 };
 
-static struct nf_chain_type filter_ipv4 = {
+static const struct nf_chain_type filter_ipv4 = {
 	.family		= NFPROTO_IPV4,
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index cf2c792..9e535c2 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -164,7 +164,7 @@ static unsigned int nf_nat_output(const struct nf_hook_ops *ops,
 	return ret;
 }
 
-static struct nf_chain_type nft_chain_nat_ipv4 = {
+static const struct nf_chain_type nft_chain_nat_ipv4 = {
 	.family		= NFPROTO_IPV4,
 	.name		= "nat",
 	.type		= NFT_CHAIN_T_NAT,
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
index 4e6bf9a..2dd2eea 100644
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -61,7 +61,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
 	return ret;
 }
 
-static struct nf_chain_type nft_chain_route_ipv4 = {
+static const struct nf_chain_type nft_chain_route_ipv4 = {
 	.family		= NFPROTO_IPV4,
 	.name		= "route",
 	.type		= NFT_CHAIN_T_ROUTE,
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index a340276..758a32b 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -90,7 +90,7 @@ static struct pernet_operations nf_tables_ipv6_net_ops = {
 	.exit	= nf_tables_ipv6_exit_net,
 };
 
-static struct nf_chain_type filter_ipv6 = {
+static const struct nf_chain_type filter_ipv6 = {
 	.family		= NFPROTO_IPV6,
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
index e86dcd7..efd1d57 100644
--- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
@@ -170,7 +170,7 @@ static unsigned int nf_nat_ipv6_output(const struct nf_hook_ops *ops,
 	return ret;
 }
 
-static struct nf_chain_type nft_chain_nat_ipv6 = {
+static const struct nf_chain_type nft_chain_nat_ipv6 = {
 	.family		= NFPROTO_IPV6,
 	.name		= "nat",
 	.type		= NFT_CHAIN_T_NAT,
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
index 3fe40f0..3620f88 100644
--- a/net/ipv6/netfilter/nft_chain_route_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -59,7 +59,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
 	return ret;
 }
 
-static struct nf_chain_type nft_chain_route_ipv6 = {
+static const struct nf_chain_type nft_chain_route_ipv6 = {
 	.family		= NFPROTO_IPV6,
 	.name		= "route",
 	.type		= NFT_CHAIN_T_ROUTE,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 7d6a226..acdd9d6 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -124,9 +124,9 @@ static inline u64 nf_tables_alloc_handle(struct nft_table *table)
 	return ++table->hgenerator;
 }
 
-static struct nf_chain_type *chain_type[AF_MAX][NFT_CHAIN_T_MAX];
+static const struct nf_chain_type *chain_type[AF_MAX][NFT_CHAIN_T_MAX];
 
-static struct nf_chain_type *
+static const struct nf_chain_type *
 __nf_tables_chain_type_lookup(int family, const struct nlattr *nla)
 {
 	int i;
@@ -139,12 +139,12 @@ __nf_tables_chain_type_lookup(int family, const struct nlattr *nla)
 	return NULL;
 }
 
-static struct nf_chain_type *
+static const struct nf_chain_type *
 nf_tables_chain_type_lookup(const struct nft_af_info *afi,
 			    const struct nlattr *nla,
 			    bool autoload)
 {
-	struct nf_chain_type *type;
+	const struct nf_chain_type *type;
 
 	type = __nf_tables_chain_type_lookup(afi->family, nla);
 	if (type != NULL)
@@ -475,7 +475,7 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
 	return 0;
 }
 
-int nft_register_chain_type(struct nf_chain_type *ctype)
+int nft_register_chain_type(const struct nf_chain_type *ctype)
 {
 	int err = 0;
 
@@ -491,7 +491,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(nft_register_chain_type);
 
-void nft_unregister_chain_type(struct nf_chain_type *ctype)
+void nft_unregister_chain_type(const struct nf_chain_type *ctype)
 {
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
 	chain_type[ctype->family][ctype->type] = NULL;
@@ -900,7 +900,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 		return -EOVERFLOW;
 
 	if (nla[NFTA_CHAIN_HOOK]) {
-		struct nf_chain_type *type;
+		const struct nf_chain_type *type;
 		struct nf_hook_ops *ops;
 		nf_hookfn *hookfn;
 		u32 hooknum, priority;
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
index 280d3a2..ee29ba2 100644
--- a/net/netfilter/nf_tables_inet.c
+++ b/net/netfilter/nf_tables_inet.c
@@ -66,7 +66,7 @@ static struct pernet_operations nf_tables_inet_net_ops = {
 	.exit	= nf_tables_inet_exit_net,
 };
 
-static struct nf_chain_type filter_inet = {
+static const struct nf_chain_type filter_inet = {
 	.family		= NFPROTO_INET,
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 16/23] netfilter: nf_tables: replay request after dropping locks to load chain type
From: Pablo Neira Ayuso @ 2014-01-10  0:35 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1389314142-17969-1-git-send-email-pablo@netfilter.org>

From: Patrick McHardy <kaber@trash.net>

To avoid races, we need to replay to request after dropping the nfnl_mutex
to auto-load the chain type module.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c |   12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d913fb0..7d6a226 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -147,16 +147,20 @@ nf_tables_chain_type_lookup(const struct nft_af_info *afi,
 	struct nf_chain_type *type;
 
 	type = __nf_tables_chain_type_lookup(afi->family, nla);
+	if (type != NULL)
+		return type;
 #ifdef CONFIG_MODULES
-	if (type == NULL && autoload) {
+	if (autoload) {
 		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
 		request_module("nft-chain-%u-%*.s", afi->family,
 			       nla_len(nla)-1, (const char *)nla_data(nla));
 		nfnl_lock(NFNL_SUBSYS_NFTABLES);
 		type = __nf_tables_chain_type_lookup(afi->family, nla);
+		if (type != NULL)
+			return ERR_PTR(-EAGAIN);
 	}
 #endif
-	return type;
+	return ERR_PTR(-ENOENT);
 }
 
 static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
@@ -906,8 +910,8 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 			type = nf_tables_chain_type_lookup(afi,
 							   nla[NFTA_CHAIN_TYPE],
 							   create);
-			if (type == NULL)
-				return -ENOENT;
+			if (IS_ERR(type))
+				return PTR_ERR(type);
 		}
 
 		err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK],
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 15/23] netfilter: nf_tables: add missing module references to chain types
From: Pablo Neira Ayuso @ 2014-01-10  0:35 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1389314142-17969-1-git-send-email-pablo@netfilter.org>

From: Patrick McHardy <kaber@trash.net>

In some cases we neither take a reference to the AF info nor to the
chain type, allowing the module to be unloaded while in use.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/nf_tables_bridge.c |    1 +
 net/ipv4/netfilter/nf_tables_arp.c      |    1 +
 net/ipv4/netfilter/nf_tables_ipv4.c     |    1 +
 net/ipv6/netfilter/nf_tables_ipv6.c     |    1 +
 net/netfilter/nf_tables_inet.c          |    1 +
 5 files changed, 5 insertions(+)

diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index 003c1e9..f97222e 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -72,6 +72,7 @@ static struct nf_chain_type filter_bridge = {
 	.family		= NFPROTO_BRIDGE,
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
+	.me		= THIS_MODULE,
 	.hook_mask	= (1 << NF_BR_LOCAL_IN) |
 			  (1 << NF_BR_FORWARD) |
 			  (1 << NF_BR_LOCAL_OUT),
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index 36d27fc..228df00 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -72,6 +72,7 @@ static struct nf_chain_type filter_arp = {
 	.family		= NFPROTO_ARP,
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
+	.me		= THIS_MODULE,
 	.hook_mask	= (1 << NF_ARP_IN) |
 			  (1 << NF_ARP_OUT) |
 			  (1 << NF_ARP_FORWARD),
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index da927dc..d6fc1b4 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -95,6 +95,7 @@ static struct nf_chain_type filter_ipv4 = {
 	.family		= NFPROTO_IPV4,
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
+	.me		= THIS_MODULE,
 	.hook_mask	= (1 << NF_INET_LOCAL_IN) |
 			  (1 << NF_INET_LOCAL_OUT) |
 			  (1 << NF_INET_FORWARD) |
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index 025e7f4..a340276 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -94,6 +94,7 @@ static struct nf_chain_type filter_ipv6 = {
 	.family		= NFPROTO_IPV6,
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
+	.me		= THIS_MODULE,
 	.hook_mask	= (1 << NF_INET_LOCAL_IN) |
 			  (1 << NF_INET_LOCAL_OUT) |
 			  (1 << NF_INET_FORWARD) |
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
index ac0edcb..280d3a2 100644
--- a/net/netfilter/nf_tables_inet.c
+++ b/net/netfilter/nf_tables_inet.c
@@ -70,6 +70,7 @@ static struct nf_chain_type filter_inet = {
 	.family		= NFPROTO_INET,
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
+	.me		= THIS_MODULE,
 	.hook_mask	= (1 << NF_INET_LOCAL_IN) |
 			  (1 << NF_INET_LOCAL_OUT) |
 			  (1 << NF_INET_FORWARD) |
-- 
1.7.10.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox