Netdev List
 help / color / mirror / Atom feed
* [RFC PATCH 08/12] net: sched: RCU cls_tcindex
From: John Fastabend @ 2014-01-10  9:42 UTC (permalink / raw)
  To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
In-Reply-To: <20140110092041.7193.5952.stgit@nitbit.x32>

Make cls_tcindex RCU safe.

This patch addds a new RCU routine rcu_dereference_bh_rtnl() to check
caller either holds the rcu read lock or RTNL. This is needed to
handle the case where tcindex_lookup() is being called in both cases.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 include/linux/rtnetlink.h |   10 +
 net/sched/cls_tcindex.c   |  327 ++++++++++++++++++++++++++-------------------
 2 files changed, 202 insertions(+), 135 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 939428a..6008b22 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -37,6 +37,16 @@ extern int lockdep_rtnl_is_held(void);
 	rcu_dereference_check(p, lockdep_rtnl_is_held())
 
 /**
+ * rcu_dereference_bh_rtnl - rcu_dereference_bh with debug checking
+ * @p: The pointer to read, prior to dereference
+ *
+ * Do an rcu_dereference_bh(p), but check caller either holds rcu_read_lock_bh()
+ * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference_bh()
+ */
+#define rcu_dereference_bh_rtnl(p)				\
+	rcu_dereference_bh_check(p, lockdep_rtnl_is_held())
+
+/**
  * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL
  * @p: The pointer to read, prior to dereferencing
  *
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index ffad187..19eb24e 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -24,7 +24,7 @@
 #define DEFAULT_HASH_SIZE	64	/* optimized for diffserv */
 
 
-#define	PRIV(tp)	((struct tcindex_data *) (tp)->root)
+#define	PRIV(tp)	((struct tcindex_data *) rtnl_dereference((tp)->root))
 
 
 struct tcindex_filter_result {
@@ -35,19 +35,22 @@ struct tcindex_filter_result {
 struct tcindex_filter {
 	u16 key;
 	struct tcindex_filter_result result;
-	struct tcindex_filter *next;
+	struct tcindex_filter __rcu *next;
+	struct rcu_head rcu;
 };
 
 
 struct tcindex_data {
 	struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */
-	struct tcindex_filter **h; /* imperfect hash; only used if !perfect;
-				      NULL if unused */
+	struct tcindex_filter __rcu **h; /* imperfect hash; only used if
+					    !perfect NULL if unused */
+	struct rcu_head rcu;
+	struct tcf_proto *tp;
 	u16 mask;		/* AND key with mask */
-	int shift;		/* shift ANDed key to the right */
-	int hash;		/* hash table size; 0 if undefined */
-	int alloc_hash;		/* allocated size */
-	int fall_through;	/* 0: only classify if explicit match */
+	u32 shift;		/* shift ANDed key to the right */
+	u32 hash;		/* hash table size; 0 if undefined */
+	u32 alloc_hash;		/* allocated size */
+	u32 fall_through;	/* 0: only classify if explicit match */
 };
 
 static inline int
@@ -59,13 +62,18 @@ tcindex_filter_is_set(struct tcindex_filter_result *r)
 static struct tcindex_filter_result *
 tcindex_lookup(struct tcindex_data *p, u16 key)
 {
-	struct tcindex_filter *f;
+	if (p->perfect) {
+		struct tcindex_filter_result *f = p->perfect + key;
+
+		return tcindex_filter_is_set(f) ? f : NULL;
+	} else if (p->h) {
+		struct tcindex_filter __rcu **fp;
+		struct tcindex_filter *f;
 
-	if (p->perfect)
-		return tcindex_filter_is_set(p->perfect + key) ?
-			p->perfect + key : NULL;
-	else if (p->h) {
-		for (f = p->h[key % p->hash]; f; f = f->next)
+		fp = &p->h[key % p->hash];
+		for (f = rcu_dereference_bh_rtnl(*fp);
+		     f;
+		     fp = &f->next, f = rcu_dereference_bh_rtnl(*fp))
 			if (f->key == key)
 				return &f->result;
 	}
@@ -77,7 +85,7 @@ tcindex_lookup(struct tcindex_data *p, u16 key)
 static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			    struct tcf_result *res)
 {
-	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_data *p = rcu_dereference(tp->root);
 	struct tcindex_filter_result *f;
 	int key = (skb->tc_index & p->mask) >> p->shift;
 
@@ -132,49 +140,113 @@ static int tcindex_init(struct tcf_proto *tp)
 	p->hash = DEFAULT_HASH_SIZE;
 	p->fall_through = 1;
 
-	tp->root = p;
+	rcu_assign_pointer(tp->root, p);
 	return 0;
 }
 
+static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
+{
+	struct tcindex_data *p = PRIV(tp);
+	struct tcindex_filter *f, *next;
+	int i;
+
+	pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p);
+	if (p->perfect) {
+		for (i = 0; i < p->hash; i++) {
+			if (!p->perfect[i].res.class)
+				continue;
+			if (walker->count >= walker->skip) {
+				if (walker->fn(tp,
+				    (unsigned long) (p->perfect+i), walker)
+				     < 0) {
+					walker->stop = 1;
+					return;
+				}
+			}
+			walker->count++;
+		}
+	}
+	if (!p->h)
+		return;
+	for (i = 0; i < p->hash; i++) {
+		for (f = rtnl_dereference(p->h[i]); f; f = next) {
+			next = rtnl_dereference(f->next);
+			if (walker->count >= walker->skip) {
+				if (walker->fn(tp, (unsigned long) &f->result,
+				    walker) < 0) {
+					walker->stop = 1;
+					return;
+				}
+			}
+			walker->count++;
+		}
+	}
+}
 
 static int
-__tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock)
+tcindex_delete(struct tcf_proto *tp, unsigned long arg)
 {
 	struct tcindex_data *p = PRIV(tp);
 	struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg;
+	struct tcindex_filter __rcu **walk;
 	struct tcindex_filter *f = NULL;
 
-	pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n", tp, arg, p, f);
+	pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p\n", tp, arg, p);
 	if (p->perfect) {
 		if (!r->res.class)
 			return -ENOENT;
 	} else {
 		int i;
-		struct tcindex_filter **walk = NULL;
 
-		for (i = 0; i < p->hash; i++)
-			for (walk = p->h+i; *walk; walk = &(*walk)->next)
-				if (&(*walk)->result == r)
+		for (i = 0; i < p->hash; i++) {
+			walk = p->h + i;
+			for (f = rtnl_dereference(*walk); f;
+			     walk = &f->next, f = rtnl_dereference(*walk)) {
+				if (&f->result == r)
 					goto found;
+			}
+		}
 		return -ENOENT;
 
 found:
-		f = *walk;
-		if (lock)
-			tcf_tree_lock(tp);
-		*walk = f->next;
-		if (lock)
-			tcf_tree_unlock(tp);
+		rcu_assign_pointer(*walk, rtnl_dereference(f->next));
 	}
 	tcf_unbind_filter(tp, &r->res);
 	tcf_exts_destroy(tp, &r->exts);
-	kfree(f);
+	if (f)
+		kfree_rcu(f, rcu);
 	return 0;
 }
 
-static int tcindex_delete(struct tcf_proto *tp, unsigned long arg)
+static int tcindex_destroy_element(struct tcf_proto *tp,
+				   unsigned long arg,
+				   struct tcf_walker *walker)
+{
+	return tcindex_delete(tp, arg);
+}
+
+static void __tcindex_destroy(struct rcu_head *head)
 {
-	return __tcindex_delete(tp, arg, 1);
+	struct tcindex_data *p = container_of(head, struct tcindex_data, rcu);
+
+	kfree(p->perfect);
+	kfree(p->h);
+	kfree(p);
+}
+
+static void tcindex_destroy(struct tcf_proto *tp)
+{
+	struct tcindex_data *p = PRIV(tp);
+	struct tcf_walker walker;
+
+	pr_debug("%s(tp %p),p %p\n", __func__, tp, p);
+	walker.count = 0;
+	walker.skip = 0;
+	walker.fn = &tcindex_destroy_element;
+	tcindex_walk(tp, &walker);
+
+	rcu_assign_pointer(tp->root, NULL);
+	call_rcu(&p->rcu, __tcindex_destroy);
 }
 
 static inline int
@@ -191,6 +263,14 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
 	[TCA_TCINDEX_CLASSID]		= { .type = NLA_U32 },
 };
 
+static void __tcindex_partial_destroy(struct rcu_head *head)
+{
+	struct tcindex_data *p = container_of(head, struct tcindex_data, rcu);
+
+	kfree(p->perfect);
+	kfree(p);
+}
+
 static int
 tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
 		  u32 handle, struct tcindex_data *p,
@@ -200,7 +280,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
 	int err, balloc = 0;
 	struct tcindex_filter_result new_filter_result, *old_r = r;
 	struct tcindex_filter_result cr;
-	struct tcindex_data cp;
+	struct tcindex_data *cp, *oldp;
 	struct tcindex_filter *f = NULL; /* make gcc behave */
 	struct tcf_exts e;
 
@@ -209,7 +289,27 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
 	if (err < 0)
 		return err;
 
-	memcpy(&cp, p, sizeof(cp));
+	/* tcindex_data attributes must look atomic to classifier/lookup so
+	 * allocate new tcindex data and RCU assign it onto root. Keeping
+	 * perfect hash and hash pointers from old data.
+	 */
+	cp = kzalloc(sizeof(cp), GFP_KERNEL);
+	if (!cp)
+		return -ENOMEM;
+
+	cp->mask = p->mask;
+	cp->shift = p->shift;
+	cp->hash = p->hash;
+	cp->alloc_hash = p->alloc_hash;
+	cp->fall_through = p->fall_through;
+	cp->tp = tp;
+
+	if (p->perfect) {
+		cp->perfect = kcalloc(cp->hash, sizeof(*r), GFP_KERNEL);
+		memcpy(cp->perfect, p->perfect, sizeof(*r) * cp->hash);
+	}
+	cp->h = p->h;
+
 	memset(&new_filter_result, 0, sizeof(new_filter_result));
 	tcf_exts_init(&new_filter_result.exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
 
@@ -221,71 +321,82 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
 	}
 
 	if (tb[TCA_TCINDEX_HASH])
-		cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
+		cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
 
 	if (tb[TCA_TCINDEX_MASK])
-		cp.mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
+		cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
 
 	if (tb[TCA_TCINDEX_SHIFT])
-		cp.shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
+		cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
 
 	err = -EBUSY;
+
 	/* Hash already allocated, make sure that we still meet the
 	 * requirements for the allocated hash.
 	 */
-	if (cp.perfect) {
-		if (!valid_perfect_hash(&cp) ||
-		    cp.hash > cp.alloc_hash)
+	if (cp->perfect) {
+		if (!valid_perfect_hash(cp) ||
+		    cp->hash > cp->alloc_hash)
 			goto errout;
-	} else if (cp.h && cp.hash != cp.alloc_hash)
+	} else if (cp->h && cp->hash != cp->alloc_hash)
 		goto errout;
 
 	err = -EINVAL;
 	if (tb[TCA_TCINDEX_FALL_THROUGH])
-		cp.fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]);
+		cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]);
 
-	if (!cp.hash) {
+	if (!cp->hash) {
 		/* Hash not specified, use perfect hash if the upper limit
 		 * of the hashing index is below the threshold.
 		 */
-		if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD)
-			cp.hash = (cp.mask >> cp.shift) + 1;
+		if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD)
+			cp->hash = (cp->mask >> cp->shift) + 1;
 		else
-			cp.hash = DEFAULT_HASH_SIZE;
+			cp->hash = DEFAULT_HASH_SIZE;
 	}
 
-	if (!cp.perfect && !cp.h)
-		cp.alloc_hash = cp.hash;
+	if (!cp->perfect && cp->h)
+		cp->alloc_hash = cp->hash;
 
 	/* Note: this could be as restrictive as if (handle & ~(mask >> shift))
 	 * but then, we'd fail handles that may become valid after some future
 	 * mask change. While this is extremely unlikely to ever matter,
 	 * the check below is safer (and also more backwards-compatible).
 	 */
-	if (cp.perfect || valid_perfect_hash(&cp))
-		if (handle >= cp.alloc_hash)
+	if (cp->perfect || valid_perfect_hash(cp))
+		if (handle >= cp->alloc_hash)
 			goto errout;
 
 
 	err = -ENOMEM;
-	if (!cp.perfect && !cp.h) {
-		if (valid_perfect_hash(&cp)) {
-			cp.perfect = kcalloc(cp.hash, sizeof(*r), GFP_KERNEL);
-			if (!cp.perfect)
+	if (!cp->perfect && !cp->h) {
+		if (valid_perfect_hash(cp)) {
+			struct tcindex_filter_result *perfect;
+
+			perfect = kcalloc(cp->hash, sizeof(*r), GFP_KERNEL);
+			if (!perfect)
 				goto errout;
+			cp->perfect = perfect;
 			balloc = 1;
 		} else {
-			cp.h = kcalloc(cp.hash, sizeof(f), GFP_KERNEL);
-			if (!cp.h)
+			struct tcindex_filter __rcu **hash;
+
+			hash = kcalloc(cp->hash,
+				       sizeof(struct tcindex_filter *),
+				       GFP_KERNEL);
+
+			if (!hash)
 				goto errout;
+
+			cp->h = hash;
 			balloc = 2;
 		}
 	}
 
-	if (cp.perfect)
-		r = cp.perfect + handle;
+	if (cp->perfect)
+		r = cp->perfect + handle;
 	else
-		r = tcindex_lookup(&cp, handle) ? : &new_filter_result;
+		r = tcindex_lookup(cp, handle) ? : &new_filter_result;
 
 	if (r == &new_filter_result) {
 		f = kzalloc(sizeof(*f), GFP_KERNEL);
@@ -300,33 +411,41 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
 
 	tcf_exts_change(tp, &cr.exts, &e);
 
-	tcf_tree_lock(tp);
 	if (old_r && old_r != r)
 		memset(old_r, 0, sizeof(*old_r));
 
-	memcpy(p, &cp, sizeof(cp));
+	oldp = p;
 	memcpy(r, &cr, sizeof(cr));
+	rcu_assign_pointer(tp->root, cp);
 
 	if (r == &new_filter_result) {
-		struct tcindex_filter **fp;
+		struct tcindex_filter *nfp;
+		struct tcindex_filter __rcu **fp;
 
 		f->key = handle;
 		f->result = new_filter_result;
 		f->next = NULL;
-		for (fp = p->h+(handle % p->hash); *fp; fp = &(*fp)->next)
-			/* nothing */;
-		*fp = f;
+
+		fp = p->h + (handle % p->hash);
+		for (nfp = rtnl_dereference(*fp);
+		     nfp;
+		     fp = &nfp->next, nfp = rtnl_dereference(*fp))
+				/* nothing */;
+
+		rcu_assign_pointer(*fp, f);
 	}
-	tcf_tree_unlock(tp);
 
+	if (oldp)
+		call_rcu(&oldp->rcu, __tcindex_partial_destroy);
 	return 0;
 
 errout_alloc:
 	if (balloc == 1)
-		kfree(cp.perfect);
+		kfree(cp->perfect);
 	else if (balloc == 2)
-		kfree(cp.h);
+		kfree(cp->h);
 errout:
+	kfree(cp);
 	tcf_exts_destroy(tp, &e);
 	return err;
 }
@@ -357,71 +476,6 @@ tcindex_change(struct net *net, struct sk_buff *in_skb,
 				 tca[TCA_RATE]);
 }
 
-
-static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
-{
-	struct tcindex_data *p = PRIV(tp);
-	struct tcindex_filter *f, *next;
-	int i;
-
-	pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p);
-	if (p->perfect) {
-		for (i = 0; i < p->hash; i++) {
-			if (!p->perfect[i].res.class)
-				continue;
-			if (walker->count >= walker->skip) {
-				if (walker->fn(tp,
-				    (unsigned long) (p->perfect+i), walker)
-				     < 0) {
-					walker->stop = 1;
-					return;
-				}
-			}
-			walker->count++;
-		}
-	}
-	if (!p->h)
-		return;
-	for (i = 0; i < p->hash; i++) {
-		for (f = p->h[i]; f; f = next) {
-			next = f->next;
-			if (walker->count >= walker->skip) {
-				if (walker->fn(tp, (unsigned long) &f->result,
-				    walker) < 0) {
-					walker->stop = 1;
-					return;
-				}
-			}
-			walker->count++;
-		}
-	}
-}
-
-
-static int tcindex_destroy_element(struct tcf_proto *tp,
-    unsigned long arg, struct tcf_walker *walker)
-{
-	return __tcindex_delete(tp, arg, 0);
-}
-
-
-static void tcindex_destroy(struct tcf_proto *tp)
-{
-	struct tcindex_data *p = PRIV(tp);
-	struct tcf_walker walker;
-
-	pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p);
-	walker.count = 0;
-	walker.skip = 0;
-	walker.fn = &tcindex_destroy_element;
-	tcindex_walk(tp, &walker);
-	kfree(p->perfect);
-	kfree(p->h);
-	kfree(p);
-	tp->root = NULL;
-}
-
-
 static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,
     struct sk_buff *skb, struct tcmsg *t)
 {
@@ -448,15 +502,18 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,
 		nla_nest_end(skb, nest);
 	} else {
 		if (p->perfect) {
-			t->tcm_handle = r-p->perfect;
+			t->tcm_handle = r - p->perfect;
 		} else {
 			struct tcindex_filter *f;
+			struct tcindex_filter __rcu **fp;
 			int i;
 
 			t->tcm_handle = 0;
 			for (i = 0; !t->tcm_handle && i < p->hash; i++) {
-				for (f = p->h[i]; !t->tcm_handle && f;
-				     f = f->next) {
+				fp = &p->h[i];
+				for (f = rtnl_dereference(*fp);
+				     !t->tcm_handle && f;
+				     fp = &f->next, f = rtnl_dereference(*fp)) {
 					if (&f->result == r)
 						t->tcm_handle = f->key;
 				}

^ permalink raw reply related

* [RFC PATCH 07/12] net: sched: RCU cls_route
From: John Fastabend @ 2014-01-10  9:41 UTC (permalink / raw)
  To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
In-Reply-To: <20140110092041.7193.5952.stgit@nitbit.x32>

RCUify the route classifier. For now however spinlock's are used to
protect fastmap cache.

The issue here is the fastmap may be read by one CPU while the
cache is being updated by another. An array of pointers could be
one possible solution.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 net/sched/cls_route.c |  218 ++++++++++++++++++++++++++++---------------------
 1 file changed, 125 insertions(+), 93 deletions(-)

diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 2473953..6ba8f43 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -29,25 +29,26 @@
  *    are mutually  exclusive.
  * 3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
  */
-
 struct route4_fastmap {
-	struct route4_filter	*filter;
-	u32			id;
-	int			iif;
+	struct route4_filter		*filter;
+	u32				id;
+	int				iif;
 };
 
 struct route4_head {
-	struct route4_fastmap	fastmap[16];
-	struct route4_bucket	*table[256 + 1];
+	struct route4_fastmap		fastmap[16];
+	struct route4_bucket __rcu	*table[256 + 1];
+	struct rcu_head			rcu;
 };
 
 struct route4_bucket {
 	/* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */
-	struct route4_filter	*ht[16 + 16 + 1];
+	struct route4_filter __rcu	*ht[16 + 16 + 1];
+	struct rcu_head			rcu;
 };
 
 struct route4_filter {
-	struct route4_filter	*next;
+	struct route4_filter __rcu	*next;
 	u32			id;
 	int			iif;
 
@@ -55,6 +56,8 @@ struct route4_filter {
 	struct tcf_exts		exts;
 	u32			handle;
 	struct route4_bucket	*bkt;
+	struct tcf_proto	*tp;
+	struct rcu_head		rcu;
 };
 
 #define ROUTE4_FAILURE ((struct route4_filter *)(-1L))
@@ -64,14 +67,13 @@ static inline int route4_fastmap_hash(u32 id, int iif)
 	return id & 0xF;
 }
 
+static DEFINE_SPINLOCK(fastmap_lock);
 static void
-route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id)
+route4_reset_fastmap(struct route4_head *head)
 {
-	spinlock_t *root_lock = qdisc_root_sleeping_lock(q);
-
-	spin_lock_bh(root_lock);
+	spin_lock(&fastmap_lock);
 	memset(head->fastmap, 0, sizeof(head->fastmap));
-	spin_unlock_bh(root_lock);
+	spin_unlock(&fastmap_lock);
 }
 
 static void
@@ -80,9 +82,12 @@ route4_set_fastmap(struct route4_head *head, u32 id, int iif,
 {
 	int h = route4_fastmap_hash(id, iif);
 
+	/* fastmap updates must look atomic to aling id, iff, filter */
+	spin_lock(&fastmap_lock);
 	head->fastmap[h].id = id;
 	head->fastmap[h].iif = iif;
 	head->fastmap[h].filter = f;
+	spin_unlock(&fastmap_lock);
 }
 
 static inline int route4_hash_to(u32 id)
@@ -123,7 +128,7 @@ static inline int route4_hash_wild(void)
 static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			   struct tcf_result *res)
 {
-	struct route4_head *head = (struct route4_head *)tp->root;
+	struct route4_head *head = (struct route4_head *)rcu_dereference_bh(tp->root);
 	struct dst_entry *dst;
 	struct route4_bucket *b;
 	struct route4_filter *f;
@@ -141,32 +146,43 @@ static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	iif = inet_iif(skb);
 
 	h = route4_fastmap_hash(id, iif);
+
+	spin_lock(&fastmap_lock);
 	if (id == head->fastmap[h].id &&
 	    iif == head->fastmap[h].iif &&
 	    (f = head->fastmap[h].filter) != NULL) {
-		if (f == ROUTE4_FAILURE)
+		if (f == ROUTE4_FAILURE) {
+			spin_unlock(&fastmap_lock);
 			goto failure;
+		}
 
 		*res = f->res;
+		spin_unlock(&fastmap_lock);
 		return 0;
 	}
+	spin_unlock(&fastmap_lock);
 
 	h = route4_hash_to(id);
 
 restart:
-	b = head->table[h];
+	b = rcu_dereference_bh(head->table[h]);
 	if (b) {
-		for (f = b->ht[route4_hash_from(id)]; f; f = f->next)
+		for (f = rcu_dereference_bh(b->ht[route4_hash_from(id)]);
+		     f;
+		     f = rcu_dereference_bh(f->next))
 			if (f->id == id)
 				ROUTE4_APPLY_RESULT();
 
-		for (f = b->ht[route4_hash_iif(iif)]; f; f = f->next)
+		for (f = rcu_dereference_bh(b->ht[route4_hash_iif(iif)]);
+		     f;
+		     f = rcu_dereference_bh(f->next))
 			if (f->iif == iif)
 				ROUTE4_APPLY_RESULT();
 
-		for (f = b->ht[route4_hash_wild()]; f; f = f->next)
+		for (f = rcu_dereference_bh(b->ht[route4_hash_wild()]);
+		     f;
+		     f = rcu_dereference_bh(f->next))
 			ROUTE4_APPLY_RESULT();
-
 	}
 	if (h < 256) {
 		h = 256;
@@ -213,7 +229,7 @@ static inline u32 from_hash(u32 id)
 
 static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
 {
-	struct route4_head *head = (struct route4_head *)tp->root;
+	struct route4_head *head = (struct route4_head *)rtnl_dereference(tp->root);
 	struct route4_bucket *b;
 	struct route4_filter *f;
 	unsigned int h1, h2;
@@ -229,9 +245,11 @@ static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
 	if (h2 > 32)
 		return 0;
 
-	b = head->table[h1];
+	b = rtnl_dereference(head->table[h1]);
 	if (b) {
-		for (f = b->ht[h2]; f; f = f->next)
+		for (f = rtnl_dereference(b->ht[h2]);
+		     f;
+		     f = rtnl_dereference(f->next))
 			if (f->handle == handle)
 				return (unsigned long)f;
 	}
@@ -248,8 +266,11 @@ static int route4_init(struct tcf_proto *tp)
 }
 
 static void
-route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f)
+route4_delete_filter(struct rcu_head *head)
 {
+	struct route4_filter *f = container_of(head, struct route4_filter, rcu);
+	struct tcf_proto *tp = f->tp;
+
 	tcf_unbind_filter(tp, &f->res);
 	tcf_exts_destroy(tp, &f->exts);
 	kfree(f);
@@ -257,7 +278,7 @@ route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f)
 
 static void route4_destroy(struct tcf_proto *tp)
 {
-	struct route4_head *head = tp->root;
+	struct route4_head *head = rtnl_dereference(tp->root);
 	int h1, h2;
 
 	if (head == NULL)
@@ -266,26 +287,29 @@ static void route4_destroy(struct tcf_proto *tp)
 	for (h1 = 0; h1 <= 256; h1++) {
 		struct route4_bucket *b;
 
-		b = head->table[h1];
+		b = rtnl_dereference(head->table[h1]);
 		if (b) {
 			for (h2 = 0; h2 <= 32; h2++) {
 				struct route4_filter *f;
 
-				while ((f = b->ht[h2]) != NULL) {
-					b->ht[h2] = f->next;
-					route4_delete_filter(tp, f);
+				while ((f = rtnl_dereference(b->ht[h2])) != NULL) {
+					rcu_assign_pointer(b->ht[h2], rtnl_dereference(f->next));
+					call_rcu(&f->rcu, route4_delete_filter);
 				}
 			}
-			kfree(b);
+			rcu_assign_pointer(head->table[h1], NULL);
+			kfree_rcu(b, rcu);
 		}
 	}
-	kfree(head);
+	rcu_assign_pointer(tp->root, NULL);
+	kfree_rcu(head, rcu);
 }
 
 static int route4_delete(struct tcf_proto *tp, unsigned long arg)
 {
-	struct route4_head *head = (struct route4_head *)tp->root;
-	struct route4_filter **fp, *f = (struct route4_filter *)arg;
+	struct route4_head *head = (struct route4_head *)rtnl_dereference(tp->root);
+	struct route4_filter __rcu **fp;
+	struct route4_filter *nf, *f = (struct route4_filter *)arg;
 	unsigned int h = 0;
 	struct route4_bucket *b;
 	int i;
@@ -296,27 +320,34 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg)
 	h = f->handle;
 	b = f->bkt;
 
-	for (fp = &b->ht[from_hash(h >> 16)]; *fp; fp = &(*fp)->next) {
-		if (*fp == f) {
-			tcf_tree_lock(tp);
+	fp = &b->ht[from_hash(h >> 16)];
+	for (nf = rtnl_dereference(*fp); nf;
+	     fp = &nf->next, nf = rtnl_dereference(*fp)) {
+		if (nf == f) {
+			/* unlink it */
 			*fp = f->next;
-			tcf_tree_unlock(tp);
 
-			route4_reset_fastmap(tp->q, head, f->id);
-			route4_delete_filter(tp, f);
+			/* Remove any fastmap lookups that might ref filter
+			 * notice we unlink'd the filter so we can't get it
+			 * back in the fastmap.
+			 */
+			route4_reset_fastmap(head);
 
-			/* Strip tree */
+			/* Delete it */
+			call_rcu(&f->rcu, route4_delete_filter);
 
-			for (i = 0; i <= 32; i++)
-				if (b->ht[i])
+			/* Strip RTNL protected tree */
+			for (i = 0; i <= 32; i++) {
+				struct route4_filter *rt = rtnl_dereference(b->ht[i]);
+
+				if (rt)
 					return 0;
+			}
 
 			/* OK, session has no flows */
-			tcf_tree_lock(tp);
-			head->table[to_hash(h)] = NULL;
-			tcf_tree_unlock(tp);
+			rcu_assign_pointer(head->table[to_hash(h)], NULL);
+			kfree_rcu(b, rcu);
 
-			kfree(b);
 			return 0;
 		}
 	}
@@ -379,26 +410,25 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,
 	}
 
 	h1 = to_hash(nhandle);
-	b = head->table[h1];
+	b = rtnl_dereference(head->table[h1]);
 	if (!b) {
 		err = -ENOBUFS;
 		b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL);
 		if (b == NULL)
 			goto errout;
 
-		tcf_tree_lock(tp);
-		head->table[h1] = b;
-		tcf_tree_unlock(tp);
+		rcu_assign_pointer(head->table[h1], b);
 	} else {
 		unsigned int h2 = from_hash(nhandle >> 16);
 
 		err = -EEXIST;
-		for (fp = b->ht[h2]; fp; fp = fp->next)
+		for (fp = rtnl_dereference(b->ht[h2]);
+		     fp;
+		     fp = rtnl_dereference(fp->next))
 			if (fp->handle == f->handle)
 				goto errout;
 	}
 
-	tcf_tree_lock(tp);
 	if (tb[TCA_ROUTE4_TO])
 		f->id = to;
 
@@ -409,7 +439,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,
 
 	f->handle = nhandle;
 	f->bkt = b;
-	tcf_tree_unlock(tp);
+	f->tp = tp;
 
 	if (tb[TCA_ROUTE4_CLASSID]) {
 		f->res.classid = nla_get_u32(tb[TCA_ROUTE4_CLASSID]);
@@ -430,14 +460,15 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
 		       struct nlattr **tca,
 		       unsigned long *arg)
 {
-	struct route4_head *head = tp->root;
-	struct route4_filter *f, *f1, **fp;
+	struct route4_head *head = rtnl_dereference(tp->root);
+	struct route4_filter __rcu **fp;
+	struct route4_filter *fold, *f1, *pfp, *f = NULL;
 	struct route4_bucket *b;
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_ROUTE4_MAX + 1];
 	unsigned int h, th;
-	u32 old_handle = 0;
 	int err;
+	bool new = true;
 
 	if (opt == NULL)
 		return handle ? -EINVAL : 0;
@@ -446,70 +477,69 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		return err;
 
-	f = (struct route4_filter *)*arg;
-	if (f) {
-		if (f->handle != handle && handle)
+	fold = (struct route4_filter *)*arg;
+	if (fold && handle && fold->handle != handle)
 			return -EINVAL;
-
-		if (f->bkt)
-			old_handle = f->handle;
-
-		err = route4_set_parms(net, tp, base, f, handle, head, tb,
-			tca[TCA_RATE], 0);
-		if (err < 0)
-			return err;
-
-		goto reinsert;
-	}
-
 	err = -ENOBUFS;
 	if (head == NULL) {
 		head = kzalloc(sizeof(struct route4_head), GFP_KERNEL);
 		if (head == NULL)
 			goto errout;
-
-		tcf_tree_lock(tp);
-		tp->root = head;
-		tcf_tree_unlock(tp);
+		rcu_assign_pointer(tp->root, head);
 	}
 
 	f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL);
-	if (f == NULL)
+	if (!f)
 		goto errout;
 
 	tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
+	if (fold) {
+		f->id = fold->id;
+		f->iif = fold->iif;
+		f->res = fold->res;
+		f->handle = fold->handle;
+
+		f->tp = fold->tp;
+		f->bkt = fold->bkt;
+		new = false;
+	}
+
 	err = route4_set_parms(net, tp, base, f, handle, head, tb,
-		tca[TCA_RATE], 1);
+			       tca[TCA_RATE], new);
 	if (err < 0)
 		goto errout;
 
-reinsert:
 	h = from_hash(f->handle >> 16);
-	for (fp = &f->bkt->ht[h]; (f1 = *fp) != NULL; fp = &f1->next)
+	fp = &f->bkt->ht[h];
+	for (pfp = rtnl_dereference(*fp);
+	     (f1 = rtnl_dereference(*fp)) != NULL;
+	     fp = &f1->next)
 		if (f->handle < f1->handle)
 			break;
 
-	f->next = f1;
-	tcf_tree_lock(tp);
-	*fp = f;
+	rcu_assign_pointer(f->next, f1);
+	rcu_assign_pointer(*fp, f);
 
-	if (old_handle && f->handle != old_handle) {
-		th = to_hash(old_handle);
-		h = from_hash(old_handle >> 16);
-		b = head->table[th];
+	if (fold->handle && f->handle != fold->handle) {
+		th = to_hash(fold->handle);
+		h = from_hash(fold->handle >> 16);
+		b = rtnl_dereference(head->table[th]);
 		if (b) {
-			for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) {
-				if (*fp == f) {
+			fp = &b->ht[h];
+			for (pfp = rtnl_dereference(*fp); pfp;
+			     fp = &pfp->next, pfp = rtnl_dereference(*fp)) {
+				if (pfp == f) {
 					*fp = f->next;
 					break;
 				}
 			}
 		}
 	}
-	tcf_tree_unlock(tp);
 
-	route4_reset_fastmap(tp->q, head, f->id);
+	route4_reset_fastmap(head);
 	*arg = (unsigned long)f;
+	if (fold)
+		call_rcu(&fold->rcu, route4_delete_filter);
 	return 0;
 
 errout:
@@ -519,7 +549,7 @@ errout:
 
 static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 {
-	struct route4_head *head = tp->root;
+	struct route4_head *head = rtnl_dereference(tp->root);
 	unsigned int h, h1;
 
 	if (head == NULL)
@@ -529,13 +559,15 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 		return;
 
 	for (h = 0; h <= 256; h++) {
-		struct route4_bucket *b = head->table[h];
+		struct route4_bucket *b = rtnl_dereference(head->table[h]);
 
 		if (b) {
 			for (h1 = 0; h1 <= 32; h1++) {
 				struct route4_filter *f;
 
-				for (f = b->ht[h1]; f; f = f->next) {
+				for (f = rtnl_dereference(b->ht[h1]);
+				     f;
+				     f = rtnl_dereference(f->next)) {
 					if (arg->count < arg->skip) {
 						arg->count++;
 						continue;

^ permalink raw reply related

* [RFC PATCH 06/12] net: sched: fw use RCU
From: John Fastabend @ 2014-01-10  9:39 UTC (permalink / raw)
  To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
In-Reply-To: <20140110092041.7193.5952.stgit@nitbit.x32>

RCU'ify fw classifier.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 net/sched/cls_fw.c |  112 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 77 insertions(+), 35 deletions(-)

diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 3f9cece..6f7680f 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -32,18 +32,21 @@
 #define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *))
 
 struct fw_head {
-	struct fw_filter *ht[HTSIZE];
+	struct rcu_head	rcu;
+	struct fw_filter __rcu *ht[HTSIZE];
 	u32 mask;
 };
 
 struct fw_filter {
-	struct fw_filter	*next;
+	struct fw_filter __rcu	*next;
+	struct rcu_head		rcu;
 	u32			id;
 	struct tcf_result	res;
 #ifdef CONFIG_NET_CLS_IND
 	char			indev[IFNAMSIZ];
 #endif /* CONFIG_NET_CLS_IND */
 	struct tcf_exts		exts;
+	struct tcf_proto	*tp;
 };
 
 static inline int fw_hash(u32 handle)
@@ -75,14 +78,16 @@ static inline int fw_hash(u32 handle)
 static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			  struct tcf_result *res)
 {
-	struct fw_head *head = (struct fw_head *)tp->root;
+	struct fw_head *head = (struct fw_head *)rcu_dereference_bh(tp->root);
 	struct fw_filter *f;
 	int r;
 	u32 id = skb->mark;
 
 	if (head != NULL) {
 		id &= head->mask;
-		for (f = head->ht[fw_hash(id)]; f; f = f->next) {
+
+		for (f = rcu_dereference_bh(head->ht[fw_hash(id)]); f;
+		     f = rcu_dereference_bh(f->next)) {
 			if (f->id == id) {
 				*res = f->res;
 #ifdef CONFIG_NET_CLS_IND
@@ -111,13 +116,14 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 
 static unsigned long fw_get(struct tcf_proto *tp, u32 handle)
 {
-	struct fw_head *head = (struct fw_head *)tp->root;
+	struct fw_head *head = (struct fw_head *)rtnl_dereference(tp->root);
 	struct fw_filter *f;
 
 	if (head == NULL)
 		return 0;
 
-	for (f = head->ht[fw_hash(handle)]; f; f = f->next) {
+	f = rtnl_dereference(head->ht[fw_hash(handle)]);
+	for (; f; f = rtnl_dereference(f->next)) {
 		if (f->id == handle)
 			return (unsigned long)f;
 	}
@@ -133,8 +139,11 @@ static int fw_init(struct tcf_proto *tp)
 	return 0;
 }
 
-static void fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)
+static void fw_delete_filter(struct rcu_head *head)
 {
+	struct fw_filter *f = container_of(head, struct fw_filter, rcu);
+	struct tcf_proto *tp = f->tp;
+
 	tcf_unbind_filter(tp, &f->res);
 	tcf_exts_destroy(tp, &f->exts);
 	kfree(f);
@@ -142,7 +151,7 @@ static void fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)
 
 static void fw_destroy(struct tcf_proto *tp)
 {
-	struct fw_head *head = tp->root;
+	struct fw_head *head = rtnl_dereference(tp->root);
 	struct fw_filter *f;
 	int h;
 
@@ -150,29 +159,32 @@ static void fw_destroy(struct tcf_proto *tp)
 		return;
 
 	for (h = 0; h < HTSIZE; h++) {
-		while ((f = head->ht[h]) != NULL) {
-			head->ht[h] = f->next;
-			fw_delete_filter(tp, f);
+		while ((f = rtnl_dereference(head->ht[h])) != NULL) {
+			rcu_assign_pointer(head->ht[h],
+					   rtnl_dereference(f->next));
+			call_rcu(&f->rcu, fw_delete_filter);
 		}
 	}
-	kfree(head);
+	rcu_assign_pointer(tp->root, NULL);
+	kfree_rcu(head, rcu);
 }
 
 static int fw_delete(struct tcf_proto *tp, unsigned long arg)
 {
-	struct fw_head *head = (struct fw_head *)tp->root;
-	struct fw_filter *f = (struct fw_filter *)arg;
-	struct fw_filter **fp;
+	struct fw_head *head = (struct fw_head *)rtnl_dereference(tp->root);
+	struct fw_filter *pfp, *f = (struct fw_filter *)arg;
+	struct fw_filter __rcu **fp;
 
 	if (head == NULL || f == NULL)
 		goto out;
 
-	for (fp = &head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {
-		if (*fp == f) {
-			tcf_tree_lock(tp);
-			*fp = f->next;
-			tcf_tree_unlock(tp);
-			fw_delete_filter(tp, f);
+	fp = &head->ht[fw_hash(f->id)];
+
+	for (pfp = rtnl_dereference(*fp); pfp;
+	     fp = &pfp->next, pfp = rtnl_dereference(*fp)) {
+		if (pfp == f) {
+			rcu_assign_pointer(*fp, rtnl_dereference(f->next));
+			call_rcu(&f->rcu, fw_delete_filter);
 			return 0;
 		}
 	}
@@ -190,7 +202,7 @@ static int
 fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f,
 	struct nlattr **tb, struct nlattr **tca, unsigned long base)
 {
-	struct fw_head *head = (struct fw_head *)tp->root;
+	struct fw_head *head = (struct fw_head *)rtnl_dereference(tp->root);
 	struct tcf_exts e;
 	u32 mask;
 	int err;
@@ -235,7 +247,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
 		     struct nlattr **tca,
 		     unsigned long *arg)
 {
-	struct fw_head *head = (struct fw_head *)tp->root;
+	struct fw_head *head = (struct fw_head *)rtnl_dereference(tp->root);
 	struct fw_filter *f = (struct fw_filter *) *arg;
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_FW_MAX + 1];
@@ -248,10 +260,42 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		return err;
 
-	if (f != NULL) {
+	if (f) {
+		struct fw_filter *pfp, *fnew;
+		struct fw_filter __rcu **fp;
+
 		if (f->id != handle && handle)
 			return -EINVAL;
-		return fw_change_attrs(net, tp, f, tb, tca, base);
+
+		fnew = kzalloc(sizeof(struct fw_filter), GFP_KERNEL);
+		if (!fnew)
+			return -ENOBUFS;
+
+		fnew->id = f->id;
+		fnew->res = f->res;
+#ifdef CONFIG_NET_CLS_IND
+		strcpy(fnew->indev, f->indev);
+#endif /* CONFIG_NET_CLS_IND */
+		fnew->tp = f->tp;
+
+		err = fw_change_attrs(net, tp, fnew, tb, tca, base);
+		if (err < 0) {
+			kfree(fnew);
+			return err;
+		}
+
+		fp = &head->ht[fw_hash(fnew->id)];
+		for (pfp = rtnl_dereference(*fp); pfp;
+		     fp = &pfp->next, pfp = rtnl_dereference(*fp))
+			if (pfp == f)
+				break;
+
+		rcu_assign_pointer(fnew->next, rtnl_dereference(pfp->next));
+		rcu_assign_pointer(*fp, fnew);
+		call_rcu(&f->rcu, fw_delete_filter);
+
+		*arg = (unsigned long)fnew;
+		return err;
 	}
 
 	if (!handle)
@@ -267,9 +311,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
 			return -ENOBUFS;
 		head->mask = mask;
 
-		tcf_tree_lock(tp);
-		tp->root = head;
-		tcf_tree_unlock(tp);
+		rcu_assign_pointer(tp->root, head);
 	}
 
 	f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL);
@@ -278,15 +320,14 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
 
 	tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE);
 	f->id = handle;
+	f->tp = tp;
 
 	err = fw_change_attrs(net, tp, f, tb, tca, base);
 	if (err < 0)
 		goto errout;
 
-	f->next = head->ht[fw_hash(handle)];
-	tcf_tree_lock(tp);
-	head->ht[fw_hash(handle)] = f;
-	tcf_tree_unlock(tp);
+	rcu_assign_pointer(f->next, head->ht[fw_hash(handle)]);
+	rcu_assign_pointer(head->ht[fw_hash(handle)], f);
 
 	*arg = (unsigned long)f;
 	return 0;
@@ -298,7 +339,7 @@ errout:
 
 static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 {
-	struct fw_head *head = (struct fw_head *)tp->root;
+	struct fw_head *head = (struct fw_head *)rtnl_dereference(tp->root);
 	int h;
 
 	if (head == NULL)
@@ -310,7 +351,8 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	for (h = 0; h < HTSIZE; h++) {
 		struct fw_filter *f;
 
-		for (f = head->ht[h]; f; f = f->next) {
+		for (f = rtnl_dereference(head->ht[h]); f;
+		     f = rtnl_dereference(f->next)) {
 			if (arg->count < arg->skip) {
 				arg->count++;
 				continue;
@@ -327,7 +369,7 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 static int fw_dump(struct tcf_proto *tp, unsigned long fh,
 		   struct sk_buff *skb, struct tcmsg *t)
 {
-	struct fw_head *head = (struct fw_head *)tp->root;
+	struct fw_head *head = (struct fw_head *)rtnl_dereference(tp->root);
 	struct fw_filter *f = (struct fw_filter *)fh;
 	unsigned char *b = skb_tail_pointer(skb);
 	struct nlattr *nest;

^ permalink raw reply related

* [RFC PATCH 05/12] net: sched: cls_flow use RCU
From: John Fastabend @ 2014-01-10  9:39 UTC (permalink / raw)
  To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
In-Reply-To: <20140110092041.7193.5952.stgit@nitbit.x32>

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 net/sched/cls_flow.c |  145 +++++++++++++++++++++++++++++---------------------
 1 file changed, 84 insertions(+), 61 deletions(-)

diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index dfd18a5..632bf48 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -34,12 +34,15 @@
 
 struct flow_head {
 	struct list_head	filters;
+	struct rcu_head		rcu;
 };
 
 struct flow_filter {
 	struct list_head	list;
+	struct rcu_head		rcu;
 	struct tcf_exts		exts;
 	struct tcf_ematch_tree	ematches;
+	struct tcf_proto	*tp;
 	struct timer_list	perturb_timer;
 	u32			perturb_period;
 	u32			handle;
@@ -276,14 +279,14 @@ static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow)
 static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			 struct tcf_result *res)
 {
-	struct flow_head *head = tp->root;
+	struct flow_head *head = rcu_dereference_bh(tp->root);
 	struct flow_filter *f;
 	u32 keymask;
 	u32 classid;
 	unsigned int n, key;
 	int r;
 
-	list_for_each_entry(f, &head->filters, list) {
+	list_for_each_entry_rcu(f, &head->filters, list) {
 		u32 keys[FLOW_KEY_MAX + 1];
 		struct flow_keys flow_keys;
 
@@ -346,13 +349,23 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = {
 	[TCA_FLOW_PERTURB]	= { .type = NLA_U32 },
 };
 
+static void flow_destroy_filter(struct rcu_head *head)
+{
+	struct flow_filter *f = container_of(head, struct flow_filter, rcu);
+
+	del_timer_sync(&f->perturb_timer);
+	tcf_exts_destroy(f->tp, &f->exts);
+	tcf_em_tree_destroy(f->tp, &f->ematches);
+	kfree(f);
+}
+
 static int flow_change(struct net *net, struct sk_buff *in_skb,
 		       struct tcf_proto *tp, unsigned long base,
 		       u32 handle, struct nlattr **tca,
 		       unsigned long *arg)
 {
-	struct flow_head *head = tp->root;
-	struct flow_filter *f;
+	struct flow_head *head = rtnl_dereference(tp->root);
+	struct flow_filter *fold, *fnew;
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_FLOW_MAX + 1];
 	struct tcf_exts e;
@@ -401,20 +414,42 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		goto err1;
 
-	f = (struct flow_filter *)*arg;
-	if (f != NULL) {
+	err = -ENOBUFS;
+	fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
+	if (!fnew)
+		goto err2;
+
+	fold = (struct flow_filter *)*arg;
+	if (fold) {
 		err = -EINVAL;
-		if (f->handle != handle && handle)
+		if (fold->handle != handle && handle)
 			goto err2;
 
-		mode = f->mode;
+		/* Copy fold into fnew */
+		fnew->handle = fold->handle;
+		fnew->keymask = fold->keymask;
+		fnew->tp = fold->tp;
+
+		fnew->handle = fold->handle;
+		fnew->nkeys = fold->nkeys;
+		fnew->keymask = fold->keymask;
+		fnew->mode = fold->mode;
+		fnew->mask = fold->mask;
+		fnew->xor = fold->xor;
+		fnew->rshift = fold->rshift;
+		fnew->addend = fold->addend;
+		fnew->divisor = fold->divisor;
+		fnew->baseclass = fold->baseclass;
+		fnew->hashrnd = fold->hashrnd;
+
+		mode = fold->mode;
 		if (tb[TCA_FLOW_MODE])
 			mode = nla_get_u32(tb[TCA_FLOW_MODE]);
 		if (mode != FLOW_MODE_HASH && nkeys > 1)
 			goto err2;
 
 		if (mode == FLOW_MODE_HASH)
-			perturb_period = f->perturb_period;
+			perturb_period = fold->perturb_period;
 		if (tb[TCA_FLOW_PERTURB]) {
 			if (mode != FLOW_MODE_HASH)
 				goto err2;
@@ -444,83 +479,70 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 		if (TC_H_MIN(baseclass) == 0)
 			baseclass = TC_H_MAKE(baseclass, 1);
 
-		err = -ENOBUFS;
-		f = kzalloc(sizeof(*f), GFP_KERNEL);
-		if (f == NULL)
-			goto err2;
-
-		f->handle = handle;
-		f->mask	  = ~0U;
-		tcf_exts_init(&f->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);
-
-		get_random_bytes(&f->hashrnd, 4);
-		f->perturb_timer.function = flow_perturbation;
-		f->perturb_timer.data = (unsigned long)f;
-		init_timer_deferrable(&f->perturb_timer);
+		fnew->handle = handle;
+		fnew->mask  = ~0U;
+		fnew->tp = tp;
+		get_random_bytes(&fnew->hashrnd, 4);
+		tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);
 	}
 
-	tcf_exts_change(tp, &f->exts, &e);
-	tcf_em_tree_change(tp, &f->ematches, &t);
+	fnew->perturb_timer.function = flow_perturbation;
+	fnew->perturb_timer.data = (unsigned long)fnew;
+	init_timer_deferrable(&fnew->perturb_timer);
 
-	tcf_tree_lock(tp);
+	tcf_exts_change(tp, &fnew->exts, &e);
+	tcf_em_tree_change(tp, &fnew->ematches, &t);
 
 	if (tb[TCA_FLOW_KEYS]) {
-		f->keymask = keymask;
-		f->nkeys   = nkeys;
+		fnew->keymask = keymask;
+		fnew->nkeys   = nkeys;
 	}
 
-	f->mode = mode;
+	fnew->mode = mode;
 
 	if (tb[TCA_FLOW_MASK])
-		f->mask = nla_get_u32(tb[TCA_FLOW_MASK]);
+		fnew->mask = nla_get_u32(tb[TCA_FLOW_MASK]);
 	if (tb[TCA_FLOW_XOR])
-		f->xor = nla_get_u32(tb[TCA_FLOW_XOR]);
+		fnew->xor = nla_get_u32(tb[TCA_FLOW_XOR]);
 	if (tb[TCA_FLOW_RSHIFT])
-		f->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]);
+		fnew->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]);
 	if (tb[TCA_FLOW_ADDEND])
-		f->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]);
+		fnew->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]);
 
 	if (tb[TCA_FLOW_DIVISOR])
-		f->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]);
+		fnew->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]);
 	if (baseclass)
-		f->baseclass = baseclass;
+		fnew->baseclass = baseclass;
 
-	f->perturb_period = perturb_period;
-	del_timer(&f->perturb_timer);
+	fnew->perturb_period = perturb_period;
 	if (perturb_period)
-		mod_timer(&f->perturb_timer, jiffies + perturb_period);
+		mod_timer(&fnew->perturb_timer, jiffies + perturb_period);
 
 	if (*arg == 0)
-		list_add_tail(&f->list, &head->filters);
+		list_add_tail_rcu(&fnew->list, &head->filters);
+	else
+		list_replace_rcu(&fnew->list, &fold->list);
 
-	tcf_tree_unlock(tp);
+	*arg = (unsigned long)fnew;
 
-	*arg = (unsigned long)f;
+	if (fold)
+		call_rcu(&fold->rcu, flow_destroy_filter);
 	return 0;
 
 err2:
 	tcf_em_tree_destroy(tp, &t);
+	kfree(fnew);
 err1:
 	tcf_exts_destroy(tp, &e);
 	return err;
 }
 
-static void flow_destroy_filter(struct tcf_proto *tp, struct flow_filter *f)
-{
-	del_timer_sync(&f->perturb_timer);
-	tcf_exts_destroy(tp, &f->exts);
-	tcf_em_tree_destroy(tp, &f->ematches);
-	kfree(f);
-}
-
 static int flow_delete(struct tcf_proto *tp, unsigned long arg)
 {
 	struct flow_filter *f = (struct flow_filter *)arg;
 
-	tcf_tree_lock(tp);
-	list_del(&f->list);
-	tcf_tree_unlock(tp);
-	flow_destroy_filter(tp, f);
+	list_del_rcu(&f->list);
+	call_rcu(&f->rcu, flow_destroy_filter);
 	return 0;
 }
 
@@ -532,28 +554,29 @@ static int flow_init(struct tcf_proto *tp)
 	if (head == NULL)
 		return -ENOBUFS;
 	INIT_LIST_HEAD(&head->filters);
-	tp->root = head;
+	rcu_assign_pointer(tp->root, head);
 	return 0;
 }
 
 static void flow_destroy(struct tcf_proto *tp)
 {
-	struct flow_head *head = tp->root;
+	struct flow_head *head = rtnl_dereference(tp->root);
 	struct flow_filter *f, *next;
 
 	list_for_each_entry_safe(f, next, &head->filters, list) {
-		list_del(&f->list);
-		flow_destroy_filter(tp, f);
+		list_del_rcu(&f->list);
+		call_rcu(&f->rcu, flow_destroy_filter);
 	}
-	kfree(head);
+	rcu_assign_pointer(tp->root, NULL);
+	kfree_rcu(head, rcu);
 }
 
 static unsigned long flow_get(struct tcf_proto *tp, u32 handle)
 {
-	struct flow_head *head = tp->root;
+	struct flow_head *head = rtnl_dereference(tp->root);
 	struct flow_filter *f;
 
-	list_for_each_entry(f, &head->filters, list)
+	list_for_each_entry_rcu(f, &head->filters, list)
 		if (f->handle == handle)
 			return (unsigned long)f;
 	return 0;
@@ -626,10 +649,10 @@ nla_put_failure:
 
 static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 {
-	struct flow_head *head = tp->root;
+	struct flow_head *head = rtnl_dereference(tp->root);
 	struct flow_filter *f;
 
-	list_for_each_entry(f, &head->filters, list) {
+	list_for_each_entry_rcu(f, &head->filters, list) {
 		if (arg->count < arg->skip)
 			goto skip;
 		if (arg->fn(tp, (unsigned long)f, arg) < 0) {

^ permalink raw reply related

* [RFC PATCH 04/12] net: sched: cls_cgroup use RCU
From: John Fastabend @ 2014-01-10  9:38 UTC (permalink / raw)
  To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
In-Reply-To: <20140110092041.7193.5952.stgit@nitbit.x32>

Make cgroup classifier safe for RCU.

Also drops the calls in the classify routine that were doing a
rcu_read_lock()/rcu_read_unlock(). If the rcu_read_lock() isn't held
entering this routine we have issues with deleting the classifier
chain so remove the unnecessary rcu_read_lock()/rcu_read_unlock()
pair noting all paths AFAIK hold rcu_read_lock.

If there is a case where classify is called without the rcu read lock
then an rcu splat will occur and we can correct it.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 net/sched/cls_cgroup.c |   65 ++++++++++++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 24 deletions(-)

diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index f9d21258..7644912 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -118,17 +118,17 @@ struct cls_cgroup_head {
 	u32			handle;
 	struct tcf_exts		exts;
 	struct tcf_ematch_tree	ematches;
+	struct rcu_head		rcu;
+	struct tcf_proto	*tp;
 };
 
 static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			       struct tcf_result *res)
 {
-	struct cls_cgroup_head *head = tp->root;
+	struct cls_cgroup_head *head = rcu_dereference_bh(tp->root);
 	u32 classid;
 
-	rcu_read_lock();
 	classid = task_cls_state(current)->classid;
-	rcu_read_unlock();
 
 	/*
 	 * Due to the nature of the classifier it is required to ignore all
@@ -176,13 +176,27 @@ static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = {
 	[TCA_CGROUP_EMATCHES]	= { .type = NLA_NESTED },
 };
 
+static void cls_cgroup_destroy_rcu(struct rcu_head *root)
+{
+	struct cls_cgroup_head *head = container_of(root,
+						    struct cls_cgroup_head,
+						    rcu);
+
+	if (head) {
+		tcf_exts_destroy(head->tp, &head->exts);
+		tcf_em_tree_destroy(head->tp, &head->ematches);
+		kfree(head);
+	}
+}
+
 static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
 			     struct tcf_proto *tp, unsigned long base,
 			     u32 handle, struct nlattr **tca,
 			     unsigned long *arg)
 {
 	struct nlattr *tb[TCA_CGROUP_MAX + 1];
-	struct cls_cgroup_head *head = tp->root;
+	struct cls_cgroup_head *head = rtnl_dereference(tp->root);
+	struct cls_cgroup_head *new;
 	struct tcf_ematch_tree t;
 	struct tcf_exts e;
 	int err;
@@ -190,25 +204,24 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
 	if (!tca[TCA_OPTIONS])
 		return -EINVAL;
 
-	if (head == NULL) {
-		if (!handle)
-			return -EINVAL;
+	if (!head && !handle)
+		return -EINVAL;
 
-		head = kzalloc(sizeof(*head), GFP_KERNEL);
-		if (head == NULL)
-			return -ENOBUFS;
+	if (head && handle != head->handle)
+		return -ENOENT;
 
-		tcf_exts_init(&head->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
-		head->handle = handle;
+	new = kzalloc(sizeof(*head), GFP_KERNEL);
+	if (!new)
+		return -ENOBUFS;
 
-		tcf_tree_lock(tp);
-		tp->root = head;
-		tcf_tree_unlock(tp);
+	if (head) {
+		new->handle = head->handle;
+	} else {
+		tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
+		new->handle = handle;
 	}
 
-	if (handle != head->handle)
-		return -ENOENT;
-
+	new->tp = tp;
 	err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS],
 			       cgroup_policy);
 	if (err < 0)
@@ -223,20 +236,24 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		return err;
 
-	tcf_exts_change(tp, &head->exts, &e);
-	tcf_em_tree_change(tp, &head->ematches, &t);
+	tcf_exts_change(tp, &new->exts, &e);
+	tcf_em_tree_change(tp, &new->ematches, &t);
 
+	rcu_assign_pointer(tp->root, new);
+	if (head)
+		call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
 	return 0;
 }
 
 static void cls_cgroup_destroy(struct tcf_proto *tp)
 {
-	struct cls_cgroup_head *head = tp->root;
+	struct cls_cgroup_head *head = rtnl_dereference(tp->root);
 
 	if (head) {
 		tcf_exts_destroy(tp, &head->exts);
 		tcf_em_tree_destroy(tp, &head->ematches);
-		kfree(head);
+		rcu_assign_pointer(tp->root, NULL);
+		kfree_rcu(head, rcu);
 	}
 }
 
@@ -247,7 +264,7 @@ static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg)
 
 static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 {
-	struct cls_cgroup_head *head = tp->root;
+	struct cls_cgroup_head *head = rtnl_dereference(tp->root);
 
 	if (arg->count < arg->skip)
 		goto skip;
@@ -263,7 +280,7 @@ skip:
 static int cls_cgroup_dump(struct tcf_proto *tp, unsigned long fh,
 			   struct sk_buff *skb, struct tcmsg *t)
 {
-	struct cls_cgroup_head *head = tp->root;
+	struct cls_cgroup_head *head = rtnl_dereference(tp->root);
 	unsigned char *b = skb_tail_pointer(skb);
 	struct nlattr *nest;
 

^ permalink raw reply related

* [RFC PATCH 03/12] net: sched: cls_basic use RCU
From: John Fastabend @ 2014-01-10  9:38 UTC (permalink / raw)
  To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
In-Reply-To: <20140110092041.7193.5952.stgit@nitbit.x32>

Enable basic classifier for RCU.

Dereferencing tp->root may look a bit strange here but it is needed
by my accounting because it is allocated at init time and needs to
be kfree'd at destroy time. However because it may be referenced in
the classify() path we must wait an RCU grace period before free'ing
it. We use kfree_rcu() and rcu_ APIs to enforce this. This pattern
is used in all the classifiers.

Also the hgenerator can be incremented without concern because it
is always incremented under RTNL.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 net/sched/cls_basic.c |   82 ++++++++++++++++++++++++++++---------------------
 1 file changed, 47 insertions(+), 35 deletions(-)

diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index b655203..47c1da2 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -24,6 +24,7 @@
 struct basic_head {
 	u32			hgenerator;
 	struct list_head	flist;
+	struct rcu_head		rcu;
 };
 
 struct basic_filter {
@@ -31,17 +32,20 @@ struct basic_filter {
 	struct tcf_exts		exts;
 	struct tcf_ematch_tree	ematches;
 	struct tcf_result	res;
+	struct tcf_proto	*tp;
 	struct list_head	link;
+	struct rcu_head		rcu;
 };
 
 static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			  struct tcf_result *res)
 {
 	int r;
-	struct basic_head *head = (struct basic_head *) tp->root;
+	struct basic_head *head =
+			(struct basic_head *) rcu_dereference_bh(tp->root);
 	struct basic_filter *f;
 
-	list_for_each_entry(f, &head->flist, link) {
+	list_for_each_entry_rcu(f, &head->flist, link) {
 		if (!tcf_em_tree_match(skb, &f->ematches, NULL))
 			continue;
 		*res = f->res;
@@ -56,7 +60,8 @@ static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 static unsigned long basic_get(struct tcf_proto *tp, u32 handle)
 {
 	unsigned long l = 0UL;
-	struct basic_head *head = (struct basic_head *) tp->root;
+	struct basic_head *head =
+			(struct basic_head *) rtnl_dereference(tp->root);
 	struct basic_filter *f;
 
 	if (head == NULL)
@@ -81,12 +86,15 @@ static int basic_init(struct tcf_proto *tp)
 	if (head == NULL)
 		return -ENOBUFS;
 	INIT_LIST_HEAD(&head->flist);
-	tp->root = head;
+	rcu_assign_pointer(tp->root, head);
 	return 0;
 }
 
-static void basic_delete_filter(struct tcf_proto *tp, struct basic_filter *f)
+static void basic_delete_filter(struct rcu_head *head)
 {
+	struct basic_filter *f = container_of(head, struct basic_filter, rcu);
+	struct tcf_proto *tp = f->tp;
+
 	tcf_unbind_filter(tp, &f->res);
 	tcf_exts_destroy(tp, &f->exts);
 	tcf_em_tree_destroy(tp, &f->ematches);
@@ -95,27 +103,26 @@ static void basic_delete_filter(struct tcf_proto *tp, struct basic_filter *f)
 
 static void basic_destroy(struct tcf_proto *tp)
 {
-	struct basic_head *head = tp->root;
+	struct basic_head *head = rtnl_dereference(tp->root);
 	struct basic_filter *f, *n;
 
 	list_for_each_entry_safe(f, n, &head->flist, link) {
-		list_del(&f->link);
-		basic_delete_filter(tp, f);
+		list_del_rcu(&f->link);
+		call_rcu(&f->rcu, basic_delete_filter);
 	}
-	kfree(head);
+	rcu_assign_pointer(tp->root, NULL);
+	kfree_rcu(head, rcu);
 }
 
 static int basic_delete(struct tcf_proto *tp, unsigned long arg)
 {
-	struct basic_head *head = (struct basic_head *) tp->root;
+	struct basic_head *head = rtnl_dereference(tp->root);
 	struct basic_filter *t, *f = (struct basic_filter *) arg;
 
 	list_for_each_entry(t, &head->flist, link)
 		if (t == f) {
-			tcf_tree_lock(tp);
-			list_del(&t->link);
-			tcf_tree_unlock(tp);
-			basic_delete_filter(tp, t);
+			list_del_rcu(&t->link);
+			call_rcu(&t->rcu, basic_delete_filter);
 			return 0;
 		}
 
@@ -152,6 +159,7 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp,
 
 	tcf_exts_change(tp, &f->exts, &e);
 	tcf_em_tree_change(tp, &f->ematches, &t);
+	f->tp = tp;
 
 	return 0;
 errout:
@@ -164,9 +172,10 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
 			struct nlattr **tca, unsigned long *arg)
 {
 	int err;
-	struct basic_head *head = (struct basic_head *) tp->root;
+	struct basic_head *head = rtnl_dereference(tp->root);
 	struct nlattr *tb[TCA_BASIC_MAX + 1];
-	struct basic_filter *f = (struct basic_filter *) *arg;
+	struct basic_filter *fold = (struct basic_filter *) *arg;
+	struct basic_filter *fnew;
 
 	if (tca[TCA_OPTIONS] == NULL)
 		return -EINVAL;
@@ -176,22 +185,23 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		return err;
 
-	if (f != NULL) {
-		if (handle && f->handle != handle)
+	if (fold != NULL) {
+		if (handle && fold->handle != handle)
 			return -EINVAL;
-		return basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE]);
 	}
 
 	err = -ENOBUFS;
-	f = kzalloc(sizeof(*f), GFP_KERNEL);
-	if (f == NULL)
+	fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
+	if (fnew == NULL)
 		goto errout;
 
-	tcf_exts_init(&f->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
+	tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
 	err = -EINVAL;
-	if (handle)
-		f->handle = handle;
-	else {
+	if (handle) {
+		fnew->handle = handle;
+	} else if (fold) {
+		fnew->handle = fold->handle;
+	} else {
 		unsigned int i = 0x80000000;
 		do {
 			if (++head->hgenerator == 0x7FFFFFFF)
@@ -203,29 +213,31 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
 			goto errout;
 		}
 
-		f->handle = head->hgenerator;
+		fnew->handle = head->hgenerator;
 	}
 
-	err = basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE]);
+	err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE]);
 	if (err < 0)
 		goto errout;
 
-	tcf_tree_lock(tp);
-	list_add(&f->link, &head->flist);
-	tcf_tree_unlock(tp);
-	*arg = (unsigned long) f;
+	*arg = (unsigned long) fnew;
+
+	if (fold) {
+		list_replace_rcu(&fold->link, &fnew->link);
+		call_rcu(&fold->rcu, basic_delete_filter);
+	} else {
+		list_add_rcu(&fnew->link, &head->flist);
+	}
 
 	return 0;
 errout:
-	if (*arg == 0UL && f)
-		kfree(f);
-
+	kfree(fnew);
 	return err;
 }
 
 static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 {
-	struct basic_head *head = (struct basic_head *) tp->root;
+	struct basic_head *head = rtnl_dereference(tp->root);
 	struct basic_filter *f;
 
 	list_for_each_entry(f, &head->flist, link) {

^ permalink raw reply related

* [RFC PATCH 02/12] net: rcu-ify tcf_proto
From: John Fastabend @ 2014-01-10  9:37 UTC (permalink / raw)
  To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
In-Reply-To: <20140110092041.7193.5952.stgit@nitbit.x32>

rcu'ify tcf_proto this allows calling tc_classify() without holding
any locks. Updaters are protected by RTNL.

This patch prepares the core net_sched infrastracture for running
the classifier/action chains without holding the qdisc lock however
it does nothing to ensure cls_xxx and act_xxx types also work without
locking. Additional patches are required to address the fall out.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 include/net/sch_generic.h |    5 +++--
 net/sched/cls_api.c       |   29 ++++++++++++++---------------
 net/sched/sch_api.c       |    6 +++---
 net/sched/sch_atm.c       |   30 +++++++++++++++++++-----------
 net/sched/sch_cbq.c       |   21 +++++++++++++++------
 net/sched/sch_choke.c     |   18 +++++++++++++-----
 net/sched/sch_drr.c       |   10 +++++++---
 net/sched/sch_dsmark.c    |    8 +++++---
 net/sched/sch_fq_codel.c  |   11 +++++++----
 net/sched/sch_hfsc.c      |   17 +++++++++++------
 net/sched/sch_htb.c       |   23 +++++++++++++++--------
 net/sched/sch_ingress.c   |    8 +++++---
 net/sched/sch_multiq.c    |    8 +++++---
 net/sched/sch_prio.c      |   11 +++++++----
 net/sched/sch_qfq.c       |    9 ++++++---
 net/sched/sch_sfb.c       |   15 +++++++++------
 net/sched/sch_sfq.c       |   11 +++++++----
 17 files changed, 151 insertions(+), 89 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 3e10a2a..11b878b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -212,8 +212,8 @@ struct tcf_proto_ops {
 
 struct tcf_proto {
 	/* Fast access part */
-	struct tcf_proto	*next;
-	void			*root;
+	struct tcf_proto __rcu	*next;
+	void __rcu		*root;
 	int			(*classify)(struct sk_buff *,
 					    const struct tcf_proto *,
 					    struct tcf_result *);
@@ -225,6 +225,7 @@ struct tcf_proto {
 	struct Qdisc		*q;
 	void			*data;
 	const struct tcf_proto_ops	*ops;
+	struct rcu_head		rcu;
 };
 
 struct qdisc_skb_cb {
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 12e882e..86c3678 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -117,7 +117,6 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
 {
 	struct net *net = sock_net(skb->sk);
 	struct nlattr *tca[TCA_MAX + 1];
-	spinlock_t *root_lock;
 	struct tcmsg *t;
 	u32 protocol;
 	u32 prio;
@@ -125,7 +124,8 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
 	u32 parent;
 	struct net_device *dev;
 	struct Qdisc  *q;
-	struct tcf_proto **back, **chain;
+	struct tcf_proto __rcu **back;
+	struct tcf_proto __rcu **chain;
 	struct tcf_proto *tp;
 	const struct tcf_proto_ops *tp_ops;
 	const struct Qdisc_class_ops *cops;
@@ -196,7 +196,9 @@ replay:
 		goto errout;
 
 	/* Check the chain for existence of proto-tcf with this priority */
-	for (back = chain; (tp = *back) != NULL; back = &tp->next) {
+	for (back = chain;
+	     (tp = rtnl_dereference(*back)) != NULL;
+	     back = &tp->next) {
 		if (tp->prio >= prio) {
 			if (tp->prio == prio) {
 				if (!nprio ||
@@ -208,8 +210,6 @@ replay:
 		}
 	}
 
-	root_lock = qdisc_root_sleeping_lock(q);
-
 	if (tp == NULL) {
 		/* Proto-tcf does not exist, create new one */
 
@@ -258,7 +258,7 @@ replay:
 		}
 		tp->ops = tp_ops;
 		tp->protocol = protocol;
-		tp->prio = nprio ? : TC_H_MAJ(tcf_auto_prio(*back));
+		tp->prio = nprio ? : TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back)));
 		tp->q = q;
 		tp->classify = tp_ops->classify;
 		tp->classid = parent;
@@ -279,9 +279,9 @@ replay:
 
 	if (fh == 0) {
 		if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
-			spin_lock_bh(root_lock);
-			*back = tp->next;
-			spin_unlock_bh(root_lock);
+			struct tcf_proto *next = rtnl_dereference(tp->next);
+
+			rcu_assign_pointer(*back, next);
 
 			tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
 			tcf_destroy(tp);
@@ -320,10 +320,8 @@ replay:
 	err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh);
 	if (err == 0) {
 		if (tp_created) {
-			spin_lock_bh(root_lock);
-			tp->next = *back;
-			*back = tp;
-			spin_unlock_bh(root_lock);
+			rcu_assign_pointer(tp->next, rtnl_dereference(*back));
+			rcu_assign_pointer(*back, tp);
 		}
 		tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
 	} else {
@@ -417,7 +415,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 	int s_t;
 	struct net_device *dev;
 	struct Qdisc *q;
-	struct tcf_proto *tp, **chain;
+	struct tcf_proto *tp, __rcu **chain;
 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
 	unsigned long cl = 0;
 	const struct Qdisc_class_ops *cops;
@@ -451,7 +449,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 
 	s_t = cb->args[0];
 
-	for (tp = *chain, t = 0; tp; tp = tp->next, t++) {
+	for (tp = rtnl_dereference(*chain), t = 0;
+	     tp; tp = rtnl_dereference(tp->next), t++) {
 		if (t < s_t)
 			continue;
 		if (TC_H_MAJ(tcm->tcm_info) &&
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 1313145..774f61c 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1778,7 +1778,7 @@ int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
 	__be16 protocol = skb->protocol;
 	int err;
 
-	for (; tp; tp = tp->next) {
+	for (; tp; tp = rcu_dereference_bh(tp->next)) {
 		if (tp->protocol != protocol &&
 		    tp->protocol != htons(ETH_P_ALL))
 			continue;
@@ -1830,7 +1830,7 @@ void tcf_destroy(struct tcf_proto *tp)
 {
 	tp->ops->destroy(tp);
 	module_put(tp->ops->owner);
-	kfree(tp);
+	kfree_rcu(tp, rcu);
 }
 
 void tcf_destroy_chain(struct tcf_proto **fl)
@@ -1838,7 +1838,7 @@ void tcf_destroy_chain(struct tcf_proto **fl)
 	struct tcf_proto *tp;
 
 	while ((tp = *fl) != NULL) {
-		*fl = tp->next;
+		*fl = rtnl_dereference(tp->next);
 		tcf_destroy(tp);
 	}
 }
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 1f9c314..a4953c6 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -41,7 +41,7 @@
 
 struct atm_flow_data {
 	struct Qdisc		*q;	/* FIFO, TBF, etc. */
-	struct tcf_proto	*filter_list;
+	struct tcf_proto __rcu	*filter_list;
 	struct atm_vcc		*vcc;	/* VCC; NULL if VCC is closed */
 	void			(*old_pop)(struct atm_vcc *vcc,
 					   struct sk_buff *skb); /* chaining */
@@ -134,6 +134,7 @@ static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
 {
 	struct atm_qdisc_data *p = qdisc_priv(sch);
 	struct atm_flow_data *flow = (struct atm_flow_data *)cl;
+	struct tcf_proto *fl;
 
 	pr_debug("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
 	if (--flow->ref)
@@ -142,7 +143,10 @@ static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
 	list_del_init(&flow->list);
 	pr_debug("atm_tc_put: qdisc %p\n", flow->q);
 	qdisc_destroy(flow->q);
-	tcf_destroy_chain(&flow->filter_list);
+
+	fl = rtnl_dereference(flow->filter_list);
+	tcf_destroy_chain(&fl);
+
 	if (flow->sock) {
 		pr_debug("atm_tc_put: f_count %ld\n",
 			file_count(flow->sock->file));
@@ -273,7 +277,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
 		error = -ENOBUFS;
 		goto err_out;
 	}
-	flow->filter_list = NULL;
+	rcu_assign_pointer(flow->filter_list, NULL);
 	flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
 	if (!flow->q)
 		flow->q = &noop_qdisc;
@@ -311,7 +315,7 @@ static int atm_tc_delete(struct Qdisc *sch, unsigned long arg)
 	pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
 	if (list_empty(&flow->list))
 		return -EINVAL;
-	if (flow->filter_list || flow == &p->link)
+	if (rtnl_dereference(flow->filter_list) || flow == &p->link)
 		return -EBUSY;
 	/*
 	 * Reference count must be 2: one for "keepalive" (set at class
@@ -369,11 +373,12 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	flow = NULL;
 	if (TC_H_MAJ(skb->priority) != sch->handle ||
 	    !(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) {
+		struct tcf_proto *fl;
+
 		list_for_each_entry(flow, &p->flows, list) {
-			if (flow->filter_list) {
-				result = tc_classify_compat(skb,
-							    flow->filter_list,
-							    &res);
+			fl = rcu_dereference_bh(flow->filter_list);
+			if (fl) {
+				result = tc_classify_compat(skb, fl, &res);
 				if (result < 0)
 					continue;
 				flow = (struct atm_flow_data *)res.class;
@@ -544,7 +549,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt)
 	if (!p->link.q)
 		p->link.q = &noop_qdisc;
 	pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q);
-	p->link.filter_list = NULL;
+	rcu_assign_pointer(p->link.filter_list, NULL);
 	p->link.vcc = NULL;
 	p->link.sock = NULL;
 	p->link.classid = sch->handle;
@@ -568,10 +573,13 @@ static void atm_tc_destroy(struct Qdisc *sch)
 {
 	struct atm_qdisc_data *p = qdisc_priv(sch);
 	struct atm_flow_data *flow, *tmp;
+	struct tcf_proto *fl;
 
 	pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p);
-	list_for_each_entry(flow, &p->flows, list)
-		tcf_destroy_chain(&flow->filter_list);
+	list_for_each_entry(flow, &p->flows, list) {
+		fl = rtnl_dereference(flow->filter_list);
+		tcf_destroy_chain(&fl);
+	}
 
 	list_for_each_entry_safe(flow, tmp, &p->flows, list) {
 		if (flow->ref > 1)
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 2f80d01..61a51d0 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -133,7 +133,7 @@ struct cbq_class {
 	struct gnet_stats_rate_est64 rate_est;
 	struct tc_cbq_xstats	xstats;
 
-	struct tcf_proto	*filter_list;
+	struct tcf_proto __rcu	*filter_list;
 
 	int			refcnt;
 	int			filters;
@@ -222,6 +222,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	struct cbq_class **defmap;
 	struct cbq_class *cl = NULL;
 	u32 prio = skb->priority;
+	struct tcf_proto *fl;
 	struct tcf_result res;
 
 	/*
@@ -234,13 +235,15 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	for (;;) {
 		int result = 0;
+
 		defmap = head->defaults;
 
+		fl = rcu_dereference_bh(head->filter_list);
 		/*
 		 * Step 2+n. Apply classifier.
 		 */
-		if (!head->filter_list ||
-		    (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
+		result = tc_classify_compat(skb, fl, &res);
+		if (!fl || result < 0)
 			goto fallback;
 
 		cl = (void *)res.class;
@@ -1685,10 +1688,13 @@ static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct tcf_proto *fl;
 
 	WARN_ON(cl->filters);
 
-	tcf_destroy_chain(&cl->filter_list);
+	fl = rtnl_dereference(cl->filter_list);
+	tcf_destroy_chain(&fl);
+
 	qdisc_destroy(cl->q);
 	qdisc_put_rtab(cl->R_tab);
 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
@@ -1701,6 +1707,7 @@ static void cbq_destroy(struct Qdisc *sch)
 	struct cbq_sched_data *q = qdisc_priv(sch);
 	struct hlist_node *next;
 	struct cbq_class *cl;
+	struct tcf_proto *fl;
 	unsigned int h;
 
 #ifdef CONFIG_NET_CLS_ACT
@@ -1712,8 +1719,10 @@ static void cbq_destroy(struct Qdisc *sch)
 	 * be bound to classes which have been destroyed already. --TGR '04
 	 */
 	for (h = 0; h < q->clhash.hashsize; h++) {
-		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode)
-			tcf_destroy_chain(&cl->filter_list);
+		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
+			fl = rtnl_dereference(cl->filter_list);
+			tcf_destroy_chain(&fl);
+		}
 	}
 	for (h = 0; h < q->clhash.hashsize; h++) {
 		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index ddd73cb..60eb0e5 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -58,7 +58,7 @@ struct choke_sched_data {
 
 /* Variables */
 	struct red_vars  vars;
-	struct tcf_proto *filter_list;
+	struct tcf_proto __rcu *filter_list;
 	struct {
 		u32	prob_drop;	/* Early probability drops */
 		u32	prob_mark;	/* Early probability marks */
@@ -200,9 +200,11 @@ static bool choke_classify(struct sk_buff *skb,
 {
 	struct choke_sched_data *q = qdisc_priv(sch);
 	struct tcf_result res;
+	struct tcf_proto *fl;
 	int result;
 
-	result = tc_classify(skb, q->filter_list, &res);
+	fl = rcu_dereference_bh(q->filter_list);
+	result = tc_classify(skb, fl, &res);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
@@ -251,12 +253,14 @@ static bool choke_match_random(const struct choke_sched_data *q,
 			       unsigned int *pidx)
 {
 	struct sk_buff *oskb;
+	struct tcf_proto *fl;
 
 	if (q->head == q->tail)
 		return false;
 
 	oskb = choke_peek_random(q, pidx);
-	if (q->filter_list)
+	fl = rcu_dereference_bh(q->filter_list);
+	if (fl)
 		return choke_get_classid(nskb) == choke_get_classid(oskb);
 
 	return choke_match_flow(oskb, nskb);
@@ -266,9 +270,11 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct choke_sched_data *q = qdisc_priv(sch);
 	const struct red_parms *p = &q->parms;
+	struct tcf_proto *fl;
 	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 
-	if (q->filter_list) {
+	fl = rcu_dereference_bh(q->filter_list);
+	if (fl) {
 		/* If using external classifiers, get result and record it. */
 		if (!choke_classify(skb, sch, &ret))
 			goto other_drop;	/* Packet was eaten by filter */
@@ -541,8 +547,10 @@ static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 static void choke_destroy(struct Qdisc *sch)
 {
 	struct choke_sched_data *q = qdisc_priv(sch);
+	struct tcf_proto *fl;
 
-	tcf_destroy_chain(&q->filter_list);
+	fl = rtnl_dereference(q->filter_list);
+	tcf_destroy_chain(&fl);
 	choke_free(q->tab);
 }
 
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 8302717..899026f 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -35,7 +35,7 @@ struct drr_class {
 
 struct drr_sched {
 	struct list_head		active;
-	struct tcf_proto		*filter_list;
+	struct tcf_proto __rcu		*filter_list;
 	struct Qdisc_class_hash		clhash;
 };
 
@@ -319,6 +319,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
 	struct drr_sched *q = qdisc_priv(sch);
 	struct drr_class *cl;
 	struct tcf_result res;
+	struct tcf_proto *fl;
 	int result;
 
 	if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
@@ -328,7 +329,8 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
 	}
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	result = tc_classify(skb, q->filter_list, &res);
+	fl = rcu_dereference_bh(q->filter_list);
+	result = tc_classify(skb, fl, &res);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
@@ -467,9 +469,11 @@ static void drr_destroy_qdisc(struct Qdisc *sch)
 	struct drr_sched *q = qdisc_priv(sch);
 	struct drr_class *cl;
 	struct hlist_node *next;
+	struct tcf_proto *fl;
 	unsigned int i;
 
-	tcf_destroy_chain(&q->filter_list);
+	fl = rtnl_dereference(q->filter_list);
+	tcf_destroy_chain(&fl);
 
 	for (i = 0; i < q->clhash.hashsize; i++) {
 		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 49d6ef3..6b84bd4 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -37,7 +37,7 @@
 
 struct dsmark_qdisc_data {
 	struct Qdisc		*q;
-	struct tcf_proto	*filter_list;
+	struct tcf_proto __rcu	*filter_list;
 	u8			*mask;	/* "owns" the array */
 	u8			*value;
 	u16			indices;
@@ -229,7 +229,8 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		skb->tc_index = TC_H_MIN(skb->priority);
 	else {
 		struct tcf_result res;
-		int result = tc_classify(skb, p->filter_list, &res);
+		struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
+		int result = tc_classify(skb, fl, &res);
 
 		pr_debug("result %d class 0x%04x\n", result, res.classid);
 
@@ -404,10 +405,11 @@ static void dsmark_reset(struct Qdisc *sch)
 static void dsmark_destroy(struct Qdisc *sch)
 {
 	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+	struct tcf_proto *fl = rtnl_dereference(p->filter_list);
 
 	pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
 
-	tcf_destroy_chain(&p->filter_list);
+	tcf_destroy_chain(&fl);
 	qdisc_destroy(p->q);
 	kfree(p->mask);
 }
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 5578628..bc47789 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -52,7 +52,7 @@ struct fq_codel_flow {
 }; /* please try to keep this structure <= 64 bytes */
 
 struct fq_codel_sched_data {
-	struct tcf_proto *filter_list;	/* optional external classifier */
+	struct tcf_proto __rcu *filter_list; /* optional external classifier */
 	struct fq_codel_flow *flows;	/* Flows table [flows_cnt] */
 	u32		*backlogs;	/* backlog table [flows_cnt] */
 	u32		flows_cnt;	/* number of flows */
@@ -84,6 +84,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
 				      int *qerr)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct tcf_proto *filter;
 	struct tcf_result res;
 	int result;
 
@@ -92,11 +93,12 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
 	    TC_H_MIN(skb->priority) <= q->flows_cnt)
 		return TC_H_MIN(skb->priority);
 
-	if (!q->filter_list)
+	filter = rcu_dereference(q->filter_list);
+	if (!filter)
 		return fq_codel_hash(q, skb) + 1;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	result = tc_classify(skb, q->filter_list, &res);
+	result = tc_classify(skb, filter, &res);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
@@ -376,8 +378,9 @@ static void fq_codel_free(void *addr)
 static void fq_codel_destroy(struct Qdisc *sch)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct tcf_proto *filter = rtnl_dereference(q->filter_list);
 
-	tcf_destroy_chain(&q->filter_list);
+	tcf_destroy_chain(&filter);
 	fq_codel_free(q->backlogs);
 	fq_codel_free(q->flows);
 }
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index c407561..0a7650f 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -116,7 +116,7 @@ struct hfsc_class {
 	struct gnet_stats_queue qstats;
 	struct gnet_stats_rate_est64 rate_est;
 	unsigned int	level;		/* class level in hierarchy */
-	struct tcf_proto *filter_list;	/* filter list */
+	struct tcf_proto __rcu *filter_list; /* filter list */
 	unsigned int	filter_cnt;	/* filter count */
 
 	struct hfsc_sched *sched;	/* scheduler data */
@@ -1110,8 +1110,10 @@ static void
 hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
 {
 	struct hfsc_sched *q = qdisc_priv(sch);
+	struct tcf_proto *fl;
 
-	tcf_destroy_chain(&cl->filter_list);
+	fl = rtnl_dereference(cl->filter_list);
+	tcf_destroy_chain(&fl);
 	qdisc_destroy(cl->qdisc);
 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
 	if (cl != &q->root)
@@ -1161,7 +1163,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	head = &q->root;
-	tcf = q->root.filter_list;
+	tcf = rcu_dereference_bh(q->root.filter_list);
 	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
@@ -1185,7 +1187,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 			return cl; /* hit leaf class */
 
 		/* apply inner filter chain */
-		tcf = cl->filter_list;
+		tcf = rcu_dereference_bh(cl->filter_list);
 		head = cl;
 	}
 
@@ -1540,11 +1542,14 @@ hfsc_destroy_qdisc(struct Qdisc *sch)
 	struct hfsc_sched *q = qdisc_priv(sch);
 	struct hlist_node *next;
 	struct hfsc_class *cl;
+	struct tcf_proto *fl;
 	unsigned int i;
 
 	for (i = 0; i < q->clhash.hashsize; i++) {
-		hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
-			tcf_destroy_chain(&cl->filter_list);
+		hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) {
+			fl = rtnl_dereference(cl->filter_list);
+			tcf_destroy_chain(&fl);
+		}
 	}
 	for (i = 0; i < q->clhash.hashsize; i++) {
 		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 0db5a6e..86df75c 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -103,7 +103,7 @@ struct htb_class {
 	u32			prio;		/* these two are used only by leaves... */
 	int			quantum;	/* but stored for parent-to-leaf return */
 
-	struct tcf_proto	*filter_list;	/* class attached filters */
+	struct tcf_proto __rcu	*filter_list;	/* class attached filters */
 	int			filter_cnt;
 	int			refcnt;		/* usage count of this class */
 
@@ -153,7 +153,7 @@ struct htb_sched {
 	int			rate2quantum;	/* quant = rate / rate2quantum */
 
 	/* filters for qdisc itself */
-	struct tcf_proto	*filter_list;
+	struct tcf_proto __rcu	*filter_list;
 
 #define HTB_WARN_TOOMANYEVENTS	0x1
 	unsigned int		warned;	/* only one warning */
@@ -223,7 +223,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
 		return cl;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	tcf = q->filter_list;
+	tcf = rcu_dereference_bh(q->filter_list);
 	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
@@ -246,7 +246,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
 			return cl;	/* we hit leaf; return it */
 
 		/* we have got inner class; apply inner filter chain */
-		tcf = cl->filter_list;
+		tcf = rcu_dereference_bh(cl->filter_list);
 	}
 	/* classification failed; try to use default class */
 	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
@@ -1230,12 +1230,15 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
 
 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
 {
+	struct tcf_proto *fl;
+
 	if (!cl->level) {
 		WARN_ON(!cl->un.leaf.q);
 		qdisc_destroy(cl->un.leaf.q);
 	}
 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
-	tcf_destroy_chain(&cl->filter_list);
+	fl = rtnl_dereference(cl->filter_list);
+	tcf_destroy_chain(&fl);
 	kfree(cl);
 }
 
@@ -1244,6 +1247,7 @@ static void htb_destroy(struct Qdisc *sch)
 	struct htb_sched *q = qdisc_priv(sch);
 	struct hlist_node *next;
 	struct htb_class *cl;
+	struct tcf_proto *fl;
 	unsigned int i;
 
 	cancel_work_sync(&q->work);
@@ -1253,11 +1257,14 @@ static void htb_destroy(struct Qdisc *sch)
 	 * because filter need its target class alive to be able to call
 	 * unbind_filter on it (without Oops).
 	 */
-	tcf_destroy_chain(&q->filter_list);
+	fl = rtnl_dereference(q->filter_list);
+	tcf_destroy_chain(&fl);
 
 	for (i = 0; i < q->clhash.hashsize; i++) {
-		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode)
-			tcf_destroy_chain(&cl->filter_list);
+		hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+			fl = rtnl_dereference(cl->filter_list);
+			tcf_destroy_chain(&fl);
+		}
 	}
 	for (i = 0; i < q->clhash.hashsize; i++) {
 		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index bce1665..f7c80d3 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -17,7 +17,7 @@
 
 
 struct ingress_qdisc_data {
-	struct tcf_proto	*filter_list;
+	struct tcf_proto __rcu	*filter_list;
 };
 
 /* ------------------------- Class/flow operations ------------------------- */
@@ -59,9 +59,10 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct ingress_qdisc_data *p = qdisc_priv(sch);
 	struct tcf_result res;
+	struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
 	int result;
 
-	result = tc_classify(skb, p->filter_list, &res);
+	result = tc_classify(skb, fl, &res);
 
 	qdisc_bstats_update(sch, skb);
 	switch (result) {
@@ -89,8 +90,9 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 static void ingress_destroy(struct Qdisc *sch)
 {
 	struct ingress_qdisc_data *p = qdisc_priv(sch);
+	struct tcf_proto *fl = rtnl_dereference(p->filter_list);
 
-	tcf_destroy_chain(&p->filter_list);
+	tcf_destroy_chain(&fl);
 }
 
 static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index afb050a..169d637 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -31,7 +31,7 @@ struct multiq_sched_data {
 	u16 bands;
 	u16 max_bands;
 	u16 curband;
-	struct tcf_proto *filter_list;
+	struct tcf_proto __rcu *filter_list;
 	struct Qdisc **queues;
 };
 
@@ -42,10 +42,11 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	struct multiq_sched_data *q = qdisc_priv(sch);
 	u32 band;
 	struct tcf_result res;
+	struct tcf_proto *fl = rcu_dereference_bh(q->filter_list);
 	int err;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	err = tc_classify(skb, q->filter_list, &res);
+	err = tc_classify(skb, fl, &res);
 #ifdef CONFIG_NET_CLS_ACT
 	switch (err) {
 	case TC_ACT_STOLEN:
@@ -188,8 +189,9 @@ multiq_destroy(struct Qdisc *sch)
 {
 	int band;
 	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct tcf_proto *fl = rtnl_dereference(q->filter_list);
 
-	tcf_destroy_chain(&q->filter_list);
+	tcf_destroy_chain(&fl);
 	for (band = 0; band < q->bands; band++)
 		qdisc_destroy(q->queues[band]);
 
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 79359b6..43aa35d 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -24,7 +24,7 @@
 
 struct prio_sched_data {
 	int bands;
-	struct tcf_proto *filter_list;
+	struct tcf_proto __rcu *filter_list;
 	u8  prio2band[TC_PRIO_MAX+1];
 	struct Qdisc *queues[TCQ_PRIO_BANDS];
 };
@@ -36,11 +36,13 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 	struct prio_sched_data *q = qdisc_priv(sch);
 	u32 band = skb->priority;
 	struct tcf_result res;
+	struct tcf_proto *fl;
 	int err;
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	if (TC_H_MAJ(skb->priority) != sch->handle) {
-		err = tc_classify(skb, q->filter_list, &res);
+		fl = rcu_dereference_bh(q->filter_list);
+		err = tc_classify(skb, fl, &res);
 #ifdef CONFIG_NET_CLS_ACT
 		switch (err) {
 		case TC_ACT_STOLEN:
@@ -50,7 +52,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 			return NULL;
 		}
 #endif
-		if (!q->filter_list || err < 0) {
+		if (!fl || err < 0) {
 			if (TC_H_MAJ(band))
 				band = 0;
 			return q->queues[q->prio2band[band & TC_PRIO_MAX]];
@@ -157,8 +159,9 @@ prio_destroy(struct Qdisc *sch)
 {
 	int prio;
 	struct prio_sched_data *q = qdisc_priv(sch);
+	struct tcf_proto *fl = rtnl_dereference(q->filter_list);
 
-	tcf_destroy_chain(&q->filter_list);
+	tcf_destroy_chain(&fl);
 	for (prio = 0; prio < q->bands; prio++)
 		qdisc_destroy(q->queues[prio]);
 }
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 8056fb4..cee5618 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -181,7 +181,7 @@ struct qfq_group {
 };
 
 struct qfq_sched {
-	struct tcf_proto *filter_list;
+	struct tcf_proto __rcu *filter_list;
 	struct Qdisc_class_hash clhash;
 
 	u64			oldV, V;	/* Precise virtual times. */
@@ -704,6 +704,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 	struct qfq_sched *q = qdisc_priv(sch);
 	struct qfq_class *cl;
 	struct tcf_result res;
+	struct tcf_proto *fl;
 	int result;
 
 	if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
@@ -714,7 +715,8 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 	}
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	result = tc_classify(skb, q->filter_list, &res);
+	fl = rcu_dereference_bh(q->filter_list);
+	result = tc_classify(skb, fl, &res);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
@@ -1524,9 +1526,10 @@ static void qfq_destroy_qdisc(struct Qdisc *sch)
 	struct qfq_sched *q = qdisc_priv(sch);
 	struct qfq_class *cl;
 	struct hlist_node *next;
+	struct tcf_proto *fl = rtnl_dereference(q->filter_list);
 	unsigned int i;
 
-	tcf_destroy_chain(&q->filter_list);
+	tcf_destroy_chain(&fl);
 
 	for (i = 0; i < q->clhash.hashsize; i++) {
 		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 30ea467..a4f1d6c 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -55,7 +55,7 @@ struct sfb_bins {
 
 struct sfb_sched_data {
 	struct Qdisc	*qdisc;
-	struct tcf_proto *filter_list;
+	struct tcf_proto __rcu *filter_list;
 	unsigned long	rehash_interval;
 	unsigned long	warmup_time;	/* double buffering warmup time in jiffies */
 	u32		max;
@@ -253,13 +253,13 @@ static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q)
 	return false;
 }
 
-static bool sfb_classify(struct sk_buff *skb, struct sfb_sched_data *q,
+static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl,
 			 int *qerr, u32 *salt)
 {
 	struct tcf_result res;
 	int result;
 
-	result = tc_classify(skb, q->filter_list, &res);
+	result = tc_classify(skb, fl, &res);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
@@ -281,6 +281,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 	struct sfb_sched_data *q = qdisc_priv(sch);
 	struct Qdisc *child = q->qdisc;
+	struct tcf_proto *fl;
 	int i;
 	u32 p_min = ~0;
 	u32 minqlen = ~0;
@@ -306,9 +307,10 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		}
 	}
 
-	if (q->filter_list) {
+	fl = rcu_dereference_bh(q->filter_list);
+	if (fl) {
 		/* If using external classifiers, get result and record it. */
-		if (!sfb_classify(skb, q, &ret, &salt))
+		if (!sfb_classify(skb, fl, &ret, &salt))
 			goto other_drop;
 		keys.src = salt;
 		keys.dst = 0;
@@ -465,8 +467,9 @@ static void sfb_reset(struct Qdisc *sch)
 static void sfb_destroy(struct Qdisc *sch)
 {
 	struct sfb_sched_data *q = qdisc_priv(sch);
+	struct tcf_proto *fl = rtnl_dereference(q->filter_list);
 
-	tcf_destroy_chain(&q->filter_list);
+	tcf_destroy_chain(&fl);
 	qdisc_destroy(q->qdisc);
 }
 
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 76f01e0..da24df01 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -125,7 +125,7 @@ struct sfq_sched_data {
 	u8		cur_depth;	/* depth of longest slot */
 	u8		flags;
 	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
-	struct tcf_proto *filter_list;
+	struct tcf_proto __rcu *filter_list;
 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
 
@@ -187,6 +187,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
 	struct tcf_result res;
+	struct tcf_proto *fl;
 	int result;
 
 	if (TC_H_MAJ(skb->priority) == sch->handle &&
@@ -194,13 +195,14 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 	    TC_H_MIN(skb->priority) <= q->divisor)
 		return TC_H_MIN(skb->priority);
 
-	if (!q->filter_list) {
+	fl = rcu_dereference_bh(q->filter_list);
+	if (!fl) {
 		skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
 		return sfq_hash(q, skb) + 1;
 	}
 
 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
-	result = tc_classify(skb, q->filter_list, &res);
+	result = tc_classify(skb, fl, &res);
 	if (result >= 0) {
 #ifdef CONFIG_NET_CLS_ACT
 		switch (result) {
@@ -727,8 +729,9 @@ static void sfq_free(void *addr)
 static void sfq_destroy(struct Qdisc *sch)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
+	struct tcf_proto *fl = rtnl_dereference(q->filter_list);
 
-	tcf_destroy_chain(&q->filter_list);
+	tcf_destroy_chain(&fl);
 	q->perturb_period = 0;
 	del_timer_sync(&q->perturb_timer);
 	sfq_free(q->ht);

^ permalink raw reply related

* [RFC PATCH 01/12] net: qdisc: use rcu prefix and silence sparse warnings
From: John Fastabend @ 2014-01-10  9:37 UTC (permalink / raw)
  To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
In-Reply-To: <20140110092041.7193.5952.stgit@nitbit.x32>

Add __rcu notation to qdisc handling by doing this we can make
smatch output more legible. And anyways some of the cases should
be using rcu_dereference() see qdisc_all_tx_empty(),
qdisc_tx_chainging(), and so on.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 include/linux/netdevice.h |   41 ++++------------------------------
 include/net/sch_generic.h |   29 +++++++++++++++++++-----
 net/core/dev.c            |   54 +++++++++++++++++++++++++++++++++++++++++++--
 net/sched/sch_generic.c   |    4 ++-
 net/sched/sch_mqprio.c    |    4 ++-
 net/sched/sch_teql.c      |    9 ++++----
 6 files changed, 89 insertions(+), 52 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 88afa80..797e473 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -546,7 +546,7 @@ struct netdev_queue {
  * read mostly part
  */
 	struct net_device	*dev;
-	struct Qdisc		*qdisc;
+	struct Qdisc __rcu	*qdisc;
 	struct Qdisc		*qdisc_sleeping;
 #ifdef CONFIG_SYSFS
 	struct kobject		kobj;
@@ -1987,13 +1987,8 @@ static inline void input_queue_tail_incr_save(struct softnet_data *sd,
 
 DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 
-void __netif_schedule(struct Qdisc *q);
-
-static inline void netif_schedule_queue(struct netdev_queue *txq)
-{
-	if (!(txq->state & QUEUE_STATE_ANY_XOFF))
-		__netif_schedule(txq->qdisc);
-}
+extern void __netif_schedule(struct Qdisc *q);
+extern void netif_schedule_queue(struct netdev_queue *txq);
 
 static inline void netif_tx_schedule_all(struct net_device *dev)
 {
@@ -2029,17 +2024,7 @@ static inline void netif_tx_start_all_queues(struct net_device *dev)
 	}
 }
 
-static inline void netif_tx_wake_queue(struct netdev_queue *dev_queue)
-{
-#ifdef CONFIG_NETPOLL_TRAP
-	if (netpoll_trap()) {
-		netif_tx_start_queue(dev_queue);
-		return;
-	}
-#endif
-	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state))
-		__netif_schedule(dev_queue->qdisc);
-}
+extern void netif_tx_wake_queue(struct netdev_queue *dev_queue);
 
 /**
  *	netif_wake_queue - restart transmit
@@ -2288,23 +2273,7 @@ static inline bool netif_subqueue_stopped(const struct net_device *dev,
 	return __netif_subqueue_stopped(dev, skb_get_queue_mapping(skb));
 }
 
-/**
- *	netif_wake_subqueue - allow sending packets on subqueue
- *	@dev: network device
- *	@queue_index: sub queue index
- *
- * Resume individual transmit queue of a device with multiple transmit queues.
- */
-static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
-{
-	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
-#ifdef CONFIG_NETPOLL_TRAP
-	if (netpoll_trap())
-		return;
-#endif
-	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state))
-		__netif_schedule(txq->qdisc);
-}
+extern void netif_wake_subqueue(struct net_device *dev, u16 queue_index);
 
 #ifdef CONFIG_XPS
 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 013d96d..3e10a2a 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -259,7 +259,9 @@ static inline spinlock_t *qdisc_lock(struct Qdisc *qdisc)
 
 static inline struct Qdisc *qdisc_root(const struct Qdisc *qdisc)
 {
-	return qdisc->dev_queue->qdisc;
+	struct Qdisc *q = rcu_dereference(qdisc->dev_queue->qdisc);
+
+	return q;
 }
 
 static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc)
@@ -384,7 +386,7 @@ static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
 	struct Qdisc *qdisc;
 
 	for (; i < dev->num_tx_queues; i++) {
-		qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+		qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
 		if (qdisc) {
 			spin_lock_bh(qdisc_lock(qdisc));
 			qdisc_reset(qdisc);
@@ -402,13 +404,18 @@ static inline void qdisc_reset_all_tx(struct net_device *dev)
 static inline bool qdisc_all_tx_empty(const struct net_device *dev)
 {
 	unsigned int i;
+
+	rcu_read_lock();
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
-		const struct Qdisc *q = txq->qdisc;
+		const struct Qdisc *q = rcu_dereference(txq->qdisc);
 
-		if (q->q.qlen)
+		if (q->q.qlen) {
+			rcu_read_unlock();
 			return false;
+		}
 	}
+	rcu_read_unlock();
 	return true;
 }
 
@@ -416,11 +423,16 @@ static inline bool qdisc_all_tx_empty(const struct net_device *dev)
 static inline bool qdisc_tx_changing(const struct net_device *dev)
 {
 	unsigned int i;
+
+	rcu_read_lock();
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
-		if (txq->qdisc != txq->qdisc_sleeping)
+		if (rcu_dereference(txq->qdisc) != txq->qdisc_sleeping) {
+			rcu_read_unlock();
 			return true;
+		}
 	}
+	rcu_read_unlock();
 	return false;
 }
 
@@ -428,11 +440,16 @@ static inline bool qdisc_tx_changing(const struct net_device *dev)
 static inline bool qdisc_tx_is_noop(const struct net_device *dev)
 {
 	unsigned int i;
+
+	rcu_read_lock();
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
-		if (txq->qdisc != &noop_qdisc)
+		if (rcu_dereference(txq->qdisc) != &noop_qdisc) {
+			rcu_read_unlock();
 			return false;
+		}
 	}
+	rcu_read_unlock();
 	return true;
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index cc9ab80..9f761e7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2154,6 +2154,56 @@ static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
 	return (struct dev_kfree_skb_cb *)skb->cb;
 }
 
+void netif_schedule_queue(struct netdev_queue *txq)
+{
+	rcu_read_lock();
+	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
+		struct Qdisc *q = rcu_dereference(txq->qdisc);
+
+		__netif_schedule(q);
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(netif_schedule_queue);
+
+/**
+ *	netif_wake_subqueue - allow sending packets on subqueue
+ *	@dev: network device
+ *	@queue_index: sub queue index
+ *
+ * Resume individual transmit queue of a device with multiple transmit queues.
+ */
+void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
+{
+	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
+#ifdef CONFIG_NETPOLL_TRAP
+	if (netpoll_trap())
+		return;
+#endif
+	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
+		struct Qdisc *q = rcu_dereference(txq->qdisc);
+
+		__netif_schedule(q);
+	}
+}
+EXPORT_SYMBOL(netif_wake_subqueue);
+
+void netif_tx_wake_queue(struct netdev_queue *dev_queue)
+{
+#ifdef CONFIG_NETPOLL_TRAP
+	if (netpoll_trap()) {
+		netif_tx_start_queue(dev_queue);
+		return;
+	}
+#endif
+	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
+		struct Qdisc *q = rcu_dereference(dev_queue->qdisc);
+
+		__netif_schedule(q);
+	}
+}
+EXPORT_SYMBOL(netif_tx_wake_queue);
+
 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
 {
 	unsigned long flags;
@@ -3375,7 +3425,7 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
 
-	q = rxq->qdisc;
+	q = rcu_dereference(rxq->qdisc);
 	if (q != &noop_qdisc) {
 		spin_lock(qdisc_lock(q));
 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
@@ -3392,7 +3442,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 {
 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
 
-	if (!rxq || rxq->qdisc == &noop_qdisc)
+	if (!rxq || rcu_dereference_bh(rxq->qdisc) == &noop_qdisc)
 		goto out;
 
 	if (*pt_prev) {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 32bb942..71d7cd9 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -776,7 +776,7 @@ static void dev_deactivate_queue(struct net_device *dev,
 	struct Qdisc *qdisc_default = _qdisc_default;
 	struct Qdisc *qdisc;
 
-	qdisc = dev_queue->qdisc;
+	qdisc = rtnl_dereference(dev_queue->qdisc);
 	if (qdisc) {
 		spin_lock_bh(qdisc_lock(qdisc));
 
@@ -869,7 +869,7 @@ static void dev_init_scheduler_queue(struct net_device *dev,
 {
 	struct Qdisc *qdisc = _qdisc;
 
-	dev_queue->qdisc = qdisc;
+	rcu_assign_pointer(dev_queue->qdisc, qdisc);
 	dev_queue->qdisc_sleeping = qdisc;
 }
 
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 6749e2f..e1b112f 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -231,7 +231,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 	memset(&sch->qstats, 0, sizeof(sch->qstats));
 
 	for (i = 0; i < dev->num_tx_queues; i++) {
-		qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+		qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
 		spin_lock_bh(qdisc_lock(qdisc));
 		sch->q.qlen		+= qdisc->q.qlen;
 		sch->bstats.bytes	+= qdisc->bstats.bytes;
@@ -340,7 +340,7 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 		spin_unlock_bh(d->lock);
 
 		for (i = tc.offset; i < tc.offset + tc.count; i++) {
-			qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+			qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
 			spin_lock_bh(qdisc_lock(qdisc));
 			bstats.bytes      += qdisc->bstats.bytes;
 			bstats.packets    += qdisc->bstats.packets;
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 4741671..b35ba70 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -100,7 +100,8 @@ teql_dequeue(struct Qdisc *sch)
 	skb = __skb_dequeue(&dat->q);
 	dat_queue = netdev_get_tx_queue(dat->m->dev, 0);
 	if (skb == NULL) {
-		struct net_device *m = qdisc_dev(dat_queue->qdisc);
+		struct Qdisc *q = rcu_dereference_bh(dat_queue->qdisc);
+		struct net_device *m = qdisc_dev(q);
 		if (m) {
 			dat->m->slaves = sch;
 			netif_wake_queue(m);
@@ -157,9 +158,9 @@ teql_destroy(struct Qdisc *sch)
 						txq = netdev_get_tx_queue(master->dev, 0);
 						master->slaves = NULL;
 
-						root_lock = qdisc_root_sleeping_lock(txq->qdisc);
+						root_lock = qdisc_root_sleeping_lock(rtnl_dereference(txq->qdisc));
 						spin_lock_bh(root_lock);
-						qdisc_reset(txq->qdisc);
+						qdisc_reset(rtnl_dereference(txq->qdisc));
 						spin_unlock_bh(root_lock);
 					}
 				}
@@ -266,7 +267,7 @@ static inline int teql_resolve(struct sk_buff *skb,
 	struct dst_entry *dst = skb_dst(skb);
 	int res;
 
-	if (txq->qdisc == &noop_qdisc)
+	if (rcu_dereference_bh(txq->qdisc) == &noop_qdisc)
 		return -ENODEV;
 
 	if (!dev->header_ops || !dst)

^ permalink raw reply related

* [RFC PATCH 00/12] RCU'ify the net:sched classifier chains
From: John Fastabend @ 2014-01-10  9:36 UTC (permalink / raw)
  To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem

There appears to be some interest in a few topics around the qdisc
layer which could benefit from having the ability to run the
filters and actions without holding the qdisc lock.

Recently Cong Wang proposed a patch series to drop the ingress
qdisc and asked for comments. This series I think gets closer to
that goal.

The ingress qdisc is a simple qdisc which doesn't maintain any
actual list of skb's and is primarily a hook to attach filters.
Further the only qdisc that can be attached to the ingress qdisc
is sch_ingress. The qdisc lock is currently serializing two
operations (1) tc_classify which is addressed here and (2)
statistics accounting. The second point is not solved here but
it could be a matter of making the bstats and qstats per cpu
stats.

This is an RFC for now and needs some more work. Some items
I know about are (a) an audit of the ematch code paths, (b) resolving
the checpatch errors mostly due to moving code around that
generates those errors, (c) run smatch, (d) audit u32 code
for correctness, (e) do a lot more testing so far only very
basic testing has been done. I tried to put some reasonable
comments in the commit logs but yes they need more work.

Cong, if its not too much to ask can we use this as a base
set of patches for this work? I think its reasonably close to
correct as is.

Thanks! John.

---

John Fastabend (12):
      net: qdisc: use rcu prefix and silence sparse warnings
      net: rcu-ify tcf_proto
      net: sched: cls_basic use RCU
      net: sched: cls_cgroup use RCU
      net: sched: cls_flow use RCU
      net: sched: fw use RCU
      net: sched: RCU cls_route
      net: sched: RCU cls_tcindex
      net: sched: make cls_u32 lockless
      net: sched: rcu'ify cls_rsvp
      net: make cls_bpf rcu safe
      net: sched: make tc_action safe to walk under RCU


 include/linux/netdevice.h |   41 +-----
 include/linux/rtnetlink.h |   10 +
 include/net/act_api.h     |    1 
 include/net/pkt_cls.h     |   12 +-
 include/net/sch_generic.h |   34 ++++-
 net/core/dev.c            |   54 +++++++
 net/sched/act_api.c       |   18 +-
 net/sched/cls_api.c       |   44 +++---
 net/sched/cls_basic.c     |   82 ++++++-----
 net/sched/cls_bpf.c       |   79 ++++++-----
 net/sched/cls_cgroup.c    |   65 ++++++---
 net/sched/cls_flow.c      |  145 ++++++++++++--------
 net/sched/cls_fw.c        |  112 +++++++++++----
 net/sched/cls_route.c     |  218 +++++++++++++++++-------------
 net/sched/cls_rsvp.h      |  152 ++++++++++++---------
 net/sched/cls_tcindex.c   |  327 ++++++++++++++++++++++++++-------------------
 net/sched/cls_u32.c       |  258 +++++++++++++++++++++++-------------
 net/sched/sch_api.c       |    6 -
 net/sched/sch_atm.c       |   30 +++-
 net/sched/sch_cbq.c       |   21 ++-
 net/sched/sch_choke.c     |   18 ++
 net/sched/sch_drr.c       |   10 +
 net/sched/sch_dsmark.c    |    8 +
 net/sched/sch_fq_codel.c  |   11 +-
 net/sched/sch_generic.c   |    4 -
 net/sched/sch_hfsc.c      |   17 ++
 net/sched/sch_htb.c       |   23 ++-
 net/sched/sch_ingress.c   |    8 +
 net/sched/sch_mqprio.c    |    4 -
 net/sched/sch_multiq.c    |    8 +
 net/sched/sch_prio.c      |   11 +-
 net/sched/sch_qfq.c       |    9 +
 net/sched/sch_sfb.c       |   15 +-
 net/sched/sch_sfq.c       |   11 +-
 net/sched/sch_teql.c      |    9 +
 35 files changed, 1139 insertions(+), 736 deletions(-)

-- 
Signature

^ permalink raw reply

* Re: [PATCHv4 net-next] xfrm: Namespacify xfrm_policy_sk_bundles
From: Fan Du @ 2014-01-10  9:23 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: Timo Teras, Eric Dumazet, davem, netdev
In-Reply-To: <20140109123803.GZ31491@secunet.com>



On 2014年01月09日 20:38, Steffen Klassert wrote:
> On Tue, Jan 07, 2014 at 10:43:38AM +0800, Fan Du wrote:
>> >
>> >  Yes, I tested sk policy with udp, when transmit, dst will be cached into sk
>> >  by sk_dst_set. Let's leave current implementation as it is.
>> >
>> >  Please kindly review if there is any concern about v4.
> Why do you want to keep the current implementation? We don't
> use the cached bundles. I'd remove this caching with the patch
> below during the next development cycle if nobody has a good
> reason why we should keep it.
>
>
> Subject: [PATCH RFC] xfrm: Remove caching of xfrm_policy_sk_bundles
>
> We currently cache socket policy bundles at xfrm_policy_sk_bundles.
> These cached bundles are never used. Instead we create and cache
> a new one whenever xfrm_lookup() is called on a socket policy.
>
> Most protocols cache the used routes to the socket, so let's
> remove the unused caching of socket policy bundles in xfrm.

Honestly speaking, I cannot think of any other obvious reason
to retain sk bundle cache for what I know about XFRM so far, though
this mysterious ancient code still puzzles me.

Acked-by: Fan Du <fan.du@windriver.com>

> Signed-off-by: Steffen Klassert<steffen.klassert@secunet.com>

-- 
浮沉随浪只记今朝笑

--fan

^ permalink raw reply

* [PATCH v3 net-next 1/3] bonding: fix bond_3ad_set_carrier() RCU usage
From: Veaceslav Falico @ 2014-01-10  9:18 UTC (permalink / raw)
  To: netdev; +Cc: Veaceslav Falico, dingtianhong, Jay Vosburgh, Andy Gospodarek
In-Reply-To: <1389345523-5497-1-git-send-email-vfalico@redhat.com>

Currently, its usage is just plainly wrong. It first gets a slave under
RCU, and, after releasing the RCU lock, continues to use it - whilst it can
be freed.

Fix this by ensuring that bond_3ad_set_carrier() holds RCU till it uses its
slave (or its agg).

Fixes: be79bd048ab ("bonding: add RCU for bond_3ad_state_machine_handler()")
CC: dingtianhong@huawei.com
CC: Jay Vosburgh <fubar@us.ibm.com>
CC: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: Veaceslav Falico <vfalico@redhat.com>
---

Notes:
    v2 -> v3:
    Just wrap RCU for the whole usage of our slave.
    
    v1 -> v2:
    Don't use _rcu primitives as we can be called under RTNL too.
    
    v1 -> v2:
    Don't use _rcu primitives as we can be called under RTNL too.

 drivers/net/bonding/bond_3ad.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 29db1ca..9ff55eb 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2327,32 +2327,33 @@ int bond_3ad_set_carrier(struct bonding *bond)
 {
 	struct aggregator *active;
 	struct slave *first_slave;
+	int ret = 1;
 
 	rcu_read_lock();
 	first_slave = bond_first_slave_rcu(bond);
-	rcu_read_unlock();
-	if (!first_slave)
-		return 0;
+	if (!first_slave) {
+		ret = 0;
+		goto out;
+	}
 	active = __get_active_agg(&(SLAVE_AD_INFO(first_slave).aggregator));
 	if (active) {
 		/* are enough slaves available to consider link up? */
 		if (active->num_of_ports < bond->params.min_links) {
 			if (netif_carrier_ok(bond->dev)) {
 				netif_carrier_off(bond->dev);
-				return 1;
+				goto out;
 			}
 		} else if (!netif_carrier_ok(bond->dev)) {
 			netif_carrier_on(bond->dev);
-			return 1;
+			goto out;
 		}
-		return 0;
-	}
-
-	if (netif_carrier_ok(bond->dev)) {
+	} else if (netif_carrier_ok(bond->dev)) {
 		netif_carrier_off(bond->dev);
-		return 1;
+		goto out;
 	}
-	return 0;
+out:
+	rcu_read_unlock();
+	return ret;
 }
 
 /**
-- 
1.8.4

^ permalink raw reply related

* [PATCH v3 net-next 3/3] bonding: fix __get_active_agg() RCU logic
From: Veaceslav Falico @ 2014-01-10  9:18 UTC (permalink / raw)
  To: netdev; +Cc: Veaceslav Falico, dingtianhong, Jay Vosburgh, Andy Gospodarek
In-Reply-To: <1389345523-5497-1-git-send-email-vfalico@redhat.com>

Currently, the implementation is meaningless - once again, we take the
slave structure and use it after we've exited RCU critical section.

Fix this by removing the rcu_read_lock() from __get_active_agg(), and
ensuring that all its callers are holding RCU.

Fixes: be79bd048 ("bonding: add RCU for bond_3ad_state_machine_handler()")
CC: dingtianhong@huawei.com
CC: Jay Vosburgh <fubar@us.ibm.com>
CC: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: Veaceslav Falico <vfalico@redhat.com>
---

Notes:
    v2 -> v3:
    Use the RCU primitives.
    
    v1 -> v2:
    Don't use RCU primitives as we can hold RTNL.

 drivers/net/bonding/bond_3ad.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 27dac0e..112afa8 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -674,6 +674,8 @@ static u32 __get_agg_bandwidth(struct aggregator *aggregator)
 /**
  * __get_active_agg - get the current active aggregator
  * @aggregator: the aggregator we're looking at
+ *
+ * Caller must hold RCU lock.
  */
 static struct aggregator *__get_active_agg(struct aggregator *aggregator)
 {
@@ -681,13 +683,9 @@ static struct aggregator *__get_active_agg(struct aggregator *aggregator)
 	struct list_head *iter;
 	struct slave *slave;
 
-	rcu_read_lock();
 	bond_for_each_slave_rcu(bond, slave, iter)
-		if (SLAVE_AD_INFO(slave).aggregator.is_active) {
-			rcu_read_unlock();
+		if (SLAVE_AD_INFO(slave).aggregator.is_active)
 			return &(SLAVE_AD_INFO(slave).aggregator);
-		}
-	rcu_read_unlock();
 
 	return NULL;
 }
@@ -1495,11 +1493,11 @@ static void ad_agg_selection_logic(struct aggregator *agg)
 	struct slave *slave;
 	struct port *port;
 
+	rcu_read_lock();
 	origin = agg;
 	active = __get_active_agg(agg);
 	best = (active && agg_device_up(active)) ? active : NULL;
 
-	rcu_read_lock();
 	bond_for_each_slave_rcu(bond, slave, iter) {
 		agg = &(SLAVE_AD_INFO(slave).aggregator);
 
-- 
1.8.4

^ permalink raw reply related

* [PATCH v3 net-next 2/3] bonding: fix __get_first_agg RCU usage
From: Veaceslav Falico @ 2014-01-10  9:18 UTC (permalink / raw)
  To: netdev; +Cc: Veaceslav Falico, dingtianhong, Jay Vosburgh, Andy Gospodarek
In-Reply-To: <1389345523-5497-1-git-send-email-vfalico@redhat.com>

Currently, the RCU read lock usage is just wrong - it gets the slave struct
under RCU and continues to use it when RCU lock is released.

However, it's still safe to do this cause we didn't need the
rcu_read_lock() initially - all of the __get_first_agg() callers are either
holding RCU read lock or the RTNL lock, so that we can't sync while in it.

So, remove the useless rcu locking and add a comment.

Fixes: be79bd048 ("bonding: add RCU for bond_3ad_state_machine_handler()")
CC: dingtianhong@huawei.com
CC: Jay Vosburgh <fubar@us.ibm.com>
CC: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: Veaceslav Falico <vfalico@redhat.com>
---

Notes:
    v2 -> v3:
    Use the rcu primitives.
    
    v1 -> v2:
    Don't use RCU primitives as we can hold RTNL.

 drivers/net/bonding/bond_3ad.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 9ff55eb..27dac0e 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -143,6 +143,7 @@ static inline struct bonding *__get_bond_by_port(struct port *port)
  *
  * Return the aggregator of the first slave in @bond, or %NULL if it can't be
  * found.
+ * The caller must hold RCU lock.
  */
 static inline struct aggregator *__get_first_agg(struct port *port)
 {
@@ -153,9 +154,7 @@ static inline struct aggregator *__get_first_agg(struct port *port)
 	if (bond == NULL)
 		return NULL;
 
-	rcu_read_lock();
 	first_slave = bond_first_slave_rcu(bond);
-	rcu_read_unlock();
 
 	return first_slave ? &(SLAVE_AD_INFO(first_slave).aggregator) : NULL;
 }
-- 
1.8.4

^ permalink raw reply related

* [PATCH v3 net-next 0/3] bonding: fix bond_3ad RCU usage
From: Veaceslav Falico @ 2014-01-10  9:18 UTC (permalink / raw)
  To: netdev; +Cc: dingtianhong, Jay Vosburgh, Andy Gospodarek, Veaceslav Falico

While digging through bond_3ad.c I've found that the RCU usage there is
just wrong - it's used as a kind of mutex/spinlock instead of RCU.

v2->v3: make bond_3ad_set_carrier() use RCU read lock for the whole
function, so that all other functions will be protected by RCU as well.
This way we can use _rcu variants everywhere.

v1->v2: use generic primitives instead of _rcu ones cause we can hold RTNL
lock without RCU one, which is still safe.

This patchset is on top of bond_3ad.c cleanup:
http://www.spinics.net/lists/netdev/msg265447.html

CC: dingtianhong@huawei.com
CC: Jay Vosburgh <fubar@us.ibm.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: netdev@vger.kernel.org
Signed-off-by: Veaceslav Falico <vfalico@redhat.com>

^ permalink raw reply

* [PATCH v3 net-next] packet: doc: describe PACKET_MMAP with one packet socket for rx and tx
From: Daniel Borkmann @ 2014-01-10  9:22 UTC (permalink / raw)
  To: davem; +Cc: netdev, Norbert van Bolhuis

From: Norbert van Bolhuis <nvbolhuis@aimvalley.nl>

Document how to use one AF_PACKET mmap socket for RX and TX.

Signed-off-by: Norbert van Bolhuis <nvbolhuis@aimvalley.nl>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
---
 v1->v2:
  - add setsockopt() calls
 v2->v3:
  - did the rebase to net-next for Norbert

 Documentation/networking/packet_mmap.txt | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index 723bf3d..91ffe1d 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -98,6 +98,11 @@ by the kernel.
 The destruction of the socket and all associated resources
 is done by a simple call to close(fd).
 
+Similarly as without PACKET_MMAP, it is possible to use one socket
+for capture and transmission. This can be done by mapping the
+allocated RX and TX buffer ring with a single mmap() call.
+See "Mapping and use of the circular buffer (ring)".
+
 Next I will describe PACKET_MMAP settings and its constraints,
 also the mapping of the circular buffer in the user process and 
 the use of this buffer.
@@ -414,6 +419,19 @@ tp_block_size/tp_frame_size frames there will be a gap between
 the frames. This is because a frame cannot be spawn across two
 blocks. 
 
+To use one socket for capture and transmission, the mapping of both the
+RX and TX buffer ring has to be done with one call to mmap:
+
+    ...
+    setsockopt(fd, SOL_PACKET, PACKET_RX_RING, &foo, sizeof(foo));
+    setsockopt(fd, SOL_PACKET, PACKET_TX_RING, &bar, sizeof(bar));
+    ...
+    rx_ring = mmap(0, size * 2, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+    tx_ring = rx_ring + size;
+
+RX must be the first as the kernel maps the TX ring memory right
+after the RX one.
+
 At the beginning of each frame there is an status field (see 
 struct tpacket_hdr). If this field is 0 means that the frame is ready
 to be used for the kernel, If not, there is a frame the user can read 
-- 
1.7.11.7

^ permalink raw reply related

* Re: [net-next 05/16] i40e: fix long lines
From: Joe Perches @ 2014-01-10  9:07 UTC (permalink / raw)
  To: Jeff Kirsher
  Cc: davem, Mitch Williams, netdev, gospo, sassmann, Jesse Brandeburg
In-Reply-To: <1389344336-1558-6-git-send-email-jeffrey.t.kirsher@intel.com>

On Fri, 2014-01-10 at 00:58 -0800, Jeff Kirsher wrote:
> From: Mitch Williams <mitch.a.williams@intel.com>
> 
> Avoid over-length lines in order to appease checkpatch.

I think you can and should ignore long line warnings
where appropriate.

and...

> diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c
[]
> @@ -681,7 +681,8 @@ aq_add_vsi_exit:
>   * @cmd_details: pointer to command details structure or NULL
>   **/
>  i40e_status i40e_aq_set_vsi_unicast_promiscuous(struct i40e_hw *hw,
> -				u16 seid, bool set, struct i40e_asq_cmd_details *cmd_details)
> +				u16 seid, bool set,
> +				struct i40e_asq_cmd_details *cmd_details)

Another way this could be changed is to put the
return value on a separate line like:

i40e_status
i40e_aq_set_vsi_unicast_promiscuous(struct i40e_hw *hw, u16 seid, bool set,
				    struct i40e_asq_cmd_details *cmd_details)

> diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
[]
> @@ -93,9 +93,9 @@ i40e_status i40e_aq_set_vsi_broadcast(struct i40e_hw *hw,
>  				u16 vsi_id, bool set_filter,
>  				struct i40e_asq_cmd_details *cmd_details);
>  i40e_status i40e_aq_set_vsi_unicast_promiscuous(struct i40e_hw *hw,
> -				u16 vsi_id, bool set, struct i40e_asq_cmd_details *cmd_details);
> +		u16 vsi_id, bool set, struct i40e_asq_cmd_details *cmd_details);

i40e_status
i40e_aq_set_vsi_unicast_promiscuous(struct i40e_hw *hw, u16 vsi_id, bool set,
				    struct i40e_asq_cmd_details *cmd_details);

etc...

but once you use extremely long 35+ characters
function names, 80 columns gets a bit silly too.

^ permalink raw reply

* [net-next 14/16] i40e: enable PTP
From: Jeff Kirsher @ 2014-01-10  8:58 UTC (permalink / raw)
  To: davem
  Cc: Jacob Keller, netdev, gospo, sassmann, Richard Cochran,
	Jesse Brandeburg, Jeff Kirsher
In-Reply-To: <1389344336-1558-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

New feature: Enable PTP support in the i40e driver.

Change-ID: I6a8e799f582705191f9583afb1b9231a8db96cc8
Cc: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Kavindya Deegala <kavindya.s.deegala@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/Makefile       |   2 +
 drivers/net/ethernet/intel/i40e/i40e.h         |  24 +
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |  33 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c    |  45 +-
 drivers/net/ethernet/intel/i40e/i40e_ptp.c     | 640 +++++++++++++++++++++++++
 drivers/net/ethernet/intel/i40e/i40e_txrx.c    |  53 ++
 drivers/net/ethernet/intel/i40e/i40e_txrx.h    |   3 +
 7 files changed, 798 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_ptp.c

diff --git a/drivers/net/ethernet/intel/i40e/Makefile b/drivers/net/ethernet/intel/i40e/Makefile
index 6ec1a79..2f7ccb4 100644
--- a/drivers/net/ethernet/intel/i40e/Makefile
+++ b/drivers/net/ethernet/intel/i40e/Makefile
@@ -41,3 +41,5 @@ i40e-objs := i40e_main.o \
 	i40e_diag.o	\
 	i40e_txrx.o	\
 	i40e_virtchnl_pf.o
+
+i40e-objs += i40e_ptp.o
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index 7dab660..d73e6c9 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -55,6 +55,9 @@
 #include "i40e_virtchnl.h"
 #include "i40e_virtchnl_pf.h"
 #include "i40e_txrx.h"
+#include <linux/clocksource.h>
+#include <linux/net_tstamp.h>
+#include <linux/ptp_clock_kernel.h>
 
 /* Useful i40e defaults */
 #define I40E_BASE_PF_SEID     16
@@ -242,6 +245,7 @@ struct i40e_pf {
 #define I40E_FLAG_DCB_ENABLED                  (u64)(1 << 20)
 #define I40E_FLAG_FDIR_ENABLED                 (u64)(1 << 21)
 #define I40E_FLAG_FDIR_ATR_ENABLED             (u64)(1 << 22)
+#define I40E_FLAG_PTP                          (u64)(1 << 25)
 #define I40E_FLAG_MFP_ENABLED                  (u64)(1 << 26)
 #ifdef CONFIG_I40E_VXLAN
 #define I40E_FLAG_VXLAN_FILTER_SYNC            (u64)(1 << 27)
@@ -302,6 +306,19 @@ struct i40e_pf {
 	u32	fcoe_hmc_filt_num;
 	u32	fcoe_hmc_cntx_num;
 	struct i40e_filter_control_settings filter_settings;
+
+	struct ptp_clock *ptp_clock;
+	struct ptp_clock_info ptp_caps;
+	struct sk_buff *ptp_tx_skb;
+	struct work_struct ptp_tx_work;
+	unsigned long ptp_tx_start;
+	unsigned long last_rx_ptp_check;
+	spinlock_t tmreg_lock; /* Used to protect the device time registers. */
+	u64 ptp_base_adj;
+	u32 tx_hwtstamp_timeouts;
+	u32 rx_hwtstamp_cleared;
+	bool ptp_tx;
+	bool ptp_rx;
 };
 
 struct i40e_mac_filter {
@@ -566,4 +583,11 @@ struct i40e_mac_filter *i40e_find_mac(struct i40e_vsi *vsi, u8 *macaddr,
 				      bool is_vf, bool is_netdev);
 void i40e_vlan_stripping_enable(struct i40e_vsi *vsi);
 
+void i40e_ptp_rx_hang(struct i40e_vsi *vsi);
+void i40e_ptp_tx_hwtstamp(struct i40e_pf *pf);
+void i40e_ptp_rx_hwtstamp(struct i40e_pf *pf, struct sk_buff *skb, u8 index);
+void i40e_ptp_set_increment(struct i40e_pf *pf);
+int i40e_ptp_hwtstamp_ioctl(struct i40e_pf *pf, struct ifreq *ifr, int cmd);
+void i40e_ptp_init(struct i40e_pf *pf);
+void i40e_ptp_stop(struct i40e_pf *pf);
 #endif /* _I40E_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 342a6e1..7b87d51 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -108,6 +108,8 @@ static struct i40e_stats i40e_gstrings_stats[] = {
 	I40E_PF_STAT("rx_oversize", stats.rx_oversize),
 	I40E_PF_STAT("rx_jabber", stats.rx_jabber),
 	I40E_PF_STAT("VF_admin_queue_requests", vf_aq_requests),
+	I40E_PF_STAT("tx_hwtstamp_timeouts", tx_hwtstamp_timeouts),
+	I40E_PF_STAT("rx_hwtstamp_cleared", rx_hwtstamp_cleared),
 };
 
 #define I40E_QUEUE_STATS_LEN(n) \
@@ -748,7 +750,36 @@ static void i40e_get_strings(struct net_device *netdev, u32 stringset,
 static int i40e_get_ts_info(struct net_device *dev,
 			    struct ethtool_ts_info *info)
 {
-	return ethtool_op_get_ts_info(dev, info);
+	struct i40e_pf *pf = i40e_netdev_to_pf(dev);
+
+	info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE |
+				SOF_TIMESTAMPING_RX_SOFTWARE |
+				SOF_TIMESTAMPING_SOFTWARE |
+				SOF_TIMESTAMPING_TX_HARDWARE |
+				SOF_TIMESTAMPING_RX_HARDWARE |
+				SOF_TIMESTAMPING_RAW_HARDWARE;
+
+	if (pf->ptp_clock)
+		info->phc_index = ptp_clock_index(pf->ptp_clock);
+	else
+		info->phc_index = -1;
+
+	info->tx_types = (1 << HWTSTAMP_TX_OFF) | (1 << HWTSTAMP_TX_ON);
+
+	info->rx_filters = (1 << HWTSTAMP_FILTER_NONE) |
+			   (1 << HWTSTAMP_FILTER_PTP_V1_L4_SYNC) |
+			   (1 << HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ) |
+			   (1 << HWTSTAMP_FILTER_PTP_V2_EVENT) |
+			   (1 << HWTSTAMP_FILTER_PTP_V2_L2_EVENT) |
+			   (1 << HWTSTAMP_FILTER_PTP_V2_L4_EVENT) |
+			   (1 << HWTSTAMP_FILTER_PTP_V2_SYNC) |
+			   (1 << HWTSTAMP_FILTER_PTP_V2_L2_SYNC) |
+			   (1 << HWTSTAMP_FILTER_PTP_V2_L4_SYNC) |
+			   (1 << HWTSTAMP_FILTER_PTP_V2_DELAY_REQ) |
+			   (1 << HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ) |
+			   (1 << HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ);
+
+	return 0;
 }
 
 static int i40e_link_test(struct net_device *netdev, u64 *data)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 182ed18..08fe2341 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1698,6 +1698,25 @@ static int i40e_change_mtu(struct net_device *netdev, int new_mtu)
 }
 
 /**
+ * i40e_ioctl - Access the hwtstamp interface
+ * @netdev: network interface device structure
+ * @ifr: interface request data
+ * @cmd: ioctl command
+ **/
+int i40e_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_pf *pf = np->vsi->back;
+
+	switch (cmd) {
+	case SIOCSHWTSTAMP:
+		return i40e_ptp_hwtstamp_ioctl(pf, ifr, cmd);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+/**
  * i40e_vlan_stripping_enable - Turn on vlan stripping for the VSI
  * @vsi: the vsi being adjusted
  **/
@@ -2150,7 +2169,8 @@ static int i40e_configure_tx_ring(struct i40e_ring *ring)
 	tx_ctx.base = (ring->dma / 128);
 	tx_ctx.qlen = ring->count;
 	tx_ctx.fd_ena = !!(vsi->back->flags & (I40E_FLAG_FDIR_ENABLED |
-			I40E_FLAG_FDIR_ATR_ENABLED));
+					       I40E_FLAG_FDIR_ATR_ENABLED));
+	tx_ctx.timesync_ena = !!(vsi->back->flags & I40E_FLAG_PTP);
 
 	/* As part of VSI creation/update, FW allocates certain
 	 * Tx arbitration queue sets for each TC enabled for
@@ -2488,6 +2508,7 @@ static void i40e_enable_misc_int_causes(struct i40e_hw *hw)
 	      I40E_PFINT_ICR0_ENA_GRST_MASK          |
 	      I40E_PFINT_ICR0_ENA_PCI_EXCEPTION_MASK |
 	      I40E_PFINT_ICR0_ENA_GPIO_MASK          |
+	      I40E_PFINT_ICR0_ENA_TIMESYNC_MASK      |
 	      I40E_PFINT_ICR0_ENA_STORM_DETECT_MASK  |
 	      I40E_PFINT_ICR0_ENA_HMC_ERR_MASK       |
 	      I40E_PFINT_ICR0_ENA_VFLR_MASK          |
@@ -2831,6 +2852,18 @@ static irqreturn_t i40e_intr(int irq, void *data)
 		dev_info(&pf->pdev->dev, "HMC error interrupt\n");
 	}
 
+	if (icr0 & I40E_PFINT_ICR0_TIMESYNC_MASK) {
+		u32 prttsyn_stat = rd32(hw, I40E_PRTTSYN_STAT_0);
+
+		if (prttsyn_stat & I40E_PRTTSYN_STAT_0_TXTIME_MASK) {
+			ena_mask &= ~I40E_PFINT_ICR0_ENA_TIMESYNC_MASK;
+			i40e_ptp_tx_hwtstamp(pf);
+			prttsyn_stat &= ~I40E_PRTTSYN_STAT_0_TXTIME_MASK;
+		}
+
+		wr32(hw, I40E_PRTTSYN_STAT_0, prttsyn_stat);
+	}
+
 	/* If a critical error is pending we have no choice but to reset the
 	 * device.
 	 * Report and mask out any remaining unexpected interrupts.
@@ -4304,6 +4337,9 @@ static void i40e_link_event(struct i40e_pf *pf)
 
 	if (pf->vf)
 		i40e_vc_notify_link_state(pf);
+
+	if (pf->flags & I40E_FLAG_PTP)
+		i40e_ptp_set_increment(pf);
 }
 
 /**
@@ -4385,6 +4421,8 @@ static void i40e_watchdog_subtask(struct i40e_pf *pf)
 	for (i = 0; i < I40E_MAX_VEB; i++)
 		if (pf->veb[i])
 			i40e_update_veb_stats(pf->veb[i]);
+
+	i40e_ptp_rx_hang(pf->vsi[pf->lan_vsi]);
 }
 
 /**
@@ -6033,6 +6071,7 @@ static const struct net_device_ops i40e_netdev_ops = {
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= i40e_set_mac,
 	.ndo_change_mtu		= i40e_change_mtu,
+	.ndo_do_ioctl		= i40e_ioctl,
 	.ndo_tx_timeout		= i40e_tx_timeout,
 	.ndo_vlan_rx_add_vid	= i40e_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= i40e_vlan_rx_kill_vid,
@@ -7299,6 +7338,8 @@ no_autoneg:
 					 ~I40E_PRTDCB_MFLCN_RFCE_MASK);
 
 fc_complete:
+	i40e_ptp_init(pf);
+
 	return ret;
 }
 
@@ -7803,6 +7844,8 @@ static void i40e_remove(struct pci_dev *pdev)
 
 	i40e_dbg_pf_exit(pf);
 
+	i40e_ptp_stop(pf);
+
 	if (pf->flags & I40E_FLAG_SRIOV_ENABLED) {
 		i40e_free_vfs(pf);
 		pf->flags &= ~I40E_FLAG_SRIOV_ENABLED;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
new file mode 100644
index 0000000..095091e
--- /dev/null
+++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
@@ -0,0 +1,640 @@
+/*******************************************************************************
+ *
+ * Intel Ethernet Controller XL710 Family Linux Driver
+ * Copyright(c) 2013 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+ * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+ *
+ ******************************************************************************/
+
+#include "i40e.h"
+#include <linux/export.h>
+#include <linux/ptp_classify.h>
+
+/* The XL710 timesync is very much like Intel's 82599 design when it comes to
+ * the fundamental clock design. However, the clock operations are much simpler
+ * in the XL710 because the device supports a full 64 bits of nanoseconds.
+ * Because the field is so wide, we can forgo the cycle counter and just
+ * operate with the nanosecond field directly without fear of overflow.
+ *
+ * Much like the 82599, the update period is dependent upon the link speed:
+ * At 40Gb link or no link, the period is 1.6ns.
+ * At 10Gb link, the period is multiplied by 2. (3.2ns)
+ * At 1Gb link, the period is multiplied by 20. (32ns)
+ * 1588 functionality is not supported at 100Mbps.
+ */
+#define I40E_PTP_40GB_INCVAL 0x0199999999ULL
+#define I40E_PTP_10GB_INCVAL 0x0333333333ULL
+#define I40E_PTP_1GB_INCVAL  0x2000000000ULL
+
+#define I40E_PRTTSYN_CTL1_TSYNTYPE_V1  (0x1 << \
+					I40E_PRTTSYN_CTL1_TSYNTYPE_SHIFT)
+#define I40E_PRTTSYN_CTL1_TSYNTYPE_V2  (0x2 << \
+					I40E_PRTTSYN_CTL1_TSYNTYPE_SHIFT)
+#define I40E_PTP_TX_TIMEOUT  (HZ * 15)
+
+/**
+ * i40e_ptp_read - Read the PHC time from the device
+ * @pf: Board private structure
+ * @ts: timespec structure to hold the current time value
+ *
+ * This function reads the PRTTSYN_TIME registers and stores them in a
+ * timespec. However, since the registers are 64 bits of nanoseconds, we must
+ * convert the result to a timespec before we can return.
+ **/
+static void i40e_ptp_read(struct i40e_pf *pf, struct timespec *ts)
+{
+	struct i40e_hw *hw = &pf->hw;
+	u32 hi, lo;
+	u64 ns;
+
+	/* The timer latches on the lowest register read. */
+	lo = rd32(hw, I40E_PRTTSYN_TIME_L);
+	hi = rd32(hw, I40E_PRTTSYN_TIME_H);
+
+	ns = (((u64)hi) << 32) | lo;
+
+	*ts = ns_to_timespec(ns);
+}
+
+/**
+ * i40e_ptp_write - Write the PHC time to the device
+ * @pf: Board private structure
+ * @ts: timespec structure that holds the new time value
+ *
+ * This function writes the PRTTSYN_TIME registers with the user value. Since
+ * we receive a timespec from the stack, we must convert that timespec into
+ * nanoseconds before programming the registers.
+ **/
+static void i40e_ptp_write(struct i40e_pf *pf, const struct timespec *ts)
+{
+	struct i40e_hw *hw = &pf->hw;
+	u64 ns = timespec_to_ns(ts);
+
+	/* The timer will not update until the high register is written, so
+	 * write the low register first.
+	 */
+	wr32(hw, I40E_PRTTSYN_TIME_L, ns & 0xFFFFFFFF);
+	wr32(hw, I40E_PRTTSYN_TIME_H, ns >> 32);
+}
+
+/**
+ * i40e_ptp_convert_to_hwtstamp - Convert device clock to system time
+ * @hwtstamps: Timestamp structure to update
+ * @timestamp: Timestamp from the hardware
+ *
+ * We need to convert the NIC clock value into a hwtstamp which can be used by
+ * the upper level timestamping functions. Since the timestamp is simply a 64-
+ * bit nanosecond value, we can call ns_to_ktime directly to handle this.
+ **/
+static void i40e_ptp_convert_to_hwtstamp(struct skb_shared_hwtstamps *hwtstamps,
+					 u64 timestamp)
+{
+	memset(hwtstamps, 0, sizeof(*hwtstamps));
+
+	hwtstamps->hwtstamp = ns_to_ktime(timestamp);
+}
+
+/**
+ * i40e_ptp_adjfreq - Adjust the PHC frequency
+ * @ptp: The PTP clock structure
+ * @ppb: Parts per billion adjustment from the base
+ *
+ * Adjust the frequency of the PHC by the indicated parts per billion from the
+ * base frequency.
+ **/
+static int i40e_ptp_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
+{
+	struct i40e_pf *pf = container_of(ptp, struct i40e_pf, ptp_caps);
+	struct i40e_hw *hw = &pf->hw;
+	u64 adj, freq, diff;
+	int neg_adj = 0;
+
+	if (ppb < 0) {
+		neg_adj = 1;
+		ppb = -ppb;
+	}
+
+	smp_mb(); /* Force any pending update before accessing. */
+	adj = ACCESS_ONCE(pf->ptp_base_adj);
+
+	freq = adj;
+	freq *= ppb;
+	diff = div_u64(freq, 1000000000ULL);
+
+	if (neg_adj)
+		adj -= diff;
+	else
+		adj += diff;
+
+	wr32(hw, I40E_PRTTSYN_INC_L, adj & 0xFFFFFFFF);
+	wr32(hw, I40E_PRTTSYN_INC_H, adj >> 32);
+
+	return 0;
+}
+
+/**
+ * i40e_ptp_adjtime - Adjust the PHC time
+ * @ptp: The PTP clock structure
+ * @delta: Offset in nanoseconds to adjust the PHC time by
+ *
+ * Adjust the frequency of the PHC by the indicated parts per billion from the
+ * base frequency.
+ **/
+static int i40e_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	struct i40e_pf *pf = container_of(ptp, struct i40e_pf, ptp_caps);
+	struct timespec now, then = ns_to_timespec(delta);
+	unsigned long flags;
+
+	spin_lock_irqsave(&pf->tmreg_lock, flags);
+
+	i40e_ptp_read(pf, &now);
+	now = timespec_add(now, then);
+	i40e_ptp_write(pf, (const struct timespec *)&now);
+
+	spin_unlock_irqrestore(&pf->tmreg_lock, flags);
+
+	return 0;
+}
+
+/**
+ * i40e_ptp_gettime - Get the time of the PHC
+ * @ptp: The PTP clock structure
+ * @ts: timespec structure to hold the current time value
+ *
+ * Read the device clock and return the correct value on ns, after converting it
+ * into a timespec struct.
+ **/
+static int i40e_ptp_gettime(struct ptp_clock_info *ptp, struct timespec *ts)
+{
+	struct i40e_pf *pf = container_of(ptp, struct i40e_pf, ptp_caps);
+	unsigned long flags;
+
+	spin_lock_irqsave(&pf->tmreg_lock, flags);
+	i40e_ptp_read(pf, ts);
+	spin_unlock_irqrestore(&pf->tmreg_lock, flags);
+
+	return 0;
+}
+
+/**
+ * i40e_ptp_settime - Set the time of the PHC
+ * @ptp: The PTP clock structure
+ * @ts: timespec structure that holds the new time value
+ *
+ * Set the device clock to the user input value. The conversion from timespec
+ * to ns happens in the write function.
+ **/
+static int i40e_ptp_settime(struct ptp_clock_info *ptp,
+			    const struct timespec *ts)
+{
+	struct i40e_pf *pf = container_of(ptp, struct i40e_pf, ptp_caps);
+	unsigned long flags;
+
+	spin_lock_irqsave(&pf->tmreg_lock, flags);
+	i40e_ptp_write(pf, ts);
+	spin_unlock_irqrestore(&pf->tmreg_lock, flags);
+
+	return 0;
+}
+
+/**
+ * i40e_ptp_tx_work
+ * @work: pointer to work struct
+ *
+ * This work function polls the PRTTSYN_STAT_0.TXTIME bit to determine when a
+ * Tx timestamp event has occurred, in order to pass the Tx timestamp value up
+ * the stack in the skb.
+ */
+void i40e_ptp_tx_work(struct work_struct *work)
+{
+	struct i40e_pf *pf = container_of(work, struct i40e_pf,
+					  ptp_tx_work);
+	struct i40e_hw *hw = &pf->hw;
+	u32 prttsyn_stat_0;
+
+	if (!pf->ptp_tx_skb)
+		return;
+
+	if (time_is_before_jiffies(pf->ptp_tx_start +
+				   I40E_PTP_TX_TIMEOUT)) {
+		dev_kfree_skb_any(pf->ptp_tx_skb);
+		pf->ptp_tx_skb = NULL;
+		pf->tx_hwtstamp_timeouts++;
+		dev_warn(&pf->pdev->dev, "clearing Tx timestamp hang");
+		return;
+	}
+
+	prttsyn_stat_0 = rd32(hw, I40E_PRTTSYN_STAT_0);
+	if (prttsyn_stat_0 & I40E_PRTTSYN_STAT_0_TXTIME_MASK)
+		i40e_ptp_tx_hwtstamp(pf);
+	else
+		schedule_work(&pf->ptp_tx_work);
+}
+
+/**
+ * i40e_ptp_enable - Enable/disable ancillary features of the PHC subsystem
+ * @ptp: The PTP clock structure
+ * @rq: The requested feature to change
+ * @on: Enable/disable flag
+ *
+ * The XL710 does not support any of the ancillary features of the PHY
+ * subsystem, so this function may just return.
+ **/
+static int i40e_ptp_enable(struct ptp_clock_info *ptp,
+			   struct ptp_clock_request *rq, int on)
+{
+	return -EOPNOTSUPP;
+}
+
+/**
+ * i40e_ptp_rx_hang - Detect error case when Rx timestamp registers are hung
+ * @vsi: The VSI with the rings relevant to 1588
+ *
+ * This watchdog task is scheduled to detect error case where hardware has
+ * dropped an Rx packet that was timestamped when the ring is full. The
+ * particular error is rare but leaves the device in a state unable to timestamp
+ * any future packets.
+ **/
+void i40e_ptp_rx_hang(struct i40e_vsi *vsi)
+{
+	struct i40e_pf *pf = vsi->back;
+	struct i40e_hw *hw = &pf->hw;
+	struct i40e_ring *rx_ring;
+	unsigned long rx_event;
+	u32 prttsyn_stat;
+	int n;
+
+	if (pf->flags & I40E_FLAG_PTP)
+		return;
+
+	prttsyn_stat = rd32(hw, I40E_PRTTSYN_STAT_1);
+
+	/* Unless all four receive timestamp registers are latched, we are not
+	 * concerned about a possible PTP Rx hang, so just update the timeout
+	 * counter and exit.
+	 */
+	if (!(prttsyn_stat & ((I40E_PRTTSYN_STAT_1_RXT0_MASK <<
+			       I40E_PRTTSYN_STAT_1_RXT0_SHIFT) |
+			      (I40E_PRTTSYN_STAT_1_RXT1_MASK <<
+			       I40E_PRTTSYN_STAT_1_RXT1_SHIFT) |
+			      (I40E_PRTTSYN_STAT_1_RXT2_MASK <<
+			       I40E_PRTTSYN_STAT_1_RXT2_SHIFT) |
+			      (I40E_PRTTSYN_STAT_1_RXT3_MASK <<
+			       I40E_PRTTSYN_STAT_1_RXT3_SHIFT)))) {
+		pf->last_rx_ptp_check = jiffies;
+		return;
+	}
+
+	/* Determine the most recent watchdog or rx_timestamp event. */
+	rx_event = pf->last_rx_ptp_check;
+	for (n = 0; n < vsi->num_queue_pairs; n++) {
+		rx_ring = vsi->rx_rings[n];
+		if (time_after(rx_ring->last_rx_timestamp, rx_event))
+			rx_event = rx_ring->last_rx_timestamp;
+	}
+
+	/* Only need to read the high RXSTMP register to clear the lock */
+	if (time_is_before_jiffies(rx_event + 5 * HZ)) {
+		rd32(hw, I40E_PRTTSYN_RXTIME_H(0));
+		rd32(hw, I40E_PRTTSYN_RXTIME_H(1));
+		rd32(hw, I40E_PRTTSYN_RXTIME_H(2));
+		rd32(hw, I40E_PRTTSYN_RXTIME_H(3));
+		pf->last_rx_ptp_check = jiffies;
+		pf->rx_hwtstamp_cleared++;
+		dev_warn(&vsi->back->pdev->dev,
+			 "%s: clearing Rx timestamp hang",
+			 __func__);
+	}
+}
+
+/**
+ * i40e_ptp_tx_hwtstamp - Utility function which returns the Tx timestamp
+ * @pf: Board private structure
+ *
+ * Read the value of the Tx timestamp from the registers, convert it into a
+ * value consumable by the stack, and store that result into the shhwtstamps
+ * struct before returning it up the stack.
+ **/
+void i40e_ptp_tx_hwtstamp(struct i40e_pf *pf)
+{
+	struct skb_shared_hwtstamps shhwtstamps;
+	struct i40e_hw *hw = &pf->hw;
+	u32 hi, lo;
+	u64 ns;
+
+	lo = rd32(hw, I40E_PRTTSYN_TXTIME_L);
+	hi = rd32(hw, I40E_PRTTSYN_TXTIME_H);
+
+	ns = (((u64)hi) << 32) | lo;
+
+	i40e_ptp_convert_to_hwtstamp(&shhwtstamps, ns);
+	skb_tstamp_tx(pf->ptp_tx_skb, &shhwtstamps);
+	dev_kfree_skb_any(pf->ptp_tx_skb);
+	pf->ptp_tx_skb = NULL;
+}
+
+/**
+ * i40e_ptp_rx_hwtstamp - Utility function which checks for an Rx timestamp
+ * @pf: Board private structure
+ * @skb: Particular skb to send timestamp with
+ * @index: Index into the receive timestamp registers for the timestamp
+ *
+ * The XL710 receives a notification in the receive descriptor with an offset
+ * into the set of RXTIME registers where the timestamp is for that skb. This
+ * function goes and fetches the receive timestamp from that offset, if a valid
+ * one exists. The RXTIME registers are in ns, so we must convert the result
+ * first.
+ **/
+void i40e_ptp_rx_hwtstamp(struct i40e_pf *pf, struct sk_buff *skb, u8 index)
+{
+	u32 prttsyn_stat, hi, lo;
+	struct i40e_hw *hw;
+	u64 ns;
+
+	/* Since we cannot turn off the Rx timestamp logic if the device is
+	 * doing Tx timestamping, check if Rx timestamping is configured.
+	 */
+	if (!pf->ptp_rx)
+		return;
+
+	hw = &pf->hw;
+
+	prttsyn_stat = rd32(hw, I40E_PRTTSYN_STAT_1);
+
+	if (!(prttsyn_stat & (1 << index)))
+		return;
+
+	lo = rd32(hw, I40E_PRTTSYN_RXTIME_L(index));
+	hi = rd32(hw, I40E_PRTTSYN_RXTIME_H(index));
+
+	ns = (((u64)hi) << 32) | lo;
+
+	i40e_ptp_convert_to_hwtstamp(skb_hwtstamps(skb), ns);
+}
+
+/**
+ * i40e_ptp_set_increment - Utility function to update clock increment rate
+ * @pf: Board private structure
+ *
+ * During a link change, the DMA frequency that drives the 1588 logic will
+ * change. In order to keep the PRTTSYN_TIME registers in units of nanoseconds,
+ * we must update the increment value per clock tick.
+ **/
+void i40e_ptp_set_increment(struct i40e_pf *pf)
+{
+	struct i40e_link_status *hw_link_info;
+	struct i40e_hw *hw = &pf->hw;
+	u64 incval;
+
+	hw_link_info = &hw->phy.link_info;
+
+	i40e_aq_get_link_info(&pf->hw, true, NULL, NULL);
+
+	switch (hw_link_info->link_speed) {
+	case I40E_LINK_SPEED_10GB:
+		incval = I40E_PTP_10GB_INCVAL;
+		break;
+	case I40E_LINK_SPEED_1GB:
+		incval = I40E_PTP_1GB_INCVAL;
+		break;
+	case I40E_LINK_SPEED_100MB:
+		dev_warn(&pf->pdev->dev,
+			 "%s: 1588 functionality is not supported at 100 Mbps. Stopping the PHC.\n",
+			 __func__);
+		incval = 0;
+		break;
+	case I40E_LINK_SPEED_40GB:
+	default:
+		incval = I40E_PTP_40GB_INCVAL;
+		break;
+	}
+
+	/* Write the new increment value into the increment register. The
+	 * hardware will not update the clock until both registers have been
+	 * written.
+	 */
+	wr32(hw, I40E_PRTTSYN_INC_L, incval & 0xFFFFFFFF);
+	wr32(hw, I40E_PRTTSYN_INC_H, incval >> 32);
+
+	/* Update the base adjustement value. */
+	ACCESS_ONCE(pf->ptp_base_adj) = incval;
+	smp_mb(); /* Force the above update. */
+}
+
+/**
+ * i40e_ptp_hwtstamp_ioctl - ioctl interface to control the HW timestamping
+ * @pf: Board private structure
+ * @ifreq: ioctl data
+ * @cmd: Particular ioctl requested
+ *
+ * Respond to the user filter requests and make the appropriate hardware
+ * changes here. The XL710 cannot support splitting of the Tx/Rx timestamping
+ * logic, so keep track in software of whether to indicate these timestamps
+ * or not.
+ *
+ * It is permissible to "upgrade" the user request to a broader filter, as long
+ * as the user receives the timestamps they care about and the user is notified
+ * the filter has been broadened.
+ **/
+int i40e_ptp_hwtstamp_ioctl(struct i40e_pf *pf, struct ifreq *ifr, int cmd)
+{
+	struct i40e_hw *hw = &pf->hw;
+	struct hwtstamp_config config;
+	u32 pf_id, tsyntype, regval;
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	/* Reserved for future extensions. */
+	if (config.flags)
+		return -EINVAL;
+
+	/* Confirm that 1588 is supported on this PF. */
+	pf_id = (rd32(hw, I40E_PRTTSYN_CTL0) & I40E_PRTTSYN_CTL0_PF_ID_MASK) >>
+		I40E_PRTTSYN_CTL0_PF_ID_SHIFT;
+	if (hw->pf_id != pf_id)
+		return -EINVAL;
+
+	switch (config.tx_type) {
+	case HWTSTAMP_TX_OFF:
+		pf->ptp_tx = false;
+		break;
+	case HWTSTAMP_TX_ON:
+		pf->ptp_tx = true;
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	switch (config.rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		pf->ptp_rx = false;
+		tsyntype = 0;
+		break;
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+		pf->ptp_rx = true;
+		tsyntype = I40E_PRTTSYN_CTL1_V1MESSTYPE0_MASK |
+			   I40E_PRTTSYN_CTL1_TSYNTYPE_V1 |
+			   I40E_PRTTSYN_CTL1_UDP_ENA_MASK;
+		config.rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_EVENT;
+		break;
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+		pf->ptp_rx = true;
+		tsyntype = I40E_PRTTSYN_CTL1_V2MESSTYPE0_MASK |
+			   I40E_PRTTSYN_CTL1_TSYNTYPE_V2 |
+			   I40E_PRTTSYN_CTL1_UDP_ENA_MASK;
+		config.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
+		break;
+	case HWTSTAMP_FILTER_ALL:
+	default:
+		return -ERANGE;
+	}
+
+	/* Clear out all 1588-related registers to clear and unlatch them. */
+	rd32(hw, I40E_PRTTSYN_STAT_0);
+	rd32(hw, I40E_PRTTSYN_TXTIME_H);
+	rd32(hw, I40E_PRTTSYN_RXTIME_H(0));
+	rd32(hw, I40E_PRTTSYN_RXTIME_H(1));
+	rd32(hw, I40E_PRTTSYN_RXTIME_H(2));
+	rd32(hw, I40E_PRTTSYN_RXTIME_H(3));
+
+	/* Enable/disable the Tx timestamp interrupt based on user input. */
+	regval = rd32(hw, I40E_PRTTSYN_CTL0);
+	if (pf->ptp_tx)
+		regval |= I40E_PRTTSYN_CTL0_TXTIME_INT_ENA_MASK;
+	else
+		regval &= ~I40E_PRTTSYN_CTL0_TXTIME_INT_ENA_MASK;
+	wr32(hw, I40E_PRTTSYN_CTL0, regval);
+
+	regval = rd32(hw, I40E_PFINT_ICR0_ENA);
+	if (pf->ptp_tx)
+		regval |= I40E_PFINT_ICR0_ENA_TIMESYNC_MASK;
+	else
+		regval &= ~I40E_PFINT_ICR0_ENA_TIMESYNC_MASK;
+	wr32(hw, I40E_PFINT_ICR0_ENA, regval);
+
+	/* There is no simple on/off switch for Rx. To "disable" Rx support,
+	 * ignore any received timestamps, rather than turn off the clock.
+	 */
+	if (pf->ptp_rx) {
+		regval = rd32(hw, I40E_PRTTSYN_CTL1);
+		/* clear everything but the enable bit */
+		regval &= I40E_PRTTSYN_CTL1_TSYNENA_MASK;
+		/* now enable bits for desired Rx timestamps */
+		regval |= tsyntype;
+		wr32(hw, I40E_PRTTSYN_CTL1, regval);
+	}
+
+	return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ?
+		-EFAULT : 0;
+}
+
+/**
+ * i40e_ptp_init - Initialize the 1588 support and register the PHC
+ * @pf: Board private structure
+ *
+ * This function registers the device clock as a PHC. If it is successful, it
+ * starts the clock in the hardware.
+ **/
+void i40e_ptp_init(struct i40e_pf *pf)
+{
+	struct i40e_hw *hw = &pf->hw;
+	struct net_device *netdev = pf->vsi[pf->lan_vsi]->netdev;
+
+	snprintf(pf->ptp_caps.name, 16, "%pm", netdev->dev_addr);
+	pf->ptp_caps.owner = THIS_MODULE;
+	pf->ptp_caps.max_adj = 999999999;
+	pf->ptp_caps.n_ext_ts = 0;
+	pf->ptp_caps.pps = 0;
+	pf->ptp_caps.adjfreq = i40e_ptp_adjfreq;
+	pf->ptp_caps.adjtime = i40e_ptp_adjtime;
+	pf->ptp_caps.gettime = i40e_ptp_gettime;
+	pf->ptp_caps.settime = i40e_ptp_settime;
+	pf->ptp_caps.enable = i40e_ptp_enable;
+
+	/* Attempt to register the clock before enabling the hardware. */
+	pf->ptp_clock = ptp_clock_register(&pf->ptp_caps, &pf->pdev->dev);
+	if (IS_ERR(pf->ptp_clock)) {
+		pf->ptp_clock = NULL;
+		dev_err(&pf->pdev->dev, "%s: ptp_clock_register failed\n",
+			__func__);
+	} else {
+		struct timespec ts;
+		u32 regval;
+
+		spin_lock_init(&pf->tmreg_lock);
+		INIT_WORK(&pf->ptp_tx_work, i40e_ptp_tx_work);
+
+		dev_info(&pf->pdev->dev, "%s: added PHC on %s\n", __func__,
+			 netdev->name);
+		pf->flags |= I40E_FLAG_PTP;
+
+		/* Ensure the clocks are running. */
+		regval = rd32(hw, I40E_PRTTSYN_CTL0);
+		regval |= I40E_PRTTSYN_CTL0_TSYNENA_MASK;
+		wr32(hw, I40E_PRTTSYN_CTL0, regval);
+		regval = rd32(hw, I40E_PRTTSYN_CTL1);
+		regval |= I40E_PRTTSYN_CTL1_TSYNENA_MASK;
+		wr32(hw, I40E_PRTTSYN_CTL1, regval);
+
+		/* Set the increment value per clock tick. */
+		i40e_ptp_set_increment(pf);
+
+		/* Set the clock value. */
+		ts = ktime_to_timespec(ktime_get_real());
+		i40e_ptp_settime(&pf->ptp_caps, &ts);
+	}
+}
+
+/**
+ * i40e_ptp_stop - Disable the driver/hardware support and unregister the PHC
+ * @pf: Board private structure
+ *
+ * This function handles the cleanup work required from the initialization by
+ * clearing out the important information and unregistering the PHC.
+ **/
+void i40e_ptp_stop(struct i40e_pf *pf)
+{
+	pf->flags &= ~I40E_FLAG_PTP;
+	pf->ptp_tx = false;
+	pf->ptp_rx = false;
+
+	cancel_work_sync(&pf->ptp_tx_work);
+	if (pf->ptp_tx_skb) {
+		dev_kfree_skb_any(pf->ptp_tx_skb);
+		pf->ptp_tx_skb = NULL;
+	}
+
+	if (pf->ptp_clock) {
+		ptp_clock_unregister(pf->ptp_clock);
+		pf->ptp_clock = NULL;
+		dev_info(&pf->pdev->dev, "%s: removed PHC on %s\n", __func__,
+			 pf->vsi[pf->lan_vsi]->netdev->name);
+	}
+}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 946d8b1..a089ac1 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1088,6 +1088,13 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
 		}
 
 		skb->rxhash = i40e_rx_hash(rx_ring, rx_desc);
+		if (unlikely(rx_status & I40E_RXD_QW1_STATUS_TSYNVALID_MASK)) {
+			i40e_ptp_rx_hwtstamp(vsi->back, skb, (rx_status &
+					   I40E_RXD_QW1_STATUS_TSYNINDX_MASK) >>
+					   I40E_RXD_QW1_STATUS_TSYNINDX_SHIFT);
+			rx_ring->last_rx_timestamp = jiffies;
+		}
+
 		/* probably a little skewed due to removing CRC */
 		total_rx_bytes += skb->len;
 		total_rx_packets++;
@@ -1426,6 +1433,46 @@ static int i40e_tso(struct i40e_ring *tx_ring, struct sk_buff *skb,
 }
 
 /**
+ * i40e_tsyn - set up the tsyn context descriptor
+ * @tx_ring:  ptr to the ring to send
+ * @skb:      ptr to the skb we're sending
+ * @tx_flags: the collected send information
+ *
+ * Returns 0 if no Tx timestamp can happen and 1 if the timestamp will happen
+ **/
+static int i40e_tsyn(struct i40e_ring *tx_ring, struct sk_buff *skb,
+		     u32 tx_flags, u64 *cd_type_cmd_tso_mss)
+{
+	struct i40e_pf *pf;
+
+	if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)))
+		return 0;
+
+	/* Tx timestamps cannot be sampled when doing TSO */
+	if (tx_flags & I40E_TX_FLAGS_TSO)
+		return 0;
+
+	/* only timestamp the outbound packet if the user has requested it and
+	 * we are not already transmitting a packet to be timestamped
+	 */
+	pf = i40e_netdev_to_pf(tx_ring->netdev);
+	if (pf->ptp_tx && !pf->ptp_tx_skb) {
+		skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+		pf->ptp_tx_skb = skb_get(skb);
+	} else {
+		return 0;
+	}
+
+	*cd_type_cmd_tso_mss |= (u64)I40E_TX_CTX_DESC_TSYN <<
+				I40E_TXD_CTX_QW1_CMD_SHIFT;
+
+	pf->ptp_tx_start = jiffies;
+	schedule_work(&pf->ptp_tx_work);
+
+	return 1;
+}
+
+/**
  * i40e_tx_enable_csum - Enable Tx checksum offloads
  * @skb: send buffer
  * @tx_flags: Tx flags currently set
@@ -1801,6 +1848,7 @@ static netdev_tx_t i40e_xmit_frame_ring(struct sk_buff *skb,
 	__be16 protocol;
 	u32 td_cmd = 0;
 	u8 hdr_len = 0;
+	int tsyn;
 	int tso;
 	if (0 == i40e_xmit_descriptor_count(skb, tx_ring))
 		return NETDEV_TX_BUSY;
@@ -1831,6 +1879,11 @@ static netdev_tx_t i40e_xmit_frame_ring(struct sk_buff *skb,
 
 	skb_tx_timestamp(skb);
 
+	tsyn = i40e_tsyn(tx_ring, skb, tx_flags, &cd_type_cmd_tso_mss);
+
+	if (tsyn)
+		tx_flags |= I40E_TX_FLAGS_TSYN;
+
 	/* always enable CRC insertion offload */
 	td_cmd |= I40E_TX_DESC_CMD_ICRC;
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 93b8642..d534969 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -136,6 +136,7 @@ enum i40e_dyn_idx_t {
 #define I40E_TX_FLAGS_IPV6		(u32)(1 << 5)
 #define I40E_TX_FLAGS_FCCRC		(u32)(1 << 6)
 #define I40E_TX_FLAGS_FSO		(u32)(1 << 7)
+#define I40E_TX_FLAGS_TSYN		(u32)(1 << 8)
 #define I40E_TX_FLAGS_VLAN_MASK		0xffff0000
 #define I40E_TX_FLAGS_VLAN_PRIO_MASK	0xe0000000
 #define I40E_TX_FLAGS_VLAN_PRIO_SHIFT	29
@@ -248,6 +249,8 @@ struct i40e_ring {
 	u8 atr_sample_rate;
 	u8 atr_count;
 
+	unsigned long last_rx_timestamp;
+
 	bool ring_active;		/* is ring online or not */
 
 	/* stats structs */
-- 
1.8.3.1

^ permalink raw reply related

* [net-next 13/16] i40e: call clear_pxe after adminq is initialized
From: Jeff Kirsher @ 2014-01-10  8:58 UTC (permalink / raw)
  To: davem
  Cc: Shannon Nelson, netdev, gospo, sassmann, Jesse Brandeburg,
	Jeff Kirsher
In-Reply-To: <1389344336-1558-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Shannon Nelson <shannon.nelson@intel.com>

In the latest firmware the clear_pxe_mode function will use the
AdminQ request, so call this after AdminQ is set up rather than
relying on i40e_pf_reset() to clear the PXE mode.

Change-ID: Ice8cba2e9cbc3c7bde0a0bcf8eaf5009abef040b
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Kavindya Deegala <kavindya.s.deegala@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index bc14a85..182ed18 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7612,6 +7612,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_pf_reset;
 	}
 
+	i40e_clear_pxe_mode(hw);
 	err = i40e_get_capabilities(pf);
 	if (err)
 		goto err_adminq_setup;
-- 
1.8.3.1

^ permalink raw reply related

* [net-next 16/16] i40e: Bump version
From: Jeff Kirsher @ 2014-01-10  8:58 UTC (permalink / raw)
  To: davem
  Cc: Catherine Sullivan, netdev, gospo, sassmann, Jesse Brandeburg,
	Jeff Kirsher
In-Reply-To: <1389344336-1558-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Catherine Sullivan <catherine.sullivan@intel.com>

Update the driver version to 0.3.28-k.

Signed-off-by: Catherine Sullivan <catherine.sullivan@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 08fe2341..e13a3f8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -38,7 +38,7 @@ static const char i40e_driver_string[] =
 
 #define DRV_VERSION_MAJOR 0
 #define DRV_VERSION_MINOR 3
-#define DRV_VERSION_BUILD 27
+#define DRV_VERSION_BUILD 28
 #define DRV_VERSION __stringify(DRV_VERSION_MAJOR) "." \
 	     __stringify(DRV_VERSION_MINOR) "." \
 	     __stringify(DRV_VERSION_BUILD)    DRV_KERN
-- 
1.8.3.1

^ permalink raw reply related

* [net-next 15/16] i40e: fix log message wording
From: Jeff Kirsher @ 2014-01-10  8:58 UTC (permalink / raw)
  To: davem
  Cc: Shannon Nelson, netdev, gospo, sassmann, Jesse Brandeburg,
	Jeff Kirsher
In-Reply-To: <1389344336-1558-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Shannon Nelson <shannon.nelson@intel.com>

Change the redundant "vsi VSI" to VSI.

Change-ID: Ic16ea5820a99abc7831713cde39e7d032a7ba4d3
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Kavindya Deegala <kavindya.s.deegala@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
index 0220b18..36a5cc8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
@@ -1083,7 +1083,7 @@ static ssize_t i40e_dbg_command_write(struct file *filp,
 		vsi = i40e_dbg_find_vsi(pf, vsi_seid);
 		if (!vsi) {
 			dev_info(&pf->pdev->dev,
-				 "add relay: vsi VSI %d not found\n", vsi_seid);
+				 "add relay: VSI %d not found\n", vsi_seid);
 			goto command_write_done;
 		}
 
-- 
1.8.3.1

^ permalink raw reply related

* [net-next 11/16] i40e: adjust ITR max and min values
From: Jeff Kirsher @ 2014-01-10  8:58 UTC (permalink / raw)
  To: davem
  Cc: Shannon Nelson, netdev, gospo, sassmann, Jesse Brandeburg,
	Jeff Kirsher
In-Reply-To: <1389344336-1558-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Shannon Nelson <shannon.nelson@intel.com>

Set the ITR max and min values to match the hardware max value
and the recommended min value.  These values are shifted right
one bit because the register counts in 2 usec units, so leave
a comment to explain.

Change-ID: I289c27955cf6c566a6d21b95c3110b88cbb15dad
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Kavindya Deegala <kavindya.s.deegala@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 6f8506c..93b8642 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -29,9 +29,8 @@
 
 /* Interrupt Throttling and Rate Limiting (storm control) Goodies */
 
-#define I40E_MAX_ITR               0x07FF
-#define I40E_MIN_ITR               0x0001
-#define I40E_ITR_USEC_RESOLUTION   2
+#define I40E_MAX_ITR               0x0FF0  /* reg uses 2 usec resolution */
+#define I40E_MIN_ITR               0x0004  /* reg uses 2 usec resolution */
 #define I40E_MAX_IRATE             0x03F
 #define I40E_MIN_IRATE             0x001
 #define I40E_IRATE_USEC_RESOLUTION 4
-- 
1.8.3.1

^ permalink raw reply related

* [net-next 12/16] i40e: clear qtx_head before enabling Tx queue
From: Jeff Kirsher @ 2014-01-10  8:58 UTC (permalink / raw)
  To: davem
  Cc: Shannon Nelson, netdev, gospo, sassmann, Jesse Brandeburg,
	Jeff Kirsher
In-Reply-To: <1389344336-1558-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Shannon Nelson <shannon.nelson@intel.com>

Make sure the "new" qtx_head[q] register is cleared before
enabling the Tx queue.

Change-ID: I0c7a12815e343a5ae68807af172a35d6c6857935
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Kavindya Deegala <kavindya.s.deegala@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 65c27cb..bc14a85 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -3008,11 +3008,13 @@ static int i40e_vsi_control_tx(struct i40e_vsi *vsi, bool enable)
 			continue;
 
 		/* turn on/off the queue */
-		if (enable)
+		if (enable) {
+			wr32(hw, I40E_QTX_HEAD(pf_q), 0);
 			tx_reg |= I40E_QTX_ENA_QENA_REQ_MASK |
 				  I40E_QTX_ENA_QENA_STAT_MASK;
-		else
+		} else {
 			tx_reg &= ~I40E_QTX_ENA_QENA_REQ_MASK;
+		}
 
 		wr32(hw, I40E_QTX_ENA(pf_q), tx_reg);
 
-- 
1.8.3.1

^ permalink raw reply related

* [net-next 10/16] i40e: check for possible incorrect ipv6 checksum
From: Jeff Kirsher @ 2014-01-10  8:58 UTC (permalink / raw)
  To: davem
  Cc: Shannon Nelson, netdev, gospo, sassmann, Jesse Brandeburg,
	Jeff Kirsher
In-Reply-To: <1389344336-1558-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Shannon Nelson <shannon.nelson@intel.com>

If the IPV6EXADD bit is set in the Rx descriptor status, there
was an optional extension header with an alternate IP address
detected.  The HW checksum offload doesn't handle the alternate
IP address correctly so likely comes up with the wrong answer.
Thus, if the bit is set we ignore the checksum offload value.

Change-ID: I70ff8d38cdcddccf44107691cae13d0c07c284c8
Signed-off-by: Shannon Nelson <shannon.nelson@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Kavindya Deegala <kavindya.s.deegala@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 4 ++++
 drivers/net/ethernet/intel/i40e/i40e_type.h | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 43d88dd..946d8b1 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -892,6 +892,10 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi,
 	      rx_status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)))
 		return;
 
+	/* likely incorrect csum if alternate IP extention headers found */
+	if (rx_status & (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT))
+		return;
+
 	/* IP or L4 or outmost IP checksum error */
 	if (rx_error & ((1 << I40E_RX_DESC_ERROR_IPE_SHIFT) |
 			(1 << I40E_RX_DESC_ERROR_L4E_SHIFT) |
diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h
index 072c850..80cf240 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
@@ -499,7 +499,9 @@ enum i40e_rx_desc_status_bits {
 	I40E_RX_DESC_STATUS_FLM_SHIFT		= 11,
 	I40E_RX_DESC_STATUS_FLTSTAT_SHIFT	= 12, /* 2 BITS */
 	I40E_RX_DESC_STATUS_LPBK_SHIFT		= 14,
-	I40E_RX_DESC_STATUS_UDP_0_SHIFT		= 16
+	I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT	= 15,
+	I40E_RX_DESC_STATUS_RESERVED_SHIFT	= 16, /* 2 BITS */
+	I40E_RX_DESC_STATUS_UDP_0_SHIFT		= 18
 };
 
 #define I40E_RXD_QW1_STATUS_TSYNINDX_SHIFT   I40E_RX_DESC_STATUS_TSYNINDX_SHIFT
-- 
1.8.3.1

^ permalink raw reply related

* [net-next 08/16] i40e: do not bail when disabling if Tx queue disable fails
From: Jeff Kirsher @ 2014-01-10  8:58 UTC (permalink / raw)
  To: davem
  Cc: Anjali Singhai Jain, netdev, gospo, sassmann, Jesse Brandeburg,
	Jeff Kirsher
In-Reply-To: <1389344336-1558-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Anjali Singhai Jain <anjali.singhai@intel.com>

Fix a bug where the driver was erroneously exiting the driver unload
path if one part of the unload failed.  Instead of the original way
the driver should always continue when disabling and be sure to disable
all queues.

Change-ID: Ib8c81c596bc87c31d8e9ca97ebf871168475279d
Signed-off-by: Anjali Singhai Jain <anjali.singhai@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Kavindya Deegala <kavindya.s.deegala@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 80c83eb..65c27cb 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -3113,7 +3113,7 @@ static int i40e_vsi_control_rx(struct i40e_vsi *vsi, bool enable)
  **/
 int i40e_vsi_control_rings(struct i40e_vsi *vsi, bool request)
 {
-	int ret;
+	int ret = 0;
 
 	/* do rx first for enable and last for disable */
 	if (request) {
@@ -3122,10 +3122,9 @@ int i40e_vsi_control_rings(struct i40e_vsi *vsi, bool request)
 			return ret;
 		ret = i40e_vsi_control_tx(vsi, request);
 	} else {
-		ret = i40e_vsi_control_tx(vsi, request);
-		if (ret)
-			return ret;
-		ret = i40e_vsi_control_rx(vsi, request);
+		/* Ignore return value, we need to shutdown whatever we can */
+		i40e_vsi_control_tx(vsi, request);
+		i40e_vsi_control_rx(vsi, request);
 	}
 
 	return ret;
-- 
1.8.3.1

^ permalink raw reply related

* [net-next 06/16] i40e: Cleanup Doxygen warnings
From: Jeff Kirsher @ 2014-01-10  8:58 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, gospo, sassmann, Jesse Brandeburg
In-Reply-To: <1389344336-1558-1-git-send-email-jeffrey.t.kirsher@intel.com>

These changes make Doxygen/kdoc work correctly without warnings.

Change-ID: I2941f38860be805ff7548d84dae35754c83f1d62
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Kavindya Deegala <kavindya.s.deegala@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_adminq.c | 26 +++++++++++++-------------
 drivers/net/ethernet/intel/i40e/i40e_common.c | 17 ++++++++++-------
 drivers/net/ethernet/intel/i40e/i40e_hmc.h    |  3 ---
 drivers/net/ethernet/intel/i40e/i40e_nvm.c    |  1 +
 4 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.c b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
index 4e5a02c..a50e6b3 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
@@ -128,7 +128,7 @@ static void i40e_free_adminq_arq(struct i40e_hw *hw)
 
 /**
  *  i40e_alloc_arq_bufs - Allocate pre-posted buffers for the receive queue
- *  @hw:     pointer to the hardware structure
+ *  @hw: pointer to the hardware structure
  **/
 static i40e_status i40e_alloc_arq_bufs(struct i40e_hw *hw)
 {
@@ -195,7 +195,7 @@ unwind_alloc_arq_bufs:
 
 /**
  *  i40e_alloc_asq_bufs - Allocate empty buffer structs for the send queue
- *  @hw:     pointer to the hardware structure
+ *  @hw: pointer to the hardware structure
  **/
 static i40e_status i40e_alloc_asq_bufs(struct i40e_hw *hw)
 {
@@ -235,7 +235,7 @@ unwind_alloc_asq_bufs:
 
 /**
  *  i40e_free_arq_bufs - Free receive queue buffer info elements
- *  @hw:     pointer to the hardware structure
+ *  @hw: pointer to the hardware structure
  **/
 static void i40e_free_arq_bufs(struct i40e_hw *hw)
 {
@@ -254,7 +254,7 @@ static void i40e_free_arq_bufs(struct i40e_hw *hw)
 
 /**
  *  i40e_free_asq_bufs - Free send queue buffer info elements
- *  @hw:     pointer to the hardware structure
+ *  @hw: pointer to the hardware structure
  **/
 static void i40e_free_asq_bufs(struct i40e_hw *hw)
 {
@@ -277,7 +277,7 @@ static void i40e_free_asq_bufs(struct i40e_hw *hw)
 
 /**
  *  i40e_config_asq_regs - configure ASQ registers
- *  @hw:     pointer to the hardware structure
+ *  @hw: pointer to the hardware structure
  *
  *  Configure base address and length registers for the transmit queue
  **/
@@ -304,7 +304,7 @@ static void i40e_config_asq_regs(struct i40e_hw *hw)
 
 /**
  *  i40e_config_arq_regs - ARQ register configuration
- *  @hw:     pointer to the hardware structure
+ *  @hw: pointer to the hardware structure
  *
  * Configure base address and length registers for the receive (event queue)
  **/
@@ -334,7 +334,7 @@ static void i40e_config_arq_regs(struct i40e_hw *hw)
 
 /**
  *  i40e_init_asq - main initialization routine for ASQ
- *  @hw:     pointer to the hardware structure
+ *  @hw: pointer to the hardware structure
  *
  *  This is the main initialization routine for the Admin Send Queue
  *  Prior to calling this function, drivers *MUST* set the following fields
@@ -391,7 +391,7 @@ init_adminq_exit:
 
 /**
  *  i40e_init_arq - initialize ARQ
- *  @hw:     pointer to the hardware structure
+ *  @hw: pointer to the hardware structure
  *
  *  The main initialization routine for the Admin Receive (Event) Queue.
  *  Prior to calling this function, drivers *MUST* set the following fields
@@ -448,7 +448,7 @@ init_adminq_exit:
 
 /**
  *  i40e_shutdown_asq - shutdown the ASQ
- *  @hw:     pointer to the hardware structure
+ *  @hw: pointer to the hardware structure
  *
  *  The main shutdown routine for the Admin Send Queue
  **/
@@ -479,7 +479,7 @@ static i40e_status i40e_shutdown_asq(struct i40e_hw *hw)
 
 /**
  *  i40e_shutdown_arq - shutdown ARQ
- *  @hw:     pointer to the hardware structure
+ *  @hw: pointer to the hardware structure
  *
  *  The main shutdown routine for the Admin Receive Queue
  **/
@@ -510,7 +510,7 @@ static i40e_status i40e_shutdown_arq(struct i40e_hw *hw)
 
 /**
  *  i40e_init_adminq - main initialization routine for Admin Queue
- *  @hw:     pointer to the hardware structure
+ *  @hw: pointer to the hardware structure
  *
  *  Prior to calling this function, drivers *MUST* set the following fields
  *  in the hw->aq structure:
@@ -607,7 +607,7 @@ init_adminq_exit:
 
 /**
  *  i40e_shutdown_adminq - shutdown routine for the Admin Queue
- *  @hw:     pointer to the hardware structure
+ *  @hw: pointer to the hardware structure
  **/
 i40e_status i40e_shutdown_adminq(struct i40e_hw *hw)
 {
@@ -626,7 +626,7 @@ i40e_status i40e_shutdown_adminq(struct i40e_hw *hw)
 
 /**
  *  i40e_clean_asq - cleans Admin send queue
- *  @asq: pointer to the adminq send ring
+ *  @hw: pointer to the hardware structure
  *
  *  returns the number of free desc
  **/
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c
index 1bf0ec1..0b5a75c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -74,7 +74,8 @@ static i40e_status i40e_set_mac_type(struct i40e_hw *hw)
 /**
  * i40e_debug_aq
  * @hw: debug mask related to admin queue
- * @cap: pointer to adminq command descriptor
+ * @mask: debug mask
+ * @desc: pointer to admin queue descriptor
  * @buffer: pointer to command buffer
  *
  * Dumps debug log about adminq command with descriptor contents.
@@ -629,7 +630,7 @@ aq_get_link_info_exit:
 /**
  * i40e_aq_add_vsi
  * @hw: pointer to the hw struct
- * @vsi: pointer to a vsi context struct
+ * @vsi_ctx: pointer to a vsi context struct
  * @cmd_details: pointer to command details structure or NULL
  *
  * Add a VSI context to the hardware.
@@ -776,7 +777,7 @@ i40e_status i40e_aq_set_vsi_broadcast(struct i40e_hw *hw,
 /**
  * i40e_get_vsi_params - get VSI configuration info
  * @hw: pointer to the hw struct
- * @vsi: pointer to a vsi context struct
+ * @vsi_ctx: pointer to a vsi context struct
  * @cmd_details: pointer to command details structure or NULL
  **/
 i40e_status i40e_aq_get_vsi_params(struct i40e_hw *hw,
@@ -818,7 +819,7 @@ aq_get_vsi_params_exit:
 /**
  * i40e_aq_update_vsi_params
  * @hw: pointer to the hw struct
- * @vsi: pointer to a vsi context struct
+ * @vsi_ctx: pointer to a vsi context struct
  * @cmd_details: pointer to command details structure or NULL
  *
  * Update a VSI context.
@@ -921,7 +922,6 @@ i40e_status i40e_aq_get_firmware_version(struct i40e_hw *hw,
 /**
  * i40e_aq_send_driver_version
  * @hw: pointer to the hw struct
- * @event: driver event: driver ok, start or stop
  * @dv: driver's major, minor version
  * @cmd_details: pointer to command details structure or NULL
  *
@@ -1039,10 +1039,10 @@ i40e_status i40e_aq_add_veb(struct i40e_hw *hw, u16 uplink_seid,
  * @hw: pointer to the hw struct
  * @veb_seid: the SEID of the VEB to query
  * @switch_id: the uplink switch id
- * @floating_veb: set to true if the VEB is floating
+ * @floating: set to true if the VEB is floating
  * @statistic_index: index of the stats counter block for this VEB
  * @vebs_used: number of VEB's used by function
- * @vebs_unallocated: total VEB's not reserved by any function
+ * @vebs_free: total VEB's not reserved by any function
  * @cmd_details: pointer to command details structure or NULL
  *
  * This retrieves the parameters for a particular VEB, specified by
@@ -1179,6 +1179,8 @@ i40e_status i40e_aq_remove_macvlan(struct i40e_hw *hw, u16 seid,
  * i40e_aq_send_msg_to_vf
  * @hw: pointer to the hardware structure
  * @vfid: vf id to send msg
+ * @v_opcode: opcodes for VF-PF communication
+ * @v_retval: return error code
  * @msg: pointer to the msg buffer
  * @msglen: msg length
  * @cmd_details: pointer to command details
@@ -1723,6 +1725,7 @@ i40e_status i40e_aq_start_lldp(struct i40e_hw *hw,
  * @udp_port: the UDP port to add
  * @header_len: length of the tunneling header length in DWords
  * @protocol_index: protocol index type
+ * @filter_index: pointer to filter index
  * @cmd_details: pointer to command details structure or NULL
  **/
 i40e_status i40e_aq_add_udp_tunnel(struct i40e_hw *hw,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_hmc.h b/drivers/net/ethernet/intel/i40e/i40e_hmc.h
index 72bf30a..0cd4701 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_hmc.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_hmc.h
@@ -116,7 +116,6 @@ struct i40e_hmc_info {
  * @hw: pointer to our hw struct
  * @pa: pointer to physical address
  * @sd_index: segment descriptor index
- * @hmc_fn_id: hmc function id
  * @type: if sd entry is direct or paged
  **/
 #define I40E_SET_PF_SD_ENTRY(hw, pa, sd_index, type)			\
@@ -138,7 +137,6 @@ struct i40e_hmc_info {
  * I40E_CLEAR_PF_SD_ENTRY - marks the sd entry as invalid in the hardware
  * @hw: pointer to our hw struct
  * @sd_index: segment descriptor index
- * @hmc_fn_id: hmc function id
  * @type: if sd entry is direct or paged
  **/
 #define I40E_CLEAR_PF_SD_ENTRY(hw, sd_index, type)			\
@@ -159,7 +157,6 @@ struct i40e_hmc_info {
  * @hw: pointer to our hw struct
  * @sd_idx: segment descriptor index
  * @pd_idx: page descriptor index
- * @hmc_fn_id: hmc function id
  **/
 #define I40E_INVALIDATE_PF_HMC_PD(hw, sd_idx, pd_idx)			\
 	wr32((hw), I40E_PFHMC_PDINV,					\
diff --git a/drivers/net/ethernet/intel/i40e/i40e_nvm.c b/drivers/net/ethernet/intel/i40e/i40e_nvm.c
index 37d66c8..e12bf07 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_nvm.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_nvm.c
@@ -244,6 +244,7 @@ i40e_status i40e_read_nvm_buffer(struct i40e_hw *hw, u16 offset,
 /**
  *  i40e_calc_nvm_checksum - Calculates and returns the checksum
  *  @hw: pointer to hardware structure
+ *  @checksum: pointer to the checksum
  *
  *  This function calculate SW Checksum that covers the whole 64kB shadow RAM
  *  except the VPD and PCIe ALT Auto-load modules. The structure and size of VPD
-- 
1.8.3.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox