* [PATCH net-next 11/13] net: sched: implement tcf_block_get() and tcf_block_put()
From: Vlad Buslov @ 2018-09-06 7:59 UTC (permalink / raw)
To: netdev
Cc: jhs, xiyou.wangcong, jiri, davem, stephen, ktkhai, paulmck,
nicolas.dichtel, leon, gregkh, mark.rutland, fw, dsahern,
lucien.xin, jakub.kicinski, christian.brauner, jbenc, Vlad Buslov
In-Reply-To: <1536220742-25650-1-git-send-email-vladbu@mellanox.com>
Implement get/put function for blocks that only take/release the reference
and perform deallocation. These functions are intended to be used by
unlocked rules update path to always hold reference to block while working
with it. They use on new fine-grained locking mechanisms introduced in
previous patches in this set, instead of relying on global protection
provided by rtnl lock.
Extract code that is common with tcf_block_detach_ext() into common
function __tcf_block_put().
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
net/sched/cls_api.c | 70 ++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 48 insertions(+), 22 deletions(-)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f06aa9313a58..5d9f91331d26 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -537,6 +537,19 @@ static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index)
return idr_find(&tn->idr, block_index);
}
+static struct tcf_block *tcf_block_get(struct net *net, u32 block_index)
+{
+ struct tcf_block *block;
+
+ rcu_read_lock();
+ block = tcf_block_lookup(net, block_index);
+ if (block && !refcount_inc_not_zero(&block->refcnt))
+ block = NULL;
+ rcu_read_unlock();
+
+ return block;
+}
+
static void tcf_qdisc_put(struct Qdisc *q, bool rtnl_held)
{
if (!q)
@@ -573,6 +586,40 @@ static void tcf_block_put_all_chains(struct tcf_block *block)
}
}
+static void __tcf_block_put(struct tcf_block *block, struct Qdisc *q,
+ struct tcf_block_ext_info *ei)
+{
+ if (refcount_dec_and_test(&block->refcnt)) {
+ /* Flushing/putting all chains will cause the block to be
+ * deallocated when last chain is freed. However, if chain_list
+ * is empty, block has to be manually deallocated. After block
+ * reference counter reached 0, it is no longer possible to
+ * increment it or add new chains to block.
+ */
+ bool free_block = list_empty(&block->chain_list);
+
+ if (tcf_block_shared(block))
+ tcf_block_remove(block, block->net);
+ if (!free_block)
+ tcf_block_flush_all_chains(block);
+
+ if (q)
+ tcf_block_offload_unbind(block, q, ei);
+
+ if (free_block)
+ kfree_rcu(block, rcu);
+ else
+ tcf_block_put_all_chains(block);
+ } else if (q) {
+ tcf_block_offload_unbind(block, q, ei);
+ }
+}
+
+static void tcf_block_put(struct tcf_block *block)
+{
+ __tcf_block_put(block, NULL, NULL);
+}
+
/* Find tcf block.
* Set q, parent, cl when appropriate.
*/
@@ -835,28 +882,7 @@ void tcf_block_detach_ext(struct tcf_block *block, struct Qdisc *q,
tcf_chain0_head_change_cb_del(block, ei);
tcf_block_owner_del(block, q, ei->binder_type);
- if (refcount_dec_and_test(&block->refcnt)) {
- /* Flushing/putting all chains will cause the block to be
- * deallocated when last chain is freed. However, if chain_list
- * is empty, block has to be manually deallocated. After block
- * reference counter reached 0, it is no longer possible to
- * increment it or add new chains to block.
- */
- bool free_block = list_empty(&block->chain_list);
-
- if (tcf_block_shared(block))
- tcf_block_remove(block, block->net);
- if (!free_block)
- tcf_block_flush_all_chains(block);
- tcf_block_offload_unbind(block, q, ei);
-
- if (free_block)
- kfree_rcu(block, rcu);
- else
- tcf_block_put_all_chains(block);
- } else {
- tcf_block_offload_unbind(block, q, ei);
- }
+ __tcf_block_put(block, q, ei);
}
EXPORT_SYMBOL(tcf_block_detach_ext);
--
2.7.5
^ permalink raw reply related
* [PATCH net-next 12/13] net: sched: use reference counting for tcf blocks on rules update
From: Vlad Buslov @ 2018-09-06 7:59 UTC (permalink / raw)
To: netdev
Cc: jhs, xiyou.wangcong, jiri, davem, stephen, ktkhai, paulmck,
nicolas.dichtel, leon, gregkh, mark.rutland, fw, dsahern,
lucien.xin, jakub.kicinski, christian.brauner, jbenc, Vlad Buslov
In-Reply-To: <1536220742-25650-1-git-send-email-vladbu@mellanox.com>
In order to remove dependency on rtnl lock on rules update path, always
take reference to block while using it on rules update path. Change
tcf_block_get() error handling to properly release block with reference
counting, instead of just destroying it, in order to accommodate potential
concurrent users.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
net/sched/cls_api.c | 37 ++++++++++++++++++++-----------------
1 file changed, 20 insertions(+), 17 deletions(-)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 5d9f91331d26..8808818a1d24 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -633,7 +633,7 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q,
int err = 0;
if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
- block = tcf_block_lookup(net, block_index);
+ block = tcf_block_get(net, block_index);
if (!block) {
NL_SET_ERR_MSG(extack, "Block of given index was not found");
return ERR_PTR(-EINVAL);
@@ -713,6 +713,14 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q,
err = -EOPNOTSUPP;
goto errout_qdisc;
}
+
+ /* Always take reference to block in order to support execution
+ * of rules update path of cls API without rtnl lock. Caller
+ * must release block when it is finished using it. 'if' block
+ * of this conditional obtain reference to block by calling
+ * tcf_block_get().
+ */
+ refcount_inc(&block->refcnt);
}
return block;
@@ -726,6 +734,8 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q,
static void tcf_block_release(struct Qdisc *q, struct tcf_block *block)
{
+ if (!IS_ERR_OR_NULL(block))
+ tcf_block_put(block);
tcf_qdisc_put(q, true);
}
@@ -794,21 +804,16 @@ int tcf_block_attach_ext(struct tcf_block **p_block, struct Qdisc *q,
{
struct net *net = qdisc_net(q);
struct tcf_block *block = NULL;
- bool created = false;
int err;
- if (ei->block_index) {
+ if (ei->block_index)
/* block_index not 0 means the shared block is requested */
- block = tcf_block_lookup(net, ei->block_index);
- if (block)
- refcount_inc(&block->refcnt);
- }
+ block = tcf_block_get(net, ei->block_index);
if (!block) {
block = tcf_block_create(net, q, ei->block_index, extack);
if (IS_ERR(block))
return PTR_ERR(block);
- created = true;
if (tcf_block_shared(block)) {
err = tcf_block_insert(block, net, extack);
if (err)
@@ -838,14 +843,8 @@ int tcf_block_attach_ext(struct tcf_block **p_block, struct Qdisc *q,
err_chain0_head_change_cb_add:
tcf_block_owner_del(block, q, ei->binder_type);
err_block_owner_add:
- if (created) {
- if (tcf_block_shared(block))
- tcf_block_remove(block, net);
err_block_insert:
- kfree_rcu(block, rcu);
- } else {
- refcount_dec(&block->refcnt);
- }
+ tcf_block_put(block);
return err;
}
EXPORT_SYMBOL(tcf_block_attach_ext);
@@ -1738,7 +1737,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
return err;
if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
- block = tcf_block_lookup(net, tcm->tcm_block_index);
+ block = tcf_block_get(net, tcm->tcm_block_index);
if (!block)
goto out;
/* If we work with block index, q is NULL and parent value
@@ -1797,6 +1796,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
}
}
+ if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK)
+ tcf_block_put(block);
cb->args[0] = index;
out:
@@ -2061,7 +2062,7 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
return err;
if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
- block = tcf_block_lookup(net, tcm->tcm_block_index);
+ block = tcf_block_get(net, tcm->tcm_block_index);
if (!block)
goto out;
/* If we work with block index, q is NULL and parent value
@@ -2128,6 +2129,8 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
index++;
}
+ if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK)
+ tcf_block_put(block);
cb->args[0] = index;
out:
--
2.7.5
^ permalink raw reply related
* [PATCH net-next 09/13] net: sched: extend tcf_block with rcu
From: Vlad Buslov @ 2018-09-06 7:58 UTC (permalink / raw)
To: netdev
Cc: jhs, xiyou.wangcong, jiri, davem, stephen, ktkhai, paulmck,
nicolas.dichtel, leon, gregkh, mark.rutland, fw, dsahern,
lucien.xin, jakub.kicinski, christian.brauner, jbenc, Vlad Buslov
In-Reply-To: <1536220742-25650-1-git-send-email-vladbu@mellanox.com>
Extend tcf_block with rcu to allow safe deallocation when it is accessed
concurrently.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
include/net/sch_generic.h | 1 +
net/sched/cls_api.c | 6 +++---
2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 825e2bf6c5c3..2b87b47c49f6 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -357,6 +357,7 @@ struct tcf_block {
struct tcf_chain *chain;
struct list_head filter_chain_list;
} chain0;
+ struct rcu_head rcu;
};
static inline void tcf_block_offload_inc(struct tcf_block *block, u32 *flags)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f11da74dd339..502b2da8a885 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -241,7 +241,7 @@ static void tcf_chain_destroy(struct tcf_chain *chain)
block->chain0.chain = NULL;
kfree(chain);
if (list_empty(&block->chain_list) && !refcount_read(&block->refcnt))
- kfree(block);
+ kfree_rcu(block, rcu);
}
static void tcf_chain_hold(struct tcf_chain *chain)
@@ -785,7 +785,7 @@ int tcf_block_attach_ext(struct tcf_block **p_block, struct Qdisc *q,
if (tcf_block_shared(block))
tcf_block_remove(block, net);
err_block_insert:
- kfree(block);
+ kfree_rcu(block, rcu);
} else {
refcount_dec(&block->refcnt);
}
@@ -841,7 +841,7 @@ void tcf_block_detach_ext(struct tcf_block *block, struct Qdisc *q,
tcf_block_offload_unbind(block, q, ei);
if (free_block)
- kfree(block);
+ kfree_rcu(block, rcu);
else
tcf_block_put_all_chains(block);
} else {
--
2.7.5
^ permalink raw reply related
* [PATCH net-next 10/13] net: sched: protect block idr with spinlock
From: Vlad Buslov @ 2018-09-06 7:58 UTC (permalink / raw)
To: netdev
Cc: jhs, xiyou.wangcong, jiri, davem, stephen, ktkhai, paulmck,
nicolas.dichtel, leon, gregkh, mark.rutland, fw, dsahern,
lucien.xin, jakub.kicinski, christian.brauner, jbenc, Vlad Buslov
In-Reply-To: <1536220742-25650-1-git-send-email-vladbu@mellanox.com>
Protect block idr access with spinlock, instead of relying on rtnl lock.
Take tn->idr_lock spinlock during block insertion and removal.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
net/sched/cls_api.c | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 502b2da8a885..f06aa9313a58 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -473,6 +473,7 @@ tcf_chain0_head_change_cb_del(struct tcf_block *block,
}
struct tcf_net {
+ spinlock_t idr_lock; /* Protects idr */
struct idr idr;
};
@@ -482,16 +483,25 @@ static int tcf_block_insert(struct tcf_block *block, struct net *net,
struct netlink_ext_ack *extack)
{
struct tcf_net *tn = net_generic(net, tcf_net_id);
+ int err;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&tn->idr_lock);
+ err = idr_alloc_u32(&tn->idr, block, &block->index, block->index,
+ GFP_NOWAIT);
+ spin_unlock(&tn->idr_lock);
+ idr_preload_end();
- return idr_alloc_u32(&tn->idr, block, &block->index, block->index,
- GFP_KERNEL);
+ return err;
}
static void tcf_block_remove(struct tcf_block *block, struct net *net)
{
struct tcf_net *tn = net_generic(net, tcf_net_id);
+ spin_lock(&tn->idr_lock);
idr_remove(&tn->idr, block->index);
+ spin_unlock(&tn->idr_lock);
}
static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
@@ -2284,6 +2294,7 @@ static __net_init int tcf_net_init(struct net *net)
{
struct tcf_net *tn = net_generic(net, tcf_net_id);
+ spin_lock_init(&tn->idr_lock);
idr_init(&tn->idr);
return 0;
}
--
2.7.5
^ permalink raw reply related
* [PATCH net-next 13/13] net: sched: add flags to Qdisc class ops struct
From: Vlad Buslov @ 2018-09-06 7:59 UTC (permalink / raw)
To: netdev
Cc: jhs, xiyou.wangcong, jiri, davem, stephen, ktkhai, paulmck,
nicolas.dichtel, leon, gregkh, mark.rutland, fw, dsahern,
lucien.xin, jakub.kicinski, christian.brauner, jbenc, Vlad Buslov
In-Reply-To: <1536220742-25650-1-git-send-email-vladbu@mellanox.com>
Extend Qdisc_class_ops with flags. Create enum to hold possible class ops
flag values. Add first class ops flags value QDISC_CLASS_OPS_DOIT_UNLOCKED
to indicate that class ops functions can be called without taking rtnl
lock.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
include/net/sch_generic.h | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 2b87b47c49f6..bc4082961726 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -174,6 +174,7 @@ static inline int qdisc_avail_bulklimit(const struct netdev_queue *txq)
}
struct Qdisc_class_ops {
+ unsigned int flags;
/* Child qdisc manipulation */
struct netdev_queue * (*select_queue)(struct Qdisc *, struct tcmsg *);
int (*graft)(struct Qdisc *, unsigned long cl,
@@ -205,6 +206,13 @@ struct Qdisc_class_ops {
struct gnet_dump *);
};
+/* Qdisc_class_ops flag values */
+
+/* Implements API that doesn't require rtnl lock */
+enum qdisc_class_ops_flags {
+ QDISC_CLASS_OPS_DOIT_UNLOCKED = 1,
+};
+
struct Qdisc_ops {
struct Qdisc_ops *next;
const struct Qdisc_class_ops *cl_ops;
--
2.7.5
^ permalink raw reply related
* [PATCH net-next 02/13] net: sched: rename qdisc_destroy() to qdisc_put()
From: Vlad Buslov @ 2018-09-06 7:58 UTC (permalink / raw)
To: netdev
Cc: jhs, xiyou.wangcong, jiri, davem, stephen, ktkhai, paulmck,
nicolas.dichtel, leon, gregkh, mark.rutland, fw, dsahern,
lucien.xin, jakub.kicinski, christian.brauner, jbenc, Vlad Buslov
In-Reply-To: <1536220742-25650-1-git-send-email-vladbu@mellanox.com>
Current implementation of qdisc_destroy() decrements Qdisc reference
counter and only actually destroy Qdisc if reference counter value reached
zero. Rename qdisc_destroy() to qdisc_put() in order for it to better
describe the way in which this function currently implemented and used.
Extract code that deallocates Qdisc into new private qdisc_destroy()
function. It is intended to be shared between regular qdisc_put() and its
unlocked version that is introduced in next patch in this series.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
include/net/sch_generic.h | 2 +-
net/sched/sch_api.c | 6 +++---
net/sched/sch_atm.c | 2 +-
net/sched/sch_cbq.c | 2 +-
net/sched/sch_cbs.c | 2 +-
net/sched/sch_drr.c | 4 ++--
net/sched/sch_dsmark.c | 2 +-
net/sched/sch_fifo.c | 2 +-
net/sched/sch_generic.c | 23 ++++++++++++++---------
net/sched/sch_hfsc.c | 2 +-
net/sched/sch_htb.c | 4 ++--
net/sched/sch_mq.c | 4 ++--
net/sched/sch_mqprio.c | 4 ++--
net/sched/sch_multiq.c | 6 +++---
net/sched/sch_netem.c | 2 +-
net/sched/sch_prio.c | 6 +++---
net/sched/sch_qfq.c | 4 ++--
net/sched/sch_red.c | 4 ++--
net/sched/sch_sfb.c | 4 ++--
net/sched/sch_tbf.c | 4 ++--
20 files changed, 47 insertions(+), 42 deletions(-)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index a6d00093f35e..18e22a5a6550 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -554,7 +554,7 @@ void dev_deactivate_many(struct list_head *head);
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
struct Qdisc *qdisc);
void qdisc_reset(struct Qdisc *qdisc);
-void qdisc_destroy(struct Qdisc *qdisc);
+void qdisc_put(struct Qdisc *qdisc);
void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, unsigned int n,
unsigned int len);
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 98541c6399db..836b32e6e8e8 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -921,7 +921,7 @@ static void notify_and_destroy(struct net *net, struct sk_buff *skb,
qdisc_notify(net, skb, n, clid, old, new);
if (old)
- qdisc_destroy(old);
+ qdisc_put(old);
}
/* Graft qdisc "new" to class "classid" of qdisc "parent" or
@@ -974,7 +974,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
qdisc_refcount_inc(new);
if (!ingress)
- qdisc_destroy(old);
+ qdisc_put(old);
}
skip:
@@ -1568,7 +1568,7 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
if (err) {
if (q)
- qdisc_destroy(q);
+ qdisc_put(q);
return err;
}
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index cd49afca9617..d714d3747bcb 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -150,7 +150,7 @@ static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
pr_debug("atm_tc_put: destroying\n");
list_del_init(&flow->list);
pr_debug("atm_tc_put: qdisc %p\n", flow->q);
- qdisc_destroy(flow->q);
+ qdisc_put(flow->q);
tcf_block_put(flow->block);
if (flow->sock) {
pr_debug("atm_tc_put: f_count %ld\n",
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index f42025d53cfe..4dc05409e3fb 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1418,7 +1418,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
WARN_ON(cl->filters);
tcf_block_put(cl->block);
- qdisc_destroy(cl->q);
+ qdisc_put(cl->q);
qdisc_put_rtab(cl->R_tab);
gen_kill_estimator(&cl->rate_est);
if (cl != &q->link)
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index e26a24017faa..e689e11b6d0f 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -379,7 +379,7 @@ static void cbs_destroy(struct Qdisc *sch)
cbs_disable_offload(dev, q);
if (q->qdisc)
- qdisc_destroy(q->qdisc);
+ qdisc_put(q->qdisc);
}
static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index e0b0cf8a9939..cdebaed0f8cf 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -134,7 +134,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
tca[TCA_RATE]);
if (err) {
NL_SET_ERR_MSG(extack, "Failed to replace estimator");
- qdisc_destroy(cl->qdisc);
+ qdisc_put(cl->qdisc);
kfree(cl);
return err;
}
@@ -153,7 +153,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
{
gen_kill_estimator(&cl->rate_est);
- qdisc_destroy(cl->qdisc);
+ qdisc_put(cl->qdisc);
kfree(cl);
}
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 049714c57075..f6f480784bc6 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -412,7 +412,7 @@ static void dsmark_destroy(struct Qdisc *sch)
pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
tcf_block_put(p->block);
- qdisc_destroy(p->q);
+ qdisc_put(p->q);
if (p->mv != p->embedded)
kfree(p->mv);
}
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 24893d3b5d22..3809c9bf8896 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -177,7 +177,7 @@ struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
if (q) {
err = fifo_set_limit(q, limit);
if (err < 0) {
- qdisc_destroy(q);
+ qdisc_put(q);
q = NULL;
}
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 69078c82963e..bb778485ed88 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -901,7 +901,7 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
if (!ops->init || ops->init(sch, NULL, extack) == 0)
return sch;
- qdisc_destroy(sch);
+ qdisc_put(sch);
return NULL;
}
EXPORT_SYMBOL(qdisc_create_dflt);
@@ -941,15 +941,11 @@ void qdisc_free(struct Qdisc *qdisc)
kfree((char *) qdisc - qdisc->padded);
}
-void qdisc_destroy(struct Qdisc *qdisc)
+static void qdisc_destroy(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;
struct sk_buff *skb, *tmp;
- if (qdisc->flags & TCQ_F_BUILTIN ||
- !refcount_dec_and_test(&qdisc->refcnt))
- return;
-
#ifdef CONFIG_NET_SCHED
qdisc_hash_del(qdisc);
@@ -976,7 +972,16 @@ void qdisc_destroy(struct Qdisc *qdisc)
qdisc_free(qdisc);
}
-EXPORT_SYMBOL(qdisc_destroy);
+
+void qdisc_put(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN ||
+ !refcount_dec_and_test(&qdisc->refcnt))
+ return;
+
+ qdisc_destroy(qdisc);
+}
+EXPORT_SYMBOL(qdisc_put);
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
@@ -1270,7 +1275,7 @@ static void shutdown_scheduler_queue(struct net_device *dev,
rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
dev_queue->qdisc_sleeping = qdisc_default;
- qdisc_destroy(qdisc);
+ qdisc_put(qdisc);
}
}
@@ -1279,7 +1284,7 @@ void dev_shutdown(struct net_device *dev)
netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
if (dev_ingress_queue(dev))
shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
- qdisc_destroy(dev->qdisc);
+ qdisc_put(dev->qdisc);
dev->qdisc = &noop_qdisc;
WARN_ON(timer_pending(&dev->watchdog_timer));
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 3278a76f6861..b18ec1f6de60 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1092,7 +1092,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
struct hfsc_sched *q = qdisc_priv(sch);
tcf_block_put(cl->block);
- qdisc_destroy(cl->qdisc);
+ qdisc_put(cl->qdisc);
gen_kill_estimator(&cl->rate_est);
if (cl != &q->root)
kfree(cl);
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 43c4bfe625a9..862a33b9e2e0 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1224,7 +1224,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
{
if (!cl->level) {
WARN_ON(!cl->un.leaf.q);
- qdisc_destroy(cl->un.leaf.q);
+ qdisc_put(cl->un.leaf.q);
}
gen_kill_estimator(&cl->rate_est);
tcf_block_put(cl->block);
@@ -1425,7 +1425,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
/* turn parent into inner node */
qdisc_reset(parent->un.leaf.q);
qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog);
- qdisc_destroy(parent->un.leaf.q);
+ qdisc_put(parent->un.leaf.q);
if (parent->prio_activity)
htb_deactivate(q, parent);
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index d6b8ae4ed7a3..f20f3a0f8424 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -65,7 +65,7 @@ static void mq_destroy(struct Qdisc *sch)
if (!priv->qdiscs)
return;
for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
- qdisc_destroy(priv->qdiscs[ntx]);
+ qdisc_put(priv->qdiscs[ntx]);
kfree(priv->qdiscs);
}
@@ -119,7 +119,7 @@ static void mq_attach(struct Qdisc *sch)
qdisc = priv->qdiscs[ntx];
old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
if (old)
- qdisc_destroy(old);
+ qdisc_put(old);
#ifdef CONFIG_NET_SCHED
if (ntx < dev->real_num_tx_queues)
qdisc_hash_add(qdisc, false);
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 0e9d761cdd80..d364e63c396d 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -40,7 +40,7 @@ static void mqprio_destroy(struct Qdisc *sch)
for (ntx = 0;
ntx < dev->num_tx_queues && priv->qdiscs[ntx];
ntx++)
- qdisc_destroy(priv->qdiscs[ntx]);
+ qdisc_put(priv->qdiscs[ntx]);
kfree(priv->qdiscs);
}
@@ -300,7 +300,7 @@ static void mqprio_attach(struct Qdisc *sch)
qdisc = priv->qdiscs[ntx];
old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
if (old)
- qdisc_destroy(old);
+ qdisc_put(old);
if (ntx < dev->real_num_tx_queues)
qdisc_hash_add(qdisc, false);
}
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 1da7ea8de0ad..7410ce4d0321 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -175,7 +175,7 @@ multiq_destroy(struct Qdisc *sch)
tcf_block_put(q->block);
for (band = 0; band < q->bands; band++)
- qdisc_destroy(q->queues[band]);
+ qdisc_put(q->queues[band]);
kfree(q->queues);
}
@@ -204,7 +204,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
q->queues[i] = &noop_qdisc;
qdisc_tree_reduce_backlog(child, child->q.qlen,
child->qstats.backlog);
- qdisc_destroy(child);
+ qdisc_put(child);
}
}
@@ -228,7 +228,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
qdisc_tree_reduce_backlog(old,
old->q.qlen,
old->qstats.backlog);
- qdisc_destroy(old);
+ qdisc_put(old);
}
sch_tree_unlock(sch);
}
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index ad18a2052416..553c9f602336 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -1032,7 +1032,7 @@ static void netem_destroy(struct Qdisc *sch)
qdisc_watchdog_cancel(&q->watchdog);
if (q->qdisc)
- qdisc_destroy(q->qdisc);
+ qdisc_put(q->qdisc);
dist_free(q->delay_dist);
dist_free(q->slot_dist);
}
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 222e53d3d27a..f8af98621179 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -175,7 +175,7 @@ prio_destroy(struct Qdisc *sch)
tcf_block_put(q->block);
prio_offload(sch, NULL);
for (prio = 0; prio < q->bands; prio++)
- qdisc_destroy(q->queues[prio]);
+ qdisc_put(q->queues[prio]);
}
static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
@@ -205,7 +205,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
extack);
if (!queues[i]) {
while (i > oldbands)
- qdisc_destroy(queues[--i]);
+ qdisc_put(queues[--i]);
return -ENOMEM;
}
}
@@ -220,7 +220,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
qdisc_tree_reduce_backlog(child, child->q.qlen,
child->qstats.backlog);
- qdisc_destroy(child);
+ qdisc_put(child);
}
for (i = oldbands; i < q->bands; i++) {
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index bb1a9c11fc54..dc37c4ead439 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -526,7 +526,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
return 0;
destroy_class:
- qdisc_destroy(cl->qdisc);
+ qdisc_put(cl->qdisc);
kfree(cl);
return err;
}
@@ -537,7 +537,7 @@ static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
qfq_rm_from_agg(q, cl);
gen_kill_estimator(&cl->rate_est);
- qdisc_destroy(cl->qdisc);
+ qdisc_put(cl->qdisc);
kfree(cl);
}
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 56c181c3feeb..3ce6c0a2c493 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -181,7 +181,7 @@ static void red_destroy(struct Qdisc *sch)
del_timer_sync(&q->adapt_timer);
red_offload(sch, false);
- qdisc_destroy(q->qdisc);
+ qdisc_put(q->qdisc);
}
static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
@@ -233,7 +233,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
if (child) {
qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
q->qdisc->qstats.backlog);
- qdisc_destroy(q->qdisc);
+ qdisc_put(q->qdisc);
q->qdisc = child;
}
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 7cbdad8419b7..bab506b01a32 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -469,7 +469,7 @@ static void sfb_destroy(struct Qdisc *sch)
struct sfb_sched_data *q = qdisc_priv(sch);
tcf_block_put(q->block);
- qdisc_destroy(q->qdisc);
+ qdisc_put(q->qdisc);
}
static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = {
@@ -523,7 +523,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt,
qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
q->qdisc->qstats.backlog);
- qdisc_destroy(q->qdisc);
+ qdisc_put(q->qdisc);
q->qdisc = child;
q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 6f74a426f159..dd29de1418b7 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -392,7 +392,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
if (child) {
qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
q->qdisc->qstats.backlog);
- qdisc_destroy(q->qdisc);
+ qdisc_put(q->qdisc);
q->qdisc = child;
}
q->limit = qopt->limit;
@@ -438,7 +438,7 @@ static void tbf_destroy(struct Qdisc *sch)
struct tbf_sched_data *q = qdisc_priv(sch);
qdisc_watchdog_cancel(&q->watchdog);
- qdisc_destroy(q->qdisc);
+ qdisc_put(q->qdisc);
}
static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
--
2.7.5
^ permalink raw reply related
* KASAN: use-after-free Read in bpf_tcp_close (2)
From: syzbot @ 2018-09-06 8:22 UTC (permalink / raw)
To: ast, daniel, linux-kernel, netdev, syzkaller-bugs
Hello,
syzbot found the following crash on:
HEAD commit: 11f026b4e306 libbpf: Remove the duplicate checking of func..
git tree: bpf-next
console output: https://syzkaller.appspot.com/x/log.txt?x=153c55ca400000
kernel config: https://syzkaller.appspot.com/x/.config?x=4c7e83258d6e0156
dashboard link: https://syzkaller.appspot.com/bug?extid=579adfa56843da894bc5
compiler: gcc (GCC) 8.0.1 20180413 (experimental)
Unfortunately, I don't have any reproducer for this crash yet.
IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+579adfa56843da894bc5@syzkaller.appspotmail.com
==================================================================
BUG: KASAN: use-after-free in atomic_read
include/asm-generic/atomic-instrumented.h:21 [inline]
BUG: KASAN: use-after-free in virt_spin_lock
arch/x86/include/asm/qspinlock.h:65 [inline]
BUG: KASAN: use-after-free in native_queued_spin_lock_slowpath+0x189/0x1220
kernel/locking/qspinlock.c:305
Read of size 4 at addr ffff8801bfd651a0 by task syz-executor1/9753
CPU: 1 PID: 9753 Comm: syz-executor1 Not tainted 4.19.0-rc2+ #89
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
print_address_description+0x6c/0x20b mm/kasan/report.c:256
kasan_report_error mm/kasan/report.c:354 [inline]
kasan_report.cold.7+0x242/0x30d mm/kasan/report.c:412
check_memory_region_inline mm/kasan/kasan.c:260 [inline]
check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267
kasan_check_read+0x11/0x20 mm/kasan/kasan.c:272
atomic_read include/asm-generic/atomic-instrumented.h:21 [inline]
virt_spin_lock arch/x86/include/asm/qspinlock.h:65 [inline]
native_queued_spin_lock_slowpath+0x189/0x1220
kernel/locking/qspinlock.c:305
pv_queued_spin_lock_slowpath arch/x86/include/asm/paravirt.h:679 [inline]
queued_spin_lock_slowpath arch/x86/include/asm/qspinlock.h:32 [inline]
queued_spin_lock include/asm-generic/qspinlock.h:88 [inline]
do_raw_spin_lock+0x1a7/0x200 kernel/locking/spinlock_debug.c:113
__raw_spin_lock_bh include/linux/spinlock_api_smp.h:136 [inline]
_raw_spin_lock_bh+0x39/0x40 kernel/locking/spinlock.c:168
bpf_tcp_close+0x68e/0x10d0 kernel/bpf/sockmap.c:349
inet_release+0x104/0x1f0 net/ipv4/af_inet.c:428
inet6_release+0x50/0x70 net/ipv6/af_inet6.c:457
__sock_release+0xd7/0x250 net/socket.c:579
sock_close+0x19/0x20 net/socket.c:1139
__fput+0x38a/0xa40 fs/file_table.c:278
____fput+0x15/0x20 fs/file_table.c:309
task_work_run+0x1e8/0x2a0 kernel/task_work.c:113
tracehook_notify_resume include/linux/tracehook.h:193 [inline]
exit_to_usermode_loop+0x318/0x380 arch/x86/entry/common.c:166
prepare_exit_to_usermode arch/x86/entry/common.c:197 [inline]
syscall_return_slowpath arch/x86/entry/common.c:268 [inline]
do_syscall_64+0x6be/0x820 arch/x86/entry/common.c:293
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x410c51
Code: 75 14 b8 03 00 00 00 0f 05 48 3d 01 f0 ff ff 0f 83 34 19 00 00 c3 48
83 ec 08 e8 0a fc ff ff 48 89 04 24 b8 03 00 00 00 0f 05 <48> 8b 3c 24 48
89 c2 e8 53 fc ff ff 48 89 d0 48 83 c4 08 48 3d 01
RSP: 002b:00007ffeacb37d60 EFLAGS: 00000293 ORIG_RAX: 0000000000000003
RAX: 0000000000000000 RBX: 0000000000000008 RCX: 0000000000410c51
RDX: 0000000000000000 RSI: 0000000000731f70 RDI: 0000000000000007
RBP: 0000000000000000 R08: ffffffffffffffff R09: ffffffffffffffff
R10: 00007ffeacb37c90 R11: 0000000000000293 R12: 0000000000000010
R13: 0000000000022aa9 R14: 000000000000004d R15: badc0ffeebadface
Allocated by task 9754:
save_stack+0x43/0xd0 mm/kasan/kasan.c:448
set_track mm/kasan/kasan.c:460 [inline]
kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
kmem_cache_alloc_trace+0x152/0x730 mm/slab.c:3620
kmalloc include/linux/slab.h:513 [inline]
kzalloc include/linux/slab.h:707 [inline]
sock_map_alloc+0x209/0x430 kernel/bpf/sockmap.c:1653
find_and_alloc_map kernel/bpf/syscall.c:129 [inline]
map_create+0x3bd/0x1100 kernel/bpf/syscall.c:509
__do_sys_bpf kernel/bpf/syscall.c:2356 [inline]
__se_sys_bpf kernel/bpf/syscall.c:2333 [inline]
__x64_sys_bpf+0x303/0x510 kernel/bpf/syscall.c:2333
do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
entry_SYSCALL_64_after_hwframe+0x49/0xbe
Freed by task 13:
save_stack+0x43/0xd0 mm/kasan/kasan.c:448
set_track mm/kasan/kasan.c:460 [inline]
__kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
__cache_free mm/slab.c:3498 [inline]
kfree+0xd9/0x210 mm/slab.c:3813
sock_map_remove_complete kernel/bpf/sockmap.c:1561 [inline]
sock_map_free+0x428/0x570 kernel/bpf/sockmap.c:1756
bpf_map_free_deferred+0xba/0xf0 kernel/bpf/syscall.c:290
process_one_work+0xc73/0x1aa0 kernel/workqueue.c:2153
worker_thread+0x189/0x13c0 kernel/workqueue.c:2296
kthread+0x35a/0x420 kernel/kthread.c:246
ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:413
The buggy address belongs to the object at ffff8801bfd65080
which belongs to the cache kmalloc-512 of size 512
The buggy address is located 288 bytes inside of
512-byte region [ffff8801bfd65080, ffff8801bfd65280)
The buggy address belongs to the page:
page:ffffea0006ff5940 count:1 mapcount:0 mapping:ffff8801dac00940
index:0xffff8801bfd65300
flags: 0x2fffc0000000100(slab)
raw: 02fffc0000000100 ffffea000702a488 ffffea00075c4148 ffff8801dac00940
raw: ffff8801bfd65300 ffff8801bfd65080 0000000100000001 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
ffff8801bfd65080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff8801bfd65100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> ffff8801bfd65180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^
ffff8801bfd65200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff8801bfd65280: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
==================================================================
---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.
syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with
syzbot.
^ permalink raw reply
* [PATCH net-next] failover: Fix error return code in net_failover_create
From: YueHaibing @ 2018-09-06 13:04 UTC (permalink / raw)
To: davem, alexander.h.duyck, jeffrey.t.kirsher, liran.alon,
sridhar.samudrala, stephen, dan.carpenter
Cc: linux-kernel, netdev, YueHaibing
if failover_register failed, 'err' code should be set correctly
Fixes: cfc80d9a1163 ("net: Introduce net_failover driver")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
drivers/net/net_failover.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c
index 192ae1c..cbcb9f2 100644
--- a/drivers/net/net_failover.c
+++ b/drivers/net/net_failover.c
@@ -761,8 +761,10 @@ struct failover *net_failover_create(struct net_device *standby_dev)
netif_carrier_off(failover_dev);
failover = failover_register(failover_dev, &net_failover_ops);
- if (IS_ERR(failover))
+ if (IS_ERR(failover)) {
+ err = PTR_ERR(failover);
goto err_failover_register;
+ }
return failover;
--
2.7.0
^ permalink raw reply related
* Re: [PATCH net-next 03/13] net: sched: extend Qdisc with rcu
From: Eric Dumazet @ 2018-09-06 8:30 UTC (permalink / raw)
To: Vlad Buslov, netdev
Cc: jhs, xiyou.wangcong, jiri, davem, stephen, ktkhai, paulmck,
nicolas.dichtel, leon, gregkh, mark.rutland, fw, dsahern,
lucien.xin, jakub.kicinski, christian.brauner, jbenc
In-Reply-To: <1536220742-25650-4-git-send-email-vladbu@mellanox.com>
On 09/06/2018 12:58 AM, Vlad Buslov wrote:
...
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index 18e22a5a6550..239c73f29471 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -90,6 +90,7 @@ struct Qdisc {
> struct gnet_stats_queue __percpu *cpu_qstats;
> int padded;
> refcount_t refcnt;
> + struct rcu_head rcu;
>
> /*
> * For performance sake on SMP, we put highly modified fields at the end
Probably better to move this at the end of struct Qdisc,
not risking unexpected performance regressions in fast path.
^ permalink raw reply
* Re: [PATCH net-next 03/13] net: sched: extend Qdisc with rcu
From: Kirill Tkhai @ 2018-09-06 8:39 UTC (permalink / raw)
To: Eric Dumazet, Vlad Buslov, netdev
Cc: jhs, xiyou.wangcong, jiri, davem, stephen, paulmck,
nicolas.dichtel, leon, gregkh, mark.rutland, fw, dsahern,
lucien.xin, jakub.kicinski, christian.brauner, jbenc
In-Reply-To: <a5b99bb3-9076-ba0f-5b8b-efe8529e93fa@gmail.com>
On 06.09.2018 11:30, Eric Dumazet wrote:
>
>
> On 09/06/2018 12:58 AM, Vlad Buslov wrote:
>
> ...
>
>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
>> index 18e22a5a6550..239c73f29471 100644
>> --- a/include/net/sch_generic.h
>> +++ b/include/net/sch_generic.h
>> @@ -90,6 +90,7 @@ struct Qdisc {
>> struct gnet_stats_queue __percpu *cpu_qstats;
>> int padded;
>> refcount_t refcnt;
>> + struct rcu_head rcu;
>>
>> /*
>> * For performance sake on SMP, we put highly modified fields at the end
>
> Probably better to move this at the end of struct Qdisc,
> not risking unexpected performance regressions in fast path.
Do you mean regressions on UP? On SMP it looks like this field
fits in the unused gap created by:
struct sk_buff_head gso_skb ____cacheline_aligned_in_smp;
Kirill
^ permalink raw reply
* Re: [PATCH v2 01/17] asm: simd context helper API
From: Thomas Gleixner @ 2018-09-06 13:42 UTC (permalink / raw)
To: Jason A. Donenfeld
Cc: Andrew Lutomirski, LKML, Netdev, David Miller, Greg Kroah-Hartman,
Samuel Neves, linux-arch, Rik van Riel
In-Reply-To: <CAHmME9rL2omQxqMVCcX-jPMpcQDk8ftEn3YXWcFhS=AkVK8ZXw@mail.gmail.com>
On Sat, 1 Sep 2018, Jason A. Donenfeld wrote:
> On Sat, Sep 1, 2018 at 2:32 PM Andy Lutomirski <luto@kernel.org> wrote:
> > I tend to think the right approach is to merge Jason's code and then
> > make it better later. Even with a totally perfect lazy FPU restore
> > implementation on x86, we'll probably still need some way of dealing
> > with SIMD contexts. I think we're highly unlikely to ever a allow
> > SIMD usage in all NMI contexts, for example, and there will always be
> > cases where we specifically don't want to use all available SIMD
> > capabilities even if we can. For example, generating random numbers
> > does crypto, but we probably don't want to do *SIMD* crypto, since
> > that will force a save and restore and will probably fire up the
> > AVX512 unit, and that's not worth it unless we're already using it for
> > some other reason.
> >
> > Also, as Rik has discovered, lazy FPU restore is conceptually
> > straightforward but isn't entirely trivial :)
>
> Sounds good. I'll move ahead on this basis.
Fine with me.
^ permalink raw reply
* [PATCH net-next v3 1/2] netlink: ipv4 igmp join notifications
From: Patrick Ruddy @ 2018-09-06 9:10 UTC (permalink / raw)
To: netdev; +Cc: roopa, jiri, stephen
In-Reply-To: <20180830093545.29465-2-pruddy@vyatta.att-mail.com>
Some userspace applications need to know about IGMP joins from the
kernel for 2 reasons:
1. To allow the programming of multicast MAC filters in hardware
2. To form a multicast FORUS list for non link-local multicast
groups to be sent to the kernel and from there to the interested
party.
(1) can be fulfilled but simply sending the hardware multicast MAC
address to be programmed but (2) requires the L3 address to be sent
since this cannot be constructed from the MAC address whereas the
reverse translation is a standard library function.
This commit provides addition and deletion of multicast addresses
using the RTM_NEWMDB and RTM_DELMDB messages with AF_INET. It also
provides the RTM_GETMDB extension to allow multicast join state to
be read from the kernel.
Signed-off-by: Patrick Ruddy <pruddy@vyatta.att-mail.com>
---
v3 rework to use RTM_***MDB messages as per review comments.
net/ipv4/igmp.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 139 insertions(+)
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 4da39446da2d..aed819e2ea93 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -86,6 +86,7 @@
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/if_arp.h>
+#include <net/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/times.h>
#include <linux/pkt_sched.h>
@@ -1385,6 +1386,91 @@ static void ip_mc_hash_remove(struct in_device *in_dev,
}
+static int fill_addr(struct sk_buff *skb, struct net_device *dev, __be32 addr,
+ int type, unsigned int flags)
+{
+ struct nlmsghdr *nlh;
+ struct ifaddrmsg *ifm;
+
+ nlh = nlmsg_put(skb, 0, 0, type, sizeof(*ifm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifa_family = AF_INET;
+ ifm->ifa_prefixlen = 32;
+ ifm->ifa_flags = IFA_F_PERMANENT;
+ ifm->ifa_scope = RT_SCOPE_LINK;
+ ifm->ifa_index = dev->ifindex;
+
+ if (nla_put_in_addr(skb, IFA_ADDRESS, addr))
+ goto nla_put_failure;
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static inline size_t addr_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
+ + nla_total_size(sizeof(__be32));
+}
+
+static void ip_mc_addr_notify(struct net_device *dev, __be32 addr, int type)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(addr_nlmsg_size(), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = fill_addr(skb, dev, addr, type, 0);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_MDB, err);
+}
+
+int ip_mc_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb,
+ struct net_device *dev)
+{
+ int s_idx;
+ int idx = 0;
+ struct ip_mc_list *im;
+ struct in_device *in_dev;
+
+ ASSERT_RTNL();
+
+ s_idx = cb->args[2];
+ in_dev = __in_dev_get_rtnl(dev);
+
+ for_each_pmc_rtnl(in_dev, im) {
+ if (idx < s_idx)
+ continue;
+ if (fill_addr(skb, dev, im->multiaddr, RTM_NEWMDB,
+ NLM_F_MULTI) < 0)
+ goto done;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ idx++;
+ }
+
+ done:
+ cb->args[2] = idx;
+
+ return skb->len;
+}
+
/*
* A socket has joined a multicast group on device dev.
*/
@@ -1430,6 +1516,8 @@ static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
igmpv3_del_delrec(in_dev, im);
#endif
igmp_group_added(im);
+
+ ip_mc_addr_notify(in_dev->dev, addr, RTM_NEWMDB);
if (!in_dev->dead)
ip_rt_multicast_event(in_dev);
out:
@@ -1661,6 +1749,8 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
in_dev->mc_count--;
igmp_group_dropped(i);
ip_mc_clear_src(i);
+ ip_mc_addr_notify(in_dev->dev, addr,
+ RTM_DELMDB);
if (!in_dev->dead)
ip_rt_multicast_event(in_dev);
@@ -3051,6 +3141,53 @@ static struct notifier_block igmp_notifier = {
.notifier_call = igmp_netdev_event,
};
+static int igmp_mc_dump_ifaddrs(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx, s_idx;
+ struct net_device *dev;
+ struct in_device *in_dev;
+ struct hlist_head *head;
+
+ s_h = cb->args[0];
+ idx = cb->args[1];
+ s_idx = idx;
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ rcu_read_lock();
+ cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
+ net->dev_base_seq;
+ hlist_for_each_entry_rcu(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ if (h > s_h || idx > s_idx)
+ cb->args[2] = 0;
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ goto cont;
+
+ /* loop over multicast addresses */
+ if (ip_mc_dump_ifaddr(skb, cb, dev) < 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+cont:
+ idx++;
+ }
+ rcu_read_unlock();
+ }
+
+done:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+
+ return skb->len;
+}
+
int __init igmp_mc_init(void)
{
#if defined(CONFIG_PROC_FS)
@@ -3064,6 +3201,8 @@ int __init igmp_mc_init(void)
goto reg_notif_fail;
return 0;
+ rtnl_register(PF_INET, RTM_GETMDB, NULL, igmp_mc_dump_ifaddrs, 0);
+
reg_notif_fail:
unregister_pernet_subsys(&igmp_net_ops);
return err;
--
2.17.1
^ permalink raw reply related
* [PATCH net-next v3 2/2] netlink: ipv6 MLD join notifications
From: Patrick Ruddy @ 2018-09-06 9:10 UTC (permalink / raw)
To: netdev; +Cc: roopa, jiri, stephen
In-Reply-To: <20180906091056.21109-1-pruddy@vyatta.att-mail.com>
Some userspace applications need to know about MLD joins from the
kernel for 2 reasons:
1. To allow the programming of multicast MAC filters in hardware
2. To form a multicast FORUS list for non link-local multicast
groups to be sent to the kernel and from there to the interested
party.
(1) can be fulfilled but simply sending the hardware multicast MAC
address to be programmed but (2) requires the L3 address to be sent
since this cannot be constructed from the MAC address whereas the
reverse translation is a standard library function.
This commit provides addition and deletion of multicast addresses
using the RTM_NEWMDB and RTM_DELMDB messages with AF_INET6. It also
provides the RTM_GETMDB extension to allow multicast join state to
be read from the kernel.
Signed-off-by: Patrick Ruddy <pruddy@vyatta.att-mail.com>
---
v3 rework to use RTM_***MDB messages as per review comments.
net/ipv6/addrconf.c | 34 ++++++++++++++++-------
net/ipv6/mcast.c | 66 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 90 insertions(+), 10 deletions(-)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d51a8c0b3372..d23955c21650 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4860,6 +4860,8 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
struct nlmsghdr *nlh;
u8 scope = RT_SCOPE_UNIVERSE;
int ifindex = ifmca->idev->dev->ifindex;
+ int addr_type = (event == RTM_GETMULTICAST) ? IFA_MULTICAST :
+ IFA_ADDRESS;
if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE)
scope = RT_SCOPE_SITE;
@@ -4869,7 +4871,7 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
return -EMSGSIZE;
put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
- if (nla_put_in6_addr(skb, IFA_MULTICAST, &ifmca->mca_addr) < 0 ||
+ if (nla_put_in6_addr(skb, addr_type, &ifmca->mca_addr) < 0 ||
put_cacheinfo(skb, ifmca->mca_cstamp, ifmca->mca_tstamp,
INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) {
nlmsg_cancel(skb, nlh);
@@ -4916,7 +4918,7 @@ enum addr_type_t {
/* called with rcu_read_lock() */
static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
struct netlink_callback *cb, enum addr_type_t type,
- int s_ip_idx, int *p_ip_idx)
+ int s_ip_idx, int *p_ip_idx, int msg_type)
{
struct ifmcaddr6 *ifmca;
struct ifacaddr6 *ifaca;
@@ -4935,7 +4937,7 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
err = inet6_fill_ifaddr(skb, ifa,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
- RTM_NEWADDR,
+ msg_type,
NLM_F_MULTI);
if (err < 0)
break;
@@ -4952,7 +4954,7 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
err = inet6_fill_ifmcaddr(skb, ifmca,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
- RTM_GETMULTICAST,
+ msg_type,
NLM_F_MULTI);
if (err < 0)
break;
@@ -4967,7 +4969,7 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
err = inet6_fill_ifacaddr(skb, ifaca,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
- RTM_GETANYCAST,
+ msg_type,
NLM_F_MULTI);
if (err < 0)
break;
@@ -4982,7 +4984,7 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
}
static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
- enum addr_type_t type)
+ enum addr_type_t type, int msg_type)
{
struct net *net = sock_net(skb->sk);
int h, s_h;
@@ -5012,7 +5014,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
goto cont;
if (in6_dump_addrs(idev, skb, cb, type,
- s_ip_idx, &ip_idx) < 0)
+ s_ip_idx, &ip_idx, msg_type) < 0)
goto done;
cont:
idx++;
@@ -5031,14 +5033,22 @@ static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
enum addr_type_t type = UNICAST_ADDR;
- return inet6_dump_addr(skb, cb, type);
+ return inet6_dump_addr(skb, cb, type, RTM_NEWADDR);
}
static int inet6_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
enum addr_type_t type = MULTICAST_ADDR;
- return inet6_dump_addr(skb, cb, type);
+ return inet6_dump_addr(skb, cb, type, RTM_GETMULTICAST);
+}
+
+static int inet6_mdb_dump_ifmcaddr(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ enum addr_type_t type = MULTICAST_ADDR;
+
+ return inet6_dump_addr(skb, cb, type, RTM_NEWMDB);
}
@@ -5046,7 +5056,7 @@ static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
enum addr_type_t type = ANYCAST_ADDR;
- return inet6_dump_addr(skb, cb, type);
+ return inet6_dump_addr(skb, cb, type, RTM_GETANYCAST);
}
static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
@@ -6774,6 +6784,10 @@ int __init addrconf_init(void)
NULL, inet6_dump_ifmcaddr, 0);
if (err < 0)
goto errout;
+ err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETMDB,
+ NULL, inet6_mdb_dump_ifmcaddr, 0);
+ if (err < 0)
+ goto errout;
err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETANYCAST,
NULL, inet6_dump_ifacaddr, 0);
if (err < 0)
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 4ae54aaca373..5108dbf73516 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -880,6 +880,67 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
return mc;
}
+static int fill_addr(struct sk_buff *skb, struct net_device *dev,
+ const struct in6_addr *addr, int type, unsigned int flags)
+{
+ struct nlmsghdr *nlh;
+ struct ifaddrmsg *ifm;
+ u8 scope = RT_SCOPE_UNIVERSE;
+
+ if (ipv6_addr_scope(addr) & IFA_SITE)
+ scope = RT_SCOPE_SITE;
+
+ nlh = nlmsg_put(skb, 0, 0, type, sizeof(*ifm), flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifa_family = AF_INET6;
+ ifm->ifa_prefixlen = 128;
+ ifm->ifa_flags = IFA_F_PERMANENT;
+ ifm->ifa_scope = scope;
+ ifm->ifa_index = dev->ifindex;
+
+ if (nla_put_in6_addr(skb, IFA_ADDRESS, addr))
+ goto nla_put_failure;
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static inline size_t addr_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
+ + nla_total_size(sizeof(struct in6_addr));
+}
+
+static void ipv6_mc_addr_notify(struct net_device *dev,
+ const struct in6_addr *addr, int type)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(addr_nlmsg_size(), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ err = fill_addr(skb, dev, addr, type, 0);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_MDB, err);
+}
+
/*
* device multicast group inc (add if not found)
*/
@@ -932,6 +993,9 @@ static int __ipv6_dev_mc_inc(struct net_device *dev,
mld_del_delrec(idev, mc);
igmp6_group_added(mc);
+
+ ipv6_mc_addr_notify(dev, addr, RTM_NEWMDB);
+
ma_put(mc);
return 0;
}
@@ -960,6 +1024,8 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr)
igmp6_group_dropped(ma);
ip6_mc_clear_src(ma);
+ ipv6_mc_addr_notify(idev->dev, addr,
+ RTM_DELMDB);
ma_put(ma);
return 0;
}
--
2.17.1
^ permalink raw reply related
* Re: [PATCH net-next 03/13] net: sched: extend Qdisc with rcu
From: Eric Dumazet @ 2018-09-06 9:21 UTC (permalink / raw)
To: Kirill Tkhai, Eric Dumazet, Vlad Buslov, netdev
Cc: jhs, xiyou.wangcong, jiri, davem, stephen, paulmck,
nicolas.dichtel, leon, gregkh, mark.rutland, fw, dsahern,
lucien.xin, jakub.kicinski, christian.brauner, jbenc
In-Reply-To: <4a230727-1072-3bc9-1590-2e59c2162460@virtuozzo.com>
On 09/06/2018 01:39 AM, Kirill Tkhai wrote:
> On 06.09.2018 11:30, Eric Dumazet wrote:
>>
>>
>> On 09/06/2018 12:58 AM, Vlad Buslov wrote:
>>
>> ...
>>
>>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
>>> index 18e22a5a6550..239c73f29471 100644
>>> --- a/include/net/sch_generic.h
>>> +++ b/include/net/sch_generic.h
>>> @@ -90,6 +90,7 @@ struct Qdisc {
>>> struct gnet_stats_queue __percpu *cpu_qstats;
>>> int padded;
>>> refcount_t refcnt;
>>> + struct rcu_head rcu;
>>>
>>> /*
>>> * For performance sake on SMP, we put highly modified fields at the end
>>
>> Probably better to move this at the end of struct Qdisc,
>> not risking unexpected performance regressions in fast path.
>
> Do you mean regressions on UP? On SMP it looks like this field
> fits in the unused gap created by:
>
> struct sk_buff_head gso_skb ____cacheline_aligned_in_smp;
>
> Kirill
Oh right, we might have holes there, and in the last cache line, so it does not
matter for now.
^ permalink raw reply
* Re: [PATCH net-next 03/13] net: sched: extend Qdisc with rcu
From: Vlad Buslov @ 2018-09-06 9:23 UTC (permalink / raw)
To: Kirill Tkhai
Cc: Eric Dumazet, netdev, jhs, xiyou.wangcong, jiri, davem, stephen,
paulmck, nicolas.dichtel, leon, gregkh, mark.rutland, fw, dsahern,
lucien.xin, jakub.kicinski, christian.brauner, jbenc
In-Reply-To: <4a230727-1072-3bc9-1590-2e59c2162460@virtuozzo.com>
On Thu 06 Sep 2018 at 08:39, Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
> On 06.09.2018 11:30, Eric Dumazet wrote:
>>
>>
>> On 09/06/2018 12:58 AM, Vlad Buslov wrote:
>>
>> ...
>>
>>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
>>> index 18e22a5a6550..239c73f29471 100644
>>> --- a/include/net/sch_generic.h
>>> +++ b/include/net/sch_generic.h
>>> @@ -90,6 +90,7 @@ struct Qdisc {
>>> struct gnet_stats_queue __percpu *cpu_qstats;
>>> int padded;
>>> refcount_t refcnt;
>>> + struct rcu_head rcu;
>>>
>>> /*
>>> * For performance sake on SMP, we put highly modified fields at the end
>>
>> Probably better to move this at the end of struct Qdisc,
>> not risking unexpected performance regressions in fast path.
>
> Do you mean regressions on UP? On SMP it looks like this field
> fits in the unused gap created by:
>
> struct sk_buff_head gso_skb ____cacheline_aligned_in_smp;
>
> Kirill
Hi Eric, Kirill
I intentionally put rcu_head here in order for it not to be in same
cache line with "highly modified fields" (according to comment).
^ permalink raw reply
* Re: [PATCH net-next 03/13] net: sched: extend Qdisc with rcu
From: Eric Dumazet @ 2018-09-06 9:29 UTC (permalink / raw)
To: Vlad Buslov, Kirill Tkhai
Cc: netdev, jhs, xiyou.wangcong, jiri, davem, stephen, paulmck,
nicolas.dichtel, leon, gregkh, mark.rutland, fw, dsahern,
lucien.xin, jakub.kicinski, christian.brauner, jbenc
In-Reply-To: <vbf1sa7x9m9.fsf@reg-r-vrt-018-180.mtr.labs.mlnx>
On 09/06/2018 02:23 AM, Vlad Buslov wrote:
>
> On Thu 06 Sep 2018 at 08:39, Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>> On 06.09.2018 11:30, Eric Dumazet wrote:
>>>
>>>
>>> On 09/06/2018 12:58 AM, Vlad Buslov wrote:
>>>
>>> ...
>>>
>>>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
>>>> index 18e22a5a6550..239c73f29471 100644
>>>> --- a/include/net/sch_generic.h
>>>> +++ b/include/net/sch_generic.h
>>>> @@ -90,6 +90,7 @@ struct Qdisc {
>>>> struct gnet_stats_queue __percpu *cpu_qstats;
>>>> int padded;
>>>> refcount_t refcnt;
>>>> + struct rcu_head rcu;
>>>>
>>>> /*
>>>> * For performance sake on SMP, we put highly modified fields at the end
>>>
>>> Probably better to move this at the end of struct Qdisc,
>>> not risking unexpected performance regressions in fast path.
>>
>> Do you mean regressions on UP? On SMP it looks like this field
>> fits in the unused gap created by:
>>
>> struct sk_buff_head gso_skb ____cacheline_aligned_in_smp;
>>
>> Kirill
>
> Hi Eric, Kirill
>
> I intentionally put rcu_head here in order for it not to be in same
> cache line with "highly modified fields" (according to comment).
My personal preference would have been to use the last cache line
for a new 'control path field', and leave holes in the first one
for future needs in fast path.
But this is a minor detail really, either choice is fine.
^ permalink raw reply
* Re: [PATCH RFC] net/mlx5_en: switch to Toeplitz RSS hash by default
From: Konstantin Khlebnikov @ 2018-09-06 10:04 UTC (permalink / raw)
To: Saeed Mahameed
Cc: Tariq Toukan, Linux Netdev List, Saeed Mahameed, Gal Pressman,
Or Gerlitz, David S. Miller
In-Reply-To: <CALzJLG8wEHMEg-ixYPyoOCOQMcDVq03eVPo-7TSk0dR7mHM=Bg@mail.gmail.com>
On 06.09.2018 08:24, Saeed Mahameed wrote:
> On Sun, Sep 2, 2018 at 2:55 AM, Konstantin Khlebnikov
> <khlebnikov@yandex-team.ru> wrote:
>> On 02.09.2018 12:29, Tariq Toukan wrote:
>>>
>>>
>>>
>>> On 31/08/2018 2:29 PM, Konstantin Khlebnikov wrote:
>>>>
>>>> XOR (MLX5_RX_HASH_FN_INVERTED_XOR8) gives only 8 bits.
>>>> It seems not enough for RFS. All other drivers use toeplitz.
>>>>
>>>> Driver mlx4_en uses Toeplitz by default and warns if hash XOR is used
>>>> together with NETIF_F_RXHASH (enabled by default too): "Enabling both
>>>> XOR Hash function and RX Hashing can limit RPS functionality".
>>>>
>>>> XOR is default in mlx5_en since commit 2be6967cdbc9
>>>> ("net/mlx5e: Support ETH_RSS_HASH_XOR").
>>>>
>>>> Hash function could be set via ethtool. But it would be nice to have
>>>> single standard for drivers or proper description why this one is
>>>> special.
>>>>
>>>> Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
>>>> ---
>>>
>>>
>>> Hi Konstantin,
>>>
>>> Thanks for the patch.
>>>
>>> I understand the motivation.
>>>
>>> This change affects the default out-of-the-box behavior and requires a
>>> full performance cycle. We'll run performance regression tomorrow, results
>>> should be ready by EOW.
>>> > I'll update.
>>
>>
>> Ok, thank you.
>>
>> The only mention I've found in your documentation
>> http://www.mellanox.com/related-docs/prod_software/Mellanox_EN_for_Linux_User_Manual_v4_4.pdf
>>
>> is
>> ---
>> 1.1.10 RSS Support
>> 1.1.10.1 RSS Hash Function
>> The device has the ability to use XOR as the RSS distribution function,
>> instead of the default
>> Toplitz function.
>> The XOR function can be better distributed among driver's receive queues in
>> small number of
>> streams, where it distributes each TCP/UDP stream to a different queue.
>> ---
>>
>> So Toeplitz is supposed to be default hash function for all versions of
>> drivers and hardware.
>>
>> Also XOR8 seems vulnerable for ddos - hash is predictable, no random\secret
>> vector, only 8 bits.
>> So, it's easy to route all flows into one point. As we got it by accident.
>>
>> Moreover, in kernel 4.4.y hash switch via ethtool is broken and does not
>> work =)
>>
>
> is it broken in mlx5 only or for the whole kernel ?
In mlx5 only - interface works but makes no effect.
For now we have disabled RXHASH as workaround.
>
> If it is mlx5 then this might be the reason:
> commit 2d75b2bc8a8c0ce5567a6ecef52e194d117efe3f
> net/mlx5e: Add ethtool RSS configuration options
>
> was submitted to kernel 4.3
>
> and an important fix for hash function change was submitted to 4.5:
>
> commit bdfc028de1b3cd59490d5413a5c87b0fa50040c2
> Author: Tariq Toukan <tariqt@mellanox.com>
> Date: Mon Feb 29 21:17:12 2016 +0200
>
> net/mlx5e: Fix ethtool RX hash func configuration change
>
> We should modify TIRs explicitly to apply the new RSS configuration.
> The light ndo close/open calls do not "refresh" them.
>
> Fixes: 2d75b2bc8a8c ('net/mlx5e: Add ethtool RSS configuration options')
>
I think so, but this patch also seems flawed and fixed in
commit a100ff3eef193d2d79daf98dcd97a54776ffeb78
Author: Gal Pressman <galp@mellanox.com>
Date: Thu Jan 12 16:25:46 2017 +0200
net/mlx5e: Fix update of hash function/key via ethtool
Modifying TIR hash should change selected fields bitmask in addition to
the function and key.
And maybe more, there a lot of progress since v4.4
>
>>>
>>> Regards,
>>> Tariq
^ permalink raw reply
* Re: [PATCH 1/7] fix hnode refcounting
From: Jamal Hadi Salim @ 2018-09-06 10:21 UTC (permalink / raw)
To: Al Viro, netdev; +Cc: Cong Wang, Jiri Pirko, stable
In-Reply-To: <20180905190414.5477-1-viro@ZenIV.linux.org.uk>
On 2018-09-05 3:04 p.m., Al Viro wrote:
> From: Al Viro <viro@zeniv.linux.org.uk>
>
> cls_u32.c misuses refcounts for struct tc_u_hnode - it counts references via
> ->hlist and via ->tp_root together. u32_destroy() drops the former and, in
> case when there had been links, leaves the sucker on the list. As the result,
> there's nothing to protect it from getting freed once links are dropped.
> That also makes the "is it busy" check incapable of catching the root hnode -
> it *is* busy (there's a reference from tp), but we don't see it as something
> separate. "Is it our root?" check partially covers that, but the problem
> exists for others' roots as well.
>
> AFAICS, the minimal fix preserving the existing behaviour (where it doesn't
> include oopsen, that is) would be this:
> * count tp->root and tp_c->hlist as separate references. I.e.
> have u32_init() set refcount to 2, not 1.
> * in u32_destroy() we always drop the former; in u32_destroy_hnode() -
> the latter.
>
> That way we have *all* references contributing to refcount. List
> removal happens in u32_destroy_hnode() (called only when ->refcnt is 1)
> an in u32_destroy() in case of tc_u_common going away, along with everything
> reachable from it. IOW, that way we know that u32_destroy_key() won't
> free something still on the list (or pointed to by someone's ->root).
>
> Cc: stable@vger.kernel.org
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
For networking patches, subject should be reflective of tree and
subsystem. Example for this one:
"[PATCH net 1/7]:net: sched: cls_u32: fix hnode refcounting"
Also useful to have a cover letter summarizing the patchset
in 0/7. Otherwise
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
cheers,
jamal
^ permalink raw reply
* Re: [PATCH 2/7] mark root hnode explicitly
From: Jamal Hadi Salim @ 2018-09-06 10:28 UTC (permalink / raw)
To: Al Viro, netdev; +Cc: Cong Wang, Jiri Pirko
In-Reply-To: <20180905190414.5477-2-viro@ZenIV.linux.org.uk>
On 2018-09-05 3:04 p.m., Al Viro wrote:
> From: Al Viro <viro@zeniv.linux.org.uk>
>
> ... and disallow deleting or linking to such
>
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Same comment as other one in regards to subject
Since the flag space is coming from htnode which is
exposed via uapi it makes sense to keep this one here
because it is for private use; but a comment in
include/uapi/linux/pkt_cls.h that this flag or
maybe a set of bits is reserved for internal use.
Otherwise:
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
cheers,
jamal
^ permalink raw reply
* Re: [PATCH 3/7] make sure that divisor is a power of 2
From: Jamal Hadi Salim @ 2018-09-06 10:28 UTC (permalink / raw)
To: Al Viro, netdev; +Cc: Cong Wang, Jiri Pirko
In-Reply-To: <20180905190414.5477-3-viro@ZenIV.linux.org.uk>
On 2018-09-05 3:04 p.m., Al Viro wrote:
> From: Al Viro <viro@zeniv.linux.org.uk>
>
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
cheers,
jamal
^ permalink raw reply
* Re: [PATCH 2/7] mark root hnode explicitly
From: Jamal Hadi Salim @ 2018-09-06 10:34 UTC (permalink / raw)
To: Al Viro, netdev; +Cc: Cong Wang, Jiri Pirko
In-Reply-To: <8d3e140a-eb1b-3aae-d9e7-36d103a54e5e@mojatatu.com>
On 2018-09-06 6:28 a.m., Jamal Hadi Salim wrote:
> On 2018-09-05 3:04 p.m., Al Viro wrote:
>> From: Al Viro <viro@zeniv.linux.org.uk>
>>
>> ... and disallow deleting or linking to such
>>
>> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
>
> Same comment as other one in regards to subject
>
> Since the flag space is coming from htnode which is
> exposed via uapi it makes sense to keep this one here
> because it is for private use; but a comment in
> include/uapi/linux/pkt_cls.h that this flag or
> maybe a set of bits is reserved for internal use.
> Otherwise:
>
> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Sorry, additional comment:
It makes sense to reject user space attempt to
set TCA_CLS_FLAGS_U32_ROOT
So my suggestion is to update tc_flags_valid() to
check for this.
cheers,
jamal
^ permalink raw reply
* Re: [PATCH 4/7] get rid of unused argument of u32_destroy_key()
From: Jamal Hadi Salim @ 2018-09-06 10:34 UTC (permalink / raw)
To: Al Viro, netdev; +Cc: Cong Wang, Jiri Pirko
In-Reply-To: <20180905190414.5477-4-viro@ZenIV.linux.org.uk>
On 2018-09-05 3:04 p.m., Al Viro wrote:
> From: Al Viro <viro@zeniv.linux.org.uk>
>
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
cheers,
jamal
^ permalink raw reply
* Re: [PATCH 5/7] get rid of tc_u_knode ->tp
From: Jamal Hadi Salim @ 2018-09-06 10:35 UTC (permalink / raw)
To: Al Viro, netdev; +Cc: Cong Wang, Jiri Pirko
In-Reply-To: <20180905190414.5477-5-viro@ZenIV.linux.org.uk>
On 2018-09-05 3:04 p.m., Al Viro wrote:
> From: Al Viro <viro@zeniv.linux.org.uk>
>
> not used anymore
>
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
cheers,
jamal
^ permalink raw reply
* Re: [PATCH 6/7] get rid of tc_u_common ->rcu
From: Jamal Hadi Salim @ 2018-09-06 10:36 UTC (permalink / raw)
To: Al Viro, netdev; +Cc: Cong Wang, Jiri Pirko
In-Reply-To: <20180905190414.5477-6-viro@ZenIV.linux.org.uk>
On 2018-09-05 3:04 p.m., Al Viro wrote:
> From: Al Viro <viro@zeniv.linux.org.uk>
>
> unused
>
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
cheers,
jamal
^ permalink raw reply
* Re: [PATCH 7/7] clean tc_u_common hashtable
From: Jamal Hadi Salim @ 2018-09-06 10:36 UTC (permalink / raw)
To: Al Viro, netdev; +Cc: Cong Wang, Jiri Pirko
In-Reply-To: <20180905190414.5477-7-viro@ZenIV.linux.org.uk>
On 2018-09-05 3:04 p.m., Al Viro wrote:
> From: Al Viro <viro@zeniv.linux.org.uk>
>
> * calculate key *once*, not for each hash chain element
> * let tc_u_hash() return the pointer to chain head rather than index -
> callers are cleaner that way.
>
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
cheers,
jamal
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox