* [RFC PATCH 01/12] net: qdisc: use rcu prefix and silence sparse warnings
2014-01-10 9:36 [RFC PATCH 00/12] RCU'ify the net:sched classifier chains John Fastabend
@ 2014-01-10 9:37 ` John Fastabend
2014-01-10 9:37 ` [RFC PATCH 02/12] net: rcu-ify tcf_proto John Fastabend
` (12 subsequent siblings)
13 siblings, 0 replies; 20+ messages in thread
From: John Fastabend @ 2014-01-10 9:37 UTC (permalink / raw)
To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
Add __rcu notation to qdisc handling by doing this we can make
smatch output more legible. And anyways some of the cases should
be using rcu_dereference() see qdisc_all_tx_empty(),
qdisc_tx_chainging(), and so on.
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
include/linux/netdevice.h | 41 ++++------------------------------
include/net/sch_generic.h | 29 +++++++++++++++++++-----
net/core/dev.c | 54 +++++++++++++++++++++++++++++++++++++++++++--
net/sched/sch_generic.c | 4 ++-
net/sched/sch_mqprio.c | 4 ++-
net/sched/sch_teql.c | 9 ++++----
6 files changed, 89 insertions(+), 52 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 88afa80..797e473 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -546,7 +546,7 @@ struct netdev_queue {
* read mostly part
*/
struct net_device *dev;
- struct Qdisc *qdisc;
+ struct Qdisc __rcu *qdisc;
struct Qdisc *qdisc_sleeping;
#ifdef CONFIG_SYSFS
struct kobject kobj;
@@ -1987,13 +1987,8 @@ static inline void input_queue_tail_incr_save(struct softnet_data *sd,
DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
-void __netif_schedule(struct Qdisc *q);
-
-static inline void netif_schedule_queue(struct netdev_queue *txq)
-{
- if (!(txq->state & QUEUE_STATE_ANY_XOFF))
- __netif_schedule(txq->qdisc);
-}
+extern void __netif_schedule(struct Qdisc *q);
+extern void netif_schedule_queue(struct netdev_queue *txq);
static inline void netif_tx_schedule_all(struct net_device *dev)
{
@@ -2029,17 +2024,7 @@ static inline void netif_tx_start_all_queues(struct net_device *dev)
}
}
-static inline void netif_tx_wake_queue(struct netdev_queue *dev_queue)
-{
-#ifdef CONFIG_NETPOLL_TRAP
- if (netpoll_trap()) {
- netif_tx_start_queue(dev_queue);
- return;
- }
-#endif
- if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state))
- __netif_schedule(dev_queue->qdisc);
-}
+extern void netif_tx_wake_queue(struct netdev_queue *dev_queue);
/**
* netif_wake_queue - restart transmit
@@ -2288,23 +2273,7 @@ static inline bool netif_subqueue_stopped(const struct net_device *dev,
return __netif_subqueue_stopped(dev, skb_get_queue_mapping(skb));
}
-/**
- * netif_wake_subqueue - allow sending packets on subqueue
- * @dev: network device
- * @queue_index: sub queue index
- *
- * Resume individual transmit queue of a device with multiple transmit queues.
- */
-static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
-{
- struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
-#ifdef CONFIG_NETPOLL_TRAP
- if (netpoll_trap())
- return;
-#endif
- if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state))
- __netif_schedule(txq->qdisc);
-}
+extern void netif_wake_subqueue(struct net_device *dev, u16 queue_index);
#ifdef CONFIG_XPS
int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 013d96d..3e10a2a 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -259,7 +259,9 @@ static inline spinlock_t *qdisc_lock(struct Qdisc *qdisc)
static inline struct Qdisc *qdisc_root(const struct Qdisc *qdisc)
{
- return qdisc->dev_queue->qdisc;
+ struct Qdisc *q = rcu_dereference(qdisc->dev_queue->qdisc);
+
+ return q;
}
static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc)
@@ -384,7 +386,7 @@ static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
struct Qdisc *qdisc;
for (; i < dev->num_tx_queues; i++) {
- qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+ qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
if (qdisc) {
spin_lock_bh(qdisc_lock(qdisc));
qdisc_reset(qdisc);
@@ -402,13 +404,18 @@ static inline void qdisc_reset_all_tx(struct net_device *dev)
static inline bool qdisc_all_tx_empty(const struct net_device *dev)
{
unsigned int i;
+
+ rcu_read_lock();
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
- const struct Qdisc *q = txq->qdisc;
+ const struct Qdisc *q = rcu_dereference(txq->qdisc);
- if (q->q.qlen)
+ if (q->q.qlen) {
+ rcu_read_unlock();
return false;
+ }
}
+ rcu_read_unlock();
return true;
}
@@ -416,11 +423,16 @@ static inline bool qdisc_all_tx_empty(const struct net_device *dev)
static inline bool qdisc_tx_changing(const struct net_device *dev)
{
unsigned int i;
+
+ rcu_read_lock();
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
- if (txq->qdisc != txq->qdisc_sleeping)
+ if (rcu_dereference(txq->qdisc) != txq->qdisc_sleeping) {
+ rcu_read_unlock();
return true;
+ }
}
+ rcu_read_unlock();
return false;
}
@@ -428,11 +440,16 @@ static inline bool qdisc_tx_changing(const struct net_device *dev)
static inline bool qdisc_tx_is_noop(const struct net_device *dev)
{
unsigned int i;
+
+ rcu_read_lock();
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
- if (txq->qdisc != &noop_qdisc)
+ if (rcu_dereference(txq->qdisc) != &noop_qdisc) {
+ rcu_read_unlock();
return false;
+ }
}
+ rcu_read_unlock();
return true;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index cc9ab80..9f761e7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2154,6 +2154,56 @@ static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
return (struct dev_kfree_skb_cb *)skb->cb;
}
+void netif_schedule_queue(struct netdev_queue *txq)
+{
+ rcu_read_lock();
+ if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
+ struct Qdisc *q = rcu_dereference(txq->qdisc);
+
+ __netif_schedule(q);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(netif_schedule_queue);
+
+/**
+ * netif_wake_subqueue - allow sending packets on subqueue
+ * @dev: network device
+ * @queue_index: sub queue index
+ *
+ * Resume individual transmit queue of a device with multiple transmit queues.
+ */
+void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
+{
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
+#ifdef CONFIG_NETPOLL_TRAP
+ if (netpoll_trap())
+ return;
+#endif
+ if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
+ struct Qdisc *q = rcu_dereference(txq->qdisc);
+
+ __netif_schedule(q);
+ }
+}
+EXPORT_SYMBOL(netif_wake_subqueue);
+
+void netif_tx_wake_queue(struct netdev_queue *dev_queue)
+{
+#ifdef CONFIG_NETPOLL_TRAP
+ if (netpoll_trap()) {
+ netif_tx_start_queue(dev_queue);
+ return;
+ }
+#endif
+ if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
+ struct Qdisc *q = rcu_dereference(dev_queue->qdisc);
+
+ __netif_schedule(q);
+ }
+}
+EXPORT_SYMBOL(netif_tx_wake_queue);
+
void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
{
unsigned long flags;
@@ -3375,7 +3425,7 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
- q = rxq->qdisc;
+ q = rcu_dereference(rxq->qdisc);
if (q != &noop_qdisc) {
spin_lock(qdisc_lock(q));
if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
@@ -3392,7 +3442,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
{
struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
- if (!rxq || rxq->qdisc == &noop_qdisc)
+ if (!rxq || rcu_dereference_bh(rxq->qdisc) == &noop_qdisc)
goto out;
if (*pt_prev) {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 32bb942..71d7cd9 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -776,7 +776,7 @@ static void dev_deactivate_queue(struct net_device *dev,
struct Qdisc *qdisc_default = _qdisc_default;
struct Qdisc *qdisc;
- qdisc = dev_queue->qdisc;
+ qdisc = rtnl_dereference(dev_queue->qdisc);
if (qdisc) {
spin_lock_bh(qdisc_lock(qdisc));
@@ -869,7 +869,7 @@ static void dev_init_scheduler_queue(struct net_device *dev,
{
struct Qdisc *qdisc = _qdisc;
- dev_queue->qdisc = qdisc;
+ rcu_assign_pointer(dev_queue->qdisc, qdisc);
dev_queue->qdisc_sleeping = qdisc;
}
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 6749e2f..e1b112f 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -231,7 +231,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
memset(&sch->qstats, 0, sizeof(sch->qstats));
for (i = 0; i < dev->num_tx_queues; i++) {
- qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+ qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
spin_lock_bh(qdisc_lock(qdisc));
sch->q.qlen += qdisc->q.qlen;
sch->bstats.bytes += qdisc->bstats.bytes;
@@ -340,7 +340,7 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
spin_unlock_bh(d->lock);
for (i = tc.offset; i < tc.offset + tc.count; i++) {
- qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+ qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
spin_lock_bh(qdisc_lock(qdisc));
bstats.bytes += qdisc->bstats.bytes;
bstats.packets += qdisc->bstats.packets;
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 4741671..b35ba70 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -100,7 +100,8 @@ teql_dequeue(struct Qdisc *sch)
skb = __skb_dequeue(&dat->q);
dat_queue = netdev_get_tx_queue(dat->m->dev, 0);
if (skb == NULL) {
- struct net_device *m = qdisc_dev(dat_queue->qdisc);
+ struct Qdisc *q = rcu_dereference_bh(dat_queue->qdisc);
+ struct net_device *m = qdisc_dev(q);
if (m) {
dat->m->slaves = sch;
netif_wake_queue(m);
@@ -157,9 +158,9 @@ teql_destroy(struct Qdisc *sch)
txq = netdev_get_tx_queue(master->dev, 0);
master->slaves = NULL;
- root_lock = qdisc_root_sleeping_lock(txq->qdisc);
+ root_lock = qdisc_root_sleeping_lock(rtnl_dereference(txq->qdisc));
spin_lock_bh(root_lock);
- qdisc_reset(txq->qdisc);
+ qdisc_reset(rtnl_dereference(txq->qdisc));
spin_unlock_bh(root_lock);
}
}
@@ -266,7 +267,7 @@ static inline int teql_resolve(struct sk_buff *skb,
struct dst_entry *dst = skb_dst(skb);
int res;
- if (txq->qdisc == &noop_qdisc)
+ if (rcu_dereference_bh(txq->qdisc) == &noop_qdisc)
return -ENODEV;
if (!dev->header_ops || !dst)
^ permalink raw reply related [flat|nested] 20+ messages in thread* [RFC PATCH 02/12] net: rcu-ify tcf_proto
2014-01-10 9:36 [RFC PATCH 00/12] RCU'ify the net:sched classifier chains John Fastabend
2014-01-10 9:37 ` [RFC PATCH 01/12] net: qdisc: use rcu prefix and silence sparse warnings John Fastabend
@ 2014-01-10 9:37 ` John Fastabend
2014-01-10 9:38 ` [RFC PATCH 03/12] net: sched: cls_basic use RCU John Fastabend
` (11 subsequent siblings)
13 siblings, 0 replies; 20+ messages in thread
From: John Fastabend @ 2014-01-10 9:37 UTC (permalink / raw)
To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
rcu'ify tcf_proto this allows calling tc_classify() without holding
any locks. Updaters are protected by RTNL.
This patch prepares the core net_sched infrastracture for running
the classifier/action chains without holding the qdisc lock however
it does nothing to ensure cls_xxx and act_xxx types also work without
locking. Additional patches are required to address the fall out.
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
include/net/sch_generic.h | 5 +++--
net/sched/cls_api.c | 29 ++++++++++++++---------------
net/sched/sch_api.c | 6 +++---
net/sched/sch_atm.c | 30 +++++++++++++++++++-----------
net/sched/sch_cbq.c | 21 +++++++++++++++------
net/sched/sch_choke.c | 18 +++++++++++++-----
net/sched/sch_drr.c | 10 +++++++---
net/sched/sch_dsmark.c | 8 +++++---
net/sched/sch_fq_codel.c | 11 +++++++----
net/sched/sch_hfsc.c | 17 +++++++++++------
net/sched/sch_htb.c | 23 +++++++++++++++--------
net/sched/sch_ingress.c | 8 +++++---
net/sched/sch_multiq.c | 8 +++++---
net/sched/sch_prio.c | 11 +++++++----
net/sched/sch_qfq.c | 9 ++++++---
net/sched/sch_sfb.c | 15 +++++++++------
net/sched/sch_sfq.c | 11 +++++++----
17 files changed, 151 insertions(+), 89 deletions(-)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 3e10a2a..11b878b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -212,8 +212,8 @@ struct tcf_proto_ops {
struct tcf_proto {
/* Fast access part */
- struct tcf_proto *next;
- void *root;
+ struct tcf_proto __rcu *next;
+ void __rcu *root;
int (*classify)(struct sk_buff *,
const struct tcf_proto *,
struct tcf_result *);
@@ -225,6 +225,7 @@ struct tcf_proto {
struct Qdisc *q;
void *data;
const struct tcf_proto_ops *ops;
+ struct rcu_head rcu;
};
struct qdisc_skb_cb {
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 12e882e..86c3678 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -117,7 +117,6 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
- spinlock_t *root_lock;
struct tcmsg *t;
u32 protocol;
u32 prio;
@@ -125,7 +124,8 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
u32 parent;
struct net_device *dev;
struct Qdisc *q;
- struct tcf_proto **back, **chain;
+ struct tcf_proto __rcu **back;
+ struct tcf_proto __rcu **chain;
struct tcf_proto *tp;
const struct tcf_proto_ops *tp_ops;
const struct Qdisc_class_ops *cops;
@@ -196,7 +196,9 @@ replay:
goto errout;
/* Check the chain for existence of proto-tcf with this priority */
- for (back = chain; (tp = *back) != NULL; back = &tp->next) {
+ for (back = chain;
+ (tp = rtnl_dereference(*back)) != NULL;
+ back = &tp->next) {
if (tp->prio >= prio) {
if (tp->prio == prio) {
if (!nprio ||
@@ -208,8 +210,6 @@ replay:
}
}
- root_lock = qdisc_root_sleeping_lock(q);
-
if (tp == NULL) {
/* Proto-tcf does not exist, create new one */
@@ -258,7 +258,7 @@ replay:
}
tp->ops = tp_ops;
tp->protocol = protocol;
- tp->prio = nprio ? : TC_H_MAJ(tcf_auto_prio(*back));
+ tp->prio = nprio ? : TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back)));
tp->q = q;
tp->classify = tp_ops->classify;
tp->classid = parent;
@@ -279,9 +279,9 @@ replay:
if (fh == 0) {
if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
- spin_lock_bh(root_lock);
- *back = tp->next;
- spin_unlock_bh(root_lock);
+ struct tcf_proto *next = rtnl_dereference(tp->next);
+
+ rcu_assign_pointer(*back, next);
tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
tcf_destroy(tp);
@@ -320,10 +320,8 @@ replay:
err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh);
if (err == 0) {
if (tp_created) {
- spin_lock_bh(root_lock);
- tp->next = *back;
- *back = tp;
- spin_unlock_bh(root_lock);
+ rcu_assign_pointer(tp->next, rtnl_dereference(*back));
+ rcu_assign_pointer(*back, tp);
}
tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
} else {
@@ -417,7 +415,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
int s_t;
struct net_device *dev;
struct Qdisc *q;
- struct tcf_proto *tp, **chain;
+ struct tcf_proto *tp, __rcu **chain;
struct tcmsg *tcm = nlmsg_data(cb->nlh);
unsigned long cl = 0;
const struct Qdisc_class_ops *cops;
@@ -451,7 +449,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
s_t = cb->args[0];
- for (tp = *chain, t = 0; tp; tp = tp->next, t++) {
+ for (tp = rtnl_dereference(*chain), t = 0;
+ tp; tp = rtnl_dereference(tp->next), t++) {
if (t < s_t)
continue;
if (TC_H_MAJ(tcm->tcm_info) &&
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 1313145..774f61c 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1778,7 +1778,7 @@ int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
__be16 protocol = skb->protocol;
int err;
- for (; tp; tp = tp->next) {
+ for (; tp; tp = rcu_dereference_bh(tp->next)) {
if (tp->protocol != protocol &&
tp->protocol != htons(ETH_P_ALL))
continue;
@@ -1830,7 +1830,7 @@ void tcf_destroy(struct tcf_proto *tp)
{
tp->ops->destroy(tp);
module_put(tp->ops->owner);
- kfree(tp);
+ kfree_rcu(tp, rcu);
}
void tcf_destroy_chain(struct tcf_proto **fl)
@@ -1838,7 +1838,7 @@ void tcf_destroy_chain(struct tcf_proto **fl)
struct tcf_proto *tp;
while ((tp = *fl) != NULL) {
- *fl = tp->next;
+ *fl = rtnl_dereference(tp->next);
tcf_destroy(tp);
}
}
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 1f9c314..a4953c6 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -41,7 +41,7 @@
struct atm_flow_data {
struct Qdisc *q; /* FIFO, TBF, etc. */
- struct tcf_proto *filter_list;
+ struct tcf_proto __rcu *filter_list;
struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */
void (*old_pop)(struct atm_vcc *vcc,
struct sk_buff *skb); /* chaining */
@@ -134,6 +134,7 @@ static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow = (struct atm_flow_data *)cl;
+ struct tcf_proto *fl;
pr_debug("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
if (--flow->ref)
@@ -142,7 +143,10 @@ static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
list_del_init(&flow->list);
pr_debug("atm_tc_put: qdisc %p\n", flow->q);
qdisc_destroy(flow->q);
- tcf_destroy_chain(&flow->filter_list);
+
+ fl = rtnl_dereference(flow->filter_list);
+ tcf_destroy_chain(&fl);
+
if (flow->sock) {
pr_debug("atm_tc_put: f_count %ld\n",
file_count(flow->sock->file));
@@ -273,7 +277,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
error = -ENOBUFS;
goto err_out;
}
- flow->filter_list = NULL;
+ rcu_assign_pointer(flow->filter_list, NULL);
flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
if (!flow->q)
flow->q = &noop_qdisc;
@@ -311,7 +315,7 @@ static int atm_tc_delete(struct Qdisc *sch, unsigned long arg)
pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
if (list_empty(&flow->list))
return -EINVAL;
- if (flow->filter_list || flow == &p->link)
+ if (rtnl_dereference(flow->filter_list) || flow == &p->link)
return -EBUSY;
/*
* Reference count must be 2: one for "keepalive" (set at class
@@ -369,11 +373,12 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
flow = NULL;
if (TC_H_MAJ(skb->priority) != sch->handle ||
!(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) {
+ struct tcf_proto *fl;
+
list_for_each_entry(flow, &p->flows, list) {
- if (flow->filter_list) {
- result = tc_classify_compat(skb,
- flow->filter_list,
- &res);
+ fl = rcu_dereference_bh(flow->filter_list);
+ if (fl) {
+ result = tc_classify_compat(skb, fl, &res);
if (result < 0)
continue;
flow = (struct atm_flow_data *)res.class;
@@ -544,7 +549,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt)
if (!p->link.q)
p->link.q = &noop_qdisc;
pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q);
- p->link.filter_list = NULL;
+ rcu_assign_pointer(p->link.filter_list, NULL);
p->link.vcc = NULL;
p->link.sock = NULL;
p->link.classid = sch->handle;
@@ -568,10 +573,13 @@ static void atm_tc_destroy(struct Qdisc *sch)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow, *tmp;
+ struct tcf_proto *fl;
pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p);
- list_for_each_entry(flow, &p->flows, list)
- tcf_destroy_chain(&flow->filter_list);
+ list_for_each_entry(flow, &p->flows, list) {
+ fl = rtnl_dereference(flow->filter_list);
+ tcf_destroy_chain(&fl);
+ }
list_for_each_entry_safe(flow, tmp, &p->flows, list) {
if (flow->ref > 1)
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 2f80d01..61a51d0 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -133,7 +133,7 @@ struct cbq_class {
struct gnet_stats_rate_est64 rate_est;
struct tc_cbq_xstats xstats;
- struct tcf_proto *filter_list;
+ struct tcf_proto __rcu *filter_list;
int refcnt;
int filters;
@@ -222,6 +222,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
struct cbq_class **defmap;
struct cbq_class *cl = NULL;
u32 prio = skb->priority;
+ struct tcf_proto *fl;
struct tcf_result res;
/*
@@ -234,13 +235,15 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
for (;;) {
int result = 0;
+
defmap = head->defaults;
+ fl = rcu_dereference_bh(head->filter_list);
/*
* Step 2+n. Apply classifier.
*/
- if (!head->filter_list ||
- (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
+ result = tc_classify_compat(skb, fl, &res);
+ if (!fl || result < 0)
goto fallback;
cl = (void *)res.class;
@@ -1685,10 +1688,13 @@ static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
{
struct cbq_sched_data *q = qdisc_priv(sch);
+ struct tcf_proto *fl;
WARN_ON(cl->filters);
- tcf_destroy_chain(&cl->filter_list);
+ fl = rtnl_dereference(cl->filter_list);
+ tcf_destroy_chain(&fl);
+
qdisc_destroy(cl->q);
qdisc_put_rtab(cl->R_tab);
gen_kill_estimator(&cl->bstats, &cl->rate_est);
@@ -1701,6 +1707,7 @@ static void cbq_destroy(struct Qdisc *sch)
struct cbq_sched_data *q = qdisc_priv(sch);
struct hlist_node *next;
struct cbq_class *cl;
+ struct tcf_proto *fl;
unsigned int h;
#ifdef CONFIG_NET_CLS_ACT
@@ -1712,8 +1719,10 @@ static void cbq_destroy(struct Qdisc *sch)
* be bound to classes which have been destroyed already. --TGR '04
*/
for (h = 0; h < q->clhash.hashsize; h++) {
- hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode)
- tcf_destroy_chain(&cl->filter_list);
+ hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
+ fl = rtnl_dereference(cl->filter_list);
+ tcf_destroy_chain(&fl);
+ }
}
for (h = 0; h < q->clhash.hashsize; h++) {
hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index ddd73cb..60eb0e5 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -58,7 +58,7 @@ struct choke_sched_data {
/* Variables */
struct red_vars vars;
- struct tcf_proto *filter_list;
+ struct tcf_proto __rcu *filter_list;
struct {
u32 prob_drop; /* Early probability drops */
u32 prob_mark; /* Early probability marks */
@@ -200,9 +200,11 @@ static bool choke_classify(struct sk_buff *skb,
{
struct choke_sched_data *q = qdisc_priv(sch);
struct tcf_result res;
+ struct tcf_proto *fl;
int result;
- result = tc_classify(skb, q->filter_list, &res);
+ fl = rcu_dereference_bh(q->filter_list);
+ result = tc_classify(skb, fl, &res);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
@@ -251,12 +253,14 @@ static bool choke_match_random(const struct choke_sched_data *q,
unsigned int *pidx)
{
struct sk_buff *oskb;
+ struct tcf_proto *fl;
if (q->head == q->tail)
return false;
oskb = choke_peek_random(q, pidx);
- if (q->filter_list)
+ fl = rcu_dereference_bh(q->filter_list);
+ if (fl)
return choke_get_classid(nskb) == choke_get_classid(oskb);
return choke_match_flow(oskb, nskb);
@@ -266,9 +270,11 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct choke_sched_data *q = qdisc_priv(sch);
const struct red_parms *p = &q->parms;
+ struct tcf_proto *fl;
int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- if (q->filter_list) {
+ fl = rcu_dereference_bh(q->filter_list);
+ if (fl) {
/* If using external classifiers, get result and record it. */
if (!choke_classify(skb, sch, &ret))
goto other_drop; /* Packet was eaten by filter */
@@ -541,8 +547,10 @@ static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
static void choke_destroy(struct Qdisc *sch)
{
struct choke_sched_data *q = qdisc_priv(sch);
+ struct tcf_proto *fl;
- tcf_destroy_chain(&q->filter_list);
+ fl = rtnl_dereference(q->filter_list);
+ tcf_destroy_chain(&fl);
choke_free(q->tab);
}
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 8302717..899026f 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -35,7 +35,7 @@ struct drr_class {
struct drr_sched {
struct list_head active;
- struct tcf_proto *filter_list;
+ struct tcf_proto __rcu *filter_list;
struct Qdisc_class_hash clhash;
};
@@ -319,6 +319,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
struct tcf_result res;
+ struct tcf_proto *fl;
int result;
if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
@@ -328,7 +329,8 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
}
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- result = tc_classify(skb, q->filter_list, &res);
+ fl = rcu_dereference_bh(q->filter_list);
+ result = tc_classify(skb, fl, &res);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
@@ -467,9 +469,11 @@ static void drr_destroy_qdisc(struct Qdisc *sch)
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
struct hlist_node *next;
+ struct tcf_proto *fl;
unsigned int i;
- tcf_destroy_chain(&q->filter_list);
+ fl = rtnl_dereference(q->filter_list);
+ tcf_destroy_chain(&fl);
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 49d6ef3..6b84bd4 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -37,7 +37,7 @@
struct dsmark_qdisc_data {
struct Qdisc *q;
- struct tcf_proto *filter_list;
+ struct tcf_proto __rcu *filter_list;
u8 *mask; /* "owns" the array */
u8 *value;
u16 indices;
@@ -229,7 +229,8 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
skb->tc_index = TC_H_MIN(skb->priority);
else {
struct tcf_result res;
- int result = tc_classify(skb, p->filter_list, &res);
+ struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
+ int result = tc_classify(skb, fl, &res);
pr_debug("result %d class 0x%04x\n", result, res.classid);
@@ -404,10 +405,11 @@ static void dsmark_reset(struct Qdisc *sch)
static void dsmark_destroy(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
+ struct tcf_proto *fl = rtnl_dereference(p->filter_list);
pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
- tcf_destroy_chain(&p->filter_list);
+ tcf_destroy_chain(&fl);
qdisc_destroy(p->q);
kfree(p->mask);
}
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 5578628..bc47789 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -52,7 +52,7 @@ struct fq_codel_flow {
}; /* please try to keep this structure <= 64 bytes */
struct fq_codel_sched_data {
- struct tcf_proto *filter_list; /* optional external classifier */
+ struct tcf_proto __rcu *filter_list; /* optional external classifier */
struct fq_codel_flow *flows; /* Flows table [flows_cnt] */
u32 *backlogs; /* backlog table [flows_cnt] */
u32 flows_cnt; /* number of flows */
@@ -84,6 +84,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
int *qerr)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct tcf_proto *filter;
struct tcf_result res;
int result;
@@ -92,11 +93,12 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
TC_H_MIN(skb->priority) <= q->flows_cnt)
return TC_H_MIN(skb->priority);
- if (!q->filter_list)
+ filter = rcu_dereference(q->filter_list);
+ if (!filter)
return fq_codel_hash(q, skb) + 1;
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- result = tc_classify(skb, q->filter_list, &res);
+ result = tc_classify(skb, filter, &res);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
@@ -376,8 +378,9 @@ static void fq_codel_free(void *addr)
static void fq_codel_destroy(struct Qdisc *sch)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
+ struct tcf_proto *filter = rtnl_dereference(q->filter_list);
- tcf_destroy_chain(&q->filter_list);
+ tcf_destroy_chain(&filter);
fq_codel_free(q->backlogs);
fq_codel_free(q->flows);
}
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index c407561..0a7650f 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -116,7 +116,7 @@ struct hfsc_class {
struct gnet_stats_queue qstats;
struct gnet_stats_rate_est64 rate_est;
unsigned int level; /* class level in hierarchy */
- struct tcf_proto *filter_list; /* filter list */
+ struct tcf_proto __rcu *filter_list; /* filter list */
unsigned int filter_cnt; /* filter count */
struct hfsc_sched *sched; /* scheduler data */
@@ -1110,8 +1110,10 @@ static void
hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
{
struct hfsc_sched *q = qdisc_priv(sch);
+ struct tcf_proto *fl;
- tcf_destroy_chain(&cl->filter_list);
+ fl = rtnl_dereference(cl->filter_list);
+ tcf_destroy_chain(&fl);
qdisc_destroy(cl->qdisc);
gen_kill_estimator(&cl->bstats, &cl->rate_est);
if (cl != &q->root)
@@ -1161,7 +1163,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
head = &q->root;
- tcf = q->root.filter_list;
+ tcf = rcu_dereference_bh(q->root.filter_list);
while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
@@ -1185,7 +1187,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
return cl; /* hit leaf class */
/* apply inner filter chain */
- tcf = cl->filter_list;
+ tcf = rcu_dereference_bh(cl->filter_list);
head = cl;
}
@@ -1540,11 +1542,14 @@ hfsc_destroy_qdisc(struct Qdisc *sch)
struct hfsc_sched *q = qdisc_priv(sch);
struct hlist_node *next;
struct hfsc_class *cl;
+ struct tcf_proto *fl;
unsigned int i;
for (i = 0; i < q->clhash.hashsize; i++) {
- hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
- tcf_destroy_chain(&cl->filter_list);
+ hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) {
+ fl = rtnl_dereference(cl->filter_list);
+ tcf_destroy_chain(&fl);
+ }
}
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 0db5a6e..86df75c 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -103,7 +103,7 @@ struct htb_class {
u32 prio; /* these two are used only by leaves... */
int quantum; /* but stored for parent-to-leaf return */
- struct tcf_proto *filter_list; /* class attached filters */
+ struct tcf_proto __rcu *filter_list; /* class attached filters */
int filter_cnt;
int refcnt; /* usage count of this class */
@@ -153,7 +153,7 @@ struct htb_sched {
int rate2quantum; /* quant = rate / rate2quantum */
/* filters for qdisc itself */
- struct tcf_proto *filter_list;
+ struct tcf_proto __rcu *filter_list;
#define HTB_WARN_TOOMANYEVENTS 0x1
unsigned int warned; /* only one warning */
@@ -223,7 +223,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
return cl;
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- tcf = q->filter_list;
+ tcf = rcu_dereference_bh(q->filter_list);
while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
@@ -246,7 +246,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
return cl; /* we hit leaf; return it */
/* we have got inner class; apply inner filter chain */
- tcf = cl->filter_list;
+ tcf = rcu_dereference_bh(cl->filter_list);
}
/* classification failed; try to use default class */
cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
@@ -1230,12 +1230,15 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
{
+ struct tcf_proto *fl;
+
if (!cl->level) {
WARN_ON(!cl->un.leaf.q);
qdisc_destroy(cl->un.leaf.q);
}
gen_kill_estimator(&cl->bstats, &cl->rate_est);
- tcf_destroy_chain(&cl->filter_list);
+ fl = rtnl_dereference(cl->filter_list);
+ tcf_destroy_chain(&fl);
kfree(cl);
}
@@ -1244,6 +1247,7 @@ static void htb_destroy(struct Qdisc *sch)
struct htb_sched *q = qdisc_priv(sch);
struct hlist_node *next;
struct htb_class *cl;
+ struct tcf_proto *fl;
unsigned int i;
cancel_work_sync(&q->work);
@@ -1253,11 +1257,14 @@ static void htb_destroy(struct Qdisc *sch)
* because filter need its target class alive to be able to call
* unbind_filter on it (without Oops).
*/
- tcf_destroy_chain(&q->filter_list);
+ fl = rtnl_dereference(q->filter_list);
+ tcf_destroy_chain(&fl);
for (i = 0; i < q->clhash.hashsize; i++) {
- hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode)
- tcf_destroy_chain(&cl->filter_list);
+ hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
+ fl = rtnl_dereference(cl->filter_list);
+ tcf_destroy_chain(&fl);
+ }
}
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index bce1665..f7c80d3 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -17,7 +17,7 @@
struct ingress_qdisc_data {
- struct tcf_proto *filter_list;
+ struct tcf_proto __rcu *filter_list;
};
/* ------------------------- Class/flow operations ------------------------- */
@@ -59,9 +59,10 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct ingress_qdisc_data *p = qdisc_priv(sch);
struct tcf_result res;
+ struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
int result;
- result = tc_classify(skb, p->filter_list, &res);
+ result = tc_classify(skb, fl, &res);
qdisc_bstats_update(sch, skb);
switch (result) {
@@ -89,8 +90,9 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)
static void ingress_destroy(struct Qdisc *sch)
{
struct ingress_qdisc_data *p = qdisc_priv(sch);
+ struct tcf_proto *fl = rtnl_dereference(p->filter_list);
- tcf_destroy_chain(&p->filter_list);
+ tcf_destroy_chain(&fl);
}
static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index afb050a..169d637 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -31,7 +31,7 @@ struct multiq_sched_data {
u16 bands;
u16 max_bands;
u16 curband;
- struct tcf_proto *filter_list;
+ struct tcf_proto __rcu *filter_list;
struct Qdisc **queues;
};
@@ -42,10 +42,11 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
struct multiq_sched_data *q = qdisc_priv(sch);
u32 band;
struct tcf_result res;
+ struct tcf_proto *fl = rcu_dereference_bh(q->filter_list);
int err;
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- err = tc_classify(skb, q->filter_list, &res);
+ err = tc_classify(skb, fl, &res);
#ifdef CONFIG_NET_CLS_ACT
switch (err) {
case TC_ACT_STOLEN:
@@ -188,8 +189,9 @@ multiq_destroy(struct Qdisc *sch)
{
int band;
struct multiq_sched_data *q = qdisc_priv(sch);
+ struct tcf_proto *fl = rtnl_dereference(q->filter_list);
- tcf_destroy_chain(&q->filter_list);
+ tcf_destroy_chain(&fl);
for (band = 0; band < q->bands; band++)
qdisc_destroy(q->queues[band]);
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 79359b6..43aa35d 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -24,7 +24,7 @@
struct prio_sched_data {
int bands;
- struct tcf_proto *filter_list;
+ struct tcf_proto __rcu *filter_list;
u8 prio2band[TC_PRIO_MAX+1];
struct Qdisc *queues[TCQ_PRIO_BANDS];
};
@@ -36,11 +36,13 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
struct prio_sched_data *q = qdisc_priv(sch);
u32 band = skb->priority;
struct tcf_result res;
+ struct tcf_proto *fl;
int err;
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
if (TC_H_MAJ(skb->priority) != sch->handle) {
- err = tc_classify(skb, q->filter_list, &res);
+ fl = rcu_dereference_bh(q->filter_list);
+ err = tc_classify(skb, fl, &res);
#ifdef CONFIG_NET_CLS_ACT
switch (err) {
case TC_ACT_STOLEN:
@@ -50,7 +52,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
return NULL;
}
#endif
- if (!q->filter_list || err < 0) {
+ if (!fl || err < 0) {
if (TC_H_MAJ(band))
band = 0;
return q->queues[q->prio2band[band & TC_PRIO_MAX]];
@@ -157,8 +159,9 @@ prio_destroy(struct Qdisc *sch)
{
int prio;
struct prio_sched_data *q = qdisc_priv(sch);
+ struct tcf_proto *fl = rtnl_dereference(q->filter_list);
- tcf_destroy_chain(&q->filter_list);
+ tcf_destroy_chain(&fl);
for (prio = 0; prio < q->bands; prio++)
qdisc_destroy(q->queues[prio]);
}
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 8056fb4..cee5618 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -181,7 +181,7 @@ struct qfq_group {
};
struct qfq_sched {
- struct tcf_proto *filter_list;
+ struct tcf_proto __rcu *filter_list;
struct Qdisc_class_hash clhash;
u64 oldV, V; /* Precise virtual times. */
@@ -704,6 +704,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
struct qfq_sched *q = qdisc_priv(sch);
struct qfq_class *cl;
struct tcf_result res;
+ struct tcf_proto *fl;
int result;
if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
@@ -714,7 +715,8 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
}
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- result = tc_classify(skb, q->filter_list, &res);
+ fl = rcu_dereference_bh(q->filter_list);
+ result = tc_classify(skb, fl, &res);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
@@ -1524,9 +1526,10 @@ static void qfq_destroy_qdisc(struct Qdisc *sch)
struct qfq_sched *q = qdisc_priv(sch);
struct qfq_class *cl;
struct hlist_node *next;
+ struct tcf_proto *fl = rtnl_dereference(q->filter_list);
unsigned int i;
- tcf_destroy_chain(&q->filter_list);
+ tcf_destroy_chain(&fl);
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 30ea467..a4f1d6c 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -55,7 +55,7 @@ struct sfb_bins {
struct sfb_sched_data {
struct Qdisc *qdisc;
- struct tcf_proto *filter_list;
+ struct tcf_proto __rcu *filter_list;
unsigned long rehash_interval;
unsigned long warmup_time; /* double buffering warmup time in jiffies */
u32 max;
@@ -253,13 +253,13 @@ static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q)
return false;
}
-static bool sfb_classify(struct sk_buff *skb, struct sfb_sched_data *q,
+static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl,
int *qerr, u32 *salt)
{
struct tcf_result res;
int result;
- result = tc_classify(skb, q->filter_list, &res);
+ result = tc_classify(skb, fl, &res);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
@@ -281,6 +281,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
struct sfb_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
+ struct tcf_proto *fl;
int i;
u32 p_min = ~0;
u32 minqlen = ~0;
@@ -306,9 +307,10 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
}
}
- if (q->filter_list) {
+ fl = rcu_dereference_bh(q->filter_list);
+ if (fl) {
/* If using external classifiers, get result and record it. */
- if (!sfb_classify(skb, q, &ret, &salt))
+ if (!sfb_classify(skb, fl, &ret, &salt))
goto other_drop;
keys.src = salt;
keys.dst = 0;
@@ -465,8 +467,9 @@ static void sfb_reset(struct Qdisc *sch)
static void sfb_destroy(struct Qdisc *sch)
{
struct sfb_sched_data *q = qdisc_priv(sch);
+ struct tcf_proto *fl = rtnl_dereference(q->filter_list);
- tcf_destroy_chain(&q->filter_list);
+ tcf_destroy_chain(&fl);
qdisc_destroy(q->qdisc);
}
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 76f01e0..da24df01 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -125,7 +125,7 @@ struct sfq_sched_data {
u8 cur_depth; /* depth of longest slot */
u8 flags;
unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
- struct tcf_proto *filter_list;
+ struct tcf_proto __rcu *filter_list;
sfq_index *ht; /* Hash table ('divisor' slots) */
struct sfq_slot *slots; /* Flows table ('maxflows' entries) */
@@ -187,6 +187,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
{
struct sfq_sched_data *q = qdisc_priv(sch);
struct tcf_result res;
+ struct tcf_proto *fl;
int result;
if (TC_H_MAJ(skb->priority) == sch->handle &&
@@ -194,13 +195,14 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
TC_H_MIN(skb->priority) <= q->divisor)
return TC_H_MIN(skb->priority);
- if (!q->filter_list) {
+ fl = rcu_dereference_bh(q->filter_list);
+ if (!fl) {
skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
return sfq_hash(q, skb) + 1;
}
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
- result = tc_classify(skb, q->filter_list, &res);
+ result = tc_classify(skb, fl, &res);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
@@ -727,8 +729,9 @@ static void sfq_free(void *addr)
static void sfq_destroy(struct Qdisc *sch)
{
struct sfq_sched_data *q = qdisc_priv(sch);
+ struct tcf_proto *fl = rtnl_dereference(q->filter_list);
- tcf_destroy_chain(&q->filter_list);
+ tcf_destroy_chain(&fl);
q->perturb_period = 0;
del_timer_sync(&q->perturb_timer);
sfq_free(q->ht);
^ permalink raw reply related [flat|nested] 20+ messages in thread* [RFC PATCH 03/12] net: sched: cls_basic use RCU
2014-01-10 9:36 [RFC PATCH 00/12] RCU'ify the net:sched classifier chains John Fastabend
2014-01-10 9:37 ` [RFC PATCH 01/12] net: qdisc: use rcu prefix and silence sparse warnings John Fastabend
2014-01-10 9:37 ` [RFC PATCH 02/12] net: rcu-ify tcf_proto John Fastabend
@ 2014-01-10 9:38 ` John Fastabend
2014-01-10 9:38 ` [RFC PATCH 04/12] net: sched: cls_cgroup " John Fastabend
` (10 subsequent siblings)
13 siblings, 0 replies; 20+ messages in thread
From: John Fastabend @ 2014-01-10 9:38 UTC (permalink / raw)
To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
Enable basic classifier for RCU.
Dereferencing tp->root may look a bit strange here but it is needed
by my accounting because it is allocated at init time and needs to
be kfree'd at destroy time. However because it may be referenced in
the classify() path we must wait an RCU grace period before free'ing
it. We use kfree_rcu() and rcu_ APIs to enforce this. This pattern
is used in all the classifiers.
Also the hgenerator can be incremented without concern because it
is always incremented under RTNL.
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
net/sched/cls_basic.c | 82 ++++++++++++++++++++++++++++---------------------
1 file changed, 47 insertions(+), 35 deletions(-)
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index b655203..47c1da2 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -24,6 +24,7 @@
struct basic_head {
u32 hgenerator;
struct list_head flist;
+ struct rcu_head rcu;
};
struct basic_filter {
@@ -31,17 +32,20 @@ struct basic_filter {
struct tcf_exts exts;
struct tcf_ematch_tree ematches;
struct tcf_result res;
+ struct tcf_proto *tp;
struct list_head link;
+ struct rcu_head rcu;
};
static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
int r;
- struct basic_head *head = (struct basic_head *) tp->root;
+ struct basic_head *head =
+ (struct basic_head *) rcu_dereference_bh(tp->root);
struct basic_filter *f;
- list_for_each_entry(f, &head->flist, link) {
+ list_for_each_entry_rcu(f, &head->flist, link) {
if (!tcf_em_tree_match(skb, &f->ematches, NULL))
continue;
*res = f->res;
@@ -56,7 +60,8 @@ static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
static unsigned long basic_get(struct tcf_proto *tp, u32 handle)
{
unsigned long l = 0UL;
- struct basic_head *head = (struct basic_head *) tp->root;
+ struct basic_head *head =
+ (struct basic_head *) rtnl_dereference(tp->root);
struct basic_filter *f;
if (head == NULL)
@@ -81,12 +86,15 @@ static int basic_init(struct tcf_proto *tp)
if (head == NULL)
return -ENOBUFS;
INIT_LIST_HEAD(&head->flist);
- tp->root = head;
+ rcu_assign_pointer(tp->root, head);
return 0;
}
-static void basic_delete_filter(struct tcf_proto *tp, struct basic_filter *f)
+static void basic_delete_filter(struct rcu_head *head)
{
+ struct basic_filter *f = container_of(head, struct basic_filter, rcu);
+ struct tcf_proto *tp = f->tp;
+
tcf_unbind_filter(tp, &f->res);
tcf_exts_destroy(tp, &f->exts);
tcf_em_tree_destroy(tp, &f->ematches);
@@ -95,27 +103,26 @@ static void basic_delete_filter(struct tcf_proto *tp, struct basic_filter *f)
static void basic_destroy(struct tcf_proto *tp)
{
- struct basic_head *head = tp->root;
+ struct basic_head *head = rtnl_dereference(tp->root);
struct basic_filter *f, *n;
list_for_each_entry_safe(f, n, &head->flist, link) {
- list_del(&f->link);
- basic_delete_filter(tp, f);
+ list_del_rcu(&f->link);
+ call_rcu(&f->rcu, basic_delete_filter);
}
- kfree(head);
+ rcu_assign_pointer(tp->root, NULL);
+ kfree_rcu(head, rcu);
}
static int basic_delete(struct tcf_proto *tp, unsigned long arg)
{
- struct basic_head *head = (struct basic_head *) tp->root;
+ struct basic_head *head = rtnl_dereference(tp->root);
struct basic_filter *t, *f = (struct basic_filter *) arg;
list_for_each_entry(t, &head->flist, link)
if (t == f) {
- tcf_tree_lock(tp);
- list_del(&t->link);
- tcf_tree_unlock(tp);
- basic_delete_filter(tp, t);
+ list_del_rcu(&t->link);
+ call_rcu(&t->rcu, basic_delete_filter);
return 0;
}
@@ -152,6 +159,7 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp,
tcf_exts_change(tp, &f->exts, &e);
tcf_em_tree_change(tp, &f->ematches, &t);
+ f->tp = tp;
return 0;
errout:
@@ -164,9 +172,10 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
struct nlattr **tca, unsigned long *arg)
{
int err;
- struct basic_head *head = (struct basic_head *) tp->root;
+ struct basic_head *head = rtnl_dereference(tp->root);
struct nlattr *tb[TCA_BASIC_MAX + 1];
- struct basic_filter *f = (struct basic_filter *) *arg;
+ struct basic_filter *fold = (struct basic_filter *) *arg;
+ struct basic_filter *fnew;
if (tca[TCA_OPTIONS] == NULL)
return -EINVAL;
@@ -176,22 +185,23 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
if (err < 0)
return err;
- if (f != NULL) {
- if (handle && f->handle != handle)
+ if (fold != NULL) {
+ if (handle && fold->handle != handle)
return -EINVAL;
- return basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE]);
}
err = -ENOBUFS;
- f = kzalloc(sizeof(*f), GFP_KERNEL);
- if (f == NULL)
+ fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
+ if (fnew == NULL)
goto errout;
- tcf_exts_init(&f->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
+ tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
err = -EINVAL;
- if (handle)
- f->handle = handle;
- else {
+ if (handle) {
+ fnew->handle = handle;
+ } else if (fold) {
+ fnew->handle = fold->handle;
+ } else {
unsigned int i = 0x80000000;
do {
if (++head->hgenerator == 0x7FFFFFFF)
@@ -203,29 +213,31 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
goto errout;
}
- f->handle = head->hgenerator;
+ fnew->handle = head->hgenerator;
}
- err = basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE]);
+ err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE]);
if (err < 0)
goto errout;
- tcf_tree_lock(tp);
- list_add(&f->link, &head->flist);
- tcf_tree_unlock(tp);
- *arg = (unsigned long) f;
+ *arg = (unsigned long) fnew;
+
+ if (fold) {
+ list_replace_rcu(&fold->link, &fnew->link);
+ call_rcu(&fold->rcu, basic_delete_filter);
+ } else {
+ list_add_rcu(&fnew->link, &head->flist);
+ }
return 0;
errout:
- if (*arg == 0UL && f)
- kfree(f);
-
+ kfree(fnew);
return err;
}
static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
- struct basic_head *head = (struct basic_head *) tp->root;
+ struct basic_head *head = rtnl_dereference(tp->root);
struct basic_filter *f;
list_for_each_entry(f, &head->flist, link) {
^ permalink raw reply related [flat|nested] 20+ messages in thread* [RFC PATCH 04/12] net: sched: cls_cgroup use RCU
2014-01-10 9:36 [RFC PATCH 00/12] RCU'ify the net:sched classifier chains John Fastabend
` (2 preceding siblings ...)
2014-01-10 9:38 ` [RFC PATCH 03/12] net: sched: cls_basic use RCU John Fastabend
@ 2014-01-10 9:38 ` John Fastabend
2014-01-10 9:39 ` [RFC PATCH 05/12] net: sched: cls_flow " John Fastabend
` (9 subsequent siblings)
13 siblings, 0 replies; 20+ messages in thread
From: John Fastabend @ 2014-01-10 9:38 UTC (permalink / raw)
To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
Make cgroup classifier safe for RCU.
Also drops the calls in the classify routine that were doing a
rcu_read_lock()/rcu_read_unlock(). If the rcu_read_lock() isn't held
entering this routine we have issues with deleting the classifier
chain so remove the unnecessary rcu_read_lock()/rcu_read_unlock()
pair noting all paths AFAIK hold rcu_read_lock.
If there is a case where classify is called without the rcu read lock
then an rcu splat will occur and we can correct it.
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
net/sched/cls_cgroup.c | 65 ++++++++++++++++++++++++++++++------------------
1 file changed, 41 insertions(+), 24 deletions(-)
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index f9d21258..7644912 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -118,17 +118,17 @@ struct cls_cgroup_head {
u32 handle;
struct tcf_exts exts;
struct tcf_ematch_tree ematches;
+ struct rcu_head rcu;
+ struct tcf_proto *tp;
};
static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
- struct cls_cgroup_head *head = tp->root;
+ struct cls_cgroup_head *head = rcu_dereference_bh(tp->root);
u32 classid;
- rcu_read_lock();
classid = task_cls_state(current)->classid;
- rcu_read_unlock();
/*
* Due to the nature of the classifier it is required to ignore all
@@ -176,13 +176,27 @@ static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = {
[TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED },
};
+static void cls_cgroup_destroy_rcu(struct rcu_head *root)
+{
+ struct cls_cgroup_head *head = container_of(root,
+ struct cls_cgroup_head,
+ rcu);
+
+ if (head) {
+ tcf_exts_destroy(head->tp, &head->exts);
+ tcf_em_tree_destroy(head->tp, &head->ematches);
+ kfree(head);
+ }
+}
+
static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
unsigned long *arg)
{
struct nlattr *tb[TCA_CGROUP_MAX + 1];
- struct cls_cgroup_head *head = tp->root;
+ struct cls_cgroup_head *head = rtnl_dereference(tp->root);
+ struct cls_cgroup_head *new;
struct tcf_ematch_tree t;
struct tcf_exts e;
int err;
@@ -190,25 +204,24 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
if (!tca[TCA_OPTIONS])
return -EINVAL;
- if (head == NULL) {
- if (!handle)
- return -EINVAL;
+ if (!head && !handle)
+ return -EINVAL;
- head = kzalloc(sizeof(*head), GFP_KERNEL);
- if (head == NULL)
- return -ENOBUFS;
+ if (head && handle != head->handle)
+ return -ENOENT;
- tcf_exts_init(&head->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
- head->handle = handle;
+ new = kzalloc(sizeof(*head), GFP_KERNEL);
+ if (!new)
+ return -ENOBUFS;
- tcf_tree_lock(tp);
- tp->root = head;
- tcf_tree_unlock(tp);
+ if (head) {
+ new->handle = head->handle;
+ } else {
+ tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
+ new->handle = handle;
}
- if (handle != head->handle)
- return -ENOENT;
-
+ new->tp = tp;
err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS],
cgroup_policy);
if (err < 0)
@@ -223,20 +236,24 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
if (err < 0)
return err;
- tcf_exts_change(tp, &head->exts, &e);
- tcf_em_tree_change(tp, &head->ematches, &t);
+ tcf_exts_change(tp, &new->exts, &e);
+ tcf_em_tree_change(tp, &new->ematches, &t);
+ rcu_assign_pointer(tp->root, new);
+ if (head)
+ call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
return 0;
}
static void cls_cgroup_destroy(struct tcf_proto *tp)
{
- struct cls_cgroup_head *head = tp->root;
+ struct cls_cgroup_head *head = rtnl_dereference(tp->root);
if (head) {
tcf_exts_destroy(tp, &head->exts);
tcf_em_tree_destroy(tp, &head->ematches);
- kfree(head);
+ rcu_assign_pointer(tp->root, NULL);
+ kfree_rcu(head, rcu);
}
}
@@ -247,7 +264,7 @@ static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg)
static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
- struct cls_cgroup_head *head = tp->root;
+ struct cls_cgroup_head *head = rtnl_dereference(tp->root);
if (arg->count < arg->skip)
goto skip;
@@ -263,7 +280,7 @@ skip:
static int cls_cgroup_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
- struct cls_cgroup_head *head = tp->root;
+ struct cls_cgroup_head *head = rtnl_dereference(tp->root);
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
^ permalink raw reply related [flat|nested] 20+ messages in thread* [RFC PATCH 05/12] net: sched: cls_flow use RCU
2014-01-10 9:36 [RFC PATCH 00/12] RCU'ify the net:sched classifier chains John Fastabend
` (3 preceding siblings ...)
2014-01-10 9:38 ` [RFC PATCH 04/12] net: sched: cls_cgroup " John Fastabend
@ 2014-01-10 9:39 ` John Fastabend
2014-01-10 9:39 ` [RFC PATCH 06/12] net: sched: fw " John Fastabend
` (8 subsequent siblings)
13 siblings, 0 replies; 20+ messages in thread
From: John Fastabend @ 2014-01-10 9:39 UTC (permalink / raw)
To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
net/sched/cls_flow.c | 145 +++++++++++++++++++++++++++++---------------------
1 file changed, 84 insertions(+), 61 deletions(-)
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index dfd18a5..632bf48 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -34,12 +34,15 @@
struct flow_head {
struct list_head filters;
+ struct rcu_head rcu;
};
struct flow_filter {
struct list_head list;
+ struct rcu_head rcu;
struct tcf_exts exts;
struct tcf_ematch_tree ematches;
+ struct tcf_proto *tp;
struct timer_list perturb_timer;
u32 perturb_period;
u32 handle;
@@ -276,14 +279,14 @@ static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow)
static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
- struct flow_head *head = tp->root;
+ struct flow_head *head = rcu_dereference_bh(tp->root);
struct flow_filter *f;
u32 keymask;
u32 classid;
unsigned int n, key;
int r;
- list_for_each_entry(f, &head->filters, list) {
+ list_for_each_entry_rcu(f, &head->filters, list) {
u32 keys[FLOW_KEY_MAX + 1];
struct flow_keys flow_keys;
@@ -346,13 +349,23 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = {
[TCA_FLOW_PERTURB] = { .type = NLA_U32 },
};
+static void flow_destroy_filter(struct rcu_head *head)
+{
+ struct flow_filter *f = container_of(head, struct flow_filter, rcu);
+
+ del_timer_sync(&f->perturb_timer);
+ tcf_exts_destroy(f->tp, &f->exts);
+ tcf_em_tree_destroy(f->tp, &f->ematches);
+ kfree(f);
+}
+
static int flow_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
unsigned long *arg)
{
- struct flow_head *head = tp->root;
- struct flow_filter *f;
+ struct flow_head *head = rtnl_dereference(tp->root);
+ struct flow_filter *fold, *fnew;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_FLOW_MAX + 1];
struct tcf_exts e;
@@ -401,20 +414,42 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
if (err < 0)
goto err1;
- f = (struct flow_filter *)*arg;
- if (f != NULL) {
+ err = -ENOBUFS;
+ fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
+ if (!fnew)
+ goto err2;
+
+ fold = (struct flow_filter *)*arg;
+ if (fold) {
err = -EINVAL;
- if (f->handle != handle && handle)
+ if (fold->handle != handle && handle)
goto err2;
- mode = f->mode;
+ /* Copy fold into fnew */
+ fnew->handle = fold->handle;
+ fnew->keymask = fold->keymask;
+ fnew->tp = fold->tp;
+
+ fnew->handle = fold->handle;
+ fnew->nkeys = fold->nkeys;
+ fnew->keymask = fold->keymask;
+ fnew->mode = fold->mode;
+ fnew->mask = fold->mask;
+ fnew->xor = fold->xor;
+ fnew->rshift = fold->rshift;
+ fnew->addend = fold->addend;
+ fnew->divisor = fold->divisor;
+ fnew->baseclass = fold->baseclass;
+ fnew->hashrnd = fold->hashrnd;
+
+ mode = fold->mode;
if (tb[TCA_FLOW_MODE])
mode = nla_get_u32(tb[TCA_FLOW_MODE]);
if (mode != FLOW_MODE_HASH && nkeys > 1)
goto err2;
if (mode == FLOW_MODE_HASH)
- perturb_period = f->perturb_period;
+ perturb_period = fold->perturb_period;
if (tb[TCA_FLOW_PERTURB]) {
if (mode != FLOW_MODE_HASH)
goto err2;
@@ -444,83 +479,70 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
if (TC_H_MIN(baseclass) == 0)
baseclass = TC_H_MAKE(baseclass, 1);
- err = -ENOBUFS;
- f = kzalloc(sizeof(*f), GFP_KERNEL);
- if (f == NULL)
- goto err2;
-
- f->handle = handle;
- f->mask = ~0U;
- tcf_exts_init(&f->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);
-
- get_random_bytes(&f->hashrnd, 4);
- f->perturb_timer.function = flow_perturbation;
- f->perturb_timer.data = (unsigned long)f;
- init_timer_deferrable(&f->perturb_timer);
+ fnew->handle = handle;
+ fnew->mask = ~0U;
+ fnew->tp = tp;
+ get_random_bytes(&fnew->hashrnd, 4);
+ tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);
}
- tcf_exts_change(tp, &f->exts, &e);
- tcf_em_tree_change(tp, &f->ematches, &t);
+ fnew->perturb_timer.function = flow_perturbation;
+ fnew->perturb_timer.data = (unsigned long)fnew;
+ init_timer_deferrable(&fnew->perturb_timer);
- tcf_tree_lock(tp);
+ tcf_exts_change(tp, &fnew->exts, &e);
+ tcf_em_tree_change(tp, &fnew->ematches, &t);
if (tb[TCA_FLOW_KEYS]) {
- f->keymask = keymask;
- f->nkeys = nkeys;
+ fnew->keymask = keymask;
+ fnew->nkeys = nkeys;
}
- f->mode = mode;
+ fnew->mode = mode;
if (tb[TCA_FLOW_MASK])
- f->mask = nla_get_u32(tb[TCA_FLOW_MASK]);
+ fnew->mask = nla_get_u32(tb[TCA_FLOW_MASK]);
if (tb[TCA_FLOW_XOR])
- f->xor = nla_get_u32(tb[TCA_FLOW_XOR]);
+ fnew->xor = nla_get_u32(tb[TCA_FLOW_XOR]);
if (tb[TCA_FLOW_RSHIFT])
- f->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]);
+ fnew->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]);
if (tb[TCA_FLOW_ADDEND])
- f->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]);
+ fnew->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]);
if (tb[TCA_FLOW_DIVISOR])
- f->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]);
+ fnew->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]);
if (baseclass)
- f->baseclass = baseclass;
+ fnew->baseclass = baseclass;
- f->perturb_period = perturb_period;
- del_timer(&f->perturb_timer);
+ fnew->perturb_period = perturb_period;
if (perturb_period)
- mod_timer(&f->perturb_timer, jiffies + perturb_period);
+ mod_timer(&fnew->perturb_timer, jiffies + perturb_period);
if (*arg == 0)
- list_add_tail(&f->list, &head->filters);
+ list_add_tail_rcu(&fnew->list, &head->filters);
+ else
+ list_replace_rcu(&fnew->list, &fold->list);
- tcf_tree_unlock(tp);
+ *arg = (unsigned long)fnew;
- *arg = (unsigned long)f;
+ if (fold)
+ call_rcu(&fold->rcu, flow_destroy_filter);
return 0;
err2:
tcf_em_tree_destroy(tp, &t);
+ kfree(fnew);
err1:
tcf_exts_destroy(tp, &e);
return err;
}
-static void flow_destroy_filter(struct tcf_proto *tp, struct flow_filter *f)
-{
- del_timer_sync(&f->perturb_timer);
- tcf_exts_destroy(tp, &f->exts);
- tcf_em_tree_destroy(tp, &f->ematches);
- kfree(f);
-}
-
static int flow_delete(struct tcf_proto *tp, unsigned long arg)
{
struct flow_filter *f = (struct flow_filter *)arg;
- tcf_tree_lock(tp);
- list_del(&f->list);
- tcf_tree_unlock(tp);
- flow_destroy_filter(tp, f);
+ list_del_rcu(&f->list);
+ call_rcu(&f->rcu, flow_destroy_filter);
return 0;
}
@@ -532,28 +554,29 @@ static int flow_init(struct tcf_proto *tp)
if (head == NULL)
return -ENOBUFS;
INIT_LIST_HEAD(&head->filters);
- tp->root = head;
+ rcu_assign_pointer(tp->root, head);
return 0;
}
static void flow_destroy(struct tcf_proto *tp)
{
- struct flow_head *head = tp->root;
+ struct flow_head *head = rtnl_dereference(tp->root);
struct flow_filter *f, *next;
list_for_each_entry_safe(f, next, &head->filters, list) {
- list_del(&f->list);
- flow_destroy_filter(tp, f);
+ list_del_rcu(&f->list);
+ call_rcu(&f->rcu, flow_destroy_filter);
}
- kfree(head);
+ rcu_assign_pointer(tp->root, NULL);
+ kfree_rcu(head, rcu);
}
static unsigned long flow_get(struct tcf_proto *tp, u32 handle)
{
- struct flow_head *head = tp->root;
+ struct flow_head *head = rtnl_dereference(tp->root);
struct flow_filter *f;
- list_for_each_entry(f, &head->filters, list)
+ list_for_each_entry_rcu(f, &head->filters, list)
if (f->handle == handle)
return (unsigned long)f;
return 0;
@@ -626,10 +649,10 @@ nla_put_failure:
static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
- struct flow_head *head = tp->root;
+ struct flow_head *head = rtnl_dereference(tp->root);
struct flow_filter *f;
- list_for_each_entry(f, &head->filters, list) {
+ list_for_each_entry_rcu(f, &head->filters, list) {
if (arg->count < arg->skip)
goto skip;
if (arg->fn(tp, (unsigned long)f, arg) < 0) {
^ permalink raw reply related [flat|nested] 20+ messages in thread* [RFC PATCH 06/12] net: sched: fw use RCU
2014-01-10 9:36 [RFC PATCH 00/12] RCU'ify the net:sched classifier chains John Fastabend
` (4 preceding siblings ...)
2014-01-10 9:39 ` [RFC PATCH 05/12] net: sched: cls_flow " John Fastabend
@ 2014-01-10 9:39 ` John Fastabend
2014-01-10 9:41 ` [RFC PATCH 07/12] net: sched: RCU cls_route John Fastabend
` (7 subsequent siblings)
13 siblings, 0 replies; 20+ messages in thread
From: John Fastabend @ 2014-01-10 9:39 UTC (permalink / raw)
To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
RCU'ify fw classifier.
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
net/sched/cls_fw.c | 112 ++++++++++++++++++++++++++++++++++++----------------
1 file changed, 77 insertions(+), 35 deletions(-)
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 3f9cece..6f7680f 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -32,18 +32,21 @@
#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *))
struct fw_head {
- struct fw_filter *ht[HTSIZE];
+ struct rcu_head rcu;
+ struct fw_filter __rcu *ht[HTSIZE];
u32 mask;
};
struct fw_filter {
- struct fw_filter *next;
+ struct fw_filter __rcu *next;
+ struct rcu_head rcu;
u32 id;
struct tcf_result res;
#ifdef CONFIG_NET_CLS_IND
char indev[IFNAMSIZ];
#endif /* CONFIG_NET_CLS_IND */
struct tcf_exts exts;
+ struct tcf_proto *tp;
};
static inline int fw_hash(u32 handle)
@@ -75,14 +78,16 @@ static inline int fw_hash(u32 handle)
static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
- struct fw_head *head = (struct fw_head *)tp->root;
+ struct fw_head *head = (struct fw_head *)rcu_dereference_bh(tp->root);
struct fw_filter *f;
int r;
u32 id = skb->mark;
if (head != NULL) {
id &= head->mask;
- for (f = head->ht[fw_hash(id)]; f; f = f->next) {
+
+ for (f = rcu_dereference_bh(head->ht[fw_hash(id)]); f;
+ f = rcu_dereference_bh(f->next)) {
if (f->id == id) {
*res = f->res;
#ifdef CONFIG_NET_CLS_IND
@@ -111,13 +116,14 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
static unsigned long fw_get(struct tcf_proto *tp, u32 handle)
{
- struct fw_head *head = (struct fw_head *)tp->root;
+ struct fw_head *head = (struct fw_head *)rtnl_dereference(tp->root);
struct fw_filter *f;
if (head == NULL)
return 0;
- for (f = head->ht[fw_hash(handle)]; f; f = f->next) {
+ f = rtnl_dereference(head->ht[fw_hash(handle)]);
+ for (; f; f = rtnl_dereference(f->next)) {
if (f->id == handle)
return (unsigned long)f;
}
@@ -133,8 +139,11 @@ static int fw_init(struct tcf_proto *tp)
return 0;
}
-static void fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)
+static void fw_delete_filter(struct rcu_head *head)
{
+ struct fw_filter *f = container_of(head, struct fw_filter, rcu);
+ struct tcf_proto *tp = f->tp;
+
tcf_unbind_filter(tp, &f->res);
tcf_exts_destroy(tp, &f->exts);
kfree(f);
@@ -142,7 +151,7 @@ static void fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)
static void fw_destroy(struct tcf_proto *tp)
{
- struct fw_head *head = tp->root;
+ struct fw_head *head = rtnl_dereference(tp->root);
struct fw_filter *f;
int h;
@@ -150,29 +159,32 @@ static void fw_destroy(struct tcf_proto *tp)
return;
for (h = 0; h < HTSIZE; h++) {
- while ((f = head->ht[h]) != NULL) {
- head->ht[h] = f->next;
- fw_delete_filter(tp, f);
+ while ((f = rtnl_dereference(head->ht[h])) != NULL) {
+ rcu_assign_pointer(head->ht[h],
+ rtnl_dereference(f->next));
+ call_rcu(&f->rcu, fw_delete_filter);
}
}
- kfree(head);
+ rcu_assign_pointer(tp->root, NULL);
+ kfree_rcu(head, rcu);
}
static int fw_delete(struct tcf_proto *tp, unsigned long arg)
{
- struct fw_head *head = (struct fw_head *)tp->root;
- struct fw_filter *f = (struct fw_filter *)arg;
- struct fw_filter **fp;
+ struct fw_head *head = (struct fw_head *)rtnl_dereference(tp->root);
+ struct fw_filter *pfp, *f = (struct fw_filter *)arg;
+ struct fw_filter __rcu **fp;
if (head == NULL || f == NULL)
goto out;
- for (fp = &head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {
- if (*fp == f) {
- tcf_tree_lock(tp);
- *fp = f->next;
- tcf_tree_unlock(tp);
- fw_delete_filter(tp, f);
+ fp = &head->ht[fw_hash(f->id)];
+
+ for (pfp = rtnl_dereference(*fp); pfp;
+ fp = &pfp->next, pfp = rtnl_dereference(*fp)) {
+ if (pfp == f) {
+ rcu_assign_pointer(*fp, rtnl_dereference(f->next));
+ call_rcu(&f->rcu, fw_delete_filter);
return 0;
}
}
@@ -190,7 +202,7 @@ static int
fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f,
struct nlattr **tb, struct nlattr **tca, unsigned long base)
{
- struct fw_head *head = (struct fw_head *)tp->root;
+ struct fw_head *head = (struct fw_head *)rtnl_dereference(tp->root);
struct tcf_exts e;
u32 mask;
int err;
@@ -235,7 +247,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
struct nlattr **tca,
unsigned long *arg)
{
- struct fw_head *head = (struct fw_head *)tp->root;
+ struct fw_head *head = (struct fw_head *)rtnl_dereference(tp->root);
struct fw_filter *f = (struct fw_filter *) *arg;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_FW_MAX + 1];
@@ -248,10 +260,42 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
if (err < 0)
return err;
- if (f != NULL) {
+ if (f) {
+ struct fw_filter *pfp, *fnew;
+ struct fw_filter __rcu **fp;
+
if (f->id != handle && handle)
return -EINVAL;
- return fw_change_attrs(net, tp, f, tb, tca, base);
+
+ fnew = kzalloc(sizeof(struct fw_filter), GFP_KERNEL);
+ if (!fnew)
+ return -ENOBUFS;
+
+ fnew->id = f->id;
+ fnew->res = f->res;
+#ifdef CONFIG_NET_CLS_IND
+ strcpy(fnew->indev, f->indev);
+#endif /* CONFIG_NET_CLS_IND */
+ fnew->tp = f->tp;
+
+ err = fw_change_attrs(net, tp, fnew, tb, tca, base);
+ if (err < 0) {
+ kfree(fnew);
+ return err;
+ }
+
+ fp = &head->ht[fw_hash(fnew->id)];
+ for (pfp = rtnl_dereference(*fp); pfp;
+ fp = &pfp->next, pfp = rtnl_dereference(*fp))
+ if (pfp == f)
+ break;
+
+ rcu_assign_pointer(fnew->next, rtnl_dereference(pfp->next));
+ rcu_assign_pointer(*fp, fnew);
+ call_rcu(&f->rcu, fw_delete_filter);
+
+ *arg = (unsigned long)fnew;
+ return err;
}
if (!handle)
@@ -267,9 +311,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
return -ENOBUFS;
head->mask = mask;
- tcf_tree_lock(tp);
- tp->root = head;
- tcf_tree_unlock(tp);
+ rcu_assign_pointer(tp->root, head);
}
f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL);
@@ -278,15 +320,14 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE);
f->id = handle;
+ f->tp = tp;
err = fw_change_attrs(net, tp, f, tb, tca, base);
if (err < 0)
goto errout;
- f->next = head->ht[fw_hash(handle)];
- tcf_tree_lock(tp);
- head->ht[fw_hash(handle)] = f;
- tcf_tree_unlock(tp);
+ rcu_assign_pointer(f->next, head->ht[fw_hash(handle)]);
+ rcu_assign_pointer(head->ht[fw_hash(handle)], f);
*arg = (unsigned long)f;
return 0;
@@ -298,7 +339,7 @@ errout:
static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
- struct fw_head *head = (struct fw_head *)tp->root;
+ struct fw_head *head = (struct fw_head *)rtnl_dereference(tp->root);
int h;
if (head == NULL)
@@ -310,7 +351,8 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
for (h = 0; h < HTSIZE; h++) {
struct fw_filter *f;
- for (f = head->ht[h]; f; f = f->next) {
+ for (f = rtnl_dereference(head->ht[h]); f;
+ f = rtnl_dereference(f->next)) {
if (arg->count < arg->skip) {
arg->count++;
continue;
@@ -327,7 +369,7 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
static int fw_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
- struct fw_head *head = (struct fw_head *)tp->root;
+ struct fw_head *head = (struct fw_head *)rtnl_dereference(tp->root);
struct fw_filter *f = (struct fw_filter *)fh;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
^ permalink raw reply related [flat|nested] 20+ messages in thread* [RFC PATCH 07/12] net: sched: RCU cls_route
2014-01-10 9:36 [RFC PATCH 00/12] RCU'ify the net:sched classifier chains John Fastabend
` (5 preceding siblings ...)
2014-01-10 9:39 ` [RFC PATCH 06/12] net: sched: fw " John Fastabend
@ 2014-01-10 9:41 ` John Fastabend
2014-01-10 9:42 ` [RFC PATCH 08/12] net: sched: RCU cls_tcindex John Fastabend
` (6 subsequent siblings)
13 siblings, 0 replies; 20+ messages in thread
From: John Fastabend @ 2014-01-10 9:41 UTC (permalink / raw)
To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
RCUify the route classifier. For now however spinlock's are used to
protect fastmap cache.
The issue here is the fastmap may be read by one CPU while the
cache is being updated by another. An array of pointers could be
one possible solution.
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
net/sched/cls_route.c | 218 ++++++++++++++++++++++++++++---------------------
1 file changed, 125 insertions(+), 93 deletions(-)
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 2473953..6ba8f43 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -29,25 +29,26 @@
* are mutually exclusive.
* 3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
*/
-
struct route4_fastmap {
- struct route4_filter *filter;
- u32 id;
- int iif;
+ struct route4_filter *filter;
+ u32 id;
+ int iif;
};
struct route4_head {
- struct route4_fastmap fastmap[16];
- struct route4_bucket *table[256 + 1];
+ struct route4_fastmap fastmap[16];
+ struct route4_bucket __rcu *table[256 + 1];
+ struct rcu_head rcu;
};
struct route4_bucket {
/* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */
- struct route4_filter *ht[16 + 16 + 1];
+ struct route4_filter __rcu *ht[16 + 16 + 1];
+ struct rcu_head rcu;
};
struct route4_filter {
- struct route4_filter *next;
+ struct route4_filter __rcu *next;
u32 id;
int iif;
@@ -55,6 +56,8 @@ struct route4_filter {
struct tcf_exts exts;
u32 handle;
struct route4_bucket *bkt;
+ struct tcf_proto *tp;
+ struct rcu_head rcu;
};
#define ROUTE4_FAILURE ((struct route4_filter *)(-1L))
@@ -64,14 +67,13 @@ static inline int route4_fastmap_hash(u32 id, int iif)
return id & 0xF;
}
+static DEFINE_SPINLOCK(fastmap_lock);
static void
-route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id)
+route4_reset_fastmap(struct route4_head *head)
{
- spinlock_t *root_lock = qdisc_root_sleeping_lock(q);
-
- spin_lock_bh(root_lock);
+ spin_lock(&fastmap_lock);
memset(head->fastmap, 0, sizeof(head->fastmap));
- spin_unlock_bh(root_lock);
+ spin_unlock(&fastmap_lock);
}
static void
@@ -80,9 +82,12 @@ route4_set_fastmap(struct route4_head *head, u32 id, int iif,
{
int h = route4_fastmap_hash(id, iif);
+ /* fastmap updates must look atomic to aling id, iff, filter */
+ spin_lock(&fastmap_lock);
head->fastmap[h].id = id;
head->fastmap[h].iif = iif;
head->fastmap[h].filter = f;
+ spin_unlock(&fastmap_lock);
}
static inline int route4_hash_to(u32 id)
@@ -123,7 +128,7 @@ static inline int route4_hash_wild(void)
static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
- struct route4_head *head = (struct route4_head *)tp->root;
+ struct route4_head *head = (struct route4_head *)rcu_dereference_bh(tp->root);
struct dst_entry *dst;
struct route4_bucket *b;
struct route4_filter *f;
@@ -141,32 +146,43 @@ static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp,
iif = inet_iif(skb);
h = route4_fastmap_hash(id, iif);
+
+ spin_lock(&fastmap_lock);
if (id == head->fastmap[h].id &&
iif == head->fastmap[h].iif &&
(f = head->fastmap[h].filter) != NULL) {
- if (f == ROUTE4_FAILURE)
+ if (f == ROUTE4_FAILURE) {
+ spin_unlock(&fastmap_lock);
goto failure;
+ }
*res = f->res;
+ spin_unlock(&fastmap_lock);
return 0;
}
+ spin_unlock(&fastmap_lock);
h = route4_hash_to(id);
restart:
- b = head->table[h];
+ b = rcu_dereference_bh(head->table[h]);
if (b) {
- for (f = b->ht[route4_hash_from(id)]; f; f = f->next)
+ for (f = rcu_dereference_bh(b->ht[route4_hash_from(id)]);
+ f;
+ f = rcu_dereference_bh(f->next))
if (f->id == id)
ROUTE4_APPLY_RESULT();
- for (f = b->ht[route4_hash_iif(iif)]; f; f = f->next)
+ for (f = rcu_dereference_bh(b->ht[route4_hash_iif(iif)]);
+ f;
+ f = rcu_dereference_bh(f->next))
if (f->iif == iif)
ROUTE4_APPLY_RESULT();
- for (f = b->ht[route4_hash_wild()]; f; f = f->next)
+ for (f = rcu_dereference_bh(b->ht[route4_hash_wild()]);
+ f;
+ f = rcu_dereference_bh(f->next))
ROUTE4_APPLY_RESULT();
-
}
if (h < 256) {
h = 256;
@@ -213,7 +229,7 @@ static inline u32 from_hash(u32 id)
static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
{
- struct route4_head *head = (struct route4_head *)tp->root;
+ struct route4_head *head = (struct route4_head *)rtnl_dereference(tp->root);
struct route4_bucket *b;
struct route4_filter *f;
unsigned int h1, h2;
@@ -229,9 +245,11 @@ static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
if (h2 > 32)
return 0;
- b = head->table[h1];
+ b = rtnl_dereference(head->table[h1]);
if (b) {
- for (f = b->ht[h2]; f; f = f->next)
+ for (f = rtnl_dereference(b->ht[h2]);
+ f;
+ f = rtnl_dereference(f->next))
if (f->handle == handle)
return (unsigned long)f;
}
@@ -248,8 +266,11 @@ static int route4_init(struct tcf_proto *tp)
}
static void
-route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f)
+route4_delete_filter(struct rcu_head *head)
{
+ struct route4_filter *f = container_of(head, struct route4_filter, rcu);
+ struct tcf_proto *tp = f->tp;
+
tcf_unbind_filter(tp, &f->res);
tcf_exts_destroy(tp, &f->exts);
kfree(f);
@@ -257,7 +278,7 @@ route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f)
static void route4_destroy(struct tcf_proto *tp)
{
- struct route4_head *head = tp->root;
+ struct route4_head *head = rtnl_dereference(tp->root);
int h1, h2;
if (head == NULL)
@@ -266,26 +287,29 @@ static void route4_destroy(struct tcf_proto *tp)
for (h1 = 0; h1 <= 256; h1++) {
struct route4_bucket *b;
- b = head->table[h1];
+ b = rtnl_dereference(head->table[h1]);
if (b) {
for (h2 = 0; h2 <= 32; h2++) {
struct route4_filter *f;
- while ((f = b->ht[h2]) != NULL) {
- b->ht[h2] = f->next;
- route4_delete_filter(tp, f);
+ while ((f = rtnl_dereference(b->ht[h2])) != NULL) {
+ rcu_assign_pointer(b->ht[h2], rtnl_dereference(f->next));
+ call_rcu(&f->rcu, route4_delete_filter);
}
}
- kfree(b);
+ rcu_assign_pointer(head->table[h1], NULL);
+ kfree_rcu(b, rcu);
}
}
- kfree(head);
+ rcu_assign_pointer(tp->root, NULL);
+ kfree_rcu(head, rcu);
}
static int route4_delete(struct tcf_proto *tp, unsigned long arg)
{
- struct route4_head *head = (struct route4_head *)tp->root;
- struct route4_filter **fp, *f = (struct route4_filter *)arg;
+ struct route4_head *head = (struct route4_head *)rtnl_dereference(tp->root);
+ struct route4_filter __rcu **fp;
+ struct route4_filter *nf, *f = (struct route4_filter *)arg;
unsigned int h = 0;
struct route4_bucket *b;
int i;
@@ -296,27 +320,34 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg)
h = f->handle;
b = f->bkt;
- for (fp = &b->ht[from_hash(h >> 16)]; *fp; fp = &(*fp)->next) {
- if (*fp == f) {
- tcf_tree_lock(tp);
+ fp = &b->ht[from_hash(h >> 16)];
+ for (nf = rtnl_dereference(*fp); nf;
+ fp = &nf->next, nf = rtnl_dereference(*fp)) {
+ if (nf == f) {
+ /* unlink it */
*fp = f->next;
- tcf_tree_unlock(tp);
- route4_reset_fastmap(tp->q, head, f->id);
- route4_delete_filter(tp, f);
+ /* Remove any fastmap lookups that might ref filter
+ * notice we unlink'd the filter so we can't get it
+ * back in the fastmap.
+ */
+ route4_reset_fastmap(head);
- /* Strip tree */
+ /* Delete it */
+ call_rcu(&f->rcu, route4_delete_filter);
- for (i = 0; i <= 32; i++)
- if (b->ht[i])
+ /* Strip RTNL protected tree */
+ for (i = 0; i <= 32; i++) {
+ struct route4_filter *rt = rtnl_dereference(b->ht[i]);
+
+ if (rt)
return 0;
+ }
/* OK, session has no flows */
- tcf_tree_lock(tp);
- head->table[to_hash(h)] = NULL;
- tcf_tree_unlock(tp);
+ rcu_assign_pointer(head->table[to_hash(h)], NULL);
+ kfree_rcu(b, rcu);
- kfree(b);
return 0;
}
}
@@ -379,26 +410,25 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,
}
h1 = to_hash(nhandle);
- b = head->table[h1];
+ b = rtnl_dereference(head->table[h1]);
if (!b) {
err = -ENOBUFS;
b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL);
if (b == NULL)
goto errout;
- tcf_tree_lock(tp);
- head->table[h1] = b;
- tcf_tree_unlock(tp);
+ rcu_assign_pointer(head->table[h1], b);
} else {
unsigned int h2 = from_hash(nhandle >> 16);
err = -EEXIST;
- for (fp = b->ht[h2]; fp; fp = fp->next)
+ for (fp = rtnl_dereference(b->ht[h2]);
+ fp;
+ fp = rtnl_dereference(fp->next))
if (fp->handle == f->handle)
goto errout;
}
- tcf_tree_lock(tp);
if (tb[TCA_ROUTE4_TO])
f->id = to;
@@ -409,7 +439,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,
f->handle = nhandle;
f->bkt = b;
- tcf_tree_unlock(tp);
+ f->tp = tp;
if (tb[TCA_ROUTE4_CLASSID]) {
f->res.classid = nla_get_u32(tb[TCA_ROUTE4_CLASSID]);
@@ -430,14 +460,15 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
struct nlattr **tca,
unsigned long *arg)
{
- struct route4_head *head = tp->root;
- struct route4_filter *f, *f1, **fp;
+ struct route4_head *head = rtnl_dereference(tp->root);
+ struct route4_filter __rcu **fp;
+ struct route4_filter *fold, *f1, *pfp, *f = NULL;
struct route4_bucket *b;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_ROUTE4_MAX + 1];
unsigned int h, th;
- u32 old_handle = 0;
int err;
+ bool new = true;
if (opt == NULL)
return handle ? -EINVAL : 0;
@@ -446,70 +477,69 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
if (err < 0)
return err;
- f = (struct route4_filter *)*arg;
- if (f) {
- if (f->handle != handle && handle)
+ fold = (struct route4_filter *)*arg;
+ if (fold && handle && fold->handle != handle)
return -EINVAL;
-
- if (f->bkt)
- old_handle = f->handle;
-
- err = route4_set_parms(net, tp, base, f, handle, head, tb,
- tca[TCA_RATE], 0);
- if (err < 0)
- return err;
-
- goto reinsert;
- }
-
err = -ENOBUFS;
if (head == NULL) {
head = kzalloc(sizeof(struct route4_head), GFP_KERNEL);
if (head == NULL)
goto errout;
-
- tcf_tree_lock(tp);
- tp->root = head;
- tcf_tree_unlock(tp);
+ rcu_assign_pointer(tp->root, head);
}
f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL);
- if (f == NULL)
+ if (!f)
goto errout;
tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
+ if (fold) {
+ f->id = fold->id;
+ f->iif = fold->iif;
+ f->res = fold->res;
+ f->handle = fold->handle;
+
+ f->tp = fold->tp;
+ f->bkt = fold->bkt;
+ new = false;
+ }
+
err = route4_set_parms(net, tp, base, f, handle, head, tb,
- tca[TCA_RATE], 1);
+ tca[TCA_RATE], new);
if (err < 0)
goto errout;
-reinsert:
h = from_hash(f->handle >> 16);
- for (fp = &f->bkt->ht[h]; (f1 = *fp) != NULL; fp = &f1->next)
+ fp = &f->bkt->ht[h];
+ for (pfp = rtnl_dereference(*fp);
+ (f1 = rtnl_dereference(*fp)) != NULL;
+ fp = &f1->next)
if (f->handle < f1->handle)
break;
- f->next = f1;
- tcf_tree_lock(tp);
- *fp = f;
+ rcu_assign_pointer(f->next, f1);
+ rcu_assign_pointer(*fp, f);
- if (old_handle && f->handle != old_handle) {
- th = to_hash(old_handle);
- h = from_hash(old_handle >> 16);
- b = head->table[th];
+ if (fold->handle && f->handle != fold->handle) {
+ th = to_hash(fold->handle);
+ h = from_hash(fold->handle >> 16);
+ b = rtnl_dereference(head->table[th]);
if (b) {
- for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) {
- if (*fp == f) {
+ fp = &b->ht[h];
+ for (pfp = rtnl_dereference(*fp); pfp;
+ fp = &pfp->next, pfp = rtnl_dereference(*fp)) {
+ if (pfp == f) {
*fp = f->next;
break;
}
}
}
}
- tcf_tree_unlock(tp);
- route4_reset_fastmap(tp->q, head, f->id);
+ route4_reset_fastmap(head);
*arg = (unsigned long)f;
+ if (fold)
+ call_rcu(&fold->rcu, route4_delete_filter);
return 0;
errout:
@@ -519,7 +549,7 @@ errout:
static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
- struct route4_head *head = tp->root;
+ struct route4_head *head = rtnl_dereference(tp->root);
unsigned int h, h1;
if (head == NULL)
@@ -529,13 +559,15 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
return;
for (h = 0; h <= 256; h++) {
- struct route4_bucket *b = head->table[h];
+ struct route4_bucket *b = rtnl_dereference(head->table[h]);
if (b) {
for (h1 = 0; h1 <= 32; h1++) {
struct route4_filter *f;
- for (f = b->ht[h1]; f; f = f->next) {
+ for (f = rtnl_dereference(b->ht[h1]);
+ f;
+ f = rtnl_dereference(f->next)) {
if (arg->count < arg->skip) {
arg->count++;
continue;
^ permalink raw reply related [flat|nested] 20+ messages in thread* [RFC PATCH 08/12] net: sched: RCU cls_tcindex
2014-01-10 9:36 [RFC PATCH 00/12] RCU'ify the net:sched classifier chains John Fastabend
` (6 preceding siblings ...)
2014-01-10 9:41 ` [RFC PATCH 07/12] net: sched: RCU cls_route John Fastabend
@ 2014-01-10 9:42 ` John Fastabend
2014-01-10 9:42 ` [RFC PATCH 09/12] net: sched: make cls_u32 lockless John Fastabend
` (5 subsequent siblings)
13 siblings, 0 replies; 20+ messages in thread
From: John Fastabend @ 2014-01-10 9:42 UTC (permalink / raw)
To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
Make cls_tcindex RCU safe.
This patch addds a new RCU routine rcu_dereference_bh_rtnl() to check
caller either holds the rcu read lock or RTNL. This is needed to
handle the case where tcindex_lookup() is being called in both cases.
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
include/linux/rtnetlink.h | 10 +
net/sched/cls_tcindex.c | 327 ++++++++++++++++++++++++++-------------------
2 files changed, 202 insertions(+), 135 deletions(-)
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 939428a..6008b22 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -37,6 +37,16 @@ extern int lockdep_rtnl_is_held(void);
rcu_dereference_check(p, lockdep_rtnl_is_held())
/**
+ * rcu_dereference_bh_rtnl - rcu_dereference_bh with debug checking
+ * @p: The pointer to read, prior to dereference
+ *
+ * Do an rcu_dereference_bh(p), but check caller either holds rcu_read_lock_bh()
+ * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference_bh()
+ */
+#define rcu_dereference_bh_rtnl(p) \
+ rcu_dereference_bh_check(p, lockdep_rtnl_is_held())
+
+/**
* rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL
* @p: The pointer to read, prior to dereferencing
*
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index ffad187..19eb24e 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -24,7 +24,7 @@
#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */
-#define PRIV(tp) ((struct tcindex_data *) (tp)->root)
+#define PRIV(tp) ((struct tcindex_data *) rtnl_dereference((tp)->root))
struct tcindex_filter_result {
@@ -35,19 +35,22 @@ struct tcindex_filter_result {
struct tcindex_filter {
u16 key;
struct tcindex_filter_result result;
- struct tcindex_filter *next;
+ struct tcindex_filter __rcu *next;
+ struct rcu_head rcu;
};
struct tcindex_data {
struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */
- struct tcindex_filter **h; /* imperfect hash; only used if !perfect;
- NULL if unused */
+ struct tcindex_filter __rcu **h; /* imperfect hash; only used if
+ !perfect NULL if unused */
+ struct rcu_head rcu;
+ struct tcf_proto *tp;
u16 mask; /* AND key with mask */
- int shift; /* shift ANDed key to the right */
- int hash; /* hash table size; 0 if undefined */
- int alloc_hash; /* allocated size */
- int fall_through; /* 0: only classify if explicit match */
+ u32 shift; /* shift ANDed key to the right */
+ u32 hash; /* hash table size; 0 if undefined */
+ u32 alloc_hash; /* allocated size */
+ u32 fall_through; /* 0: only classify if explicit match */
};
static inline int
@@ -59,13 +62,18 @@ tcindex_filter_is_set(struct tcindex_filter_result *r)
static struct tcindex_filter_result *
tcindex_lookup(struct tcindex_data *p, u16 key)
{
- struct tcindex_filter *f;
+ if (p->perfect) {
+ struct tcindex_filter_result *f = p->perfect + key;
+
+ return tcindex_filter_is_set(f) ? f : NULL;
+ } else if (p->h) {
+ struct tcindex_filter __rcu **fp;
+ struct tcindex_filter *f;
- if (p->perfect)
- return tcindex_filter_is_set(p->perfect + key) ?
- p->perfect + key : NULL;
- else if (p->h) {
- for (f = p->h[key % p->hash]; f; f = f->next)
+ fp = &p->h[key % p->hash];
+ for (f = rcu_dereference_bh_rtnl(*fp);
+ f;
+ fp = &f->next, f = rcu_dereference_bh_rtnl(*fp))
if (f->key == key)
return &f->result;
}
@@ -77,7 +85,7 @@ tcindex_lookup(struct tcindex_data *p, u16 key)
static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
- struct tcindex_data *p = PRIV(tp);
+ struct tcindex_data *p = rcu_dereference(tp->root);
struct tcindex_filter_result *f;
int key = (skb->tc_index & p->mask) >> p->shift;
@@ -132,49 +140,113 @@ static int tcindex_init(struct tcf_proto *tp)
p->hash = DEFAULT_HASH_SIZE;
p->fall_through = 1;
- tp->root = p;
+ rcu_assign_pointer(tp->root, p);
return 0;
}
+static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
+{
+ struct tcindex_data *p = PRIV(tp);
+ struct tcindex_filter *f, *next;
+ int i;
+
+ pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p);
+ if (p->perfect) {
+ for (i = 0; i < p->hash; i++) {
+ if (!p->perfect[i].res.class)
+ continue;
+ if (walker->count >= walker->skip) {
+ if (walker->fn(tp,
+ (unsigned long) (p->perfect+i), walker)
+ < 0) {
+ walker->stop = 1;
+ return;
+ }
+ }
+ walker->count++;
+ }
+ }
+ if (!p->h)
+ return;
+ for (i = 0; i < p->hash; i++) {
+ for (f = rtnl_dereference(p->h[i]); f; f = next) {
+ next = rtnl_dereference(f->next);
+ if (walker->count >= walker->skip) {
+ if (walker->fn(tp, (unsigned long) &f->result,
+ walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ }
+ walker->count++;
+ }
+ }
+}
static int
-__tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock)
+tcindex_delete(struct tcf_proto *tp, unsigned long arg)
{
struct tcindex_data *p = PRIV(tp);
struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg;
+ struct tcindex_filter __rcu **walk;
struct tcindex_filter *f = NULL;
- pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n", tp, arg, p, f);
+ pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p\n", tp, arg, p);
if (p->perfect) {
if (!r->res.class)
return -ENOENT;
} else {
int i;
- struct tcindex_filter **walk = NULL;
- for (i = 0; i < p->hash; i++)
- for (walk = p->h+i; *walk; walk = &(*walk)->next)
- if (&(*walk)->result == r)
+ for (i = 0; i < p->hash; i++) {
+ walk = p->h + i;
+ for (f = rtnl_dereference(*walk); f;
+ walk = &f->next, f = rtnl_dereference(*walk)) {
+ if (&f->result == r)
goto found;
+ }
+ }
return -ENOENT;
found:
- f = *walk;
- if (lock)
- tcf_tree_lock(tp);
- *walk = f->next;
- if (lock)
- tcf_tree_unlock(tp);
+ rcu_assign_pointer(*walk, rtnl_dereference(f->next));
}
tcf_unbind_filter(tp, &r->res);
tcf_exts_destroy(tp, &r->exts);
- kfree(f);
+ if (f)
+ kfree_rcu(f, rcu);
return 0;
}
-static int tcindex_delete(struct tcf_proto *tp, unsigned long arg)
+static int tcindex_destroy_element(struct tcf_proto *tp,
+ unsigned long arg,
+ struct tcf_walker *walker)
+{
+ return tcindex_delete(tp, arg);
+}
+
+static void __tcindex_destroy(struct rcu_head *head)
{
- return __tcindex_delete(tp, arg, 1);
+ struct tcindex_data *p = container_of(head, struct tcindex_data, rcu);
+
+ kfree(p->perfect);
+ kfree(p->h);
+ kfree(p);
+}
+
+static void tcindex_destroy(struct tcf_proto *tp)
+{
+ struct tcindex_data *p = PRIV(tp);
+ struct tcf_walker walker;
+
+ pr_debug("%s(tp %p),p %p\n", __func__, tp, p);
+ walker.count = 0;
+ walker.skip = 0;
+ walker.fn = &tcindex_destroy_element;
+ tcindex_walk(tp, &walker);
+
+ rcu_assign_pointer(tp->root, NULL);
+ call_rcu(&p->rcu, __tcindex_destroy);
}
static inline int
@@ -191,6 +263,14 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
[TCA_TCINDEX_CLASSID] = { .type = NLA_U32 },
};
+static void __tcindex_partial_destroy(struct rcu_head *head)
+{
+ struct tcindex_data *p = container_of(head, struct tcindex_data, rcu);
+
+ kfree(p->perfect);
+ kfree(p);
+}
+
static int
tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
u32 handle, struct tcindex_data *p,
@@ -200,7 +280,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
int err, balloc = 0;
struct tcindex_filter_result new_filter_result, *old_r = r;
struct tcindex_filter_result cr;
- struct tcindex_data cp;
+ struct tcindex_data *cp, *oldp;
struct tcindex_filter *f = NULL; /* make gcc behave */
struct tcf_exts e;
@@ -209,7 +289,27 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
if (err < 0)
return err;
- memcpy(&cp, p, sizeof(cp));
+ /* tcindex_data attributes must look atomic to classifier/lookup so
+ * allocate new tcindex data and RCU assign it onto root. Keeping
+ * perfect hash and hash pointers from old data.
+ */
+ cp = kzalloc(sizeof(cp), GFP_KERNEL);
+ if (!cp)
+ return -ENOMEM;
+
+ cp->mask = p->mask;
+ cp->shift = p->shift;
+ cp->hash = p->hash;
+ cp->alloc_hash = p->alloc_hash;
+ cp->fall_through = p->fall_through;
+ cp->tp = tp;
+
+ if (p->perfect) {
+ cp->perfect = kcalloc(cp->hash, sizeof(*r), GFP_KERNEL);
+ memcpy(cp->perfect, p->perfect, sizeof(*r) * cp->hash);
+ }
+ cp->h = p->h;
+
memset(&new_filter_result, 0, sizeof(new_filter_result));
tcf_exts_init(&new_filter_result.exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
@@ -221,71 +321,82 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
}
if (tb[TCA_TCINDEX_HASH])
- cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
+ cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
if (tb[TCA_TCINDEX_MASK])
- cp.mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
+ cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
if (tb[TCA_TCINDEX_SHIFT])
- cp.shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
+ cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
err = -EBUSY;
+
/* Hash already allocated, make sure that we still meet the
* requirements for the allocated hash.
*/
- if (cp.perfect) {
- if (!valid_perfect_hash(&cp) ||
- cp.hash > cp.alloc_hash)
+ if (cp->perfect) {
+ if (!valid_perfect_hash(cp) ||
+ cp->hash > cp->alloc_hash)
goto errout;
- } else if (cp.h && cp.hash != cp.alloc_hash)
+ } else if (cp->h && cp->hash != cp->alloc_hash)
goto errout;
err = -EINVAL;
if (tb[TCA_TCINDEX_FALL_THROUGH])
- cp.fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]);
+ cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]);
- if (!cp.hash) {
+ if (!cp->hash) {
/* Hash not specified, use perfect hash if the upper limit
* of the hashing index is below the threshold.
*/
- if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD)
- cp.hash = (cp.mask >> cp.shift) + 1;
+ if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD)
+ cp->hash = (cp->mask >> cp->shift) + 1;
else
- cp.hash = DEFAULT_HASH_SIZE;
+ cp->hash = DEFAULT_HASH_SIZE;
}
- if (!cp.perfect && !cp.h)
- cp.alloc_hash = cp.hash;
+ if (!cp->perfect && cp->h)
+ cp->alloc_hash = cp->hash;
/* Note: this could be as restrictive as if (handle & ~(mask >> shift))
* but then, we'd fail handles that may become valid after some future
* mask change. While this is extremely unlikely to ever matter,
* the check below is safer (and also more backwards-compatible).
*/
- if (cp.perfect || valid_perfect_hash(&cp))
- if (handle >= cp.alloc_hash)
+ if (cp->perfect || valid_perfect_hash(cp))
+ if (handle >= cp->alloc_hash)
goto errout;
err = -ENOMEM;
- if (!cp.perfect && !cp.h) {
- if (valid_perfect_hash(&cp)) {
- cp.perfect = kcalloc(cp.hash, sizeof(*r), GFP_KERNEL);
- if (!cp.perfect)
+ if (!cp->perfect && !cp->h) {
+ if (valid_perfect_hash(cp)) {
+ struct tcindex_filter_result *perfect;
+
+ perfect = kcalloc(cp->hash, sizeof(*r), GFP_KERNEL);
+ if (!perfect)
goto errout;
+ cp->perfect = perfect;
balloc = 1;
} else {
- cp.h = kcalloc(cp.hash, sizeof(f), GFP_KERNEL);
- if (!cp.h)
+ struct tcindex_filter __rcu **hash;
+
+ hash = kcalloc(cp->hash,
+ sizeof(struct tcindex_filter *),
+ GFP_KERNEL);
+
+ if (!hash)
goto errout;
+
+ cp->h = hash;
balloc = 2;
}
}
- if (cp.perfect)
- r = cp.perfect + handle;
+ if (cp->perfect)
+ r = cp->perfect + handle;
else
- r = tcindex_lookup(&cp, handle) ? : &new_filter_result;
+ r = tcindex_lookup(cp, handle) ? : &new_filter_result;
if (r == &new_filter_result) {
f = kzalloc(sizeof(*f), GFP_KERNEL);
@@ -300,33 +411,41 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
tcf_exts_change(tp, &cr.exts, &e);
- tcf_tree_lock(tp);
if (old_r && old_r != r)
memset(old_r, 0, sizeof(*old_r));
- memcpy(p, &cp, sizeof(cp));
+ oldp = p;
memcpy(r, &cr, sizeof(cr));
+ rcu_assign_pointer(tp->root, cp);
if (r == &new_filter_result) {
- struct tcindex_filter **fp;
+ struct tcindex_filter *nfp;
+ struct tcindex_filter __rcu **fp;
f->key = handle;
f->result = new_filter_result;
f->next = NULL;
- for (fp = p->h+(handle % p->hash); *fp; fp = &(*fp)->next)
- /* nothing */;
- *fp = f;
+
+ fp = p->h + (handle % p->hash);
+ for (nfp = rtnl_dereference(*fp);
+ nfp;
+ fp = &nfp->next, nfp = rtnl_dereference(*fp))
+ /* nothing */;
+
+ rcu_assign_pointer(*fp, f);
}
- tcf_tree_unlock(tp);
+ if (oldp)
+ call_rcu(&oldp->rcu, __tcindex_partial_destroy);
return 0;
errout_alloc:
if (balloc == 1)
- kfree(cp.perfect);
+ kfree(cp->perfect);
else if (balloc == 2)
- kfree(cp.h);
+ kfree(cp->h);
errout:
+ kfree(cp);
tcf_exts_destroy(tp, &e);
return err;
}
@@ -357,71 +476,6 @@ tcindex_change(struct net *net, struct sk_buff *in_skb,
tca[TCA_RATE]);
}
-
-static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
-{
- struct tcindex_data *p = PRIV(tp);
- struct tcindex_filter *f, *next;
- int i;
-
- pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p);
- if (p->perfect) {
- for (i = 0; i < p->hash; i++) {
- if (!p->perfect[i].res.class)
- continue;
- if (walker->count >= walker->skip) {
- if (walker->fn(tp,
- (unsigned long) (p->perfect+i), walker)
- < 0) {
- walker->stop = 1;
- return;
- }
- }
- walker->count++;
- }
- }
- if (!p->h)
- return;
- for (i = 0; i < p->hash; i++) {
- for (f = p->h[i]; f; f = next) {
- next = f->next;
- if (walker->count >= walker->skip) {
- if (walker->fn(tp, (unsigned long) &f->result,
- walker) < 0) {
- walker->stop = 1;
- return;
- }
- }
- walker->count++;
- }
- }
-}
-
-
-static int tcindex_destroy_element(struct tcf_proto *tp,
- unsigned long arg, struct tcf_walker *walker)
-{
- return __tcindex_delete(tp, arg, 0);
-}
-
-
-static void tcindex_destroy(struct tcf_proto *tp)
-{
- struct tcindex_data *p = PRIV(tp);
- struct tcf_walker walker;
-
- pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p);
- walker.count = 0;
- walker.skip = 0;
- walker.fn = &tcindex_destroy_element;
- tcindex_walk(tp, &walker);
- kfree(p->perfect);
- kfree(p->h);
- kfree(p);
- tp->root = NULL;
-}
-
-
static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
@@ -448,15 +502,18 @@ static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,
nla_nest_end(skb, nest);
} else {
if (p->perfect) {
- t->tcm_handle = r-p->perfect;
+ t->tcm_handle = r - p->perfect;
} else {
struct tcindex_filter *f;
+ struct tcindex_filter __rcu **fp;
int i;
t->tcm_handle = 0;
for (i = 0; !t->tcm_handle && i < p->hash; i++) {
- for (f = p->h[i]; !t->tcm_handle && f;
- f = f->next) {
+ fp = &p->h[i];
+ for (f = rtnl_dereference(*fp);
+ !t->tcm_handle && f;
+ fp = &f->next, f = rtnl_dereference(*fp)) {
if (&f->result == r)
t->tcm_handle = f->key;
}
^ permalink raw reply related [flat|nested] 20+ messages in thread* [RFC PATCH 09/12] net: sched: make cls_u32 lockless
2014-01-10 9:36 [RFC PATCH 00/12] RCU'ify the net:sched classifier chains John Fastabend
` (7 preceding siblings ...)
2014-01-10 9:42 ` [RFC PATCH 08/12] net: sched: RCU cls_tcindex John Fastabend
@ 2014-01-10 9:42 ` John Fastabend
2014-01-10 9:43 ` [RFC PATCH 10/12] net: sched: rcu'ify cls_rsvp John Fastabend
` (4 subsequent siblings)
13 siblings, 0 replies; 20+ messages in thread
From: John Fastabend @ 2014-01-10 9:42 UTC (permalink / raw)
To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
Make cls_u32 classifier safe to run without holding lock. This patch
converts statistics that are kept in read section u32_classify into
per cpu counters.
This patch needs careful review to be sure the internal data
structures are being handled. With regards to testing simply creating
a u32 filter is not enough the actual structure need to be created
and destroyed under stress for a valid test case.
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
net/sched/cls_u32.c | 258 +++++++++++++++++++++++++++++++++------------------
1 file changed, 169 insertions(+), 89 deletions(-)
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 20f2fb7..1d68d08 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -36,6 +36,7 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
+#include <linux/percpu.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
@@ -43,40 +44,46 @@
#include <net/pkt_cls.h>
struct tc_u_knode {
- struct tc_u_knode *next;
+ struct tc_u_knode __rcu *next;
u32 handle;
- struct tc_u_hnode *ht_up;
+ struct tc_u_hnode __rcu *ht_up;
struct tcf_exts exts;
#ifdef CONFIG_NET_CLS_IND
char indev[IFNAMSIZ];
#endif
u8 fshift;
struct tcf_result res;
- struct tc_u_hnode *ht_down;
+ struct tc_u_hnode __rcu *ht_down;
#ifdef CONFIG_CLS_U32_PERF
- struct tc_u32_pcnt *pf;
+ struct tc_u32_pcnt __percpu *pf;
#endif
#ifdef CONFIG_CLS_U32_MARK
- struct tc_u32_mark mark;
+ u32 val;
+ u32 mask;
+ u32 __percpu *pcpu_success;
#endif
+ struct tcf_proto *tp;
+ struct rcu_head rcu;
struct tc_u32_sel sel;
};
struct tc_u_hnode {
- struct tc_u_hnode *next;
+ struct tc_u_hnode __rcu *next;
u32 handle;
u32 prio;
struct tc_u_common *tp_c;
int refcnt;
unsigned int divisor;
- struct tc_u_knode *ht[1];
+ struct tc_u_knode __rcu *ht[1];
+ struct rcu_head rcu;
};
struct tc_u_common {
- struct tc_u_hnode *hlist;
+ struct tc_u_hnode __rcu *hlist;
struct Qdisc *q;
int refcnt;
u32 hgenerator;
+ struct rcu_head rcu;
};
static inline unsigned int u32_hash_fold(__be32 key,
@@ -95,7 +102,7 @@ static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct
unsigned int off;
} stack[TC_U32_MAXDEPTH];
- struct tc_u_hnode *ht = (struct tc_u_hnode *)tp->root;
+ struct tc_u_hnode *ht = (struct tc_u_hnode *) rcu_dereference_bh(tp->root);
unsigned int off = skb_network_offset(skb);
struct tc_u_knode *n;
int sdepth = 0;
@@ -103,27 +110,31 @@ static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct
int sel = 0;
#ifdef CONFIG_CLS_U32_PERF
int j;
+ struct tc_u32_pcnt *pf;
#endif
int i, r;
next_ht:
- n = ht->ht[sel];
+ n = rcu_dereference_bh(ht->ht[sel]);
next_knode:
if (n) {
struct tc_u32_key *key = n->sel.keys;
#ifdef CONFIG_CLS_U32_PERF
- n->pf->rcnt += 1;
+ pf = this_cpu_ptr(n->pf);
+ pf->rcnt += 1;
j = 0;
#endif
#ifdef CONFIG_CLS_U32_MARK
- if ((skb->mark & n->mark.mask) != n->mark.val) {
- n = n->next;
+ if ((skb->mark & n->mask) != n->val) {
+ n = rcu_dereference_bh(n->next);
goto next_knode;
} else {
- n->mark.success++;
+ u32 *success = this_cpu_ptr(n->pcpu_success);
+
+ *success = *success + 1;
}
#endif
@@ -138,37 +149,41 @@ next_knode:
if (!data)
goto out;
if ((*data ^ key->val) & key->mask) {
- n = n->next;
+ n = rcu_dereference_bh(n->next);
goto next_knode;
}
#ifdef CONFIG_CLS_U32_PERF
- n->pf->kcnts[j] += 1;
+ pf = this_cpu_ptr(n->pf);
+ pf->kcnts[j] += 1;
j++;
#endif
}
- if (n->ht_down == NULL) {
+
+ ht = rcu_dereference_bh(n->ht_down);
+ if (!ht) {
check_terminal:
if (n->sel.flags & TC_U32_TERMINAL) {
*res = n->res;
#ifdef CONFIG_NET_CLS_IND
if (!tcf_match_indev(skb, n->indev)) {
- n = n->next;
+ n = rcu_dereference_bh(n->next);
goto next_knode;
}
#endif
#ifdef CONFIG_CLS_U32_PERF
- n->pf->rhit += 1;
+ pf = this_cpu_ptr(n->pf);
+ pf->rhit += 1;
#endif
r = tcf_exts_exec(skb, &n->exts, res);
if (r < 0) {
- n = n->next;
+ n = rcu_dereference_bh(n->next);
goto next_knode;
}
return r;
}
- n = n->next;
+ n = rcu_dereference_bh(n->next);
goto next_knode;
}
@@ -179,7 +194,7 @@ check_terminal:
stack[sdepth].off = off;
sdepth++;
- ht = n->ht_down;
+ ht = rcu_dereference_bh(n->ht_down);
sel = 0;
if (ht->divisor) {
__be32 *data, hdata;
@@ -221,7 +236,7 @@ check_terminal:
/* POP */
if (sdepth--) {
n = stack[sdepth].knode;
- ht = n->ht_up;
+ ht = rcu_dereference_bh(n->ht_up);
off = stack[sdepth].off;
goto check_terminal;
}
@@ -238,7 +253,9 @@ u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
{
struct tc_u_hnode *ht;
- for (ht = tp_c->hlist; ht; ht = ht->next)
+ for (ht = rtnl_dereference(tp_c->hlist);
+ ht;
+ ht = rtnl_dereference(ht->next))
if (ht->handle == handle)
break;
@@ -255,7 +272,9 @@ u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
if (sel > ht->divisor)
goto out;
- for (n = ht->ht[sel]; n; n = n->next)
+ for (n = rtnl_dereference(ht->ht[sel]);
+ n;
+ n = rtnl_dereference(n->next))
if (n->handle == handle)
break;
out:
@@ -269,7 +288,7 @@ static unsigned long u32_get(struct tcf_proto *tp, u32 handle)
struct tc_u_common *tp_c = tp->data;
if (TC_U32_HTID(handle) == TC_U32_ROOT)
- ht = tp->root;
+ ht = rtnl_dereference(tp->root);
else
ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle));
@@ -290,6 +309,9 @@ static u32 gen_new_htid(struct tc_u_common *tp_c)
{
int i = 0x800;
+ /* hgenerator only used inside rtnl lock it is safe to increment
+ * without read _copy_ update semantics
+ */
do {
if (++tp_c->hgenerator == 0x7FF)
tp_c->hgenerator = 1;
@@ -325,11 +347,11 @@ static int u32_init(struct tcf_proto *tp)
}
tp_c->refcnt++;
- root_ht->next = tp_c->hlist;
- tp_c->hlist = root_ht;
+ rcu_assign_pointer(root_ht->next, tp_c->hlist);
+ rcu_assign_pointer(tp_c->hlist, root_ht);
root_ht->tp_c = tp_c;
- tp->root = root_ht;
+ rcu_assign_pointer(tp->root, root_ht);
tp->data = tp_c;
return 0;
}
@@ -341,25 +363,33 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n)
if (n->ht_down)
n->ht_down->refcnt--;
#ifdef CONFIG_CLS_U32_PERF
- kfree(n->pf);
+ free_percpu(n->pf);
#endif
kfree(n);
return 0;
}
+void __u32_delete_key(struct rcu_head *rcu)
+{
+ struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);
+
+ u32_destroy_key(key->tp, key);
+}
+
static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
{
- struct tc_u_knode **kp;
+ struct tc_u_knode __rcu **kp;
+ struct tc_u_knode *pkp;
struct tc_u_hnode *ht = key->ht_up;
if (ht) {
- for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) {
- if (*kp == key) {
- tcf_tree_lock(tp);
- *kp = key->next;
- tcf_tree_unlock(tp);
+ kp = &ht->ht[TC_U32_HASH(key->handle)];
+ for (pkp = rtnl_dereference(*kp); pkp;
+ kp = &pkp->next, pkp = rtnl_dereference(*kp)) {
+ if (pkp == key) {
+ rcu_assign_pointer(*kp, key->next);
- u32_destroy_key(tp, key);
+ call_rcu(&key->rcu, __u32_delete_key);
return 0;
}
}
@@ -368,16 +398,15 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
return 0;
}
-static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
+static void u32_clear_hnode(struct tc_u_hnode *ht)
{
struct tc_u_knode *n;
unsigned int h;
for (h = 0; h <= ht->divisor; h++) {
- while ((n = ht->ht[h]) != NULL) {
- ht->ht[h] = n->next;
-
- u32_destroy_key(tp, n);
+ while ((n = rtnl_dereference(ht->ht[h])) != NULL) {
+ rcu_assign_pointer(ht->ht[h], rtnl_dereference(n->next));
+ call_rcu(&n->rcu, __u32_delete_key);
}
}
}
@@ -385,28 +414,31 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
{
struct tc_u_common *tp_c = tp->data;
- struct tc_u_hnode **hn;
+ struct tc_u_hnode __rcu **hn;
+ struct tc_u_hnode *phn;
WARN_ON(ht->refcnt);
- u32_clear_hnode(tp, ht);
+ u32_clear_hnode(ht);
- for (hn = &tp_c->hlist; *hn; hn = &(*hn)->next) {
- if (*hn == ht) {
- *hn = ht->next;
- kfree(ht);
+ hn = &tp_c->hlist;
+ for (phn = rtnl_dereference(*hn);
+ phn;
+ hn = &phn->next, phn = rtnl_dereference(*hn)) {
+ if (phn == ht) {
+ rcu_assign_pointer(*hn, ht->next);
+ kfree_rcu(ht, rcu);
return 0;
}
}
- WARN_ON(1);
return -ENOENT;
}
static void u32_destroy(struct tcf_proto *tp)
{
struct tc_u_common *tp_c = tp->data;
- struct tc_u_hnode *root_ht = tp->root;
+ struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
WARN_ON(root_ht == NULL);
@@ -418,17 +450,16 @@ static void u32_destroy(struct tcf_proto *tp)
tp->q->u32_node = NULL;
- for (ht = tp_c->hlist; ht; ht = ht->next) {
+ for (ht = rtnl_dereference(tp_c->hlist);
+ ht;
+ ht = rtnl_dereference(ht->next)) {
ht->refcnt--;
- u32_clear_hnode(tp, ht);
+ u32_clear_hnode(ht);
}
- while ((ht = tp_c->hlist) != NULL) {
- tp_c->hlist = ht->next;
-
- WARN_ON(ht->refcnt != 0);
-
- kfree(ht);
+ while ((ht = rtnl_dereference(tp_c->hlist)) != NULL) {
+ rcu_assign_pointer(tp_c->hlist, ht->next);
+ kfree_rcu(ht, rcu);
}
kfree(tp_c);
@@ -440,6 +471,7 @@ static void u32_destroy(struct tcf_proto *tp)
static int u32_delete(struct tcf_proto *tp, unsigned long arg)
{
struct tc_u_hnode *ht = (struct tc_u_hnode *)arg;
+ struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
if (ht == NULL)
return 0;
@@ -447,7 +479,7 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg)
if (TC_U32_KEY(ht->handle))
return u32_delete_key(tp, (struct tc_u_knode *)ht);
- if (tp->root == ht)
+ if (root_ht == ht)
return -EINVAL;
if (ht->refcnt == 1) {
@@ -465,7 +497,9 @@ static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
struct tc_u_knode *n;
unsigned int i = 0x7FF;
- for (n = ht->ht[TC_U32_HASH(handle)]; n; n = n->next)
+ for (n = rtnl_dereference(ht->ht[TC_U32_HASH(handle)]);
+ n;
+ n = rtnl_dereference(n->next))
if (i < TC_U32_NODE(n->handle))
i = TC_U32_NODE(n->handle);
i++;
@@ -512,10 +546,8 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
ht_down->refcnt++;
}
- tcf_tree_lock(tp);
- ht_old = n->ht_down;
- n->ht_down = ht_down;
- tcf_tree_unlock(tp);
+ ht_old = rtnl_dereference(n->ht_down);
+ rcu_assign_pointer(n->ht_down, ht_down);
if (ht_old)
ht_old->refcnt--;
@@ -527,6 +559,7 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
#ifdef CONFIG_NET_CLS_IND
if (tb[TCA_U32_INDEV]) {
+ /* TODO make this a pointer update */
err = tcf_change_indev(tp, n->indev, tb[TCA_U32_INDEV]);
if (err < 0)
goto errout;
@@ -590,8 +623,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
ht->divisor = divisor;
ht->handle = handle;
ht->prio = tp->prio;
- ht->next = tp_c->hlist;
- tp_c->hlist = ht;
+ rcu_assign_pointer(ht->next, tp_c->hlist);
+ rcu_assign_pointer(tp_c->hlist, ht);
*arg = (unsigned long)ht;
return 0;
}
@@ -599,7 +632,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
if (tb[TCA_U32_HASH]) {
htid = nla_get_u32(tb[TCA_U32_HASH]);
if (TC_U32_HTID(htid) == TC_U32_ROOT) {
- ht = tp->root;
+ ht = rtnl_dereference(tp->root);
htid = ht->handle;
} else {
ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid));
@@ -607,7 +640,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
return -EINVAL;
}
} else {
- ht = tp->root;
+ ht = rtnl_dereference(tp->root);
htid = ht->handle;
}
@@ -631,8 +664,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
return -ENOBUFS;
#ifdef CONFIG_CLS_U32_PERF
- n->pf = kzalloc(sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64), GFP_KERNEL);
- if (n->pf == NULL) {
+ n->pf = __alloc_percpu(sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64),
+ __alignof__(struct tc_u32_pcnt));
+ if (!n->pf) {
kfree(n);
return -ENOBUFS;
}
@@ -643,34 +677,39 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
n->handle = handle;
n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
+ n->tp = tp;
#ifdef CONFIG_CLS_U32_MARK
+ n->pcpu_success = alloc_percpu(u32);
+
if (tb[TCA_U32_MARK]) {
struct tc_u32_mark *mark;
mark = nla_data(tb[TCA_U32_MARK]);
- memcpy(&n->mark, mark, sizeof(struct tc_u32_mark));
- n->mark.success = 0;
+ n->val = mark->val;
+ n->mask = mark->mask;
}
#endif
err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE]);
if (err == 0) {
- struct tc_u_knode **ins;
- for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next)
- if (TC_U32_NODE(handle) < TC_U32_NODE((*ins)->handle))
+ struct tc_u_knode __rcu **ins;
+ struct tc_u_knode *pins;
+
+ ins = &ht->ht[TC_U32_HASH(handle)];
+ for (pins = rtnl_dereference(*ins); pins;
+ ins = &pins->next, pins = rtnl_dereference(*ins))
+ if (TC_U32_NODE(handle) < TC_U32_NODE(pins->handle))
break;
- n->next = *ins;
- tcf_tree_lock(tp);
- *ins = n;
- tcf_tree_unlock(tp);
+ rcu_assign_pointer(n->next, pins);
+ rcu_assign_pointer(*ins, n);
*arg = (unsigned long)n;
return 0;
}
#ifdef CONFIG_CLS_U32_PERF
- kfree(n->pf);
+ free_percpu(n->pf);
#endif
kfree(n);
return err;
@@ -686,7 +725,9 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
if (arg->stop)
return;
- for (ht = tp_c->hlist; ht; ht = ht->next) {
+ for (ht = rtnl_dereference(tp_c->hlist);
+ ht;
+ ht = rtnl_dereference(ht->next)) {
if (ht->prio != tp->prio)
continue;
if (arg->count >= arg->skip) {
@@ -697,7 +738,9 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
}
arg->count++;
for (h = 0; h <= ht->divisor; h++) {
- for (n = ht->ht[h]; n; n = n->next) {
+ for (n = rtnl_dereference(ht->ht[h]);
+ n;
+ n = rtnl_dereference(n->next)) {
if (arg->count < arg->skip) {
arg->count++;
continue;
@@ -716,6 +759,7 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct tc_u_knode *n = (struct tc_u_knode *)fh;
+ struct tc_u_hnode *ht_up, *ht_down;
struct nlattr *nest;
if (n == NULL)
@@ -734,11 +778,18 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh,
if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor))
goto nla_put_failure;
} else {
+#ifdef CONFIG_CLS_U32_PERF
+ struct tc_u32_pcnt *gpf;
+#endif
+ int cpu;
+
if (nla_put(skb, TCA_U32_SEL,
sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
&n->sel))
goto nla_put_failure;
- if (n->ht_up) {
+
+ ht_up = rtnl_dereference(n->ht_up);
+ if (ht_up) {
u32 htid = n->handle & 0xFFFFF000;
if (nla_put_u32(skb, TCA_U32_HASH, htid))
goto nla_put_failure;
@@ -746,14 +797,24 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh,
if (n->res.classid &&
nla_put_u32(skb, TCA_U32_CLASSID, n->res.classid))
goto nla_put_failure;
- if (n->ht_down &&
- nla_put_u32(skb, TCA_U32_LINK, n->ht_down->handle))
+
+ ht_down = rtnl_dereference(n->ht_down);
+ if (ht_down &&
+ nla_put_u32(skb, TCA_U32_LINK, ht_down->handle))
goto nla_put_failure;
#ifdef CONFIG_CLS_U32_MARK
- if ((n->mark.val || n->mark.mask) &&
- nla_put(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark))
- goto nla_put_failure;
+ if ((n->val || n->mask)) {
+ struct tc_u32_mark mark = {.val = n->val,
+ .mask = n->mask,
+ .success = 0};
+
+ for_each_possible_cpu(cpu)
+ mark.success += *per_cpu_ptr(n->pcpu_success, cpu);
+
+ if (nla_put(skb, TCA_U32_MARK, sizeof(mark), &mark))
+ goto nla_put_failure;
+ }
#endif
if (tcf_exts_dump(skb, &n->exts) < 0)
@@ -765,10 +826,29 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh,
goto nla_put_failure;
#endif
#ifdef CONFIG_CLS_U32_PERF
+ gpf = kzalloc(sizeof(struct tc_u32_pcnt) +
+ n->sel.nkeys * sizeof(u64),
+ GFP_KERNEL);
+ if (!gpf)
+ goto nla_put_failure;
+
+ for_each_possible_cpu(cpu) {
+ int i;
+ struct tc_u32_pcnt *pf = per_cpu_ptr(n->pf, cpu);
+
+ gpf->rcnt += pf->rcnt;
+ gpf->rhit += pf->rhit;
+ for (i = 0; i < n->sel.nkeys; i++)
+ gpf->kcnts[i] += pf->kcnts[i];
+ }
+
if (nla_put(skb, TCA_U32_PCNT,
sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64),
- n->pf))
+ gpf)) {
+ kfree(gpf);
goto nla_put_failure;
+ }
+ kfree(gpf);
#endif
}
^ permalink raw reply related [flat|nested] 20+ messages in thread* [RFC PATCH 10/12] net: sched: rcu'ify cls_rsvp
2014-01-10 9:36 [RFC PATCH 00/12] RCU'ify the net:sched classifier chains John Fastabend
` (8 preceding siblings ...)
2014-01-10 9:42 ` [RFC PATCH 09/12] net: sched: make cls_u32 lockless John Fastabend
@ 2014-01-10 9:43 ` John Fastabend
2014-01-10 9:43 ` [RFC PATCH 11/12] net: make cls_bpf rcu safe John Fastabend
` (3 subsequent siblings)
13 siblings, 0 replies; 20+ messages in thread
From: John Fastabend @ 2014-01-10 9:43 UTC (permalink / raw)
To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
net/sched/cls_rsvp.h | 152 ++++++++++++++++++++++++++++----------------------
1 file changed, 86 insertions(+), 66 deletions(-)
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 4f25c2a..ebdbd45 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -70,31 +70,34 @@ struct rsvp_head {
u32 tmap[256/32];
u32 hgenerator;
u8 tgenerator;
- struct rsvp_session *ht[256];
+ struct rsvp_session __rcu *ht[256];
+ struct rcu_head rcu;
};
struct rsvp_session {
- struct rsvp_session *next;
- __be32 dst[RSVP_DST_LEN];
- struct tc_rsvp_gpi dpi;
- u8 protocol;
- u8 tunnelid;
+ struct rsvp_session __rcu *next;
+ __be32 dst[RSVP_DST_LEN];
+ struct tc_rsvp_gpi dpi;
+ u8 protocol;
+ u8 tunnelid;
/* 16 (src,sport) hash slots, and one wildcard source slot */
- struct rsvp_filter *ht[16 + 1];
+ struct rsvp_filter __rcu *ht[16 + 1];
+ struct rcu_head rcu;
};
struct rsvp_filter {
- struct rsvp_filter *next;
- __be32 src[RSVP_DST_LEN];
- struct tc_rsvp_gpi spi;
- u8 tunnelhdr;
+ struct rsvp_filter __rcu *next;
+ __be32 src[RSVP_DST_LEN];
+ struct tc_rsvp_gpi spi;
+ u8 tunnelhdr;
- struct tcf_result res;
- struct tcf_exts exts;
+ struct tcf_result res;
+ struct tcf_exts exts;
- u32 handle;
- struct rsvp_session *sess;
+ u32 handle;
+ struct rsvp_session *sess;
+ struct rcu_head rcu;
};
static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
@@ -128,7 +131,7 @@ static inline unsigned int hash_src(__be32 *src)
static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
- struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht;
+ struct rsvp_head *head = rcu_dereference_bh(tp->root);
struct rsvp_session *s;
struct rsvp_filter *f;
unsigned int h1, h2;
@@ -169,7 +172,8 @@ restart:
h1 = hash_dst(dst, protocol, tunnelid);
h2 = hash_src(src);
- for (s = sht[h1]; s; s = s->next) {
+ for (s = rcu_dereference_bh(head->ht[h1]); s;
+ s = rcu_dereference_bh(s->next)) {
if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] &&
protocol == s->protocol &&
!(s->dpi.mask &
@@ -181,7 +185,8 @@ restart:
#endif
tunnelid == s->tunnelid) {
- for (f = s->ht[h2]; f; f = f->next) {
+ for (f = rcu_dereference_bh(s->ht[h2]); f;
+ f = rcu_dereference_bh(f->next)) {
if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] &&
!(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key))
#if RSVP_DST_LEN == 4
@@ -205,7 +210,8 @@ matched:
}
/* And wildcard bucket... */
- for (f = s->ht[16]; f; f = f->next) {
+ for (f = rcu_dereference_bh(s->ht[16]); f;
+ f = rcu_dereference_bh(f->next)) {
*res = f->res;
RSVP_APPLY_RESULT();
goto matched;
@@ -218,7 +224,7 @@ matched:
static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle)
{
- struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht;
+ struct rsvp_head *head = rtnl_dereference(tp->root);
struct rsvp_session *s;
struct rsvp_filter *f;
unsigned int h1 = handle & 0xFF;
@@ -227,8 +233,10 @@ static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle)
if (h2 > 16)
return 0;
- for (s = sht[h1]; s; s = s->next) {
- for (f = s->ht[h2]; f; f = f->next) {
+ for (s = rtnl_dereference(head->ht[h1]); s;
+ s = rtnl_dereference(s->next)) {
+ for (f = rtnl_dereference(s->ht[h2]); f;
+ f = rtnl_dereference(f->next)) {
if (f->handle == handle)
return (unsigned long)f;
}
@@ -246,7 +254,7 @@ static int rsvp_init(struct tcf_proto *tp)
data = kzalloc(sizeof(struct rsvp_head), GFP_KERNEL);
if (data) {
- tp->root = data;
+ rcu_assign_pointer(tp->root, data);
return 0;
}
return -ENOBUFS;
@@ -257,53 +265,55 @@ rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
{
tcf_unbind_filter(tp, &f->res);
tcf_exts_destroy(tp, &f->exts);
- kfree(f);
+ kfree_rcu(f, rcu);
}
static void rsvp_destroy(struct tcf_proto *tp)
{
- struct rsvp_head *data = xchg(&tp->root, NULL);
- struct rsvp_session **sht;
+ struct rsvp_head *data = rtnl_dereference(tp->root);
int h1, h2;
if (data == NULL)
return;
- sht = data->ht;
+ rcu_assign_pointer(tp->root, NULL);
for (h1 = 0; h1 < 256; h1++) {
struct rsvp_session *s;
- while ((s = sht[h1]) != NULL) {
- sht[h1] = s->next;
+ while ((s = rtnl_dereference(data->ht[h1])) != NULL) {
+ rcu_assign_pointer(data->ht[h1], s->next);
for (h2 = 0; h2 <= 16; h2++) {
struct rsvp_filter *f;
- while ((f = s->ht[h2]) != NULL) {
- s->ht[h2] = f->next;
+ while ((f = rtnl_dereference(s->ht[h2])) != NULL) {
+ rcu_assign_pointer(s->ht[h2], f->next);
rsvp_delete_filter(tp, f);
}
}
- kfree(s);
+ kfree_rcu(s, rcu);
}
}
- kfree(data);
+ rcu_assign_pointer(tp->root, NULL);
+ kfree_rcu(data, rcu);
}
static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
{
- struct rsvp_filter **fp, *f = (struct rsvp_filter *)arg;
+ struct rsvp_head *head = rtnl_dereference(tp->root);
+ struct rsvp_filter *nfp, *f = (struct rsvp_filter *)arg;
+ struct rsvp_filter __rcu **fp;
unsigned int h = f->handle;
- struct rsvp_session **sp;
- struct rsvp_session *s = f->sess;
+ struct rsvp_session __rcu **sp;
+ struct rsvp_session *nsp, *s = f->sess;
int i;
- for (fp = &s->ht[(h >> 8) & 0xFF]; *fp; fp = &(*fp)->next) {
- if (*fp == f) {
- tcf_tree_lock(tp);
+ fp = &s->ht[(h >> 8) & 0xFF];
+ for (nfp = rtnl_dereference(*fp); nfp;
+ fp = &nfp->next, nfp = rtnl_dereference(*fp)) {
+ if (nfp == f) {
*fp = f->next;
- tcf_tree_unlock(tp);
rsvp_delete_filter(tp, f);
/* Strip tree */
@@ -313,14 +323,12 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
return 0;
/* OK, session has no flows */
- for (sp = &((struct rsvp_head *)tp->root)->ht[h & 0xFF];
- *sp; sp = &(*sp)->next) {
- if (*sp == s) {
- tcf_tree_lock(tp);
+ sp = &head->ht[h & 0xFF];
+ for (nsp = rtnl_dereference(*sp); nsp;
+ sp = &nsp->next, nsp = rtnl_dereference(*sp)) {
+ if (nsp == s) {
*sp = s->next;
- tcf_tree_unlock(tp);
-
- kfree(s);
+ kfree_rcu(s, rcu);
return 0;
}
}
@@ -333,7 +341,7 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt)
{
- struct rsvp_head *data = tp->root;
+ struct rsvp_head *data = rtnl_dereference(tp->root);
int i = 0xFFFF;
while (i-- > 0) {
@@ -361,7 +369,7 @@ static int tunnel_bts(struct rsvp_head *data)
static void tunnel_recycle(struct rsvp_head *data)
{
- struct rsvp_session **sht = data->ht;
+ struct rsvp_session __rcu **sht = data->ht;
u32 tmap[256/32];
int h1, h2;
@@ -369,11 +377,13 @@ static void tunnel_recycle(struct rsvp_head *data)
for (h1 = 0; h1 < 256; h1++) {
struct rsvp_session *s;
- for (s = sht[h1]; s; s = s->next) {
+ for (s = rtnl_dereference(sht[h1]); s;
+ s = rtnl_dereference(s->next)) {
for (h2 = 0; h2 <= 16; h2++) {
struct rsvp_filter *f;
- for (f = s->ht[h2]; f; f = f->next) {
+ for (f = rtnl_dereference(s->ht[h2]); f;
+ f = rtnl_dereference(f->next)) {
if (f->tunnelhdr == 0)
continue;
data->tgenerator = f->res.classid;
@@ -417,9 +427,11 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb,
struct nlattr **tca,
unsigned long *arg)
{
- struct rsvp_head *data = tp->root;
- struct rsvp_filter *f, **fp;
- struct rsvp_session *s, **sp;
+ struct rsvp_head *data = rtnl_dereference(tp->root);
+ struct rsvp_filter *f, *nfp;
+ struct rsvp_filter __rcu **fp;
+ struct rsvp_session *nsp, *s;
+ struct rsvp_session __rcu **sp;
struct tc_rsvp_pinfo *pinfo = NULL;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_RSVP_MAX + 1];
@@ -499,7 +511,9 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb,
goto errout;
}
- for (sp = &data->ht[h1]; (s = *sp) != NULL; sp = &s->next) {
+ for (sp = &data->ht[h1];
+ (s = rtnl_dereference(*sp)) != NULL;
+ sp = &s->next) {
if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
pinfo && pinfo->protocol == s->protocol &&
memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 &&
@@ -521,12 +535,14 @@ insert:
tcf_exts_change(tp, &f->exts, &e);
- for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next)
- if (((*fp)->spi.mask & f->spi.mask) != f->spi.mask)
+ fp = &s->ht[h2];
+ for (nfp = rtnl_dereference(*fp); nfp;
+ fp = &nfp->next, nfp = rtnl_dereference(*fp))
+ if ((nfp->spi.mask & f->spi.mask) != f->spi.mask)
break;
- f->next = *fp;
+ rcu_assign_pointer(f->next, nfp);
wmb();
- *fp = f;
+ rcu_assign_pointer(*fp, f);
*arg = (unsigned long)f;
return 0;
@@ -546,13 +562,15 @@ insert:
s->protocol = pinfo->protocol;
s->tunnelid = pinfo->tunnelid;
}
- for (sp = &data->ht[h1]; *sp; sp = &(*sp)->next) {
- if (((*sp)->dpi.mask&s->dpi.mask) != s->dpi.mask)
+ sp = &data->ht[h1];
+ for (nsp = rtnl_dereference(*sp); nsp;
+ sp = &nsp->next, nsp = rtnl_dereference(*sp)) {
+ if ((nsp->dpi.mask & s->dpi.mask) != s->dpi.mask)
break;
}
- s->next = *sp;
+ rcu_assign_pointer(s->next, nsp);
wmb();
- *sp = s;
+ rcu_assign_pointer(*sp, s);
goto insert;
@@ -565,7 +583,7 @@ errout2:
static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
- struct rsvp_head *head = tp->root;
+ struct rsvp_head *head = rtnl_dereference(tp->root);
unsigned int h, h1;
if (arg->stop)
@@ -574,11 +592,13 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
for (h = 0; h < 256; h++) {
struct rsvp_session *s;
- for (s = head->ht[h]; s; s = s->next) {
+ for (s = rtnl_dereference(head->ht[h]); s;
+ s = rtnl_dereference(s->next)) {
for (h1 = 0; h1 <= 16; h1++) {
struct rsvp_filter *f;
- for (f = s->ht[h1]; f; f = f->next) {
+ for (f = rtnl_dereference(s->ht[h1]); f;
+ f = rtnl_dereference(f->next)) {
if (arg->count < arg->skip) {
arg->count++;
continue;
^ permalink raw reply related [flat|nested] 20+ messages in thread* [RFC PATCH 11/12] net: make cls_bpf rcu safe
2014-01-10 9:36 [RFC PATCH 00/12] RCU'ify the net:sched classifier chains John Fastabend
` (9 preceding siblings ...)
2014-01-10 9:43 ` [RFC PATCH 10/12] net: sched: rcu'ify cls_rsvp John Fastabend
@ 2014-01-10 9:43 ` John Fastabend
2014-01-10 9:44 ` [RFC PATCH 12/12] net: sched: make tc_action safe to walk under RCU John Fastabend
` (2 subsequent siblings)
13 siblings, 0 replies; 20+ messages in thread
From: John Fastabend @ 2014-01-10 9:43 UTC (permalink / raw)
To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
This patch makes the cls_bpf classifier RCU safe. The tcf_lock
was being used to protect a list of cls_bpf_prog now this list
is RCU safe and updates occur with rcu_replace.
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
net/sched/cls_bpf.c | 79 ++++++++++++++++++++++++++-------------------------
1 file changed, 40 insertions(+), 39 deletions(-)
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 00a5a58..e69adc330 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -25,8 +25,9 @@ MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
MODULE_DESCRIPTION("TC BPF based classifier");
struct cls_bpf_head {
- struct list_head plist;
+ struct list_head __rcu plist;
u32 hgen;
+ struct rcu_head rcu;
};
struct cls_bpf_prog {
@@ -37,6 +38,8 @@ struct cls_bpf_prog {
struct list_head link;
u32 handle;
u16 bpf_len;
+ struct tcf_proto *tp;
+ struct rcu_head rcu;
};
static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
@@ -49,11 +52,11 @@ static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
- struct cls_bpf_head *head = tp->root;
+ struct cls_bpf_head *head = rcu_dereference(tp->root);
struct cls_bpf_prog *prog;
int ret;
- list_for_each_entry(prog, &head->plist, link) {
+ list_for_each_entry_rcu(prog, &head->plist, link) {
int filter_res = SK_RUN_FILTER(prog->filter, skb);
if (filter_res == 0)
@@ -81,8 +84,8 @@ static int cls_bpf_init(struct tcf_proto *tp)
if (head == NULL)
return -ENOBUFS;
- INIT_LIST_HEAD(&head->plist);
- tp->root = head;
+ INIT_LIST_HEAD_RCU(&head->plist);
+ rcu_assign_pointer(tp->root, head);
return 0;
}
@@ -98,6 +101,13 @@ static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
kfree(prog);
}
+void __cls_bpf_delete_prog(struct rcu_head *rcu)
+{
+ struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu);
+
+ cls_bpf_delete_prog(prog->tp, prog);
+}
+
static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
{
struct cls_bpf_head *head = tp->root;
@@ -105,11 +115,8 @@ static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
list_for_each_entry(prog, &head->plist, link) {
if (prog == todel) {
- tcf_tree_lock(tp);
- list_del(&prog->link);
- tcf_tree_unlock(tp);
-
- cls_bpf_delete_prog(tp, prog);
+ list_del_rcu(&prog->link);
+ call_rcu(&prog->rcu, __cls_bpf_delete_prog);
return 0;
}
}
@@ -123,11 +130,12 @@ static void cls_bpf_destroy(struct tcf_proto *tp)
struct cls_bpf_prog *prog, *tmp;
list_for_each_entry_safe(prog, tmp, &head->plist, link) {
- list_del(&prog->link);
- cls_bpf_delete_prog(tp, prog);
+ list_del_rcu(&prog->link);
+ call_rcu(&prog->rcu, __cls_bpf_delete_prog);
}
- kfree(head);
+ rcu_assign_pointer(tp->root, NULL);
+ kfree_rcu(head, rcu);
}
static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
@@ -158,10 +166,10 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
unsigned long base, struct nlattr **tb,
struct nlattr *est)
{
- struct sock_filter *bpf_ops, *bpf_old;
+ struct sock_filter *bpf_ops;
struct tcf_exts exts;
struct sock_fprog tmp;
- struct sk_filter *fp, *fp_old;
+ struct sk_filter *fp;
u16 bpf_size, bpf_len;
u32 classid;
int ret;
@@ -197,24 +205,13 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
if (ret)
goto errout_free;
- tcf_tree_lock(tp);
- fp_old = prog->filter;
- bpf_old = prog->bpf_ops;
-
prog->bpf_len = bpf_len;
prog->bpf_ops = bpf_ops;
prog->filter = fp;
prog->res.classid = classid;
- tcf_tree_unlock(tp);
tcf_bind_filter(tp, &prog->res, base);
tcf_exts_change(tp, &prog->exts, &exts);
-
- if (fp_old)
- sk_unattached_filter_destroy(fp_old);
- if (bpf_old)
- kfree(bpf_old);
-
return 0;
errout_free:
@@ -245,8 +242,9 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
unsigned long *arg)
{
struct cls_bpf_head *head = tp->root;
- struct cls_bpf_prog *prog = (struct cls_bpf_prog *) *arg;
+ struct cls_bpf_prog *oldprog = (struct cls_bpf_prog *) *arg;
struct nlattr *tb[TCA_BPF_MAX + 1];
+ struct cls_bpf_prog *prog;
int ret;
if (tca[TCA_OPTIONS] == NULL)
@@ -256,18 +254,19 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
if (ret < 0)
return ret;
- if (prog != NULL) {
- if (handle && prog->handle != handle)
- return -EINVAL;
- return cls_bpf_modify_existing(net, tp, prog, base, tb,
- tca[TCA_RATE]);
- }
-
prog = kzalloc(sizeof(*prog), GFP_KERNEL);
if (prog == NULL)
return -ENOBUFS;
tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE);
+
+ if (oldprog) {
+ if (handle && oldprog->handle != handle) {
+ ret = -EINVAL;
+ goto errout;
+ }
+ }
+
if (handle == 0)
prog->handle = cls_bpf_grab_new_handle(tp, head);
else
@@ -281,15 +280,17 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
if (ret < 0)
goto errout;
- tcf_tree_lock(tp);
- list_add(&prog->link, &head->plist);
- tcf_tree_unlock(tp);
+ if (oldprog) {
+ list_replace_rcu(&prog->link, &oldprog->link);
+ call_rcu(&oldprog->rcu, __cls_bpf_delete_prog);
+ } else {
+ list_add_rcu(&prog->link, &head->plist);
+ }
*arg = (unsigned long) prog;
-
return 0;
errout:
- if (*arg == 0UL && prog)
+ if (prog)
kfree(prog);
return ret;
^ permalink raw reply related [flat|nested] 20+ messages in thread* [RFC PATCH 12/12] net: sched: make tc_action safe to walk under RCU
2014-01-10 9:36 [RFC PATCH 00/12] RCU'ify the net:sched classifier chains John Fastabend
` (10 preceding siblings ...)
2014-01-10 9:43 ` [RFC PATCH 11/12] net: make cls_bpf rcu safe John Fastabend
@ 2014-01-10 9:44 ` John Fastabend
2014-01-11 19:43 ` [RFC PATCH 00/12] RCU'ify the net:sched classifier chains Cong Wang
2014-01-12 13:28 ` Jamal Hadi Salim
13 siblings, 0 replies; 20+ messages in thread
From: John Fastabend @ 2014-01-10 9:44 UTC (permalink / raw)
To: xiyou.wangcong, jhs, eric.dumazet; +Cc: netdev, davem
This patch makes tc_actions safe to walk from classifiers under RCU.
Notice that each action act() callback defined in each act_*.c
already does its own locking. This is needed even in the current
infrastructure for the case where users add/remove actions via
'tc actions' and reference them via index from the classifiers.
There are a couple interesting pieces here (i.e. need careful review)
In tcf_exts_exec() the call to tcf_action_exec follows a list_empty
check. However although this is occurring under RCU there is no
guarantee that the list is not empty when tcf_action_exec is called.
This patch fixes up return values from tcf_action_exec() so that
packets wont be dropped on the floor when this occurs. Hopefully
its rare and by using RCU we sort of make this assumption.
Second there is a suspect usage of list_splice_init_rcu() in the
tcf_exts_change() routine. Notice how it is used twice in succession
and the second init works on the src tcf_exts. There is probably a
better way to accomplish that.
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
include/net/act_api.h | 1 +
include/net/pkt_cls.h | 12 ++++++++++--
net/sched/act_api.c | 18 +++++++++---------
net/sched/cls_api.c | 15 ++++++++-------
4 files changed, 28 insertions(+), 18 deletions(-)
diff --git a/include/net/act_api.h b/include/net/act_api.h
index 77d5d81..93f248e 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -81,6 +81,7 @@ struct tc_action {
__u32 type; /* for backward compat(TCA_OLD_COMPAT) */
__u32 order;
struct list_head list;
+ struct rcu_head rcu;
};
#define TCA_CAP_NONE 0
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 50ea079..53b6e50 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -63,7 +63,7 @@ tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r)
struct tcf_exts {
#ifdef CONFIG_NET_CLS_ACT
__u32 type; /* for backward compat(TCA_OLD_COMPAT) */
- struct list_head actions;
+ struct list_head __rcu actions;
#endif
/* Map to export classifier specific extension TLV types to the
* generic extensions API. Unsupported extensions must be set to 0.
@@ -76,7 +76,7 @@ static inline void tcf_exts_init(struct tcf_exts *exts, int action, int police)
{
#ifdef CONFIG_NET_CLS_ACT
exts->type = 0;
- INIT_LIST_HEAD(&exts->actions);
+ INIT_LIST_HEAD_RCU(&exts->actions);
#endif
exts->action = action;
exts->police = police;
@@ -88,6 +88,8 @@ static inline void tcf_exts_init(struct tcf_exts *exts, int action, int police)
*
* Returns 1 if a predicative extension is present, i.e. an extension which
* might cause further actions and thus overrule the regular tcf_result.
+ *
+ * This check is only valid if done under RTNL.
*/
static inline int
tcf_exts_is_predicative(struct tcf_exts *exts)
@@ -128,6 +130,12 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts,
struct tcf_result *res)
{
#ifdef CONFIG_NET_CLS_ACT
+ /* This check is racy but this is OK, if the list is emptied
+ * before walking the chain of actions the return value has
+ * been updated to return zero. This way packets will not be
+ * dropped when action list deletion occurs after the empty
+ * check but before execution
+ */
if (!list_empty(&exts->actions))
return tcf_action_exec(skb, &exts->actions, res);
#endif
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index dce2b6e..410e69c 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -345,14 +345,14 @@ int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions,
struct tcf_result *res)
{
const struct tc_action *a;
- int ret = -1;
+ int ret = 0;
if (skb->tc_verd & TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
ret = TC_ACT_OK;
goto exec_done;
}
- list_for_each_entry(a, actions, list) {
+ list_for_each_entry_rcu(a, actions, list) {
repeat:
if (a->ops) {
ret = a->ops->act(skb, a, res);
@@ -380,13 +380,13 @@ void tcf_action_destroy(struct list_head *actions, int bind)
if (a->ops) {
if (a->ops->cleanup(a, bind) == ACT_P_DELETED)
module_put(a->ops->owner);
- list_del(&a->list);
- kfree(a);
+ list_del_rcu(&a->list);
+ kfree_rcu(a, rcu);
} else {
/*FIXME: Remove later - catch insertion bugs*/
WARN(1, "tcf_action_destroy: BUG? destroying NULL ops\n");
- list_del(&a->list);
- kfree(a);
+ list_del_rcu(&a->list);
+ kfree_rcu(a, rcu);
}
}
}
@@ -559,7 +559,7 @@ int tcf_action_init(struct net *net, struct nlattr *nla,
goto err;
}
act->order = i;
- list_add_tail(&act->list, actions);
+ list_add_tail_rcu(&act->list, actions);
}
return 0;
@@ -708,8 +708,8 @@ static void cleanup_a(struct list_head *actions)
struct tc_action *a, *tmp;
list_for_each_entry_safe(a, tmp, actions, list) {
- list_del(&a->list);
- kfree(a);
+ list_del_rcu(&a->list);
+ kfree_rcu(a, rcu);
}
}
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 86c3678..af87be5 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -496,7 +496,7 @@ void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND);
- INIT_LIST_HEAD(&exts->actions);
+ INIT_LIST_HEAD_RCU(&exts->actions);
#endif
}
EXPORT_SYMBOL(tcf_exts_destroy);
@@ -508,7 +508,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
{
struct tc_action *act;
- INIT_LIST_HEAD(&exts->actions);
+ INIT_LIST_HEAD_RCU(&exts->actions);
if (exts->police && tb[exts->police]) {
act = tcf_action_init_1(net, tb[exts->police], rate_tlv,
"police", TCA_ACT_NOREPLACE,
@@ -517,7 +517,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
return PTR_ERR(act);
act->type = exts->type = TCA_OLD_COMPAT;
- list_add(&act->list, &exts->actions);
+ list_add_rcu(&act->list, &exts->actions);
} else if (exts->action && tb[exts->action]) {
int err;
err = tcf_action_init(net, tb[exts->action], rate_tlv,
@@ -537,16 +537,17 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
}
EXPORT_SYMBOL(tcf_exts_validate);
+/* It is not safe to use src->actions after this due to _init_rcu usage
+ * INIT_LIST_HEAD_RCU() is called on src->actions
+ */
void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
struct tcf_exts *src)
{
#ifdef CONFIG_NET_CLS_ACT
if (!list_empty(&src->actions)) {
LIST_HEAD(tmp);
- tcf_tree_lock(tp);
- list_splice_init(&dst->actions, &tmp);
- list_splice(&src->actions, &dst->actions);
- tcf_tree_unlock(tp);
+ list_splice_init_rcu(&dst->actions, &tmp, synchronize_rcu);
+ list_splice_init_rcu(&src->actions, &dst->actions, synchronize_rcu);
tcf_action_destroy(&tmp, TCA_ACT_UNBIND);
}
#endif
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [RFC PATCH 00/12] RCU'ify the net:sched classifier chains
2014-01-10 9:36 [RFC PATCH 00/12] RCU'ify the net:sched classifier chains John Fastabend
` (11 preceding siblings ...)
2014-01-10 9:44 ` [RFC PATCH 12/12] net: sched: make tc_action safe to walk under RCU John Fastabend
@ 2014-01-11 19:43 ` Cong Wang
2014-01-11 23:33 ` John Fastabend
2014-01-12 13:28 ` Jamal Hadi Salim
13 siblings, 1 reply; 20+ messages in thread
From: Cong Wang @ 2014-01-11 19:43 UTC (permalink / raw)
To: John Fastabend
Cc: Jamal Hadi Salim, Eric Dumazet, Linux Kernel Network Developers,
David Miller
On Fri, Jan 10, 2014 at 1:36 AM, John Fastabend
<john.fastabend@gmail.com> wrote:
> There appears to be some interest in a few topics around the qdisc
> layer which could benefit from having the ability to run the
> filters and actions without holding the qdisc lock.
>
> Recently Cong Wang proposed a patch series to drop the ingress
> qdisc and asked for comments. This series I think gets closer to
> that goal.
>
> The ingress qdisc is a simple qdisc which doesn't maintain any
> actual list of skb's and is primarily a hook to attach filters.
> Further the only qdisc that can be attached to the ingress qdisc
> is sch_ingress. The qdisc lock is currently serializing two
> operations (1) tc_classify which is addressed here and (2)
> statistics accounting. The second point is not solved here but
> it could be a matter of making the bstats and qstats per cpu
> stats.
Yeah, actually I tried to make bstats percpu, but I still doubt
if it is necessary, since increasing a 32bit counter doesn't
sound dangerous on SMP?
>
> This is an RFC for now and needs some more work. Some items
> I know about are (a) an audit of the ematch code paths, (b) resolving
> the checpatch errors mostly due to moving code around that
> generates those errors, (c) run smatch, (d) audit u32 code
> for correctness, (e) do a lot more testing so far only very
> basic testing has been done. I tried to put some reasonable
> comments in the commit logs but yes they need more work.
>
> Cong, if its not too much to ask can we use this as a base
> set of patches for this work? I think its reasonably close to
> correct as is.
>
Sure, just that:
1) I myself don't like playing RCU list without using list_head API
it is still hard for me to read.
2) The first patch in your series seems completely irrelevant to
$subject. :)
Thanks.
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [RFC PATCH 00/12] RCU'ify the net:sched classifier chains
2014-01-11 19:43 ` [RFC PATCH 00/12] RCU'ify the net:sched classifier chains Cong Wang
@ 2014-01-11 23:33 ` John Fastabend
2014-04-24 23:51 ` Cong Wang
0 siblings, 1 reply; 20+ messages in thread
From: John Fastabend @ 2014-01-11 23:33 UTC (permalink / raw)
To: Cong Wang
Cc: Jamal Hadi Salim, Eric Dumazet, Linux Kernel Network Developers,
David Miller
On 01/11/2014 11:43 AM, Cong Wang wrote:
> On Fri, Jan 10, 2014 at 1:36 AM, John Fastabend
> <john.fastabend@gmail.com> wrote:
>> There appears to be some interest in a few topics around the qdisc
>> layer which could benefit from having the ability to run the
>> filters and actions without holding the qdisc lock.
>>
>> Recently Cong Wang proposed a patch series to drop the ingress
>> qdisc and asked for comments. This series I think gets closer to
>> that goal.
>>
>> The ingress qdisc is a simple qdisc which doesn't maintain any
>> actual list of skb's and is primarily a hook to attach filters.
>> Further the only qdisc that can be attached to the ingress qdisc
>> is sch_ingress. The qdisc lock is currently serializing two
>> operations (1) tc_classify which is addressed here and (2)
>> statistics accounting. The second point is not solved here but
>> it could be a matter of making the bstats and qstats per cpu
>> stats.
>
>
> Yeah, actually I tried to make bstats percpu, but I still doubt
> if it is necessary, since increasing a 32bit counter doesn't
> sound dangerous on SMP?
>
Well what happens when multiple cpus are incrementing the counter?
You can't assume all archs have a fetch and add instruction (addl on
x86) and I fairly certain there is no guarantee the compiler even
on x86 will do it that way. Minimally we need to use the atomic
operations but then its a cache thrashing problem. And because worse
case every CPU is going to be touching those bstats you really need
to make them per cpu. Look around the kernel at other counters its a
common pattern.
Similarly the qstats need to be per cpu, I might have a patch
around here for that piece somewhere. I'll look later.
Send me your patch so I can integrate it with the rest.
>>
>> This is an RFC for now and needs some more work. Some items
>> I know about are (a) an audit of the ematch code paths, (b) resolving
>> the checpatch errors mostly due to moving code around that
>> generates those errors, (c) run smatch, (d) audit u32 code
>> for correctness, (e) do a lot more testing so far only very
>> basic testing has been done. I tried to put some reasonable
>> comments in the commit logs but yes they need more work.
>>
>> Cong, if its not too much to ask can we use this as a base
>> set of patches for this work? I think its reasonably close to
>> correct as is.
>>
>
> Sure, just that:
>
> 1) I myself don't like playing RCU list without using list_head API
> it is still hard for me to read.
I think its a reasonably common practice, and if we don't need the
prev pointer we can save a pointer.
>
> 2) The first patch in your series seems completely irrelevant to
> $subject. :)
If the intent is to drop the qdisc lock around the ingress qdisc and
use the RCU api's I want to be sure to annotate it so we can use
the analysis tools to catch any errors. Smatch and others really are
pretty good at catching dumb mistakes or missed call sites.
>
> Thanks.
>
--
John Fastabend Intel Corporation
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH 00/12] RCU'ify the net:sched classifier chains
2014-01-11 23:33 ` John Fastabend
@ 2014-04-24 23:51 ` Cong Wang
2014-04-30 16:36 ` John Fastabend
0 siblings, 1 reply; 20+ messages in thread
From: Cong Wang @ 2014-04-24 23:51 UTC (permalink / raw)
To: John Fastabend
Cc: Jamal Hadi Salim, Eric Dumazet, Linux Kernel Network Developers,
David Miller
Hi, John
Do you have any plan to update this patchset and resend them?
Thanks.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH 00/12] RCU'ify the net:sched classifier chains
2014-04-24 23:51 ` Cong Wang
@ 2014-04-30 16:36 ` John Fastabend
0 siblings, 0 replies; 20+ messages in thread
From: John Fastabend @ 2014-04-30 16:36 UTC (permalink / raw)
To: Cong Wang
Cc: Jamal Hadi Salim, Eric Dumazet, Linux Kernel Network Developers,
David Miller
On 04/24/2014 04:51 PM, Cong Wang wrote:
> Hi, John
>
> Do you have any plan to update this patchset and resend them?
>
> Thanks.
>
OK, I updated the patch set and sent it out. Still need to get
the latest perf numbers for an official submission. any testing
and/or review you can do would be great. Hoping to wrap this up
and get it out soon.
Thanks,
John
--
John Fastabend Intel Corporation
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH 00/12] RCU'ify the net:sched classifier chains
2014-01-10 9:36 [RFC PATCH 00/12] RCU'ify the net:sched classifier chains John Fastabend
` (12 preceding siblings ...)
2014-01-11 19:43 ` [RFC PATCH 00/12] RCU'ify the net:sched classifier chains Cong Wang
@ 2014-01-12 13:28 ` Jamal Hadi Salim
2014-01-12 13:57 ` Jamal Hadi Salim
13 siblings, 1 reply; 20+ messages in thread
From: Jamal Hadi Salim @ 2014-01-12 13:28 UTC (permalink / raw)
To: John Fastabend, xiyou.wangcong, eric.dumazet; +Cc: netdev, davem
On 01/10/14 04:36, John Fastabend wrote:
>
> The ingress qdisc is a simple qdisc which doesn't maintain any
> actual list of skb's and is primarily a hook to attach filters.
> Further the only qdisc that can be attached to the ingress qdisc
> is sch_ingress. The qdisc lock is currently serializing two
> operations (1) tc_classify which is addressed here and (2)
> statistics accounting. The second point is not solved here but
> it could be a matter of making the bstats and qstats per cpu
> stats.
I think as you observed in your other email:
There is a little more than just the stats (on qdiscs as well);
internal to specific filters etc.
> This is an RFC for now and needs some more work. Some items
> I know about are (a) an audit of the ematch code paths, (b) resolving
> the checpatch errors mostly due to moving code around that
> generates those errors, (c) run smatch, (d) audit u32 code
> for correctness,
Ok, my feel-good dial went up reading #d above;->
>(e) do a lot more testing so far only very
> basic testing has been done. I tried to put some reasonable
> comments in the commit logs but yes they need more work.
>
Things like MSI based devices, massive single tun device
tests, ifb with redirect etc would help cleanse things up.
> Cong, if its not too much to ask can we use this as a base
> set of patches for this work? I think its reasonably close to
> correct as is.
>
I believe these are based on your original patches John, no?
So I would agree we use these as a base.
I will scan through the patches...
This is fun stuff - I will try to participate whenever i can
(unfortunately not much time at the moment).
cheers,
jamal
> Thanks! John.
>
> ---
>
> John Fastabend (12):
> net: qdisc: use rcu prefix and silence sparse warnings
> net: rcu-ify tcf_proto
> net: sched: cls_basic use RCU
> net: sched: cls_cgroup use RCU
> net: sched: cls_flow use RCU
> net: sched: fw use RCU
> net: sched: RCU cls_route
> net: sched: RCU cls_tcindex
> net: sched: make cls_u32 lockless
> net: sched: rcu'ify cls_rsvp
> net: make cls_bpf rcu safe
> net: sched: make tc_action safe to walk under RCU
>
>
> include/linux/netdevice.h | 41 +-----
> include/linux/rtnetlink.h | 10 +
> include/net/act_api.h | 1
> include/net/pkt_cls.h | 12 +-
> include/net/sch_generic.h | 34 ++++-
> net/core/dev.c | 54 +++++++
> net/sched/act_api.c | 18 +-
> net/sched/cls_api.c | 44 +++---
> net/sched/cls_basic.c | 82 ++++++-----
> net/sched/cls_bpf.c | 79 ++++++-----
> net/sched/cls_cgroup.c | 65 ++++++---
> net/sched/cls_flow.c | 145 ++++++++++++--------
> net/sched/cls_fw.c | 112 +++++++++++----
> net/sched/cls_route.c | 218 +++++++++++++++++-------------
> net/sched/cls_rsvp.h | 152 ++++++++++++---------
> net/sched/cls_tcindex.c | 327 ++++++++++++++++++++++++++-------------------
> net/sched/cls_u32.c | 258 +++++++++++++++++++++++-------------
> net/sched/sch_api.c | 6 -
> net/sched/sch_atm.c | 30 +++-
> net/sched/sch_cbq.c | 21 ++-
> net/sched/sch_choke.c | 18 ++
> net/sched/sch_drr.c | 10 +
> net/sched/sch_dsmark.c | 8 +
> net/sched/sch_fq_codel.c | 11 +-
> net/sched/sch_generic.c | 4 -
> net/sched/sch_hfsc.c | 17 ++
> net/sched/sch_htb.c | 23 ++-
> net/sched/sch_ingress.c | 8 +
> net/sched/sch_mqprio.c | 4 -
> net/sched/sch_multiq.c | 8 +
> net/sched/sch_prio.c | 11 +-
> net/sched/sch_qfq.c | 9 +
> net/sched/sch_sfb.c | 15 +-
> net/sched/sch_sfq.c | 11 +-
> net/sched/sch_teql.c | 9 +
> 35 files changed, 1139 insertions(+), 736 deletions(-)
>
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [RFC PATCH 00/12] RCU'ify the net:sched classifier chains
2014-01-12 13:28 ` Jamal Hadi Salim
@ 2014-01-12 13:57 ` Jamal Hadi Salim
2014-01-12 14:18 ` Jamal Hadi Salim
0 siblings, 1 reply; 20+ messages in thread
From: Jamal Hadi Salim @ 2014-01-12 13:57 UTC (permalink / raw)
To: John Fastabend, xiyou.wangcong, eric.dumazet; +Cc: netdev, davem
On 01/12/14 08:28, Jamal Hadi Salim wrote:
> I will scan through the patches...
>
I looked and here's a general question:
Does even using RCU make any sense here? What we have
is a lot of updates and very very little reads (reads essentially
are done from the control side; the data path is is all about updates).
I am not sure if RCU is a win in such a case - it could make things
worse. At least that used to be the Truth(tm) many moons back.
Is that not the case anymore?
cheers,
jamal
> This is fun stuff - I will try to participate whenever i can
> (unfortunately not much time at the moment).
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH 00/12] RCU'ify the net:sched classifier chains
2014-01-12 13:57 ` Jamal Hadi Salim
@ 2014-01-12 14:18 ` Jamal Hadi Salim
0 siblings, 0 replies; 20+ messages in thread
From: Jamal Hadi Salim @ 2014-01-12 14:18 UTC (permalink / raw)
To: John Fastabend, xiyou.wangcong, eric.dumazet; +Cc: netdev, davem
On 01/12/14 08:57, Jamal Hadi Salim wrote:
> I looked and here's a general question:
> Does even using RCU make any sense here? What we have
> is a lot of updates and very very little reads (reads essentially
> are done from the control side; the data path is is all about updates).
>
> I am not sure if RCU is a win in such a case - it could make things
> worse. At least that used to be the Truth(tm) many moons back.
> Is that not the case anymore?
>
Never mind.
You are not trying to make stats rcu - rather the list
of filters and actions (which is read mostly from data path).
Looking at the u32 piece - i think this is in the right
direction. Good stuff John!
cheers,
jamal
^ permalink raw reply [flat|nested] 20+ messages in thread