From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
To: Changli Gao <xiaosuo@gmail.com>
Cc: Patrick McHardy <kaber@trash.net>,
netfilter-devel@vger.kernel.org,
Eric Dumazet <eric.dumazet@gmail.com>
Subject: Re: [PATCH] nfnetlink_queue: use hash table to speed up entry lookup
Date: Wed, 21 Apr 2010 13:23:57 -0700 [thread overview]
Message-ID: <20100421202357.GK2563@linux.vnet.ibm.com> (raw)
In-Reply-To: <1271773896-28246-1-git-send-email-xiaosuo@gmail.com>
On Tue, Apr 20, 2010 at 10:31:36PM +0800, Changli Gao wrote:
> use hash table to speed up entry lookup
>
> A hash table is used to speed up entry lookup when the verdicts aren't received
> in order. The size of hash table can be specified by NFQA_CFG_QUEUE_HTBLSIZ.
> Its default value is 1. Reciprocal division is used to lower the cost of
> division, and the entry IDs are generated carefully to get fair entry
> distribution in the buckets of the hash table.
A few questions interspersed below.
Thanx, Paul
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----
> include/linux/netfilter/nfnetlink_queue.h | 1
> init/Kconfig | 1
> lib/Kconfig | 3
> lib/Makefile | 4
> lib/reciprocal_div.c | 2
> net/netfilter/Kconfig | 1
> net/netfilter/nfnetlink_queue.c | 252 +++++++++++++++++++++++++-----
> 7 files changed, 227 insertions(+), 37 deletions(-)
> diff --git a/include/linux/netfilter/nfnetlink_queue.h b/include/linux/netfilter/nfnetlink_queue.h
> index 2455fe5..77b1566 100644
> --- a/include/linux/netfilter/nfnetlink_queue.h
> +++ b/include/linux/netfilter/nfnetlink_queue.h
> @@ -83,6 +83,7 @@ enum nfqnl_attr_config {
> NFQA_CFG_CMD, /* nfqnl_msg_config_cmd */
> NFQA_CFG_PARAMS, /* nfqnl_msg_config_params */
> NFQA_CFG_QUEUE_MAXLEN, /* __u32 */
> + NFQA_CFG_QUEUE_HTBLSIZ, /* __u32 */
> __NFQA_CFG_MAX
> };
> #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
> diff --git a/init/Kconfig b/init/Kconfig
> index cb6069e..4b4266f 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1059,6 +1059,7 @@ choice
>
> config SLAB
> bool "SLAB"
> + select RECIPROCAL_DIV
> help
> The regular slab allocator that is established and known to work
> well in all environments. It organizes cache hot objects in
> diff --git a/lib/Kconfig b/lib/Kconfig
> index af12831..0c4b5ec 100644
> --- a/lib/Kconfig
> +++ b/lib/Kconfig
> @@ -231,4 +231,7 @@ config IOQ
>
> If unsure, say N
>
> +config RECIPROCAL_DIV
> + bool
> +
> endmenu
> diff --git a/lib/Makefile b/lib/Makefile
> index 0a6ab6f..c3555bd 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -10,7 +10,7 @@ endif
> lib-y := ctype.o string.o vsprintf.o cmdline.o \
> rbtree.o radix-tree.o dump_stack.o \
> idr.o int_sqrt.o extable.o prio_tree.o \
> - sha1.o irq_regs.o reciprocal_div.o argv_split.o \
> + sha1.o irq_regs.o argv_split.o \
> proportions.o prio_heap.o ratelimit.o show_mem.o \
> is_single_threaded.o plist.o decompress.o flex_array.o
>
> @@ -103,6 +103,8 @@ obj-$(CONFIG_GENERIC_CSUM) += checksum.o
>
> obj-$(CONFIG_GENERIC_ATOMIC64) += atomic64.o
>
> +obj-$(CONFIG_RECIPROCAL_DIV) += reciprocal_div.o
> +
> hostprogs-y := gen_crc32table
> clean-files := crc32table.h
>
> diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c
> index 6a3bd48..39f2e5e 100644
> --- a/lib/reciprocal_div.c
> +++ b/lib/reciprocal_div.c
> @@ -1,5 +1,6 @@
> #include <asm/div64.h>
> #include <linux/reciprocal_div.h>
> +#include <linux/module.h>
>
> u32 reciprocal_value(u32 k)
> {
> @@ -7,3 +8,4 @@ u32 reciprocal_value(u32 k)
> do_div(val, k);
> return (u32)val;
> }
> +EXPORT_SYMBOL(reciprocal_value);
> diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
> index 18d77b5..40b34d5 100644
> --- a/net/netfilter/Kconfig
> +++ b/net/netfilter/Kconfig
> @@ -8,6 +8,7 @@ config NETFILTER_NETLINK_QUEUE
> tristate "Netfilter NFQUEUE over NFNETLINK interface"
> depends on NETFILTER_ADVANCED
> select NETFILTER_NETLINK
> + select RECIPROCAL_DIV
> help
> If this option is enabled, the kernel will include support
> for queueing packets via NFNETLINK.
> diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
> index e70a6ef..d3d02b7 100644
> --- a/net/netfilter/nfnetlink_queue.c
> +++ b/net/netfilter/nfnetlink_queue.c
> @@ -28,6 +28,8 @@
> #include <linux/netfilter/nfnetlink.h>
> #include <linux/netfilter/nfnetlink_queue.h>
> #include <linux/list.h>
> +#include <linux/vmalloc.h>
> +#include <linux/reciprocal_div.h>
> #include <net/sock.h>
> #include <net/netfilter/nf_queue.h>
>
> @@ -37,11 +39,13 @@
> #include "../bridge/br_private.h"
> #endif
>
> -#define NFQNL_QMAX_DEFAULT 1024
> +#define NFQNL_QMAX_DEFAULT 1024
> +#define NFQNL_QHTBLSIZ_DEFAULT 1
>
> struct nfqnl_instance {
> struct hlist_node hlist; /* global list of queues */
> struct rcu_head rcu;
> + struct work_struct work;
>
> int peer_pid;
> unsigned int queue_maxlen;
> @@ -49,15 +53,21 @@ struct nfqnl_instance {
> unsigned int queue_total;
> unsigned int queue_dropped;
> unsigned int queue_user_dropped;
> + unsigned int queue_htblsiz;
> + u32 reciprocal_value;
>
> unsigned int id_sequence; /* 'sequence' of pkt ids */
> + unsigned int id_increment;
> + unsigned int id_offset;
> + unsigned int id_limit;
>
> u_int16_t queue_num; /* number of this queue */
> u_int8_t copy_mode;
>
> spinlock_t lock;
>
> - struct list_head queue_list; /* packets in queue */
> + struct list_head *queue_htbl; /* packets in queue */
> + bool vmalloc;
> };
>
> typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);
> @@ -87,49 +97,87 @@ instance_lookup(u_int16_t queue_num)
> return NULL;
> }
>
> +static void instance_destroy_work(struct work_struct *work)
> +{
> + struct nfqnl_instance *inst;
> +
> + inst = container_of(work, struct nfqnl_instance, work);
> + if (inst->vmalloc)
> + vfree(inst->queue_htbl);
> + else
> + kfree(inst->queue_htbl);
> + kfree(inst);
> + module_put(THIS_MODULE);
> +}
> +
> static struct nfqnl_instance *
> instance_create(u_int16_t queue_num, int pid)
> {
> struct nfqnl_instance *inst;
> - unsigned int h;
> + unsigned int h, i;
> int err;
>
> - spin_lock(&instances_lock);
> - if (instance_lookup(queue_num)) {
> - err = -EEXIST;
> - goto out_unlock;
> - }
> -
> + rcu_read_unlock();
This seems strange -- are all the callers aware that instance_create()
temporarily exits the RCU read-side critical section?
> inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
> if (!inst) {
> err = -ENOMEM;
> - goto out_unlock;
> + goto out_lock;
> }
>
> + INIT_WORK(&inst->work, instance_destroy_work);
> inst->queue_num = queue_num;
> inst->peer_pid = pid;
> inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
> inst->copy_range = 0xfffff;
> inst->copy_mode = NFQNL_COPY_NONE;
> spin_lock_init(&inst->lock);
> - INIT_LIST_HEAD(&inst->queue_list);
> + inst->queue_htblsiz = NFQNL_QHTBLSIZ_DEFAULT;
> + inst->id_increment = INT_MAX / inst->queue_htblsiz;
> + inst->id_limit = inst->id_increment * inst->queue_htblsiz;
> + inst->reciprocal_value = reciprocal_value(inst->id_increment);
> + inst->queue_htbl = kmalloc(sizeof(struct list_head) *
> + inst->queue_htblsiz, GFP_KERNEL);
> + if (inst->queue_htbl == NULL) {
> + inst->queue_htbl = vmalloc(sizeof(struct list_head) *
> + inst->queue_htblsiz);
> + if (inst->queue_htbl == NULL) {
> + err = -ENOMEM;
> + goto out_free;
> + }
> + inst->vmalloc = true;
> + }
> + for (i = 0; i < inst->queue_htblsiz; i++)
> + INIT_LIST_HEAD(&inst->queue_htbl[i]);
>
> if (!try_module_get(THIS_MODULE)) {
> err = -EAGAIN;
> goto out_free;
> }
> + rcu_read_lock();
>
> + spin_lock(&instances_lock);
> + if (instance_lookup(queue_num)) {
> + err = -EEXIST;
> + spin_unlock(&instances_lock);
> + rcu_read_unlock();
> + goto out_free;
> + }
> h = instance_hashfn(queue_num);
> hlist_add_head_rcu(&inst->hlist, &instance_table[h]);
> -
> spin_unlock(&instances_lock);
>
> return inst;
>
> out_free:
> + if (inst->queue_htbl) {
> + if (inst->vmalloc)
> + vfree(inst->queue_htbl);
> + else
> + kfree(inst->queue_htbl);
> + }
> kfree(inst);
> -out_unlock:
> - spin_unlock(&instances_lock);
> +out_lock:
> + rcu_read_lock();
> return ERR_PTR(err);
> }
>
> @@ -143,8 +191,7 @@ instance_destroy_rcu(struct rcu_head *head)
> rcu);
>
> nfqnl_flush(inst, NULL, 0);
> - kfree(inst);
> - module_put(THIS_MODULE);
> + schedule_work(&inst->work);
> }
>
> static void
> @@ -162,32 +209,67 @@ instance_destroy(struct nfqnl_instance *inst)
> spin_unlock(&instances_lock);
> }
>
> +static inline struct list_head *nfqnl_head_get(struct nfqnl_instance *queue,
> + unsigned int id)
> +{
> + return &queue->queue_htbl[reciprocal_divide(id,
> + queue->reciprocal_value)];
> +}
> +
> +static struct nf_queue_entry *__find_entry(struct nfqnl_instance *queue,
> + u32 id)
> +{
> + struct nf_queue_entry *entry;
> + struct list_head *head;
> +
> + head = nfqnl_head_get(queue, id);
> + list_for_each_entry(entry, head, list) {
> + if (entry->id == id)
> + return entry;
> + }
> +
> + return NULL;
> +}
> +
> +static u32 __get_uniq_id(struct nfqnl_instance *queue)
> +{
> + u32 i;
> +
> + for (i = 0; i < INT_MAX; i++) {
> + queue->id_sequence += queue->id_increment;
> + if (queue->id_sequence >= queue->id_limit) {
> + if (++queue->id_offset >= queue->id_increment)
> + queue->id_offset = 0;
> + queue->id_sequence = queue->id_offset;
> + }
> + if (__find_entry(queue, queue->id_sequence) == NULL)
> + return queue->id_sequence;
> + }
> +
> + return INT_MAX;
> +}
> +
> static inline void
> __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
> {
> - list_add_tail(&entry->list, &queue->queue_list);
> - queue->queue_total++;
> + struct list_head *head;
> +
> + head = nfqnl_head_get(queue, entry->id);
> + list_add_tail(&entry->list, head);
> + queue->queue_total++;
> }
>
> static struct nf_queue_entry *
> find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
> {
> - struct nf_queue_entry *entry = NULL, *i;
> + struct nf_queue_entry *entry;
>
> spin_lock_bh(&queue->lock);
> -
> - list_for_each_entry(i, &queue->queue_list, list) {
> - if (i->id == id) {
> - entry = i;
> - break;
> - }
> - }
> -
> + entry = __find_entry(queue, id);
> if (entry) {
> list_del(&entry->list);
> queue->queue_total--;
> }
> -
> spin_unlock_bh(&queue->lock);
>
> return entry;
> @@ -197,13 +279,22 @@ static void
> nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
> {
> struct nf_queue_entry *entry, *next;
> + unsigned int i, total;
> + struct list_head *head;
>
> spin_lock_bh(&queue->lock);
> - list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
> - if (!cmpfn || cmpfn(entry, data)) {
> - list_del(&entry->list);
> - queue->queue_total--;
> - nf_reinject(entry, NF_DROP);
> + total = queue->queue_total;
> + for (i = 0; i < queue->queue_htblsiz; i++) {
> + if (total < 1)
> + break;
> + head = &queue->queue_htbl[i];
> + list_for_each_entry_safe(entry, next, head, list) {
> + if (!cmpfn || cmpfn(entry, data)) {
> + list_del(&entry->list);
> + queue->queue_total--;
> + nf_reinject(entry, NF_DROP);
> + }
> + --total;
> }
> }
> spin_unlock_bh(&queue->lock);
> @@ -262,7 +353,12 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
> break;
> }
>
> - entry->id = queue->id_sequence++;
> + entry->id = __get_uniq_id(queue);
> + if (entry->id == INT_MAX) {
> + spin_unlock_bh(&queue->lock);
> + return NULL;
> + }
> + __enqueue_entry(queue, entry);
>
> spin_unlock_bh(&queue->lock);
>
> @@ -379,6 +475,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
>
> nlmsg_failure:
> nla_put_failure:
> + find_dequeue_entry(queue, entry->id);
> if (skb)
> kfree_skb(skb);
> if (net_ratelimit())
> @@ -426,14 +523,14 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
> goto err_out_unlock;
> }
>
> - __enqueue_entry(queue, entry);
> -
> spin_unlock_bh(&queue->lock);
> return 0;
>
> err_out_free_nskb:
> kfree_skb(nskb);
> err_out_unlock:
> + list_del(&entry->list);
> + queue->queue_total--;
> spin_unlock_bh(&queue->lock);
> err_out:
> return -1;
> @@ -686,6 +783,77 @@ static const struct nf_queue_handler nfqh = {
> .outfn = &nfqnl_enqueue_packet,
> };
>
> +static int nfqnl_htbl_resize(u16 queue_num, int pid, unsigned int size)
> +{
> + struct nfqnl_instance *queue;
> + unsigned int i, total;
> + struct list_head *h, *htbl;
> + bool is_vmalloc;
> + int err;
> +
> + if (size < 1 || size > INT_MAX)
> + return -EINVAL;
> +
> + rcu_read_unlock();
Again, this seems strange. As near as I can tell, the caller immediately
exits the RCU read-side critical section, so why not have the caller do
the exit immediately before calling nfqnl_htbl_resize()?
> + htbl = kmalloc(sizeof(*h) * size, GFP_KERNEL);
> + if (htbl == NULL) {
> + htbl = vmalloc(sizeof(*h) * size);
> + if (htbl == NULL) {
> + err = -ENOMEM;
> + goto out_lock;
> + }
> + is_vmalloc = true;
> + } else {
> + is_vmalloc = false;
> + }
> + rcu_read_lock();
> +
> + queue = instance_lookup(queue_num);
> + if (!queue || queue->peer_pid != pid) {
> + err = -ENODEV;
> + goto out_free;
> + }
> +
> + for (i = 0; i < size; i++)
> + INIT_LIST_HEAD(&htbl[i]);
> + spin_lock_bh(&queue->lock);
> + swap(size, queue->queue_htblsiz);
> + queue->id_increment = INT_MAX / queue->queue_htblsiz;
> + queue->id_limit = queue->id_increment * queue->queue_htblsiz;
> + if (queue->id_offset >= queue->id_increment)
> + queue->id_offset = 0;
> + if (queue->id_sequence >= queue->id_limit)
> + queue->id_sequence = queue->id_offset;
> + queue->reciprocal_value = reciprocal_value(queue->id_increment);
> + swap(htbl, queue->queue_htbl);
> + swap(is_vmalloc, queue->vmalloc);
> + total = queue->queue_total;
> + for (i = 0; i < size; i++) {
> + struct nf_queue_entry *entry, *next;
> +
> + if (total < 1)
> + break;
> + h = &queue->queue_htbl[i];
> + list_for_each_entry_safe(entry, next, h, list) {
> + list_move_tail(&entry->list,
> + nfqnl_head_get(queue, entry->id));
> + --total;
> + }
> + }
> + spin_unlock_bh(&queue->lock);
> + err = 0;
> +
> +out_free:
> + rcu_read_unlock();
> + if (is_vmalloc)
> + vfree(htbl);
> + else
> + kfree(htbl);
> +out_lock:
> + rcu_read_lock();
> + return err;
> +}
> +
> static int
> nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
> const struct nlmsghdr *nlh,
> @@ -772,6 +940,18 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
> spin_unlock_bh(&queue->lock);
> }
>
> + if (nfqa[NFQA_CFG_QUEUE_HTBLSIZ]) {
> + __be32 *htblsiz;
> +
> + if (!queue) {
> + ret = -ENODEV;
> + goto err_out_unlock;
> + }
> + htblsiz = nla_data(nfqa[NFQA_CFG_QUEUE_HTBLSIZ]);
> + ret = nfqnl_htbl_resize(queue_num, NETLINK_CB(skb).pid,
> + ntohl(*htblsiz));
> + }
> +
> err_out_unlock:
> rcu_read_unlock();
> return ret;
> --
> To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
next prev parent reply other threads:[~2010-04-21 20:24 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-04-20 14:31 [PATCH] nfnetlink_queue: use hash table to speed up entry lookup Changli Gao
2010-04-20 14:57 ` Patrick McHardy
2010-04-21 0:04 ` Changli Gao
2010-04-21 12:35 ` Patrick McHardy
2010-04-21 19:04 ` Eric Dumazet
2010-05-01 0:05 ` Changli Gao
2010-05-01 16:42 ` Patrick McHardy
2010-04-21 20:23 ` Paul E. McKenney [this message]
2010-05-01 0:14 ` Changli Gao
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20100421202357.GK2563@linux.vnet.ibm.com \
--to=paulmck@linux.vnet.ibm.com \
--cc=eric.dumazet@gmail.com \
--cc=kaber@trash.net \
--cc=netfilter-devel@vger.kernel.org \
--cc=xiaosuo@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).