Re: [PATCH] nfnetlink_queue: use hash table to speed up entry lookup

netfilter-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
To: Changli Gao <xiaosuo@gmail.com>
Cc: Patrick McHardy <kaber@trash.net>,
	netfilter-devel@vger.kernel.org,
	Eric Dumazet <eric.dumazet@gmail.com>
Subject: Re: [PATCH] nfnetlink_queue: use hash table to speed up entry lookup
Date: Wed, 21 Apr 2010 13:23:57 -0700	[thread overview]
Message-ID: <20100421202357.GK2563@linux.vnet.ibm.com> (raw)
In-Reply-To: <1271773896-28246-1-git-send-email-xiaosuo@gmail.com>

On Tue, Apr 20, 2010 at 10:31:36PM +0800, Changli Gao wrote:
> use hash table to speed up entry lookup
> 
> A hash table is used to speed up entry lookup when the verdicts aren't received
> in order. The size of hash table can be specified by NFQA_CFG_QUEUE_HTBLSIZ.
> Its default value is 1. Reciprocal division is used to lower the cost of
> division, and the entry IDs are generated carefully to get fair entry
> distribution in the buckets of the hash table.

A few questions interspersed below.

							Thanx, Paul

> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----
>  include/linux/netfilter/nfnetlink_queue.h |    1 
>  init/Kconfig                              |    1 
>  lib/Kconfig                               |    3 
>  lib/Makefile                              |    4 
>  lib/reciprocal_div.c                      |    2 
>  net/netfilter/Kconfig                     |    1 
>  net/netfilter/nfnetlink_queue.c           |  252 +++++++++++++++++++++++++-----
>  7 files changed, 227 insertions(+), 37 deletions(-)
> diff --git a/include/linux/netfilter/nfnetlink_queue.h b/include/linux/netfilter/nfnetlink_queue.h
> index 2455fe5..77b1566 100644
> --- a/include/linux/netfilter/nfnetlink_queue.h
> +++ b/include/linux/netfilter/nfnetlink_queue.h
> @@ -83,6 +83,7 @@ enum nfqnl_attr_config {
>  	NFQA_CFG_CMD,			/* nfqnl_msg_config_cmd */
>  	NFQA_CFG_PARAMS,		/* nfqnl_msg_config_params */
>  	NFQA_CFG_QUEUE_MAXLEN,		/* __u32 */
> +	NFQA_CFG_QUEUE_HTBLSIZ,		/* __u32 */
>  	__NFQA_CFG_MAX
>  };
>  #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
> diff --git a/init/Kconfig b/init/Kconfig
> index cb6069e..4b4266f 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1059,6 +1059,7 @@ choice
> 
>  config SLAB
>  	bool "SLAB"
> +	select RECIPROCAL_DIV
>  	help
>  	  The regular slab allocator that is established and known to work
>  	  well in all environments. It organizes cache hot objects in
> diff --git a/lib/Kconfig b/lib/Kconfig
> index af12831..0c4b5ec 100644
> --- a/lib/Kconfig
> +++ b/lib/Kconfig
> @@ -231,4 +231,7 @@ config IOQ
> 
>  	 If unsure, say N
> 
> +config RECIPROCAL_DIV
> +       bool
> +
>  endmenu
> diff --git a/lib/Makefile b/lib/Makefile
> index 0a6ab6f..c3555bd 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -10,7 +10,7 @@ endif
>  lib-y := ctype.o string.o vsprintf.o cmdline.o \
>  	 rbtree.o radix-tree.o dump_stack.o \
>  	 idr.o int_sqrt.o extable.o prio_tree.o \
> -	 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
> +	 sha1.o irq_regs.o argv_split.o \
>  	 proportions.o prio_heap.o ratelimit.o show_mem.o \
>  	 is_single_threaded.o plist.o decompress.o flex_array.o
> 
> @@ -103,6 +103,8 @@ obj-$(CONFIG_GENERIC_CSUM) += checksum.o
> 
>  obj-$(CONFIG_GENERIC_ATOMIC64) += atomic64.o
> 
> +obj-$(CONFIG_RECIPROCAL_DIV) += reciprocal_div.o
> +
>  hostprogs-y	:= gen_crc32table
>  clean-files	:= crc32table.h
> 
> diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c
> index 6a3bd48..39f2e5e 100644
> --- a/lib/reciprocal_div.c
> +++ b/lib/reciprocal_div.c
> @@ -1,5 +1,6 @@
>  #include <asm/div64.h>
>  #include <linux/reciprocal_div.h>
> +#include <linux/module.h>
> 
>  u32 reciprocal_value(u32 k)
>  {
> @@ -7,3 +8,4 @@ u32 reciprocal_value(u32 k)
>  	do_div(val, k);
>  	return (u32)val;
>  }
> +EXPORT_SYMBOL(reciprocal_value);
> diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
> index 18d77b5..40b34d5 100644
> --- a/net/netfilter/Kconfig
> +++ b/net/netfilter/Kconfig
> @@ -8,6 +8,7 @@ config NETFILTER_NETLINK_QUEUE
>  	tristate "Netfilter NFQUEUE over NFNETLINK interface"
>  	depends on NETFILTER_ADVANCED
>  	select NETFILTER_NETLINK
> +	select RECIPROCAL_DIV
>  	help
>  	  If this option is enabled, the kernel will include support
>  	  for queueing packets via NFNETLINK.
> diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
> index e70a6ef..d3d02b7 100644
> --- a/net/netfilter/nfnetlink_queue.c
> +++ b/net/netfilter/nfnetlink_queue.c
> @@ -28,6 +28,8 @@
>  #include <linux/netfilter/nfnetlink.h>
>  #include <linux/netfilter/nfnetlink_queue.h>
>  #include <linux/list.h>
> +#include <linux/vmalloc.h>
> +#include <linux/reciprocal_div.h>
>  #include <net/sock.h>
>  #include <net/netfilter/nf_queue.h>
> 
> @@ -37,11 +39,13 @@
>  #include "../bridge/br_private.h"
>  #endif
> 
> -#define NFQNL_QMAX_DEFAULT 1024
> +#define NFQNL_QMAX_DEFAULT	1024
> +#define NFQNL_QHTBLSIZ_DEFAULT	1
> 
>  struct nfqnl_instance {
>  	struct hlist_node hlist;		/* global list of queues */
>  	struct rcu_head rcu;
> +	struct work_struct work;
> 
>  	int peer_pid;
>  	unsigned int queue_maxlen;
> @@ -49,15 +53,21 @@ struct nfqnl_instance {
>  	unsigned int queue_total;
>  	unsigned int queue_dropped;
>  	unsigned int queue_user_dropped;
> +	unsigned int queue_htblsiz;
> +	u32 reciprocal_value;
> 
>  	unsigned int id_sequence;		/* 'sequence' of pkt ids */
> +	unsigned int id_increment;
> +	unsigned int id_offset;
> +	unsigned int id_limit;
> 
>  	u_int16_t queue_num;			/* number of this queue */
>  	u_int8_t copy_mode;
> 
>  	spinlock_t lock;
> 
> -	struct list_head queue_list;		/* packets in queue */
> +	struct list_head *queue_htbl;		/* packets in queue */
> +	bool vmalloc;
>  };
> 
>  typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);
> @@ -87,49 +97,87 @@ instance_lookup(u_int16_t queue_num)
>  	return NULL;
>  }
> 
> +static void instance_destroy_work(struct work_struct *work)
> +{
> +	struct nfqnl_instance *inst;
> +
> +	inst = container_of(work, struct nfqnl_instance, work);
> +	if (inst->vmalloc)
> +		vfree(inst->queue_htbl);
> +	else
> +		kfree(inst->queue_htbl);
> +	kfree(inst);
> +	module_put(THIS_MODULE);
> +}
> +
>  static struct nfqnl_instance *
>  instance_create(u_int16_t queue_num, int pid)
>  {
>  	struct nfqnl_instance *inst;
> -	unsigned int h;
> +	unsigned int h, i;
>  	int err;
> 
> -	spin_lock(&instances_lock);
> -	if (instance_lookup(queue_num)) {
> -		err = -EEXIST;
> -		goto out_unlock;
> -	}
> -
> +	rcu_read_unlock();

This seems strange -- are all the callers aware that instance_create()
temporarily exits the RCU read-side critical section?

>  	inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
>  	if (!inst) {
>  		err = -ENOMEM;
> -		goto out_unlock;
> +		goto out_lock;
>  	}
> 
> +	INIT_WORK(&inst->work, instance_destroy_work);
>  	inst->queue_num = queue_num;
>  	inst->peer_pid = pid;
>  	inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
>  	inst->copy_range = 0xfffff;
>  	inst->copy_mode = NFQNL_COPY_NONE;
>  	spin_lock_init(&inst->lock);
> -	INIT_LIST_HEAD(&inst->queue_list);
> +	inst->queue_htblsiz = NFQNL_QHTBLSIZ_DEFAULT;
> +	inst->id_increment = INT_MAX / inst->queue_htblsiz;
> +	inst->id_limit = inst->id_increment * inst->queue_htblsiz;
> +	inst->reciprocal_value = reciprocal_value(inst->id_increment);
> +	inst->queue_htbl = kmalloc(sizeof(struct list_head) *
> +				   inst->queue_htblsiz, GFP_KERNEL);
> +	if (inst->queue_htbl == NULL) {
> +		inst->queue_htbl = vmalloc(sizeof(struct list_head) *
> +					   inst->queue_htblsiz);
> +		if (inst->queue_htbl == NULL) {
> +			err = -ENOMEM;
> +			goto out_free;
> +		}
> +		inst->vmalloc = true;
> +	}
> +	for (i = 0; i < inst->queue_htblsiz; i++)
> +		INIT_LIST_HEAD(&inst->queue_htbl[i]);
> 
>  	if (!try_module_get(THIS_MODULE)) {
>  		err = -EAGAIN;
>  		goto out_free;
>  	}
> +	rcu_read_lock();
> 
> +	spin_lock(&instances_lock);
> +	if (instance_lookup(queue_num)) {
> +		err = -EEXIST;
> +		spin_unlock(&instances_lock);
> +		rcu_read_unlock();
> +		goto out_free;
> +	}
>  	h = instance_hashfn(queue_num);
>  	hlist_add_head_rcu(&inst->hlist, &instance_table[h]);
> -
>  	spin_unlock(&instances_lock);
> 
>  	return inst;
> 
>  out_free:
> +	if (inst->queue_htbl) {
> +		if (inst->vmalloc)
> +			vfree(inst->queue_htbl);
> +		else
> +			kfree(inst->queue_htbl);
> +	}
>  	kfree(inst);
> -out_unlock:
> -	spin_unlock(&instances_lock);
> +out_lock:
> +	rcu_read_lock();
>  	return ERR_PTR(err);
>  }
> 
> @@ -143,8 +191,7 @@ instance_destroy_rcu(struct rcu_head *head)
>  						   rcu);
> 
>  	nfqnl_flush(inst, NULL, 0);
> -	kfree(inst);
> -	module_put(THIS_MODULE);
> +	schedule_work(&inst->work);
>  }
> 
>  static void
> @@ -162,32 +209,67 @@ instance_destroy(struct nfqnl_instance *inst)
>  	spin_unlock(&instances_lock);
>  }
> 
> +static inline struct list_head *nfqnl_head_get(struct nfqnl_instance *queue,
> +					       unsigned int id)
> +{
> +	return &queue->queue_htbl[reciprocal_divide(id,
> +						    queue->reciprocal_value)];
> +}
> +
> +static struct nf_queue_entry *__find_entry(struct nfqnl_instance *queue,
> +					   u32 id)
> +{
> +	struct nf_queue_entry *entry;
> +	struct list_head *head;
> +
> +	head = nfqnl_head_get(queue, id);
> +	list_for_each_entry(entry, head, list) {
> +		if (entry->id == id)
> +			return entry;
> +	}
> +
> +	return NULL;
> +}
> +
> +static u32 __get_uniq_id(struct nfqnl_instance *queue)
> +{
> +	u32 i;
> +
> +	for (i = 0; i < INT_MAX; i++) {
> +		queue->id_sequence += queue->id_increment;
> +		if (queue->id_sequence >= queue->id_limit) {
> +			if (++queue->id_offset >= queue->id_increment)
> +				queue->id_offset = 0;
> +			queue->id_sequence = queue->id_offset;
> +		}
> +		if (__find_entry(queue, queue->id_sequence) == NULL)
> +			return queue->id_sequence;
> +	}
> +
> +	return INT_MAX;
> +}
> +
>  static inline void
>  __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
>  {
> -       list_add_tail(&entry->list, &queue->queue_list);
> -       queue->queue_total++;
> +	struct list_head *head;
> +
> +	head = nfqnl_head_get(queue, entry->id);
> +	list_add_tail(&entry->list, head);
> +	queue->queue_total++;
>  }
> 
>  static struct nf_queue_entry *
>  find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
>  {
> -	struct nf_queue_entry *entry = NULL, *i;
> +	struct nf_queue_entry *entry;
> 
>  	spin_lock_bh(&queue->lock);
> -
> -	list_for_each_entry(i, &queue->queue_list, list) {
> -		if (i->id == id) {
> -			entry = i;
> -			break;
> -		}
> -	}
> -
> +	entry = __find_entry(queue, id);
>  	if (entry) {
>  		list_del(&entry->list);
>  		queue->queue_total--;
>  	}
> -
>  	spin_unlock_bh(&queue->lock);
> 
>  	return entry;
> @@ -197,13 +279,22 @@ static void
>  nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
>  {
>  	struct nf_queue_entry *entry, *next;
> +	unsigned int i, total;
> +	struct list_head *head;
> 
>  	spin_lock_bh(&queue->lock);
> -	list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
> -		if (!cmpfn || cmpfn(entry, data)) {
> -			list_del(&entry->list);
> -			queue->queue_total--;
> -			nf_reinject(entry, NF_DROP);
> +	total = queue->queue_total;
> +	for (i = 0; i < queue->queue_htblsiz; i++) {
> +		if (total < 1)
> +			break;
> +		head = &queue->queue_htbl[i];
> +		list_for_each_entry_safe(entry, next, head, list) {
> +			if (!cmpfn || cmpfn(entry, data)) {
> +				list_del(&entry->list);
> +				queue->queue_total--;
> +				nf_reinject(entry, NF_DROP);
> +			}
> +			--total;
>  		}
>  	}
>  	spin_unlock_bh(&queue->lock);
> @@ -262,7 +353,12 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
>  		break;
>  	}
> 
> -	entry->id = queue->id_sequence++;
> +	entry->id = __get_uniq_id(queue);
> +	if (entry->id == INT_MAX) {
> +		spin_unlock_bh(&queue->lock);
> +		return NULL;
> +	}
> +	__enqueue_entry(queue, entry);
> 
>  	spin_unlock_bh(&queue->lock);
> 
> @@ -379,6 +475,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
> 
>  nlmsg_failure:
>  nla_put_failure:
> +	find_dequeue_entry(queue, entry->id);
>  	if (skb)
>  		kfree_skb(skb);
>  	if (net_ratelimit())
> @@ -426,14 +523,14 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
>  		goto err_out_unlock;
>  	}
> 
> -	__enqueue_entry(queue, entry);
> -
>  	spin_unlock_bh(&queue->lock);
>  	return 0;
> 
>  err_out_free_nskb:
>  	kfree_skb(nskb);
>  err_out_unlock:
> +	list_del(&entry->list);
> +	queue->queue_total--;
>  	spin_unlock_bh(&queue->lock);
>  err_out:
>  	return -1;
> @@ -686,6 +783,77 @@ static const struct nf_queue_handler nfqh = {
>  	.outfn	= &nfqnl_enqueue_packet,
>  };
> 
> +static int nfqnl_htbl_resize(u16 queue_num, int pid, unsigned int size)
> +{
> +	struct nfqnl_instance *queue;
> +	unsigned int i, total;
> +	struct list_head *h, *htbl;
> +	bool is_vmalloc;
> +	int err;
> +
> +	if (size < 1 || size > INT_MAX)
> +		return -EINVAL;
> +
> +	rcu_read_unlock();

Again, this seems strange.  As near as I can tell, the caller immediately
exits the RCU read-side critical section, so why not have the caller do
the exit immediately before calling nfqnl_htbl_resize()?

> +	htbl = kmalloc(sizeof(*h) * size, GFP_KERNEL);
> +	if (htbl == NULL) {
> +		htbl = vmalloc(sizeof(*h) * size);
> +		if (htbl == NULL) {
> +			err = -ENOMEM;
> +			goto out_lock;
> +		}
> +		is_vmalloc = true;
> +	} else {
> +		is_vmalloc = false;
> +	}
> +	rcu_read_lock();
> +
> +	queue = instance_lookup(queue_num);
> +	if (!queue || queue->peer_pid != pid) {
> +		err = -ENODEV;
> +		goto out_free;
> +	}
> +
> +	for (i = 0; i < size; i++)
> +		INIT_LIST_HEAD(&htbl[i]);
> +	spin_lock_bh(&queue->lock);
> +	swap(size, queue->queue_htblsiz);
> +	queue->id_increment = INT_MAX / queue->queue_htblsiz;
> +	queue->id_limit = queue->id_increment * queue->queue_htblsiz;
> +	if (queue->id_offset >= queue->id_increment)
> +		queue->id_offset = 0;
> +	if (queue->id_sequence >= queue->id_limit)
> +		queue->id_sequence = queue->id_offset;
> +	queue->reciprocal_value = reciprocal_value(queue->id_increment);
> +	swap(htbl, queue->queue_htbl);
> +	swap(is_vmalloc, queue->vmalloc);
> +	total = queue->queue_total;
> +	for (i = 0; i < size; i++) {
> +		struct nf_queue_entry *entry, *next;
> +
> +		if (total < 1)
> +			break;
> +		h = &queue->queue_htbl[i];
> +		list_for_each_entry_safe(entry, next, h, list) {
> +			list_move_tail(&entry->list,
> +				       nfqnl_head_get(queue, entry->id));
> +			--total;
> +		}
> +	}
> +	spin_unlock_bh(&queue->lock);
> +	err = 0;
> +
> +out_free:
> +	rcu_read_unlock();
> +	if (is_vmalloc)
> +		vfree(htbl);
> +	else
> +		kfree(htbl);
> +out_lock:
> +	rcu_read_lock();
> +	return err;
> +}
> +
>  static int
>  nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
>  		  const struct nlmsghdr *nlh,
> @@ -772,6 +940,18 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
>  		spin_unlock_bh(&queue->lock);
>  	}
> 
> +	if (nfqa[NFQA_CFG_QUEUE_HTBLSIZ]) {
> +		__be32 *htblsiz;
> +
> +		if (!queue) {
> +			ret = -ENODEV;
> +			goto err_out_unlock;
> +		}
> +		htblsiz = nla_data(nfqa[NFQA_CFG_QUEUE_HTBLSIZ]);
> +		ret = nfqnl_htbl_resize(queue_num, NETLINK_CB(skb).pid,
> +					ntohl(*htblsiz));
> +	}
> +
>  err_out_unlock:
>  	rcu_read_unlock();
>  	return ret;
> --
> To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

next prev parent reply	other threads:[~2010-04-21 20:24 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-04-20 14:31 [PATCH] nfnetlink_queue: use hash table to speed up entry lookup Changli Gao
2010-04-20 14:57 ` Patrick McHardy
2010-04-21  0:04   ` Changli Gao
2010-04-21 12:35     ` Patrick McHardy
2010-04-21 19:04       ` Eric Dumazet
2010-05-01  0:05       ` Changli Gao
2010-05-01 16:42         ` Patrick McHardy
2010-04-21 20:23 ` Paul E. McKenney [this message]
2010-05-01  0:14   ` Changli Gao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100421202357.GK2563@linux.vnet.ibm.com \
    --to=paulmck@linux.vnet.ibm.com \
    --cc=eric.dumazet@gmail.com \
    --cc=kaber@trash.net \
    --cc=netfilter-devel@vger.kernel.org \
    --cc=xiaosuo@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).