From: Changli Gao <xiaosuo@gmail.com>
To: paulmck@linux.vnet.ibm.com
Cc: Patrick McHardy <kaber@trash.net>,
netfilter-devel@vger.kernel.org,
Eric Dumazet <eric.dumazet@gmail.com>
Subject: Re: [PATCH] nfnetlink_queue: use hash table to speed up entry lookup
Date: Sat, 1 May 2010 08:14:43 +0800 [thread overview]
Message-ID: <v2w412e6f7f1004301714wfc3a4e4aoacebb8e609de2d5d@mail.gmail.com> (raw)
In-Reply-To: <20100421202357.GK2563@linux.vnet.ibm.com>
On Thu, Apr 22, 2010 at 4:23 AM, Paul E. McKenney
<paulmck@linux.vnet.ibm.com> wrote:
> On Tue, Apr 20, 2010 at 10:31:36PM +0800, Changli Gao wrote:
>> use hash table to speed up entry lookup
>>
>> A hash table is used to speed up entry lookup when the verdicts aren't received
>> in order. The size of hash table can be specified by NFQA_CFG_QUEUE_HTBLSIZ.
>> Its default value is 1. Reciprocal division is used to lower the cost of
>> division, and the entry IDs are generated carefully to get fair entry
>> distribution in the buckets of the hash table.
>
> A few questions interspersed below.
>
> Thanx, Paul
>
>> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
>> ----
>> include/linux/netfilter/nfnetlink_queue.h | 1
>> init/Kconfig | 1
>> lib/Kconfig | 3
>> lib/Makefile | 4
>> lib/reciprocal_div.c | 2
>> net/netfilter/Kconfig | 1
>> net/netfilter/nfnetlink_queue.c | 252 +++++++++++++++++++++++++-----
>> 7 files changed, 227 insertions(+), 37 deletions(-)
>> diff --git a/include/linux/netfilter/nfnetlink_queue.h b/include/linux/netfilter/nfnetlink_queue.h
>> index 2455fe5..77b1566 100644
>> --- a/include/linux/netfilter/nfnetlink_queue.h
>> +++ b/include/linux/netfilter/nfnetlink_queue.h
>> @@ -83,6 +83,7 @@ enum nfqnl_attr_config {
>> NFQA_CFG_CMD, /* nfqnl_msg_config_cmd */
>> NFQA_CFG_PARAMS, /* nfqnl_msg_config_params */
>> NFQA_CFG_QUEUE_MAXLEN, /* __u32 */
>> + NFQA_CFG_QUEUE_HTBLSIZ, /* __u32 */
>> __NFQA_CFG_MAX
>> };
>> #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
>> diff --git a/init/Kconfig b/init/Kconfig
>> index cb6069e..4b4266f 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -1059,6 +1059,7 @@ choice
>>
>> config SLAB
>> bool "SLAB"
>> + select RECIPROCAL_DIV
>> help
>> The regular slab allocator that is established and known to work
>> well in all environments. It organizes cache hot objects in
>> diff --git a/lib/Kconfig b/lib/Kconfig
>> index af12831..0c4b5ec 100644
>> --- a/lib/Kconfig
>> +++ b/lib/Kconfig
>> @@ -231,4 +231,7 @@ config IOQ
>>
>> If unsure, say N
>>
>> +config RECIPROCAL_DIV
>> + bool
>> +
>> endmenu
>> diff --git a/lib/Makefile b/lib/Makefile
>> index 0a6ab6f..c3555bd 100644
>> --- a/lib/Makefile
>> +++ b/lib/Makefile
>> @@ -10,7 +10,7 @@ endif
>> lib-y := ctype.o string.o vsprintf.o cmdline.o \
>> rbtree.o radix-tree.o dump_stack.o \
>> idr.o int_sqrt.o extable.o prio_tree.o \
>> - sha1.o irq_regs.o reciprocal_div.o argv_split.o \
>> + sha1.o irq_regs.o argv_split.o \
>> proportions.o prio_heap.o ratelimit.o show_mem.o \
>> is_single_threaded.o plist.o decompress.o flex_array.o
>>
>> @@ -103,6 +103,8 @@ obj-$(CONFIG_GENERIC_CSUM) += checksum.o
>>
>> obj-$(CONFIG_GENERIC_ATOMIC64) += atomic64.o
>>
>> +obj-$(CONFIG_RECIPROCAL_DIV) += reciprocal_div.o
>> +
>> hostprogs-y := gen_crc32table
>> clean-files := crc32table.h
>>
>> diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c
>> index 6a3bd48..39f2e5e 100644
>> --- a/lib/reciprocal_div.c
>> +++ b/lib/reciprocal_div.c
>> @@ -1,5 +1,6 @@
>> #include <asm/div64.h>
>> #include <linux/reciprocal_div.h>
>> +#include <linux/module.h>
>>
>> u32 reciprocal_value(u32 k)
>> {
>> @@ -7,3 +8,4 @@ u32 reciprocal_value(u32 k)
>> do_div(val, k);
>> return (u32)val;
>> }
>> +EXPORT_SYMBOL(reciprocal_value);
>> diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
>> index 18d77b5..40b34d5 100644
>> --- a/net/netfilter/Kconfig
>> +++ b/net/netfilter/Kconfig
>> @@ -8,6 +8,7 @@ config NETFILTER_NETLINK_QUEUE
>> tristate "Netfilter NFQUEUE over NFNETLINK interface"
>> depends on NETFILTER_ADVANCED
>> select NETFILTER_NETLINK
>> + select RECIPROCAL_DIV
>> help
>> If this option is enabled, the kernel will include support
>> for queueing packets via NFNETLINK.
>> diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
>> index e70a6ef..d3d02b7 100644
>> --- a/net/netfilter/nfnetlink_queue.c
>> +++ b/net/netfilter/nfnetlink_queue.c
>> @@ -28,6 +28,8 @@
>> #include <linux/netfilter/nfnetlink.h>
>> #include <linux/netfilter/nfnetlink_queue.h>
>> #include <linux/list.h>
>> +#include <linux/vmalloc.h>
>> +#include <linux/reciprocal_div.h>
>> #include <net/sock.h>
>> #include <net/netfilter/nf_queue.h>
>>
>> @@ -37,11 +39,13 @@
>> #include "../bridge/br_private.h"
>> #endif
>>
>> -#define NFQNL_QMAX_DEFAULT 1024
>> +#define NFQNL_QMAX_DEFAULT 1024
>> +#define NFQNL_QHTBLSIZ_DEFAULT 1
>>
>> struct nfqnl_instance {
>> struct hlist_node hlist; /* global list of queues */
>> struct rcu_head rcu;
>> + struct work_struct work;
>>
>> int peer_pid;
>> unsigned int queue_maxlen;
>> @@ -49,15 +53,21 @@ struct nfqnl_instance {
>> unsigned int queue_total;
>> unsigned int queue_dropped;
>> unsigned int queue_user_dropped;
>> + unsigned int queue_htblsiz;
>> + u32 reciprocal_value;
>>
>> unsigned int id_sequence; /* 'sequence' of pkt ids */
>> + unsigned int id_increment;
>> + unsigned int id_offset;
>> + unsigned int id_limit;
>>
>> u_int16_t queue_num; /* number of this queue */
>> u_int8_t copy_mode;
>>
>> spinlock_t lock;
>>
>> - struct list_head queue_list; /* packets in queue */
>> + struct list_head *queue_htbl; /* packets in queue */
>> + bool vmalloc;
>> };
>>
>> typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);
>> @@ -87,49 +97,87 @@ instance_lookup(u_int16_t queue_num)
>> return NULL;
>> }
>>
>> +static void instance_destroy_work(struct work_struct *work)
>> +{
>> + struct nfqnl_instance *inst;
>> +
>> + inst = container_of(work, struct nfqnl_instance, work);
>> + if (inst->vmalloc)
>> + vfree(inst->queue_htbl);
>> + else
>> + kfree(inst->queue_htbl);
>> + kfree(inst);
>> + module_put(THIS_MODULE);
>> +}
>> +
>> static struct nfqnl_instance *
>> instance_create(u_int16_t queue_num, int pid)
>> {
>> struct nfqnl_instance *inst;
>> - unsigned int h;
>> + unsigned int h, i;
>> int err;
>>
>> - spin_lock(&instances_lock);
>> - if (instance_lookup(queue_num)) {
>> - err = -EEXIST;
>> - goto out_unlock;
>> - }
>> -
>> + rcu_read_unlock();
>
> This seems strange -- are all the callers aware that instance_create()
> temporarily exits the RCU read-side critical section?
There is only one caller, so it is awares. But it seems that I should
add suffix "_rcu_read_locked()" to instrance_create().
>
>> inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
>> if (!inst) {
>> err = -ENOMEM;
>> - goto out_unlock;
>> + goto out_lock;
>> }
>>
>> + INIT_WORK(&inst->work, instance_destroy_work);
>> inst->queue_num = queue_num;
>> inst->peer_pid = pid;
>> inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
>> inst->copy_range = 0xfffff;
>> inst->copy_mode = NFQNL_COPY_NONE;
>> spin_lock_init(&inst->lock);
>> - INIT_LIST_HEAD(&inst->queue_list);
>> + inst->queue_htblsiz = NFQNL_QHTBLSIZ_DEFAULT;
>> + inst->id_increment = INT_MAX / inst->queue_htblsiz;
>> + inst->id_limit = inst->id_increment * inst->queue_htblsiz;
>> + inst->reciprocal_value = reciprocal_value(inst->id_increment);
>> + inst->queue_htbl = kmalloc(sizeof(struct list_head) *
>> + inst->queue_htblsiz, GFP_KERNEL);
>> + if (inst->queue_htbl == NULL) {
>> + inst->queue_htbl = vmalloc(sizeof(struct list_head) *
>> + inst->queue_htblsiz);
>> + if (inst->queue_htbl == NULL) {
>> + err = -ENOMEM;
>> + goto out_free;
>> + }
>> + inst->vmalloc = true;
>> + }
>> + for (i = 0; i < inst->queue_htblsiz; i++)
>> + INIT_LIST_HEAD(&inst->queue_htbl[i]);
>>
>> if (!try_module_get(THIS_MODULE)) {
>> err = -EAGAIN;
>> goto out_free;
>> }
>> + rcu_read_lock();
>>
>> + spin_lock(&instances_lock);
>> + if (instance_lookup(queue_num)) {
>> + err = -EEXIST;
>> + spin_unlock(&instances_lock);
>> + rcu_read_unlock();
>> + goto out_free;
>> + }
>> h = instance_hashfn(queue_num);
>> hlist_add_head_rcu(&inst->hlist, &instance_table[h]);
>> -
>> spin_unlock(&instances_lock);
>>
>> return inst;
>>
>> out_free:
>> + if (inst->queue_htbl) {
>> + if (inst->vmalloc)
>> + vfree(inst->queue_htbl);
>> + else
>> + kfree(inst->queue_htbl);
>> + }
>> kfree(inst);
>> -out_unlock:
>> - spin_unlock(&instances_lock);
>> +out_lock:
>> + rcu_read_lock();
>> return ERR_PTR(err);
>> }
>>
>> @@ -143,8 +191,7 @@ instance_destroy_rcu(struct rcu_head *head)
>> rcu);
>>
>> nfqnl_flush(inst, NULL, 0);
>> - kfree(inst);
>> - module_put(THIS_MODULE);
>> + schedule_work(&inst->work);
>> }
>>
>> static void
>> @@ -162,32 +209,67 @@ instance_destroy(struct nfqnl_instance *inst)
>> spin_unlock(&instances_lock);
>> }
>>
>> +static inline struct list_head *nfqnl_head_get(struct nfqnl_instance *queue,
>> + unsigned int id)
>> +{
>> + return &queue->queue_htbl[reciprocal_divide(id,
>> + queue->reciprocal_value)];
>> +}
>> +
>> +static struct nf_queue_entry *__find_entry(struct nfqnl_instance *queue,
>> + u32 id)
>> +{
>> + struct nf_queue_entry *entry;
>> + struct list_head *head;
>> +
>> + head = nfqnl_head_get(queue, id);
>> + list_for_each_entry(entry, head, list) {
>> + if (entry->id == id)
>> + return entry;
>> + }
>> +
>> + return NULL;
>> +}
>> +
>> +static u32 __get_uniq_id(struct nfqnl_instance *queue)
>> +{
>> + u32 i;
>> +
>> + for (i = 0; i < INT_MAX; i++) {
>> + queue->id_sequence += queue->id_increment;
>> + if (queue->id_sequence >= queue->id_limit) {
>> + if (++queue->id_offset >= queue->id_increment)
>> + queue->id_offset = 0;
>> + queue->id_sequence = queue->id_offset;
>> + }
>> + if (__find_entry(queue, queue->id_sequence) == NULL)
>> + return queue->id_sequence;
>> + }
>> +
>> + return INT_MAX;
>> +}
>> +
>> static inline void
>> __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
>> {
>> - list_add_tail(&entry->list, &queue->queue_list);
>> - queue->queue_total++;
>> + struct list_head *head;
>> +
>> + head = nfqnl_head_get(queue, entry->id);
>> + list_add_tail(&entry->list, head);
>> + queue->queue_total++;
>> }
>>
>> static struct nf_queue_entry *
>> find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
>> {
>> - struct nf_queue_entry *entry = NULL, *i;
>> + struct nf_queue_entry *entry;
>>
>> spin_lock_bh(&queue->lock);
>> -
>> - list_for_each_entry(i, &queue->queue_list, list) {
>> - if (i->id == id) {
>> - entry = i;
>> - break;
>> - }
>> - }
>> -
>> + entry = __find_entry(queue, id);
>> if (entry) {
>> list_del(&entry->list);
>> queue->queue_total--;
>> }
>> -
>> spin_unlock_bh(&queue->lock);
>>
>> return entry;
>> @@ -197,13 +279,22 @@ static void
>> nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
>> {
>> struct nf_queue_entry *entry, *next;
>> + unsigned int i, total;
>> + struct list_head *head;
>>
>> spin_lock_bh(&queue->lock);
>> - list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
>> - if (!cmpfn || cmpfn(entry, data)) {
>> - list_del(&entry->list);
>> - queue->queue_total--;
>> - nf_reinject(entry, NF_DROP);
>> + total = queue->queue_total;
>> + for (i = 0; i < queue->queue_htblsiz; i++) {
>> + if (total < 1)
>> + break;
>> + head = &queue->queue_htbl[i];
>> + list_for_each_entry_safe(entry, next, head, list) {
>> + if (!cmpfn || cmpfn(entry, data)) {
>> + list_del(&entry->list);
>> + queue->queue_total--;
>> + nf_reinject(entry, NF_DROP);
>> + }
>> + --total;
>> }
>> }
>> spin_unlock_bh(&queue->lock);
>> @@ -262,7 +353,12 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
>> break;
>> }
>>
>> - entry->id = queue->id_sequence++;
>> + entry->id = __get_uniq_id(queue);
>> + if (entry->id == INT_MAX) {
>> + spin_unlock_bh(&queue->lock);
>> + return NULL;
>> + }
>> + __enqueue_entry(queue, entry);
>>
>> spin_unlock_bh(&queue->lock);
>>
>> @@ -379,6 +475,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
>>
>> nlmsg_failure:
>> nla_put_failure:
>> + find_dequeue_entry(queue, entry->id);
>> if (skb)
>> kfree_skb(skb);
>> if (net_ratelimit())
>> @@ -426,14 +523,14 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
>> goto err_out_unlock;
>> }
>>
>> - __enqueue_entry(queue, entry);
>> -
>> spin_unlock_bh(&queue->lock);
>> return 0;
>>
>> err_out_free_nskb:
>> kfree_skb(nskb);
>> err_out_unlock:
>> + list_del(&entry->list);
>> + queue->queue_total--;
>> spin_unlock_bh(&queue->lock);
>> err_out:
>> return -1;
>> @@ -686,6 +783,77 @@ static const struct nf_queue_handler nfqh = {
>> .outfn = &nfqnl_enqueue_packet,
>> };
>>
>> +static int nfqnl_htbl_resize(u16 queue_num, int pid, unsigned int size)
>> +{
>> + struct nfqnl_instance *queue;
>> + unsigned int i, total;
>> + struct list_head *h, *htbl;
>> + bool is_vmalloc;
>> + int err;
>> +
>> + if (size < 1 || size > INT_MAX)
>> + return -EINVAL;
>> +
>> + rcu_read_unlock();
>
> Again, this seems strange. As near as I can tell, the caller immediately
> exits the RCU read-side critical section, so why not have the caller do
> the exit immediately before calling nfqnl_htbl_resize()?
>
It sounds reasonable. Thanks.
--
Regards,
Changli Gao(xiaosuo@gmail.com)
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
prev parent reply other threads:[~2010-05-01 0:15 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-04-20 14:31 [PATCH] nfnetlink_queue: use hash table to speed up entry lookup Changli Gao
2010-04-20 14:57 ` Patrick McHardy
2010-04-21 0:04 ` Changli Gao
2010-04-21 12:35 ` Patrick McHardy
2010-04-21 19:04 ` Eric Dumazet
2010-05-01 0:05 ` Changli Gao
2010-05-01 16:42 ` Patrick McHardy
2010-04-21 20:23 ` Paul E. McKenney
2010-05-01 0:14 ` Changli Gao [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=v2w412e6f7f1004301714wfc3a4e4aoacebb8e609de2d5d@mail.gmail.com \
--to=xiaosuo@gmail.com \
--cc=eric.dumazet@gmail.com \
--cc=kaber@trash.net \
--cc=netfilter-devel@vger.kernel.org \
--cc=paulmck@linux.vnet.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).