Netchannles: first stage has been completed. Further ideas.

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* Netchannles: first stage has been completed. Further ideas.
@ 2006-07-18  8:16 Evgeniy Polyakov
  2006-07-18  8:34 ` David Miller
                   ` (3 more replies)
  0 siblings, 4 replies; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-18  8:16 UTC (permalink / raw)
  To: netdev; +Cc: David Miller

Hello.

Current tests with the latest netchannel patch show that netchannels 
outperforms sockets in any type of bulk transfer (big-sized, small-sized, 
sending, receiving) over 1gb wire. I omit graphs and numbers here, 
since I posted it already several times. I also plan to proceed
some negotiations which would allow to test netchannel support in 10gbit
environment, but it can also happen after second development stage
completed.

All protocol processing in netchannels happens in process' context at 
syscall time and is completely lockless (there is one irq lock when skb 
is queued/dequeued into netchannels queue in hard/soft irq, one mutex for 
netchannel's bucket and some locks on qdisk/NIC driver layer, but all of 
them are not directly related to netchannles and protocol processing).
I also completed listen state support (not as Unix accept() call,
netchannel must be created between two peers in listen or connect state).

ATCP stack is completely IP protocol agnostic as netchannels itself, but
there are some places which dereference IP header to obtain full size of
the data. It can be easily eliminated if there is strong desire for
that.

Further netchannel development is moved into full sending and receiving
zero-copy support implementation, which is being designed to work
without any hardware assist and VM hacks (alhtough this means that it is 
impossible to directly store data into for example VFS cache due to the
fact that network headers are placed in the same page as data). 
This stage can also be used for various high performance sniffer devices
and probably other subsystems.

I would ask to push netchannel support into -mm tree, but I expect in 
advance that having two separate TCP stacks (one of which can contain 
some bugs (I mean atcp.c)) is not that good idea, so I understand 
possible negative feedback on that issue, but it is much better than
silence.

All kernel patches, userspace utilities and more detailed description
can be found on project's homepage at:
http://tservice.net.ru/~s0mbre/old/?section=projects&item=netchannel

Thank you.

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index f48bef1..7a4a758 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -315,3 +315,5 @@ ENTRY(sys_call_table)
 	.long sys_splice
 	.long sys_sync_file_range
 	.long sys_tee			/* 315 */
+	.long sys_vmsplice
+	.long sys_netchannel_control
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5a92fed..fdfb997 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -696,4 +696,5 @@ #endif
 	.quad sys_sync_file_range
 	.quad sys_tee
 	.quad compat_sys_vmsplice
+	.quad sys_netchannel_control
 ia32_syscall_end:		
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index eb4b152..777cd85 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -322,8 +322,9 @@ #define __NR_splice		313
 #define __NR_sync_file_range	314
 #define __NR_tee		315
 #define __NR_vmsplice		316
+#define __NR_netchannel_control	317
 
-#define NR_syscalls 317
+#define NR_syscalls 318
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index feb77cb..4459bad 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -617,8 +617,10 @@ #define __NR_sync_file_range	277
 __SYSCALL(__NR_sync_file_range, sys_sync_file_range)
 #define __NR_vmsplice		278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
+#define __NR_netchannel_control	279
+__SYSCALL(__NR_netchannel_control, sys_netchannel_control)
 
-#define __NR_syscall_max __NR_vmsplice
+#define __NR_syscall_max __NR_netchannel_control
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/netchannel.h b/include/linux/netchannel.h
new file mode 100644
index 0000000..f32332c
--- /dev/null
+++ b/include/linux/netchannel.h
@@ -0,0 +1,140 @@
+/*
+ * 	netchannel.h
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __NETCHANNEL_H
+#define __NETCHANNEL_H
+
+#include <linux/types.h>
+
+enum netchannel_commands {
+	NETCHANNEL_CREATE = 0,
+	NETCHANNEL_REMOVE,
+	NETCHANNEL_BIND,
+	NETCHANNEL_RECV,
+	NETCHANNEL_SEND,
+	NETCHANNEL_DUMP,
+};
+
+enum netchannel_type {
+	NETCHANNEL_COPY_USER = 0,
+	NETCHANNEL_MMAP,
+	NETCHANEL_VM_HACK,
+};
+
+struct unetchannel
+{
+	__u32			faddr, laddr;		/* foreign/local hashes */
+	__u16			fport, lport;		/* foreign/local ports */
+	__u8			proto;			/* IP protocol number */
+	__u8			copy:3,			/* Netchannel type: copy_to_user, mmap or something */
+				state:5;		/* Some initial state */
+	__u8			memory_limit_order;	/* Memor limit order */
+	__u8			init_stat_work;		/* Start statistic dumping */
+};
+
+struct unetchannel_control
+{
+	struct unetchannel	unc;
+	__u32			cmd;
+	__u32			len;
+	__u32			flags;
+	__u32			timeout;
+	unsigned int		fd;
+};
+
+#ifdef __KERNEL__
+
+struct netchannel_stat
+{
+	u64			enter;
+	u64			ready;
+	u64			recv;
+	u64			empty;
+	u64			null;
+	u64			backlog;
+	u64			backlog_err;
+	u64			eat;
+};
+
+struct netchannel;
+
+struct common_protocol
+{
+	unsigned int		size;
+
+	int 			(*create)(struct netchannel *);
+	int 			(*destroy)(struct netchannel *);
+
+	int 			(*process_in)(struct netchannel *, void *, unsigned int);
+	int 			(*process_out)(struct netchannel *, void *, unsigned int);
+};
+
+struct netchannel
+{
+	struct hlist_node	node;
+	atomic_t		refcnt;
+	struct rcu_head		rcu_head;
+	struct unetchannel	unc;
+	unsigned long		hit;
+
+	struct page *		(*nc_alloc_page)(unsigned int size);
+	void			(*nc_free_page)(struct page *page);
+	int			(*nc_recv_data)(struct netchannel *, unsigned int *timeout, unsigned int *len, void *arg);
+	int			(*nc_send_data)(struct netchannel *, unsigned int *timeout, unsigned int *len, void *arg);
+
+	struct sk_buff_head 	recv_queue;
+	wait_queue_head_t	wait;
+
+	unsigned int		qlen;
+
+	void			*priv;
+
+	struct work_struct	work;
+
+	struct netchannel_stat	stat;
+
+	struct common_protocol	*proto;
+	struct dst_entry	*dst;
+};
+
+struct netchannel_cache_head
+{
+	struct hlist_head	head;
+	struct mutex		mutex;
+};
+
+#define NETCHANNEL_MAX_ORDER	31
+#define NETCHANNEL_MIN_ORDER	PAGE_SHIFT
+
+struct netchannel_mmap
+{
+	struct page		**page;
+	unsigned int		pnum;
+	unsigned int		poff;
+};
+
+extern struct common_protocol atcp_common_protocol;
+
+extern struct sk_buff *netchannel_get_skb(struct netchannel *nc, unsigned int *timeout, int *error);
+struct dst_entry *netchannel_route_get_raw(struct netchannel *nc);
+
+#endif /* __KERNEL__ */
+#endif /* __NETCHANNEL_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a461b51..9924911 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -684,6 +684,15 @@ extern void		dev_queue_xmit_nit(struct s
 
 extern void		dev_init(void);
 
+#ifdef CONFIG_NETCHANNEL
+extern int netchannel_recv(struct sk_buff *skb);
+#else
+static int netchannel_recv(struct sk_buff *skb) 
+{ 
+	return -1;
+}
+#endif
+
 extern int		netdev_nit;
 extern int		netdev_budget;
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f8f2347..ba82aa2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -314,6 +314,18 @@ static inline struct sk_buff *alloc_skb(
 	return __alloc_skb(size, priority, 0);
 }
 
+#ifdef CONFIG_NETCHANNEL
+struct unetchannel;
+extern struct sk_buff *netchannel_alloc(struct unetchannel *unc, unsigned int header_size, 
+		unsigned int total_size, gfp_t gfp_mask);
+#else
+static struct sk_buff *netchannel_alloc(void *unc, unsigned int header_size, 
+		unsigned int total_size, gfp_t gfp_mask)
+{
+	return NULL;
+}
+#endif
+
 static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 					       gfp_t priority)
 {
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3996960..8c22875 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -582,4 +582,6 @@ asmlinkage long sys_tee(int fdin, int fd
 asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
 					unsigned int flags);
 
+asmlinkage long sys_netchannel_control(void __user *arg);
+
 #endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5433195..1747fc3 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -132,3 +132,5 @@ cond_syscall(sys_mincore);
 cond_syscall(sys_madvise);
 cond_syscall(sys_mremap);
 cond_syscall(sys_remap_file_pages);
+
+cond_syscall(sys_netchannel_control);
diff --git a/net/Kconfig b/net/Kconfig
index 4193cdc..465e37b 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -66,6 +66,14 @@ source "net/ipv6/Kconfig"
 
 endif # if INET
 
+config NETCHANNEL
+	bool "Network channels"
+	---help---
+	  Network channels are peer-to-peer abstraction, which allows to create
+	  high performance communications. 
+	  Main advantages are unified address cache, protocol processing moved
+	  to userspace, receiving zero-copy support and other interesting features.
+
 menuconfig NETFILTER
 	bool "Network packet filtering (replaces ipchains)"
 	---help---
diff --git a/net/core/Makefile b/net/core/Makefile
index 79fe12c..7119812 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_NET_DIVERT) += dv.o
 obj-$(CONFIG_NET_PKTGEN) += pktgen.o
 obj-$(CONFIG_WIRELESS_EXT) += wireless.o
 obj-$(CONFIG_NETPOLL) += netpoll.o
+obj-$(CONFIG_NETCHANNEL) += netchannel.o
diff --git a/net/core/dev.c b/net/core/dev.c
index 9ab3cfa..2721111 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1712,6 +1712,10 @@ #endif
 		}
 	}
 
+	ret = netchannel_recv(skb);
+	if (!ret)
+		goto out;
+
 #ifdef CONFIG_NET_CLS_ACT
 	if (pt_prev) {
 		ret = deliver_skb(skb, pt_prev, orig_dev);
diff --git a/net/core/netchannel.c b/net/core/netchannel.c
new file mode 100644
index 0000000..e1db3bb
--- /dev/null
+++ b/net/core/netchannel.c
@@ -0,0 +1,1224 @@
+/*
+ * 	netchannel.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/types.h>
+#include <linux/unistd.h>
+#include <linux/linkage.h>
+#include <linux/notifier.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/workqueue.h>
+#include <linux/netchannel.h>
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
+#include <linux/udp.h>
+
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+
+#include <asm/uaccess.h>
+
+static unsigned int netchannel_hash_order = 8;
+static struct netchannel_cache_head ***netchannel_hash_table;
+static kmem_cache_t *netchannel_cache;
+
+static struct super_block *netchannel_get_sb(struct file_system_type *fs_type, 
+		int flags, const char *dev_name, void *data)
+{
+	/* So original magic... */
+	return get_sb_pseudo(fs_type, "netchannel", NULL, 0xabcdef);	
+}
+
+static struct file_system_type netchannel_fs = {
+	.name		= "netchannel",
+	.get_sb		= netchannel_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static struct vfsmount *netchannel_mnt;
+static struct file_operations netchannel_fops = {
+	.owner		= THIS_MODULE,
+};
+
+static int netchannel_inetaddr_notifier_call(struct notifier_block *, unsigned long, void *);
+static struct notifier_block netchannel_inetaddr_notifier = {
+	.notifier_call = &netchannel_inetaddr_notifier_call
+};
+
+#ifdef CONFIG_IPV6
+static int netchannel_inet6addr_notifier_call(struct notifier_block *, unsigned long, void *);
+static struct notifier_block netchannel_inet6addr_notifier = {
+	.notifier_call = &netchannel_inet6addr_notifier_call
+};
+#endif
+
+static inline unsigned int netchannel_hash(struct unetchannel *unc)
+{
+	unsigned int h = (unc->faddr ^ unc->fport) ^ (unc->laddr ^ unc->lport);
+	h ^= h >> 16;
+	h ^= h >> 8;
+	h ^= unc->proto;
+	return h & ((1 << 2*netchannel_hash_order) - 1);
+}
+
+static inline void netchannel_convert_hash(unsigned int hash, unsigned int *col, unsigned int *row)
+{
+	*row = hash & ((1 << netchannel_hash_order) - 1);
+	*col = (hash >> netchannel_hash_order) & ((1 << netchannel_hash_order) - 1);
+}
+
+static struct netchannel_cache_head *netchannel_bucket(struct unetchannel *unc)
+{
+	unsigned int hash = netchannel_hash(unc);
+	unsigned int col, row;
+
+	netchannel_convert_hash(hash, &col, &row);
+	return netchannel_hash_table[col][row];
+}
+
+static inline int netchannel_hash_equal_full(struct unetchannel *unc1, struct unetchannel *unc2)
+{
+	return (unc1->fport == unc2->fport) && (unc1->faddr == unc2->faddr) &&
+				(unc1->lport == unc2->lport) && (unc1->laddr == unc2->laddr) && 
+				(unc1->proto == unc2->proto);
+}
+
+static inline int netchannel_hash_equal_dest(struct unetchannel *unc1, struct unetchannel *unc2)
+{
+	return ((unc1->fport == unc2->fport) && (unc1->faddr == unc2->faddr) && (unc1->proto == unc2->proto));
+}
+
+static struct netchannel *netchannel_check_dest(struct unetchannel *unc, struct netchannel_cache_head *bucket)
+{
+	struct netchannel *nc;
+	struct hlist_node *node;
+	int found = 0;
+	
+	hlist_for_each_entry_rcu(nc, node, &bucket->head, node) {
+		if (netchannel_hash_equal_dest(&nc->unc, unc)) {
+			found = 1;
+			break;
+		}
+	}
+
+	return (found)?nc:NULL;
+}
+
+static struct netchannel *netchannel_check_full(struct unetchannel *unc, struct netchannel_cache_head *bucket)
+{
+	struct netchannel *nc;
+	struct hlist_node *node;
+	int found = 0;
+
+	hlist_for_each_entry_rcu(nc, node, &bucket->head, node) {
+		if (netchannel_hash_equal_full(&nc->unc, unc)) {
+			found = 1;
+			break;
+		}
+	}
+
+	return (found)?nc:NULL;
+}
+
+static void netchannel_mmap_cleanup(struct netchannel *nc)
+{
+	unsigned int i;
+	struct netchannel_mmap *m = nc->priv;
+
+	for (i=0; i<m->pnum; ++i)
+		__free_page(m->page[i]);
+
+	kfree(m);
+}
+
+static void netchannel_cleanup(struct netchannel *nc)
+{
+	kfree(nc->proto);
+	switch (nc->unc.copy) {
+		case NETCHANNEL_COPY_USER:
+			break;
+		case NETCHANNEL_MMAP:
+			netchannel_mmap_cleanup(nc);
+			break;
+		default:
+			break;
+	}
+}
+
+static void netchannel_free_rcu(struct rcu_head *rcu)
+{
+	struct netchannel *nc = container_of(rcu, struct netchannel, rcu_head);
+
+	netchannel_cleanup(nc);
+	kmem_cache_free(netchannel_cache, nc);
+}
+
+static inline void netchannel_get(struct netchannel *nc)
+{
+	atomic_inc(&nc->refcnt);
+}
+
+static inline void netchannel_put(struct netchannel *nc)
+{
+	if (atomic_dec_and_test(&nc->refcnt))
+		call_rcu(&nc->rcu_head, &netchannel_free_rcu);
+}
+
+static inline void netchannel_dump_info_unc(struct unetchannel *unc, char *prefix, unsigned long hit, int err)
+{
+	printk(KERN_NOTICE "netchannel: %s %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u, "
+			"proto: %u, copy: %u, state: %u, order: %u, hit: %lu, err: %d.\n",
+			prefix, NIPQUAD(unc->laddr), ntohs(unc->lport), NIPQUAD(unc->faddr), ntohs(unc->fport), 
+			unc->proto, unc->copy, unc->state, unc->memory_limit_order, hit, err);
+}
+
+static int netchannel_convert_skb_ipv6(struct sk_buff *skb, struct unetchannel *unc)
+{
+	/*
+	 * Hash IP addresses into src/dst. Setup TCP/UDP ports.
+	 * Not supported yet.
+	 */
+	return -1;
+}
+
+static int netchannel_convert_skb_ipv4(struct sk_buff *skb, struct unetchannel *unc)
+{
+	struct iphdr *iph;
+	u32 len;
+
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto inhdr_error;
+
+	iph = skb->nh.iph;
+
+	if (iph->ihl < 5 || iph->version != 4)
+		goto inhdr_error;
+
+	if (!pskb_may_pull(skb, iph->ihl*4))
+		goto inhdr_error;
+
+	iph = skb->nh.iph;
+
+	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+		goto inhdr_error;
+
+	len = ntohs(iph->tot_len);
+	if (skb->len < len || len < (iph->ihl*4))
+		goto inhdr_error;
+
+	if (pskb_trim_rcsum(skb, len))
+		goto inhdr_error;
+
+	unc->faddr = iph->saddr;
+	unc->laddr = iph->daddr;
+	unc->proto = iph->protocol;
+
+	len = skb->len;
+
+	skb->h.raw = skb->nh.raw + iph->ihl*4;
+
+	switch (unc->proto) {
+		case IPPROTO_TCP:
+		case IPPROTO_UDP:
+			unc->fport = ((u16 *)skb->h.raw)[0];
+			unc->lport = ((u16 *)skb->h.raw)[1];
+			break;
+		default:
+			goto inhdr_error;
+	}
+
+	return 0;
+
+inhdr_error:
+	return -1;
+}
+
+static int netchannel_convert_skb(struct sk_buff *skb, struct unetchannel *unc)
+{
+	if (skb->pkt_type == PACKET_OTHERHOST)
+		return -1;
+
+	switch (ntohs(skb->protocol)) {
+		case ETH_P_IP:
+			return netchannel_convert_skb_ipv4(skb, unc);
+		case ETH_P_IPV6:
+			return netchannel_convert_skb_ipv6(skb, unc);
+		default:
+			return -1;
+	}
+}
+
+/*
+ * By design netchannels allow to "allocate" data
+ * not only from SLAB cache, but get it from mapped area
+ * or from VFS cache (requires process' context or preallocation).
+ */
+struct sk_buff *netchannel_alloc(struct unetchannel *unc, unsigned int header_size, 
+		unsigned int total_size, gfp_t gfp_mask)
+{
+	struct netchannel *nc;
+	struct netchannel_cache_head *bucket;
+	int err;
+	struct sk_buff *skb = NULL;
+	unsigned int size, pnum, i;
+
+	skb = alloc_skb(header_size, gfp_mask);
+	if (!skb)
+		return NULL;
+
+	rcu_read_lock();
+	bucket = netchannel_bucket(unc);
+	nc = netchannel_check_full(unc, bucket);
+	if (!nc) {
+		err = -ENODEV;
+		goto err_out_free_skb;
+	}
+
+	if (!nc->nc_alloc_page || !nc->nc_free_page) {
+		err = -EINVAL;
+		goto err_out_free_skb;
+	}
+
+	netchannel_get(nc);
+
+	size = total_size - header_size;
+	pnum = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+	for (i=0; i<pnum; ++i) {
+		unsigned int cs = min_t(unsigned int, PAGE_SIZE, size);
+		struct page *page;
+
+		page = nc->nc_alloc_page(cs);
+		if (!page)
+			break;
+		
+		skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags, page, 0, cs);
+		
+		skb->len	+= cs;
+		skb->data_len	+= cs;
+		skb->truesize	+= cs;
+
+		size -= cs;
+	}
+
+	if (i < pnum) {
+		pnum = i;
+		err = -ENOMEM;
+		goto err_out_free_frags;
+	}
+
+	rcu_read_unlock();
+
+	return skb;
+
+err_out_free_frags:
+	for (i=0; i<pnum; ++i) {
+		unsigned int cs = skb_shinfo(skb)->frags[i].size;
+		struct page *page = skb_shinfo(skb)->frags[i].page;
+		
+		nc->nc_free_page(page);
+
+		skb->len	-= cs;
+		skb->data_len	-= cs;
+		skb->truesize	-= cs;
+	}
+
+err_out_free_skb:
+	kfree_skb(skb);
+	return NULL;
+}
+
+int netchannel_recv(struct sk_buff *skb)
+{
+	struct netchannel *nc;
+	struct unetchannel unc;
+	struct netchannel_cache_head *bucket;
+	int err;
+
+	if (!netchannel_hash_table)
+		return -ENODEV;
+
+	rcu_read_lock();
+
+	err = netchannel_convert_skb(skb, &unc);
+	if (err)
+		goto unlock;
+
+	bucket = netchannel_bucket(&unc);
+	nc = netchannel_check_full(&unc, bucket);
+	if (!nc) {
+		err = -ENODEV;
+		goto unlock;
+	}
+
+	nc->hit++;
+#if 0
+	if (nc->qlen + skb->len > (1 << nc->unc.memory_limit_order)) {
+		kfree_skb(skb);
+		err = 0;
+		goto unlock;
+	}
+#endif
+	nc->qlen += skb->len;
+	skb_queue_tail(&nc->recv_queue, skb);
+	//printk("\n%s: skb: %p, size: %u.\n", __func__, skb, skb->len);
+	wake_up(&nc->wait);
+
+unlock:
+	rcu_read_unlock();
+	
+	return err;
+}
+
+static int netchannel_wait_for_packet(struct netchannel *nc, long *timeo_p)
+{
+	int error = 0;
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait_exclusive(&nc->wait, &wait, TASK_INTERRUPTIBLE);
+
+	if (skb_queue_empty(&nc->recv_queue)) {
+		if (signal_pending(current))
+			goto interrupted;
+
+		*timeo_p = schedule_timeout(*timeo_p);
+	}
+out:
+	finish_wait(&nc->wait, &wait);
+	return error;
+interrupted:
+	error = (*timeo_p == MAX_SCHEDULE_TIMEOUT) ? -ERESTARTSYS : -EINTR;
+	goto out;
+}
+
+struct sk_buff *netchannel_get_skb(struct netchannel *nc, unsigned int *timeout, int *error)
+{
+	struct sk_buff *skb = NULL;
+	long tm = *timeout;
+
+	*error = 0;
+
+	while (1) {
+		skb = skb_dequeue(&nc->recv_queue);
+		if (skb) {
+			nc->qlen -= skb->len;
+			break;
+		}
+
+		if (*timeout) {
+			*error = netchannel_wait_for_packet(nc, &tm);
+			if (*error) {
+				*timeout = tm;
+				break;
+			}
+			tm = *timeout;
+		} else {
+			*error = -EAGAIN;
+			break;
+		}
+	}
+	
+	if (!skb)
+		skb = skb_dequeue(&nc->recv_queue);
+
+	return skb;
+}
+
+static int netchannel_copy_to_user_tcp(struct netchannel *nc, unsigned int *timeout, unsigned int *len, void *buf)
+{
+	int ret = nc->proto->process_in(nc, buf, *len);
+	if (ret < 0)
+		return ret;
+	*len = ret;
+	return 0;
+}
+
+static int netchannel_copy_from_user_tcp(struct netchannel *nc, unsigned int *timeout, unsigned int *len, void *buf)
+{
+	int ret = nc->proto->process_out(nc, buf, *len);
+	if (ret < 0)
+		return ret;
+	*len = ret;
+	return 0;
+}
+
+static int netchannel_copy_to_user(struct netchannel *nc, unsigned int *timeout, unsigned int *len, void *arg)
+{
+	unsigned int copied;
+	struct sk_buff *skb;
+	struct iovec to;
+	int err;
+
+	skb = netchannel_get_skb(nc, timeout, &err);
+	if (!skb)
+		return err;
+
+	to.iov_base = arg;
+	to.iov_len = *len;
+
+	copied = skb->len;
+	if (copied > *len)
+		copied = *len;
+
+	if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
+		err = skb_copy_datagram_iovec(skb, 0, &to, copied);
+	} else {
+		err = skb_copy_and_csum_datagram_iovec(skb,0, &to);
+	}
+
+	*len = (err == 0)?copied:0;
+
+	kfree_skb(skb);
+
+	return err;
+}
+
+int netchannel_skb_copy_datagram(const struct sk_buff *skb, int offset,
+			    void *to, int len)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		memcpy(to, skb->data + offset, copy);
+
+		if ((len -= copy) == 0)
+			return 0;
+		offset += copy;
+		to += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		BUG_TRAP(start <= offset + len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		if ((copy = end - offset) > 0) {
+			u8  *vaddr;
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			struct page *page = frag->page;
+
+			if (copy > len)
+				copy = len;
+			vaddr = kmap(page);
+			memcpy(to, vaddr + frag->page_offset +
+					     offset - start, copy);
+			kunmap(page);
+			if (!(len -= copy))
+				return 0;
+			offset += copy;
+			to += copy;
+		}
+		start = end;
+	}
+
+	if (skb_shinfo(skb)->frag_list) {
+		struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+		for (; list; list = list->next) {
+			int end;
+
+			BUG_TRAP(start <= offset + len);
+
+			end = start + list->len;
+			if ((copy = end - offset) > 0) {
+				if (copy > len)
+					copy = len;
+				if (netchannel_skb_copy_datagram(list,
+							    offset - start,
+							    to, copy))
+					goto fault;
+				if ((len -= copy) == 0)
+					return 0;
+				offset += copy;
+				to += copy;
+			}
+			start = end;
+		}
+	}
+	if (!len)
+		return 0;
+
+fault:
+	return -EFAULT;
+}
+
+static int netchannel_copy_to_mem(struct netchannel *nc, unsigned int *timeout, unsigned int *len, void *arg)
+{
+	struct netchannel_mmap *m = nc->priv;
+	unsigned int copied, skb_offset = 0;
+	struct sk_buff *skb;
+	int err;
+
+	skb = netchannel_get_skb(nc, timeout, &err);
+	if (!skb)
+		return err;
+
+	copied = skb->len;
+
+	while (copied) {
+		int pnum = ((m->poff % PAGE_SIZE) % m->pnum);
+		struct page *page = m->page[pnum];
+		void *page_map, *ptr;
+		unsigned int sz, left;
+
+		left = PAGE_SIZE - (m->poff % (PAGE_SIZE - 1));
+		sz = min_t(unsigned int, left, copied);
+
+		if (!sz) {
+			err = -ENOSPC;
+			goto err_out;
+		}
+
+		page_map = kmap_atomic(page, KM_USER0);
+		if (!page_map) {
+			err = -ENOMEM;
+			goto err_out;
+		}
+		ptr = page_map + (m->poff % (PAGE_SIZE - 1));
+
+		err = netchannel_skb_copy_datagram(skb, skb_offset, ptr, sz);
+		if (err) {
+			kunmap_atomic(page_map, KM_USER0);
+			goto err_out;
+		}
+		kunmap_atomic(page_map, KM_USER0);
+
+		copied -= sz;
+		m->poff += sz;
+		skb_offset += sz;
+#if 1
+		if (m->poff >= PAGE_SIZE * m->pnum) {
+			//netchannel_dump_info_unc(&nc->unc, "rewind", nc->hit, 0);
+			m->poff = 0;
+		}
+#endif
+	}
+	*len = skb->len;
+
+	err = 0;
+
+err_out:
+	kfree_skb(skb);
+
+	return err;
+}
+
+static int netchannel_mmap_setup(struct netchannel *nc)
+{
+	struct netchannel_mmap *m;
+	unsigned int i, pnum;
+
+	pnum = nc->unc.memory_limit_order - NETCHANNEL_MIN_ORDER;
+
+	m = kzalloc(sizeof(struct netchannel_mmap) + sizeof(struct page *) * pnum, GFP_KERNEL);
+	if (!m)
+		return -ENOMEM;
+
+	m->page = (struct page **)(m + 1);
+	m->pnum = pnum;
+
+	for (i=0; i<pnum; ++i) {
+		m->page[i] = alloc_page(GFP_KERNEL);
+		if (!m->page[i])
+			break;
+	}
+
+	if (i < pnum) {
+		pnum = i;
+		goto err_out_free;
+	}
+
+	nc->priv = m;
+
+	switch (nc->unc.proto) {
+		case IPPROTO_TCP:
+			nc->proto = kzalloc(atcp_common_protocol.size, GFP_KERNEL);
+			if (!nc->proto)
+				goto err_out_free;
+			memcpy(nc->proto, &atcp_common_protocol, sizeof(struct common_protocol));
+			nc->nc_recv_data = &netchannel_copy_to_user_tcp;
+			nc->nc_send_data = &netchannel_copy_from_user_tcp;
+			break;
+		case IPPROTO_UDP:
+		default:
+			nc->nc_recv_data = &netchannel_copy_to_mem;
+			break;
+	}
+
+	return 0;
+
+err_out_free:
+	for (i=0; i<pnum; ++i)
+		__free_page(m->page[i]);
+
+	kfree(m);
+
+	return -ENOMEM;
+	
+}
+
+static int netchannel_copy_user_setup(struct netchannel *nc)
+{
+	int ret = 0;
+	
+	switch (nc->unc.proto) {
+		case IPPROTO_UDP:
+			nc->nc_recv_data = &netchannel_copy_to_user;
+			break;
+		case IPPROTO_TCP:
+			nc->proto = kzalloc(atcp_common_protocol.size, GFP_KERNEL);
+			if (!nc->proto) {
+				ret = -ENOMEM;
+				break;
+			}
+			memcpy(nc->proto, &atcp_common_protocol, sizeof(struct common_protocol));
+			nc->nc_recv_data = &netchannel_copy_to_user_tcp;
+			nc->nc_send_data = &netchannel_copy_from_user_tcp;
+			break;
+		default:
+			ret = -EINVAL;
+			break;
+	}
+
+	return ret;
+}
+
+static int netchannel_setup(struct netchannel *nc)
+{
+	int ret = 0;
+
+	if (nc->unc.memory_limit_order > NETCHANNEL_MAX_ORDER)
+		nc->unc.memory_limit_order = NETCHANNEL_MAX_ORDER;
+
+	if (nc->unc.memory_limit_order < NETCHANNEL_MIN_ORDER)
+		nc->unc.memory_limit_order = NETCHANNEL_MIN_ORDER;
+	
+	switch (nc->unc.copy) {
+		case NETCHANNEL_COPY_USER:
+			ret = netchannel_copy_user_setup(nc);
+			break;
+		case NETCHANNEL_MMAP:
+			ret = netchannel_mmap_setup(nc);
+			break;
+		default:
+			ret = -EINVAL;
+			break;
+	}
+
+	return ret;
+}
+
+static int netchannel_bind_fd(struct netchannel *nc)
+{
+	struct file *file;
+	int fd, ret;
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		return fd;
+
+	file = get_empty_filp();
+	if (!file) {
+		ret = -ENFILE;
+		goto out_put_fd;
+	}
+	
+	netchannel_get(nc);
+
+	file->f_op = &netchannel_fops;
+	file->f_vfsmnt = mntget(netchannel_mnt);
+	file->f_dentry = dget(netchannel_mnt->mnt_root);
+	file->f_mapping = file->f_dentry->d_inode->i_mapping;
+	file->f_mode = FMODE_READ;
+	file->f_flags = O_RDONLY;
+	file->private_data = nc;
+	
+	fd_install(fd, file);
+
+	return fd;
+
+out_put_fd:
+	put_unused_fd(fd);
+	return ret;
+}
+
+static int netchannel_bind(struct unetchannel_control *ctl)
+{
+	struct netchannel *nc;
+	int err;
+	struct netchannel_cache_head *bucket;
+
+	bucket = netchannel_bucket(&ctl->unc);
+	
+	mutex_lock(&bucket->mutex);
+	
+	nc = netchannel_check_full(&ctl->unc, bucket);
+	if (!nc) {
+		err = -ENODEV;
+		goto err_out_unlock;
+	}
+
+	ctl->fd = netchannel_bind_fd(nc);
+	if (ctl->fd < 0) {
+		err = ctl->fd;
+		goto err_out_unlock;
+	}
+
+	mutex_unlock(&bucket->mutex);
+
+	return 0;
+
+err_out_unlock:
+	mutex_unlock(&bucket->mutex);
+	return err;
+}
+
+static void netchannel_dump_stat(struct netchannel *nc)
+{
+	printk(KERN_NOTICE "netchannel: enter: %llu, ready: %llu, recv: %llu, empty: %llu, null: %llu, backlog: %llu, backlog_err: %llu, eat: %llu.\n",
+			nc->stat.enter, nc->stat.ready, nc->stat.recv, nc->stat.empty, nc->stat.null, nc->stat.backlog,
+			nc->stat.backlog_err, nc->stat.eat);
+}
+
+static void netchannel_work(void *data)
+{
+	struct netchannel *nc = data;
+	
+	netchannel_dump_info_unc(&nc->unc, "work", nc->hit, 0);
+	netchannel_dump_stat(nc);
+	schedule_delayed_work(&nc->work, msecs_to_jiffies(1000*nc->unc.init_stat_work));
+}
+
+static int netchannel_create(struct unetchannel *unc)
+{
+	struct netchannel *nc;
+	int err = -ENOMEM;
+	struct netchannel_cache_head *bucket;
+	
+	nc = kmem_cache_alloc(netchannel_cache, GFP_KERNEL);
+	if (!nc)
+		return -ENOMEM;
+
+	memset(nc, 0, sizeof(struct netchannel));
+	
+	nc->hit = 0;
+	skb_queue_head_init(&nc->recv_queue);
+	init_waitqueue_head(&nc->wait);
+	atomic_set(&nc->refcnt, 1);
+	memcpy(&nc->unc, unc, sizeof(struct unetchannel));
+
+	err = netchannel_setup(nc);
+	if (err)
+		goto err_out_free;
+
+	nc->dst = netchannel_route_get_raw(nc);
+	if (!nc->dst) {
+		err = -ENODEV;
+		goto err_out_cleanup;
+	}
+
+	bucket = netchannel_bucket(unc);
+
+	mutex_lock(&bucket->mutex);
+
+	if (netchannel_check_full(unc, bucket)) {
+		err = -EEXIST;
+		goto err_out_unlock;
+	}
+
+	hlist_add_head_rcu(&nc->node, &bucket->head);
+	err = 0;
+
+	if (nc->proto->create)
+		err = nc->proto->create(nc);
+
+	mutex_unlock(&bucket->mutex);
+
+	netchannel_dump_info_unc(unc, "create", 0, err);
+
+	INIT_WORK(&nc->work, netchannel_work, nc);
+	if (nc->unc.init_stat_work)
+		schedule_delayed_work(&nc->work, msecs_to_jiffies(1000*nc->unc.init_stat_work));
+
+	return err;
+
+err_out_unlock:
+	mutex_unlock(&bucket->mutex);
+	dst_release(nc->dst);
+err_out_cleanup:
+	netchannel_cleanup(nc);
+err_out_free:
+	kmem_cache_free(netchannel_cache, nc);
+
+	return err;
+}
+
+static int netchannel_remove(struct unetchannel *unc)
+{
+	struct netchannel *nc;
+	int err = -ENODEV;
+	struct netchannel_cache_head *bucket;
+	unsigned long hit = 0;
+	
+	if (!netchannel_hash_table)
+		return -ENODEV;
+	
+	bucket = netchannel_bucket(unc);
+
+	mutex_lock(&bucket->mutex);
+
+	nc = netchannel_check_full(unc, bucket);
+	if (!nc)
+		nc = netchannel_check_dest(unc, bucket);
+
+	if (!nc)
+		goto out_unlock;
+	
+	hlist_del_rcu(&nc->node);
+	hit = nc->hit;
+
+	if (nc->unc.init_stat_work) {
+		cancel_rearming_delayed_work(&nc->work);
+		flush_scheduled_work();
+	}
+
+	if (nc->proto->destroy)
+		nc->proto->destroy(nc);
+	
+	dst_release(nc->dst);
+	
+	netchannel_put(nc);
+	err = 0;
+
+out_unlock:
+	mutex_unlock(&bucket->mutex);
+	netchannel_dump_info_unc(unc, "remove", hit, err);
+	return err;
+}
+
+static int netchannel_send_data(struct unetchannel_control *ctl, void __user *data)
+{
+	int ret = -ENODEV;
+	struct netchannel_cache_head *bucket;
+	struct netchannel *nc;
+
+	if (ctl->fd) {
+		struct file *file;
+		int fput_needed;
+
+		file = fget_light(ctl->fd, &fput_needed);
+		if (!file)
+			return ret;
+
+		nc = file->private_data;
+
+		fput_light(file, fput_needed);
+
+		if (!nc)
+			return -EINVAL;
+	} else {
+		bucket = netchannel_bucket(&ctl->unc);
+
+		mutex_lock(&bucket->mutex);
+
+		nc = netchannel_check_full(&ctl->unc, bucket);
+		if (!nc)
+			nc = netchannel_check_dest(&ctl->unc, bucket);
+
+		if (!nc)
+			goto err_out_unlock;
+
+		netchannel_get(nc);
+		mutex_unlock(&bucket->mutex);
+	}
+
+	ret = nc->nc_send_data(nc, &ctl->timeout, &ctl->len, data);
+	
+	if (!ctl->fd)
+		netchannel_put(nc);
+	return ret;
+
+err_out_unlock:
+	mutex_unlock(&bucket->mutex);
+	return ret;
+}
+
+static int netchannel_recv_data(struct unetchannel_control *ctl, void __user *data)
+{
+	int ret = -ENODEV;
+	struct netchannel_cache_head *bucket;
+	struct netchannel *nc;
+	
+	bucket = netchannel_bucket(&ctl->unc);
+
+	mutex_lock(&bucket->mutex);
+
+	nc = netchannel_check_full(&ctl->unc, bucket);
+	if (!nc)
+		nc = netchannel_check_dest(&ctl->unc, bucket);
+
+	if (!nc)
+		goto err_out_unlock;
+
+	netchannel_get(nc);
+	mutex_unlock(&bucket->mutex);
+
+	ret = nc->nc_recv_data(nc, &ctl->timeout, &ctl->len, data);
+	
+	netchannel_put(nc);
+	return ret;
+
+err_out_unlock:
+	mutex_unlock(&bucket->mutex);
+	return ret;
+}
+
+static int netchannel_dump_info(struct unetchannel *unc)
+{
+	struct netchannel_cache_head *bucket;
+	struct netchannel *nc;
+	char *ncs = "none";
+	unsigned long hit = 0;
+	int err;
+	
+	bucket = netchannel_bucket(unc);
+
+	mutex_lock(&bucket->mutex);
+	nc = netchannel_check_full(unc, bucket);
+	if (!nc) {
+		nc = netchannel_check_dest(unc, bucket);
+		if (nc)
+			ncs = "dest";
+	} else 
+		ncs = "full";
+	if (nc)
+		hit = nc->hit;
+	mutex_unlock(&bucket->mutex);
+	err = (nc)?0:-ENODEV;
+
+	netchannel_dump_info_unc(unc, ncs, hit, err);
+
+	return err;
+}
+
+asmlinkage long sys_netchannel_control(void __user *arg)
+{
+	struct unetchannel_control ctl;
+	int ret;
+
+	if (!netchannel_hash_table)
+		return -ENODEV;
+
+	if (copy_from_user(&ctl, arg, sizeof(struct unetchannel_control)))
+		return -ERESTARTSYS;
+
+	switch (ctl.cmd) {
+		case NETCHANNEL_CREATE:
+			ret = netchannel_create(&ctl.unc);
+			break;
+		case NETCHANNEL_BIND:
+			ret = netchannel_bind(&ctl);
+			break;
+		case NETCHANNEL_REMOVE:
+			ret = netchannel_remove(&ctl.unc);
+			break;
+		case NETCHANNEL_RECV:
+			ret = netchannel_recv_data(&ctl, arg + sizeof(struct unetchannel_control));
+			break;
+		case NETCHANNEL_SEND:
+			ret = netchannel_send_data(&ctl, arg + sizeof(struct unetchannel_control));
+			break;
+		case NETCHANNEL_DUMP:
+			ret = netchannel_dump_info(&ctl.unc);
+			break;
+		default:
+			ret = -EINVAL;
+			break;
+	}
+	
+	if (copy_to_user(arg, &ctl, sizeof(struct unetchannel_control)))
+		return -ERESTARTSYS;
+
+	return ret;
+}
+
+static inline void netchannel_dump_addr(struct in_ifaddr *ifa, char *str)
+{
+	printk(KERN_NOTICE "netchannel: %s %u.%u.%u.%u/%u.%u.%u.%u\n", str, NIPQUAD(ifa->ifa_local), NIPQUAD(ifa->ifa_mask));
+}
+
+static int netchannel_inetaddr_notifier_call(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct in_ifaddr *ifa = ptr;
+
+	switch (event) {
+		case NETDEV_UP:
+			netchannel_dump_addr(ifa, "add");
+			break;
+		case NETDEV_DOWN:
+			netchannel_dump_addr(ifa, "del");
+			break;
+		default:
+			netchannel_dump_addr(ifa, "unk");
+			break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+#ifdef CONFIG_IPV6
+static int netchannel_inet6addr_notifier_call(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct inet6_ifaddr *ifa = ptr;
+
+	printk(KERN_NOTICE "netchannel: inet6 event=%lx, ifa=%p.\n", event, ifa);
+	return NOTIFY_DONE;
+}
+#endif
+
+static int __init netchannel_init(void)
+{
+	unsigned int i, j, size;
+	int err;
+	
+	err = register_filesystem(&netchannel_fs);
+	if (err) {
+		printk(KERN_ERR "Failed to register netchannel fs, err: %d.\n", err);
+		return err;
+	}
+
+	netchannel_mnt = kern_mount(&netchannel_fs);
+	if (IS_ERR(netchannel_mnt)) {
+		printk(KERN_ERR "Failed to mount netchannel fs, err: %ld.\n", PTR_ERR(netchannel_mnt));
+		err = PTR_ERR(netchannel_mnt);
+		goto err_out_unregister;
+	}
+
+	size = (1 << netchannel_hash_order);
+
+	netchannel_hash_table = kzalloc(size * sizeof(void *), GFP_KERNEL);
+	if (!netchannel_hash_table)
+		goto err_out_umount;
+
+	for (i=0; i<size; ++i) {
+		struct netchannel_cache_head **col;
+
+		col = kzalloc(size * sizeof(void *), GFP_KERNEL);
+		if (!col)
+			break;
+		
+		for (j=0; j<size; ++j) {
+			struct netchannel_cache_head *head;
+
+			head = kzalloc(sizeof(struct netchannel_cache_head), GFP_KERNEL);
+			if (!head)
+				break;
+
+			INIT_HLIST_HEAD(&head->head);
+			mutex_init(&head->mutex);
+
+			col[j] = head;
+		}
+		
+		if (j<size && j>0) {
+			while (j >= 0)
+				kfree(col[j--]);
+			kfree(col);
+			break;
+		}
+
+		netchannel_hash_table[i] = col;
+	}
+
+	if (i<size) {
+		size = i;
+		goto err_out_free;
+	}
+
+	netchannel_cache = kmem_cache_create("netchannel", sizeof(struct netchannel), 0, 0,
+			NULL, NULL);
+	if (!netchannel_cache)
+		goto err_out_free;
+
+	register_inetaddr_notifier(&netchannel_inetaddr_notifier);
+#ifdef CONFIG_IPV6
+	register_inet6addr_notifier(&netchannel_inet6addr_notifier);
+#endif
+
+	printk(KERN_NOTICE "netchannel: Created %u order two-dimensional hash table.\n", 
+			netchannel_hash_order);
+
+	return 0;
+
+err_out_free:
+	for (i=0; i<size; ++i) {
+		for (j=0; j<(1 << netchannel_hash_order); ++j)
+			kfree(netchannel_hash_table[i][j]);
+		kfree(netchannel_hash_table[i]);
+	}
+	kfree(netchannel_hash_table);
+err_out_umount:
+	mntput(netchannel_mnt);
+err_out_unregister:
+	unregister_filesystem(&netchannel_fs);
+	printk(KERN_NOTICE "netchannel: Failed to create %u order two-dimensional hash table.\n", 
+			netchannel_hash_order);
+	return err;
+}
+
+static void __exit netchannel_exit(void)
+{
+	unsigned int i, j;
+
+	unregister_inetaddr_notifier(&netchannel_inetaddr_notifier);
+#ifdef CONFIG_IPV6
+	unregister_inet6addr_notifier(&netchannel_inet6addr_notifier);
+#endif
+	kmem_cache_destroy(netchannel_cache);
+
+	for (i=0; i<(1 << netchannel_hash_order); ++i) {
+		for (j=0; j<(1 << netchannel_hash_order); ++j)
+			kfree(netchannel_hash_table[i][j]);
+		kfree(netchannel_hash_table[i]);
+	}
+	kfree(netchannel_hash_table);
+	
+	mntput(netchannel_mnt);
+	unregister_filesystem(&netchannel_fs);
+}
+
+late_initcall(netchannel_init);
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index e40f753..6ea6379 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -428,6 +428,11 @@ config INET_TCP_DIAG
 	depends on INET_DIAG
 	def_tristate INET_DIAG
 
+config ATCP
+	bool "TCP: altenative TCP stack used for netchannels"
+	---help---
+	  Extremely lightweight RFC compliant TCP stack used for netchannels.
+
 config TCP_CONG_ADVANCED
 	bool "TCP: advanced congestion control"
 	---help---
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 9ef50a0..25c122f 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybl
 obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
 obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
 obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
+obj-$(CONFIG_ATCP) += atcp.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o
diff --git a/net/ipv4/atcp.c b/net/ipv4/atcp.c
new file mode 100644
index 0000000..219b774
--- /dev/null
+++ b/net/ipv4/atcp.c
@@ -0,0 +1,1726 @@
+/*
+ * 	atcp.c
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/netchannel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include <asm/timex.h>
+
+//#define ATCP_DEBUG
+
+#ifdef ATCP_DEBUG
+#define ulog(f, a...) printk(f, ##a)
+#else
+#define ulog(f, a...)
+#endif
+
+#if 0
+enum {
+	TCP_ESTABLISHED = 1,
+	TCP_SYN_SENT,
+	TCP_SYN_RECV,
+	TCP_FIN_WAIT1,
+	TCP_FIN_WAIT2,
+	TCP_TIME_WAIT,
+	TCP_CLOSE,
+	TCP_CLOSE_WAIT,
+	TCP_LAST_ACK,
+	TCP_LISTEN,
+	TCP_CLOSING
+};
+#endif
+
+enum atcp_init_state {
+	NETCHANNEL_ATCP_CONNECT = 0,
+	NETCHANNEL_ATCP_LISTEN,
+};
+
+#define TCP_MAX_WSCALE	14
+static __u8 atcp_offer_wscale = 8;
+
+static __u32 atcp_max_qlen = 1024*1024;
+
+struct atcp_protocol
+{
+	struct common_protocol	cproto;
+
+	struct netchannel	*nc;
+
+	__u32			state;
+
+	__u32			snd_una;
+	__u32			snd_nxt;
+	__u16			snd_wnd;
+	__u32			snd_wl1;
+	__u32			snd_wl2;
+	__u32			iss;
+
+	__u32			rcv_nxt;
+	__u16			rcv_wnd;
+	__u16			rcv_wup;
+	__u32			irs;
+
+	__u8			rwscale, swscale;
+	__u16			mss;
+	__u32			tsval, tsecr;
+	__u32			ack_sent, ack_missed, sent_without_reading, ack_missed_bytes;
+
+	struct sk_buff_head	ofo_queue;
+
+	struct sk_buff		*send_head;
+	struct sk_buff_head	retransmit_queue;
+	struct skb_timeval	first_packet_ts;
+	__u32			retransmit_timeout;
+	__u32			dupack_seq, dupack_num, dupack_sync;
+
+	__u32			seq_read;
+
+	__u32			snd_cwnd, snd_cwnd_bytes, snd_ssthresh, in_flight, in_flight_bytes;
+	__u32			prev_update_ack, prev_update_ratio;
+	__u32			max_rwin;
+
+	__u32			qlen;
+
+	struct work_struct	work;
+};
+
+struct state_machine
+{
+	__u32		state;
+	int		(*run)(struct atcp_protocol *, struct sk_buff *);
+};
+
+static inline struct atcp_protocol *atcp_convert(struct common_protocol *cproto)
+{
+	return (struct atcp_protocol *)cproto;
+}
+
+static inline __u32 skb_rwin(struct atcp_protocol *tp, struct sk_buff *skb)
+{
+	__u32 rwin = ntohs(skb->h.th->window);
+	return (rwin << tp->rwscale);
+}
+
+static inline __u32 tp_rwin(struct atcp_protocol *tp)
+{
+	__u32 rwin = tp->rcv_wnd;
+	return rwin << tp->rwscale;
+}
+
+static inline __u32 tp_swin(struct atcp_protocol *tp)
+{
+	__u32 swin = tp->snd_wnd;
+	return swin << tp->swscale;
+}
+
+static inline int beforeeq(__u32 seq1, __u32 seq2)
+{
+        return (__s32)(seq1-seq2) <= 0;
+}
+
+static inline int aftereq(__u32 seq1, __u32 seq2)
+{
+	return (__s32)(seq2-seq1) <= 0;
+}
+
+static inline __u32 atcp_packet_timestamp(void)
+{
+	//return (__u32)get_cycles();
+	return (__u32)jiffies;
+}
+
+struct atcp_option
+{
+	__u8		kind, length;
+	int		(*callback)(struct atcp_protocol *tp, struct sk_buff *skb, __u8 *data);
+};
+
+struct atcp_option_timestamp
+{
+	__u8			kind, length;
+	__u32			tsval, tsecr;
+} __attribute__ ((packed));
+
+struct atcp_option_nop
+{
+	__u8			kind;
+} __attribute__ ((packed));
+
+struct atcp_option_mss
+{
+	__u8			kind, length;
+	__u16			mss;
+} __attribute__ ((packed));
+
+struct atcp_option_wscale
+{
+	__u8			kind, length;
+	__u8			wscale;
+} __attribute__ ((packed));
+
+#define TCP_OPT_NOP	1
+#define TCP_OPT_MSS	2
+#define TCP_OPT_WSCALE	3
+#define TCP_OPT_TS	8
+
+static int atcp_opt_mss(struct atcp_protocol *tp, struct sk_buff *skb __attribute__ ((unused)), __u8 *data)
+{
+	tp->mss = ntohs(((__u16 *)data)[0]);
+	ulog("%s: mss: %u.\n", __func__, tp->mss);
+	return 0;
+}
+
+static int atcp_opt_wscale(struct atcp_protocol *tp, struct sk_buff *skb __attribute__ ((unused)), __u8 *data)
+{
+	if ((skb->h.th->syn) && ((tp->state == TCP_SYN_SENT) || (tp->state == TCP_SYN_SENT))) {
+		tp->rwscale = data[0];
+		if (tp->rwscale > TCP_MAX_WSCALE)
+			tp->rwscale = TCP_MAX_WSCALE;
+		tp->swscale = atcp_offer_wscale;
+		ulog("%s: rwscale: %u, swscale: %u.\n", __func__, tp->rwscale, tp->swscale);
+	}
+	return 0;
+}
+
+static int atcp_opt_ts(struct atcp_protocol *tp, struct sk_buff *skb, __u8 *data)
+{
+	__u32 seq = TCP_SKB_CB(skb)->seq;
+	__u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+	__u32 packet_tsval = ntohl(((__u32 *)data)[0]);
+
+	if (!skb->h.th->ack)
+		return 0;
+
+	/* PAWS check */
+	if ((tp->state == TCP_ESTABLISHED) && before(packet_tsval, tp->tsecr)) {
+		ulog("%s: PAWS failed: packet: seq: %u, end_seq: %u, tsval: %u, tsecr: %u, host tsval: %u, tsecr: %u.\n",
+				__func__, seq, end_seq, packet_tsval, ntohl(((__u32 *)data)[1]), tp->tsval, tp->tsecr);
+		return 1;
+	}
+	
+	if (between(tp->ack_sent, seq, end_seq))
+		tp->tsecr = packet_tsval;
+	return 0;
+}
+
+static struct atcp_option atcp_supported_options[] = {
+	[TCP_OPT_NOP] = {.kind = TCP_OPT_NOP, .length = 1},
+	[TCP_OPT_MSS] = {.kind = TCP_OPT_MSS, .length = 4, .callback = &atcp_opt_mss},
+	[TCP_OPT_WSCALE] = {.kind = TCP_OPT_WSCALE, .length = 3, .callback = &atcp_opt_wscale},
+	[TCP_OPT_TS] = {.kind = TCP_OPT_TS, .length = 10, .callback = &atcp_opt_ts},
+};
+
+#define TCP_FLAG_SYN	0x1
+#define TCP_FLAG_ACK	0x2
+#define TCP_FLAG_RST	0x4
+#define TCP_FLAG_PSH	0x8
+#define TCP_FLAG_FIN	0x10
+
+static inline void atcp_set_state(struct atcp_protocol *tp, __u32 state)
+{
+	ulog("state change: %u -> %u.\n", tp->state, state);
+	tp->state = state;
+}
+
+static inline int atcp_skb_data_size(struct sk_buff *skb)
+{
+	return (int)(__u32)(TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq);
+}
+
+static inline int atcp_skb_has_header(struct sk_buff *skb)
+{
+	if (skb->h.th == NULL)
+		return 0;
+	return atcp_skb_data_size(skb) != skb->len;
+}
+
+static int netchannel_ip_route_output_flow(struct rtable **rp, struct flowi *flp, int flags)
+{
+	int err;
+
+	err = __ip_route_output_key(rp, flp);
+	if (err)
+		return err;
+
+	if (flp->proto) {
+		if (!flp->fl4_src)
+			flp->fl4_src = (*rp)->rt_src;
+		if (!flp->fl4_dst)
+			flp->fl4_dst = (*rp)->rt_dst;
+	}
+
+	return 0;
+}
+
+struct dst_entry *netchannel_route_get_raw(struct netchannel *nc)
+{
+	struct rtable *rt;
+	struct flowi fl = { .oif = 0,
+			    .nl_u = { .ip4_u =
+				      { .daddr = nc->unc.faddr,
+					.saddr = nc->unc.laddr,
+					.tos = 0 } },
+			    .proto = nc->unc.proto,
+			    .uli_u = { .ports =
+				       { .sport = nc->unc.lport,
+					 .dport = nc->unc.fport } } };
+
+	if (netchannel_ip_route_output_flow(&rt, &fl, 0))
+		goto no_route;
+	return dst_clone(&rt->u.dst);
+
+no_route:
+	return NULL;
+}
+
+static inline struct dst_entry *netchannel_route_get(struct netchannel *nc)
+{
+	if (nc->dst && nc->dst->obsolete && nc->dst->ops->check(nc->dst, 0) == NULL) {
+		dst_release(nc->dst);
+		nc->dst = netchannel_route_get_raw(nc);
+		if (!nc->dst)
+			return NULL;
+	}
+	return dst_clone(nc->dst);
+}
+
+void netchannel_route_put(struct dst_entry *dst)
+{
+	/* dst_entry is being freed when skb is released in NIC */
+}
+
+static int transmit_data(struct sk_buff *skb, struct atcp_protocol *tp)
+{
+#if defined(ATCP_DEBUG)
+	{
+		struct tcphdr *th = skb->h.th;
+
+		ulog("S %u.%u.%u.%u:%u <-> %u.%u.%u.%u:%u : seq: %u, ack: %u, win: %u [%u], doff: %u, "
+			"s: %u, a: %u, p: %u, r: %u, f: %u, len: %u, state: %u, skb: %p, csum: %04x.\n",
+			NIPQUAD(tp->nc->unc.laddr), ntohs(tp->nc->unc.lport),
+			NIPQUAD(tp->nc->unc.faddr), ntohs(tp->nc->unc.fport),
+			ntohl(th->seq), ntohl(th->ack_seq), ntohs(th->window), tp_rwin(tp), th->doff,
+			th->syn, th->ack, th->psh, th->rst, th->fin,
+			skb->len, tp->state, skb, th->check);
+	}
+#endif
+	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
+}
+
+static int ip_build_header(struct netchannel *nc, struct sk_buff *skb)
+{
+	struct iphdr *iph;
+
+	skb->nh.iph = iph = (struct iphdr *)skb_push(skb, sizeof(struct iphdr));
+	if (!iph)
+		return -ENOMEM;
+
+	iph->saddr = nc->unc.laddr;
+	iph->daddr = nc->unc.faddr;
+	iph->tos = 0;
+	iph->tot_len = htons(skb->len);
+	iph->ttl = 64;
+	iph->id = 0;
+	iph->frag_off = htons(0x4000);
+	iph->version = 4;
+	iph->ihl = 5;
+	iph->protocol = nc->unc.proto;
+
+	ip_send_check(iph);
+
+	return 0;
+}
+
+static int atcp_build_header(struct atcp_protocol *tp, struct sk_buff *skb, __u32 flags, __u8 doff)
+{
+	struct tcphdr *th;
+	struct atcp_option_nop *nop;
+	struct atcp_option_timestamp *ts;
+
+	nop = (struct atcp_option_nop *)skb_push(skb, sizeof(struct atcp_option_nop));
+	nop->kind = 1;
+	nop = (struct atcp_option_nop *)skb_push(skb, sizeof(struct atcp_option_nop));
+	nop->kind = 1;
+
+	ts = (struct atcp_option_timestamp *)skb_push(skb, sizeof(struct atcp_option_timestamp));
+	ts->kind = atcp_supported_options[TCP_OPT_TS].kind;
+	ts->length = atcp_supported_options[TCP_OPT_TS].length;
+	ts->tsval = htonl(atcp_packet_timestamp());
+	ts->tsecr = htonl(tp->tsecr);
+
+	skb->h.th = th = (struct tcphdr *)skb_push(skb, sizeof(struct tcphdr));
+	memset(th, 0, sizeof(struct tcphdr));
+
+#if 0
+	ulog("%s: len:%d head:%p data:%p tail:%p end:%p dev:%s\n",
+	       __func__, skb->len, skb->head, skb->data, skb->tail, skb->end,
+	       skb->dev ? skb->dev->name : "<NULL>");
+#endif
+	th->source = tp->nc->unc.lport;
+	th->dest = tp->nc->unc.fport;
+	th->seq = htonl(tp->snd_nxt);
+	th->ack_seq = htonl(tp->rcv_nxt);
+
+	if (flags & TCP_FLAG_SYN)
+		th->syn = 1;
+	if (flags & TCP_FLAG_ACK)
+		th->ack = 1;
+	if (flags & TCP_FLAG_PSH)
+		th->psh = 1;
+	if (flags & TCP_FLAG_RST)
+		th->rst = 1;
+	if (flags & TCP_FLAG_FIN)
+		th->fin = 1;
+	th->urg = 0;
+	th->urg_ptr = 0;
+	th->window = htons(tp->snd_wnd);
+
+	th->doff = 5 + 3 + doff;
+
+	if (skb->ip_summed == CHECKSUM_HW) {
+		th->check = ~tcp_v4_check(th, skb->len, tp->nc->unc.laddr, tp->nc->unc.faddr, 0);
+		skb->csum = offsetof(struct tcphdr, check);
+	} else {
+		th->check = tcp_v4_check(th, skb->len, tp->nc->unc.laddr, tp->nc->unc.faddr,
+					 csum_partial((char *)th, th->doff << 2, skb->csum));
+	}
+
+	TCP_SKB_CB(skb)->seq = tp->snd_nxt;
+	TCP_SKB_CB(skb)->end_seq = tp->snd_nxt + skb->len - (th->doff<<2);
+	TCP_SKB_CB(skb)->ack_seq = tp->rcv_nxt;
+
+	tp->snd_nxt += th->syn + th->fin + skb->len - (th->doff<<2);
+	tp->ack_sent = tp->rcv_nxt;
+
+	return ip_build_header(tp->nc, skb);
+}
+
+static int atcp_send_data(struct atcp_protocol *tp, struct sk_buff *skb, __u32 flags, __u8 doff)
+{
+	int err;
+
+	err = atcp_build_header(tp, skb, flags, doff);
+	if (err) {
+		kfree_skb(skb);
+		return err;
+	}
+	return transmit_data(skb, tp);
+}
+
+static int atcp_send_bit(struct atcp_protocol *tp, __u32 flags)
+{
+	struct sk_buff *skb;
+	int err;
+
+	skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
+	if (!skb) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	skb->dst = netchannel_route_get(tp->nc);
+	if (!skb->dst) {
+		err = -ENODEV;
+		goto err_out_free;
+	}
+
+	skb_reserve(skb, MAX_TCP_HEADER);
+
+	return atcp_send_data(tp, skb, flags, 0);
+
+err_out_free:
+	kfree_skb(skb);
+err_out_exit:
+	return err;
+}
+
+static int atcp_listen(struct atcp_protocol *tp, struct sk_buff *skb)
+{
+	int err;
+	struct tcphdr *th = skb->h.th;
+
+	if (th->rst)
+		return 0;
+	if (th->ack)
+		return -1;
+
+	if (th->syn) {
+		tp->irs = ntohl(th->seq);
+		tp->rcv_nxt = ntohl(th->seq)+1;
+		get_random_bytes(&tp->iss, sizeof(tp->iss));
+
+		err = atcp_send_bit(tp, TCP_FLAG_SYN|TCP_FLAG_ACK);
+		if (err < 0)
+			return err;
+		atcp_set_state(tp, TCP_SYN_RECV);
+	}
+
+	return 0;
+}
+
+static void atcp_cleanup_queue(struct sk_buff_head *head, __u32 *qlen)
+{
+	struct sk_buff *skb, *n = skb_peek(head);
+
+	if (!n)
+		return;
+
+	do {
+		skb = n->next;
+		__skb_unlink(n, head);
+		if (qlen)
+			*qlen -= n->len;
+		ulog("%s: skb: %p, head: %p, qlen: %u.\n", __func__, skb, head, *qlen);
+		kfree_skb(n);
+		n = skb;
+	} while (n != (struct sk_buff *)head);
+}
+
+static int atcp_in_slow_start(struct atcp_protocol *tp)
+{
+	return tp->snd_cwnd * tp->mss <= tp->snd_ssthresh;
+}
+
+static int atcp_can_send(struct atcp_protocol *tp)
+{
+	int can_send = tp->snd_cwnd > tp->in_flight;
+
+	if (can_send)
+		can_send = tp->in_flight_bytes < tp_rwin(tp);
+
+	ulog("%s: swin: %u, rwin: %u, cwnd: %u, in_flight: %u [%u], ssthresh: %u, qlen: %u, ss: %d, can_send: %d.\n", 
+			__func__, tp_swin(tp), tp_rwin(tp), tp->snd_cwnd, tp->in_flight, tp->in_flight_bytes, 
+			tp->snd_ssthresh, tp->qlen, atcp_in_slow_start(tp), can_send);
+
+	return can_send;
+}
+
+static int __atcp_try_to_transmit(struct atcp_protocol *tp, struct sk_buff *skb)
+{
+	struct sk_buff *nskb;
+	__u32 sdiff = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
+
+	nskb = skb_clone(skb, GFP_KERNEL);
+	if (!nskb)
+		return -ENOMEM;
+
+	if (sdiff) {
+		tp->in_flight++;
+		tp->in_flight_bytes += sdiff;
+	}
+
+	return transmit_data(nskb, tp);
+}
+
+static int atcp_try_to_transmit(struct atcp_protocol *tp, struct sk_buff *skb)
+{
+	int err = -EAGAIN;
+
+	if (atcp_can_send(tp))
+		err = __atcp_try_to_transmit(tp, skb);
+
+	if ((err < 0) && (tp->send_head == (struct sk_buff *)&tp->retransmit_queue)) {
+		ulog("%s: setting head to %p.\n", __func__, skb);
+		tp->send_head = skb;
+	}
+	return err;
+}
+
+static int atcp_transmit_queue(struct atcp_protocol *tp)
+{
+	struct sk_buff *skb = tp->send_head;
+	int err = 0;
+
+	while (skb && (skb != (struct sk_buff *)&tp->retransmit_queue)) {
+		ulog("%s: skb: %p, retransmit_queue: %p.\n", __func__, skb, &tp->retransmit_queue);
+		if (!atcp_can_send(tp)) {
+			err = -EAGAIN;
+			break;
+		}
+
+		err = __atcp_try_to_transmit(tp, skb);
+		if (err)
+			break;
+
+		skb = skb->next;
+		ulog("%s: setting head to %p.\n", __func__, skb);
+		tp->send_head = skb;
+	}
+
+	return err;
+}
+
+static void atcp_check_retransmit_queue(struct atcp_protocol *tp, __u32 ack)
+{
+	struct sk_buff *skb, *n = skb_peek(&tp->retransmit_queue);
+	int removed = 0;
+
+	if (!n)
+		goto out;
+
+	do {
+		__u32 seq, end_seq;
+
+		seq = TCP_SKB_CB(n)->seq;
+		end_seq = TCP_SKB_CB(n)->end_seq;
+
+		if (!seq && !end_seq && n->len)
+			break;
+
+		if (after(end_seq, ack))
+			break;
+		else {
+			struct tcphdr *th = n->h.th;
+			struct iphdr *iph = n->nh.iph;
+			u32 size = ntohs(iph->tot_len) - (iph->ihl<<2) - (th->doff << 2);
+
+			skb = n->next;
+
+			tp->in_flight--;
+			tp->in_flight_bytes -= size;
+			tp->qlen -= size;
+			__skb_unlink(n, &tp->retransmit_queue);
+			
+			if (n == tp->send_head)
+				tp->send_head = skb;
+
+			ulog("%s: ack: %u, snd_una: %u, removing: seq: %u, end_seq: %u, ts: %u.%u, in_flight: %u [%u], dec: %u.\n", 
+					__func__, ack, tp->snd_una, seq, end_seq, n->tstamp.off_sec, n->tstamp.off_usec, 
+					tp->in_flight, tp->in_flight_bytes, size);
+			tp->dupack_seq = TCP_SKB_CB(skb)->seq;
+
+			kfree_skb(n);
+			n = skb;
+			removed++;
+
+			if (n != (struct sk_buff *)&tp->retransmit_queue)
+				tp->first_packet_ts = n->tstamp;
+		}
+	} while (n != (struct sk_buff *)&tp->retransmit_queue);
+
+out:
+	ulog("%s: removed: %d, in_flight: %u [%u], cwnd: %u.\n", __func__, removed, tp->in_flight, tp->in_flight_bytes, tp->snd_cwnd);
+
+	if (removed)
+		atcp_transmit_queue(tp);
+}
+
+static inline int atcp_retransmit_time(struct atcp_protocol *tp)
+{
+	return 0;
+	return (after(atcp_packet_timestamp(), tp->first_packet_ts.off_sec + tp->retransmit_timeout));
+}
+
+static void atcp_retransmit(struct atcp_protocol *tp)
+{
+	struct sk_buff *skb = skb_peek(&tp->retransmit_queue), *nskb;
+	int retransmitted = 0;
+
+	if (tp->state == TCP_CLOSE) {
+		atcp_cleanup_queue(&tp->retransmit_queue, &tp->qlen);
+		return;
+	}
+
+	if (!skb)
+		goto out;
+
+	do {
+		if (after(atcp_packet_timestamp(), skb->tstamp.off_sec + tp->retransmit_timeout)) {
+			__u32 seq = TCP_SKB_CB(skb)->seq;
+			__u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+			printk("%s: skb: %p, seq: %u, end_seq: %u, ts: %u.%u, time: %u.\n", 
+				__func__, skb, seq, end_seq, skb->tstamp.off_sec, skb->tstamp.off_usec, atcp_packet_timestamp());
+
+			if (!seq && !end_seq && skb->len)
+				break;
+
+			nskb = skb_clone(skb, GFP_KERNEL);
+			if (nskb) {
+				transmit_data(nskb, tp);
+				retransmitted++;
+			}
+		} else
+			break;
+	} while ((skb = skb->next) != (struct sk_buff *)&tp->retransmit_queue);
+out:
+	return;
+	//ulog("%s: retransmitted: %d.\n", __func__, retransmitted);
+}
+
+static void skb_queue_order(struct sk_buff *skb, struct sk_buff_head *head)
+{
+	struct sk_buff *next = skb_peek(head);
+	unsigned int nseq = TCP_SKB_CB(skb)->seq;
+	unsigned int nend_seq = TCP_SKB_CB(skb)->end_seq;
+
+	ulog("ofo queue: seq: %u, end_seq: %u.\n", nseq, nend_seq);
+
+	if (!next) {
+		skb_get(skb);
+		__skb_queue_tail(head, skb);
+		goto out;
+	}
+
+	do {
+		unsigned int seq = TCP_SKB_CB(next)->seq;
+		unsigned int end_seq = TCP_SKB_CB(next)->end_seq;
+
+		if (beforeeq(seq, nseq) && aftereq(end_seq, nend_seq)) {
+			ulog("Collapse 1: seq: %u, end_seq: %u removed by seq: %u, end_seq: %u.\n",
+					nseq, nend_seq, seq, end_seq);
+			kfree_skb(skb);
+			skb = NULL;
+			break;
+		}
+
+		if (beforeeq(nseq, seq) && aftereq(nend_seq, end_seq)) {
+			struct sk_buff *prev = next->prev;
+
+			__skb_unlink(next, head);
+
+			ulog("Collapse 2: seq: %u, end_seq: %u removed by seq: %u, end_seq: %u.\n",
+					seq, end_seq, nseq, nend_seq);
+
+			kfree_skb(next);
+			if (prev == (struct sk_buff *)head)
+				break;
+			next = prev;
+			seq = TCP_SKB_CB(next)->seq;
+			end_seq = TCP_SKB_CB(next)->end_seq;
+		}
+		if (after(seq, nseq))
+			break;
+	} while ((next = next->next) != (struct sk_buff *)head);
+
+	if (skb) {
+		ulog("Inserting seq: %u, end_seq: %u.\n", nseq, nend_seq);
+		skb_get(skb);
+		skb_insert(next, skb, head);
+	}
+out:
+	ulog("ofo dump: ");
+	next = (struct sk_buff *)head;
+	while ((next = next->next) != (struct sk_buff *)head) {
+		ulog("%u - %u, ", TCP_SKB_CB(next)->seq, TCP_SKB_CB(next)->end_seq);
+	}
+	ulog("\n");
+}
+
+static void skb_queue_check(struct atcp_protocol *tp, struct sk_buff_head *head)
+{
+	struct sk_buff *next = skb_peek(head);
+
+	if (!next)
+		return;
+
+	do {
+		unsigned int seq = TCP_SKB_CB(next)->seq;
+		unsigned int end_seq = TCP_SKB_CB(next)->end_seq;
+
+		if (before(tp->rcv_nxt, seq))
+			break;
+
+		tp->rcv_nxt = max_t(unsigned int, end_seq, tp->rcv_nxt);
+	} while ((next = next->next) != (struct sk_buff *)head);
+
+	ulog("ACKed: rcv_nxt: %u.\n", tp->rcv_nxt);
+}
+
+static int atcp_syn_sent(struct atcp_protocol *tp, struct sk_buff *skb)
+{
+	struct tcphdr *th = skb->h.th;
+	__u32 seq = htonl(th->seq);
+	__u32 ack = htonl(th->ack_seq);
+#if 0
+	ulog("%s: a: %d, s: %d, ack: %u, seq: %u, iss: %u, snd_nxt: %u, snd_una: %u.\n",
+			__func__, th->ack, th->syn, ack, seq, tp->iss, tp->snd_nxt, tp->snd_una);
+#endif
+	if (th->ack) {
+		if (beforeeq(ack, tp->iss) || after(ack, tp->snd_nxt))
+			return (th->rst)?0:-1;
+		if (between(ack, tp->snd_una, tp->snd_nxt)) {
+			if (th->rst) {
+				atcp_set_state(tp, TCP_CLOSE);
+				return 0;
+			}
+		}
+	}
+
+	if (th->rst)
+		return 0;
+
+	if (th->syn) {
+		tp->rcv_nxt = seq+1;
+		tp->irs = seq;
+		if (th->ack) {
+			tp->snd_una = ack;
+			atcp_check_retransmit_queue(tp, ack);
+		}
+
+		if (after(tp->snd_una, tp->iss)) {
+			atcp_set_state(tp, TCP_ESTABLISHED);
+			tp->seq_read = seq + 1;
+			return atcp_send_bit(tp, TCP_FLAG_ACK);
+		}
+
+		atcp_set_state(tp, TCP_SYN_RECV);
+		tp->snd_nxt = tp->iss;
+		return atcp_send_bit(tp, TCP_FLAG_ACK|TCP_FLAG_SYN);
+	}
+
+	return 0;
+}
+
+static int atcp_syn_recv(struct atcp_protocol *tp, struct sk_buff *skb)
+{
+	struct tcphdr *th = skb->h.th;
+	__u32 ack = ntohl(th->ack_seq);
+
+	if (th->rst) {
+		atcp_set_state(tp, TCP_CLOSE);
+		return 0;
+	}
+
+	if (th->ack) {
+		if (between(ack, tp->snd_una, tp->snd_nxt)) {
+			tp->seq_read = ntohl(th->seq) + 1;
+			atcp_set_state(tp, TCP_ESTABLISHED);
+			return 0;
+		}
+	}
+
+	if (th->fin) {
+		atcp_set_state(tp, TCP_CLOSE_WAIT);
+		return 0;
+	}
+
+	return -1;
+}
+
+static int atcp_fast_retransmit(struct atcp_protocol *tp)
+{
+	__u32 seq, end_seq, ack;
+	struct sk_buff *nskb, *skb = skb_peek(&tp->retransmit_queue);
+
+	if (!skb)
+		return -EINVAL;
+
+	seq = TCP_SKB_CB(skb)->seq;
+	end_seq = TCP_SKB_CB(skb)->end_seq;
+	ack = TCP_SKB_CB(skb)->ack_seq;
+
+	ulog("%s: seq: %u, end_seq: %u, ack: %u, dupack_seq: %u.\n", __func__, seq, end_seq, ack, tp->dupack_seq);
+
+	if (seq != tp->dupack_seq) {
+		printk("%s: a bug: seq: %u, end_seq: %u, ack: %u, dupack_seq: %u.\n", __func__, seq, end_seq, ack, tp->dupack_seq);
+		return -EINVAL;
+	}
+
+	nskb = skb_clone(skb, GFP_KERNEL);
+	if (!nskb)
+		return -ENOMEM;
+
+	return transmit_data(nskb, tp);
+}
+
+static void atcp_congestion(struct atcp_protocol *tp)
+{
+	__u32 min_wind = min_t(unsigned int, tp->snd_cwnd*tp->mss, tp_rwin(tp));
+
+	if (tp_rwin(tp) > tp->max_rwin) {
+		tp->max_rwin = tp_rwin(tp);
+		return;
+	}
+
+	tp->dupack_num++;
+	tp->dupack_sync++;
+		
+	if (tp->snd_cwnd) {
+		tp->snd_cwnd--;
+		tp->snd_cwnd_bytes = tp->mss * tp->snd_cwnd;
+		tp->prev_update_ratio = 1;
+	}
+
+	if (tp->dupack_num >= 3) {
+		tp->snd_ssthresh = max_t(unsigned int, tp->mss * 2, min_wind/2);
+		if (tp->snd_cwnd) {
+			//tp->snd_cwnd>>=1;
+			tp->snd_cwnd_bytes = tp->mss * tp->snd_cwnd;
+			tp->prev_update_ratio = 1;
+			tp->prev_update_ack = 0;
+		}
+
+		ulog("%s: dupack_seq: %u, dupack_num: %u, cwnd: %u [%u], ssthresh: %u, in_flight: %u [%u], ss: %d, rwin: %u, swin: %u.\n", 
+			__func__, tp->dupack_seq, tp->dupack_num, tp->snd_cwnd, tp->snd_cwnd*tp->mss, tp->snd_ssthresh,
+			tp->in_flight, tp->in_flight_bytes, atcp_in_slow_start(tp),
+			tp_rwin(tp), tp_swin(tp));
+		atcp_fast_retransmit(tp);
+		tp->dupack_num = 0;
+		tp->snd_cwnd++;
+		if (tp->in_flight > tp->snd_cwnd)
+			tp->snd_cwnd = tp->in_flight;
+	}
+
+	if (tp->dupack_sync >= tp->snd_cwnd) {
+		struct sk_buff *nskb, *skb = skb_peek(&tp->retransmit_queue);
+
+		while (skb && skb != (struct sk_buff *)&tp->retransmit_queue) {
+			nskb = skb_clone(skb, GFP_KERNEL);
+			if (!nskb)
+				break;
+
+			transmit_data(nskb, tp);
+
+			skb = skb->next;
+		}
+
+		tp->snd_cwnd >>= 1;
+		tp->snd_cwnd_bytes = tp->mss * tp->snd_cwnd;
+		tp->dupack_sync = 0;
+		tp->prev_update_ratio = 1;
+		tp->prev_update_ack = 0;
+	}
+}
+
+static int atcp_established(struct atcp_protocol *tp, struct sk_buff *skb)
+{
+	struct tcphdr *th = skb->h.th;
+	int err = -EINVAL;
+	__u32 seq = TCP_SKB_CB(skb)->seq;
+	__u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+	__u32 ack = TCP_SKB_CB(skb)->ack_seq;
+	__u32 rwin = tp_rwin(tp);
+
+	if (before(seq, tp->rcv_nxt)) {
+		err = 0;
+		goto out;
+	}
+
+	if (after(end_seq, tp->rcv_nxt + rwin)) {
+		ulog("%s: 1: seq: %u, size: %u, rcv_nxt: %u, rcv_wnd: %u.\n", 
+				__func__, seq, skb->len, tp->rcv_nxt, rwin);
+		goto out;
+	}
+
+	if (th->rst)
+		goto out;
+
+	ulog("%s: seq: %u, end_seq: %u, ack: %u, snd_una: %u, snd_nxt: %u, snd_wnd: %u, rcv_nxt: %u, rcv_wnd: %u, cwnd: %u in_flight: %u [%u].\n",
+			__func__, seq, end_seq, ack, 
+			tp->snd_una, tp->snd_nxt, tp_swin(tp), 
+			tp->rcv_nxt, rwin, tp->snd_cwnd, tp->in_flight, tp->in_flight_bytes);
+
+	if (!skb->len && beforeeq(ack, tp->snd_una)) {
+		ulog("%s: duplicate ack: %u, snd_una: %u, snd_nxt: %u, snd_wnd: %u, snd_wl1: %u, snd_wl2: %u.\n",
+				__func__, ack, tp->snd_una, tp->snd_nxt, tp_swin(tp), tp->snd_wl1, tp->snd_wl2);
+		atcp_congestion(tp);
+		return 0;
+	} else if (after(ack, tp->snd_nxt)) {
+		printk("%s: out of order packet: seq: %u, ack: %u, len: %u, rwin: %u.\n", __func__, seq, ack, skb->len, rwin);
+		err = atcp_send_bit(tp, TCP_FLAG_ACK);
+		if (err < 0)
+			goto out;
+	} else if (between(ack, tp->snd_una, tp->snd_nxt)) {
+		__u32 ack_bytes = ack - tp->snd_una;
+
+		tp->dupack_num = 0;
+		tp->dupack_sync = 0;
+
+		if (atcp_in_slow_start(tp)) {
+			tp->snd_cwnd++;
+			tp->snd_cwnd_bytes += ack_bytes;
+			tp->prev_update_ack = 0;
+		} else {
+			__u32 update = ack_bytes*ack_bytes/(tp->snd_cwnd_bytes);
+
+			tp->snd_cwnd_bytes += update;
+			tp->prev_update_ack += update;
+			tp->max_rwin = max(tp->max_rwin, tp_rwin(tp));
+
+			if (tp->snd_cwnd_bytes >= tp->max_rwin*tp->prev_update_ratio) {
+				tp->snd_cwnd++;
+				tp->snd_cwnd_bytes = tp->snd_cwnd * tp->mss;
+				tp->prev_update_ratio++;
+			}
+		}
+		tp->snd_una = ack;
+		atcp_check_retransmit_queue(tp, ack);
+	}
+
+	if (beforeeq(seq, tp->rcv_nxt) && aftereq(end_seq, tp->rcv_nxt)) {
+		tp->rcv_nxt = end_seq;
+		skb_queue_check(tp, &tp->ofo_queue);
+	} else {
+		/*
+		 * Out of order packet.
+		 */
+		err = 0;
+		goto out;
+	}
+
+	if (skb->len) {
+		skb_queue_order(skb, &tp->ofo_queue);
+
+		tp->ack_missed_bytes += skb->len;
+		if (atcp_in_slow_start(tp) || tp->ack_missed_bytes >= 3*tp->mss || ++tp->ack_missed >= 3) {
+			tp->ack_missed_bytes = 0;
+			tp->ack_missed = 0;
+			err = atcp_send_bit(tp, TCP_FLAG_ACK);
+			if (err < 0)
+				goto out;
+		}
+	}
+#if 1
+	if (before(tp->snd_wl1, seq) || ((tp->snd_wl1 == seq) && beforeeq(tp->snd_wl2, ack))) {
+		tp->snd_wnd = ntohs(th->window);
+		tp->snd_wl1 = seq;
+		tp->snd_wl2 = ack;
+	}
+#endif
+	if (th->fin) {
+		atcp_set_state(tp, TCP_CLOSE_WAIT);
+		err = 0;
+	}
+
+	err = skb->len;
+out:
+	ulog("%s: return: %d.\n", __func__, err);
+	return err;
+}
+
+static int atcp_fin_wait1(struct atcp_protocol *tp, struct sk_buff *skb)
+{
+	int err;
+	struct tcphdr *th = skb->h.th;
+
+	if (th->fin) {
+		if (th->ack) {
+			/* Start time-wait timer... */
+			atcp_set_state(tp, TCP_TIME_WAIT);
+		} else
+			atcp_set_state(tp, TCP_CLOSING);
+		return 0;
+	}
+
+	err = atcp_established(tp, skb);
+	if (err < 0)
+		return err;
+	atcp_set_state(tp, TCP_FIN_WAIT2);
+	return 0;
+}
+
+static int atcp_fin_wait2(struct atcp_protocol *tp, struct sk_buff *skb)
+{
+	struct tcphdr *th = skb->h.th;
+
+	if (th->fin) {
+		/* Start time-wait timer... */
+		return 0;
+	}
+
+	return atcp_established(tp, skb);
+}
+
+static int atcp_close_wait(struct atcp_protocol *tp, struct sk_buff *skb)
+{
+	struct tcphdr *th = skb->h.th;
+
+	if (th->fin)
+		return 0;
+
+	return atcp_established(tp, skb);
+}
+
+static int atcp_closing(struct atcp_protocol *tp, struct sk_buff *skb)
+{
+	int err;
+	struct tcphdr *th = skb->h.th;
+
+	if (th->fin)
+		return 0;
+
+	err = atcp_established(tp, skb);
+	if (err < 0)
+		return err;
+	atcp_set_state(tp, TCP_TIME_WAIT);
+	return 0;
+}
+
+static int atcp_last_ack(struct atcp_protocol *tp, struct sk_buff *skb)
+{
+	struct tcphdr *th = skb->h.th;
+
+	if (th->fin)
+		return 0;
+
+	atcp_set_state(tp, TCP_CLOSE);
+	return 0;
+}
+
+static int atcp_time_wait(struct atcp_protocol *tp, struct sk_buff *skb)
+{
+	return atcp_send_bit(tp, TCP_FLAG_ACK);
+}
+
+static int atcp_close(struct atcp_protocol *tp, struct sk_buff *skb)
+{
+	struct tcphdr *th = skb->h.th;
+
+	atcp_cleanup_queue(&tp->retransmit_queue, &tp->qlen);
+	atcp_cleanup_queue(&tp->ofo_queue, NULL);
+
+	if (!th->rst)
+		return -1;
+	return 0;
+}
+
+static struct state_machine atcp_state_machine[] = {
+	{ .state = 0, .run = NULL},
+	{ .state = TCP_ESTABLISHED, .run = atcp_established, },
+	{ .state = TCP_SYN_SENT, .run = atcp_syn_sent, },
+	{ .state = TCP_SYN_RECV, .run = atcp_syn_recv, },
+	{ .state = TCP_FIN_WAIT1, .run = atcp_fin_wait1, },
+	{ .state = TCP_FIN_WAIT2, .run = atcp_fin_wait2, },
+	{ .state = TCP_TIME_WAIT, .run = atcp_time_wait, },
+	{ .state = TCP_CLOSE, .run = atcp_close, },
+	{ .state = TCP_CLOSE_WAIT, .run = atcp_close_wait, },
+	{ .state = TCP_LAST_ACK, .run = atcp_last_ack, },
+	{ .state = TCP_LISTEN, .run = atcp_listen, },
+	{ .state = TCP_CLOSING, .run = atcp_closing, },
+};
+
+static void atcp_work(void *data)
+{
+	struct atcp_protocol *tp = data;
+
+	printk("%s: cwnd: %u [%u], ssthresh: %u, ss: %d, in_flight: %u [%u], dupack [%u, %u, %u], rwin: %u, swin: %u, can_send: %u, max_rwin: %u, prev: %u %u.\n",
+			__func__, tp->snd_cwnd, tp->snd_cwnd_bytes, 
+			tp->snd_ssthresh, atcp_in_slow_start(tp), 
+			tp->in_flight, tp->in_flight_bytes, 
+			tp->dupack_num, tp->dupack_seq, tp->dupack_sync,
+			tp_rwin(tp), tp_swin(tp), atcp_can_send(tp), tp->max_rwin,
+			tp->prev_update_ack, tp->prev_update_ratio);
+	schedule_delayed_work(&tp->work, HZ);
+}
+
+static int atcp_init_listen(struct netchannel *nc)
+{
+	struct atcp_protocol *tp = atcp_convert(nc->proto);
+	atcp_set_state(tp, TCP_LISTEN);
+	return 0;
+}
+
+static int atcp_connect(struct netchannel *nc)
+{
+	struct atcp_protocol *tp = atcp_convert(nc->proto);
+	int err;
+	struct sk_buff *skb;
+	struct atcp_option_mss *mss;
+	struct atcp_option_wscale *wscale;
+	struct atcp_option_nop *nop;
+
+	skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	skb->dst = netchannel_route_get(nc);
+	if (!skb->dst) {
+		kfree_skb(skb);
+		return -ENODEV;
+	}
+
+	skb_reserve(skb, MAX_TCP_HEADER);
+
+	mss = (struct atcp_option_mss *)skb_push(skb, sizeof(struct atcp_option_mss));
+	mss->kind = TCP_OPT_MSS;
+	mss->length = atcp_supported_options[TCP_OPT_MSS].length;
+	mss->mss = htons(tp->mss);
+
+	nop = (struct atcp_option_nop *)skb_push(skb, sizeof(struct atcp_option_nop));
+	nop->kind = 1;
+	
+	wscale = (struct atcp_option_wscale *)skb_push(skb, sizeof(struct atcp_option_wscale));
+	wscale->kind = TCP_OPT_WSCALE;
+	wscale->length = atcp_supported_options[TCP_OPT_WSCALE].length;
+	wscale->wscale = atcp_offer_wscale;
+
+	err = atcp_send_data(tp, skb, TCP_FLAG_SYN, skb->len/4);
+	if (err < 0)
+		return err;
+	atcp_set_state(tp, TCP_SYN_SENT);
+	return 0;
+}
+
+static int atcp_create(struct netchannel *nc)
+{
+	struct atcp_protocol *tp = atcp_convert(nc->proto);
+
+	get_random_bytes(&tp->iss, sizeof(tp->iss));
+	tp->snd_wnd = 4096;
+	tp->snd_nxt = tp->iss;
+	tp->rcv_wnd = 0xffff;
+	tp->rwscale = 0;
+	tp->swscale = 0;
+	tp->mss = 1460;
+	tp->snd_cwnd = 1;
+	tp->snd_cwnd_bytes = tp->mss;
+	tp->snd_ssthresh = 0xffff;
+	tp->retransmit_timeout = 10;
+	tp->prev_update_ack = 0;
+	tp->prev_update_ratio = 1;
+	tp->tsval = atcp_packet_timestamp();
+	tp->tsecr = 0;
+	tp->nc = nc;
+	skb_queue_head_init(&tp->retransmit_queue);
+	skb_queue_head_init(&tp->ofo_queue);
+	tp->send_head = (struct sk_buff *)&tp->retransmit_queue;
+
+	INIT_WORK(&tp->work, atcp_work, tp);
+	schedule_delayed_work(&tp->work, HZ);
+
+	if (nc->unc.state == NETCHANNEL_ATCP_LISTEN)
+		return atcp_init_listen(nc);
+	else if (nc->unc.state == NETCHANNEL_ATCP_CONNECT)
+		return atcp_connect(nc);
+
+	return -EINVAL;
+}
+
+static int atcp_parse_options(struct atcp_protocol *tp, struct sk_buff *skb)
+{
+	struct tcphdr *th = skb->h.th;
+	int optsize = (th->doff<<2) - sizeof(struct tcphdr);
+	__u8 *opt = (__u8 *)skb->h.raw + sizeof(struct tcphdr);
+	int err = 0;
+
+	if (optsize < 0)
+		return -EINVAL;
+
+	while (optsize) {
+		__u8 kind = *opt++;
+		__u8 len; 
+
+		if (kind == 1) {
+			optsize--;
+			continue;
+		} else if (kind == 0)
+			break;
+		else
+			len = *opt++;
+
+		//ulog("%s: kind: %u, len: %u, optsize: %d.\n", __func__, kind, len, optsize);
+
+		if (kind < sizeof(atcp_supported_options)/sizeof(atcp_supported_options[0])) {
+			if (optsize < len) {
+				err = -EINVAL;
+				break;
+			}
+			if (atcp_supported_options[kind].callback) {
+				err = atcp_supported_options[kind].callback(tp, skb, opt);
+				if (err)
+					break;
+			}
+		}
+		opt += len - 2;
+		optsize -= len;
+	}
+	return err;
+}
+
+static int atcp_state_machine_run(struct atcp_protocol *tp, struct sk_buff *skb)
+{
+	int err = -EINVAL, broken = 1;
+	struct tcphdr *th = skb->h.th;
+	__u16 rwin = skb_rwin(tp, skb);
+	__u32 seq = TCP_SKB_CB(skb)->seq;
+	__u32 ack = TCP_SKB_CB(skb)->ack_seq;
+
+	ulog("R %u.%u.%u.%u:%u <-> %u.%u.%u.%u:%u : seq: %u, ack: %u, win: %u [r: %u, s: %u], doff: %u, "
+			"s: %u, a: %u, p: %u, r: %u, f: %u, len: %u, state: %u, skb: %p, snd_una: %u, snd_nxt: %u.\n",
+		NIPQUAD(tp->nc->unc.laddr), ntohs(tp->nc->unc.lport),
+		NIPQUAD(tp->nc->unc.faddr), ntohs(tp->nc->unc.fport),
+		seq, ack, ntohs(th->window), rwin, tp_swin(tp), th->doff,
+		th->syn, th->ack, th->psh, th->rst, th->fin,
+		skb->len, tp->state, skb, tp->snd_una, tp->snd_nxt);
+
+	tp->rcv_wnd = ntohs(th->window);
+
+	/* Some kind of header prediction. */
+	if ((tp->state == TCP_ESTABLISHED) && (seq == tp->rcv_nxt)) {
+		int sz;
+
+		err = atcp_established(tp, skb);
+		if (err < 0)
+			goto out;
+		sz = err;
+		err = atcp_parse_options(tp, skb);
+		if (err >= 0)
+			err = sz;
+		goto out;
+	}
+
+	err = atcp_parse_options(tp, skb);
+	if (err < 0)
+		goto out;
+	if (err > 0)
+		return atcp_send_bit(tp, TCP_FLAG_ACK);
+
+	if (tp->state == TCP_SYN_SENT || tp->state == TCP_LISTEN) {
+		err = atcp_state_machine[tp->state].run(tp, skb);
+	} else {
+		if (!skb->len && ((!rwin && seq == tp->rcv_nxt) || 
+					(rwin && (aftereq(seq, tp->rcv_nxt) && before(seq, tp->rcv_nxt + rwin)))))
+				broken = 0;
+		else if ((aftereq(seq, tp->rcv_nxt) && before(seq, tp->rcv_nxt + rwin)) &&
+					(aftereq(seq, tp->rcv_nxt) && before(seq+skb->len-1, tp->rcv_nxt + rwin)))
+				broken = 0;
+
+		if (broken && !th->rst) {
+			ulog("R broken: rwin: %u, seq: %u, rcv_nxt: %u, size: %u.\n", 
+					rwin, seq, tp->rcv_nxt, skb->len);
+			return atcp_send_bit(tp, TCP_FLAG_ACK);
+		}
+
+		if (th->rst) {
+			ulog("R broken rst: rwin: %u, seq: %u, rcv_nxt: %u, size: %u.\n", 
+					rwin, seq, tp->rcv_nxt, skb->len);
+			atcp_set_state(tp, TCP_CLOSE);
+			err = 0;
+			goto out;
+		}
+
+		if (th->syn) {
+			ulog("R broken syn: rwin: %u, seq: %u, rcv_nxt: %u, size: %u.\n", 
+					rwin, seq, tp->rcv_nxt, skb->len);
+			goto out;
+		}
+
+		if (!th->ack)
+			goto out;
+
+		err = atcp_state_machine[tp->state].run(tp, skb);
+
+		if (between(ack, tp->snd_una, tp->snd_nxt)) {
+			tp->snd_una = ack;
+			atcp_check_retransmit_queue(tp, ack);
+		}
+
+		if (th->fin && seq == tp->rcv_nxt) {
+			if (tp->state == TCP_LISTEN || tp->state == TCP_CLOSE)
+				return 0;
+			tp->rcv_nxt++;
+			atcp_send_bit(tp, TCP_FLAG_ACK);
+		}
+	}
+
+out:
+#if 0
+	ulog("E %u.%u.%u.%u:%u <-> %u.%u.%u.%u:%u : seq: %u, ack: %u, state: %u, err: %d.\n",
+		NIPQUAD(tp->nc->unc.laddr), ntohs(tp->nc->unc.lport),
+		NIPQUAD(tp->nc->unc.faddr), ntohs(tp->nc->unc.fport),
+		ntohl(th->seq), ntohl(th->ack_seq), tp->state, err);
+#endif
+	if (err < 0) {
+		__u32 flags = TCP_FLAG_RST;
+		if (th->ack) {
+			tp->snd_nxt = ntohl(th->ack_seq);
+		} else {
+			flags |= TCP_FLAG_ACK;
+			tp->snd_nxt = 0;
+			tp->rcv_nxt = ntohl(th->seq) + skb->len;
+		}
+		atcp_set_state(tp, TCP_CLOSE);
+		atcp_send_bit(tp, flags);
+		atcp_cleanup_queue(&tp->retransmit_queue, &tp->qlen);
+	}
+
+	if (atcp_retransmit_time(tp))
+		atcp_retransmit(tp);
+
+	return err;
+}
+
+static int atcp_read_data(struct atcp_protocol *tp, __u8 *buf, unsigned int size)
+{
+	struct sk_buff *skb = skb_peek(&tp->ofo_queue);
+	int read = 0;
+
+	if (!skb)
+		return -EAGAIN;
+
+	ulog("%s: size: %u, seq_read: %u.\n", __func__, size, tp->seq_read);
+
+	while (size && (skb != (struct sk_buff *)&tp->ofo_queue)) {
+		__u32 seq = TCP_SKB_CB(skb)->seq;
+		__u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+		unsigned int sz, data_size, off, len;
+		struct sk_buff *next = skb->next;
+
+		if (after(tp->seq_read, end_seq)) {
+			ulog("Impossible: skb: seq: %u, end_seq: %u, seq_read: %u.\n",
+					seq, end_seq, tp->seq_read);
+
+			__skb_unlink(skb, &tp->ofo_queue);
+			kfree_skb(skb);
+
+			skb = next;
+			continue;
+		}
+
+		if (before(tp->seq_read, seq))
+			break;
+
+		off = tp->seq_read - seq;
+		data_size = skb->len - off;
+		sz = min_t(unsigned int, size, data_size);
+
+		ulog("Copy: seq_read: %u, seq: %u, end_seq: %u, size: %u, off: %u, data_size: %u, sz: %u, read: %d.\n",
+				tp->seq_read, seq, end_seq, size, off, data_size, sz, read);
+
+		len = sz;
+		while (len) {
+			unsigned int copied = sz - len;
+			
+			len = copy_to_user(&buf[copied], skb->data + off + copied, len);
+		}
+
+		buf += sz;
+		read += sz;
+		size -= sz;
+
+		tp->seq_read += sz;
+
+		if (aftereq(tp->seq_read, end_seq)) {
+			ulog("Unlinking: skb: seq: %u, end_seq: %u, seq_read: %u.\n",
+					seq, end_seq, tp->seq_read);
+
+			__skb_unlink(skb, &tp->ofo_queue);
+			kfree_skb(skb);
+		}
+
+		skb = next;
+	}
+
+	return read;
+}
+
+static int atcp_process_in(struct netchannel *nc, void *buf, unsigned int size)
+{
+	struct atcp_protocol *tp = atcp_convert(nc->proto);
+	struct tcphdr *th;
+	struct iphdr *iph;
+	struct sk_buff *skb;
+	int err = 0;
+	unsigned int read = 0, timeout = HZ;
+
+	if (tp->state == TCP_CLOSE)
+		return -ECONNRESET;
+
+	while (size) {
+		unsigned int tm = timeout, len;
+#if 0
+		if (skb_queue_empty(&nc->recv_queue) && read)
+			break;
+#endif
+		if (!skb_queue_empty(&tp->ofo_queue)) {
+			err = atcp_read_data(tp, buf, size);
+
+			if (err > 0) {
+				size -= err;
+				buf += err;
+				read += err;
+			}
+
+			if (!size)
+				break;
+		}
+
+		skb = netchannel_get_skb(nc, &tm, &err);
+		if (!skb) 
+			break;
+
+		iph = skb->nh.iph;
+		th = skb->h.th;
+
+		skb_pull(skb, (th->doff<<2) + (iph->ihl<<2));
+		len = skb->len;
+
+		ulog("\n%s: skb: %p, data_size: %u.\n", __func__, skb, skb->len);
+
+		TCP_SKB_CB(skb)->seq = ntohl(th->seq);
+		TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + skb->len;
+		TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+
+		err = atcp_state_machine_run(tp, skb);
+		if (err <= 0) {
+			kfree_skb(skb);
+			break;
+		}
+
+		if (len) {
+			err = atcp_read_data(tp, buf, size);
+
+			if (err > 0) {
+				size -= err;
+				buf += err;
+				read += err;
+			}
+		}
+
+		kfree_skb(skb);
+	}
+
+	if (atcp_retransmit_time(tp))
+		atcp_retransmit(tp);
+
+	return read;
+}
+
+static int atcp_out_read(struct netchannel *nc, unsigned int tm)
+{
+	struct sk_buff *skb;
+	int err;
+
+	skb = netchannel_get_skb(nc, &tm, &err);
+	if (skb) {
+		struct atcp_protocol *tp = atcp_convert(nc->proto);
+		struct tcphdr *th;
+		struct iphdr *iph;
+		
+		iph = skb->nh.iph;
+		th = skb->h.th;
+
+		skb_pull(skb, (th->doff<<2) + (iph->ihl<<2));
+
+		ulog("\n%s: skb: %p, data_size: %u.\n", __func__, skb, skb->len);
+
+		TCP_SKB_CB(skb)->seq = ntohl(th->seq);
+		TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + skb->len;
+		TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+
+		atcp_state_machine_run(tp, skb);
+		kfree_skb(skb);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int atcp_transmit_combined(struct netchannel *nc, void *buf, unsigned int data_size)
+{
+	struct atcp_protocol *tp = atcp_convert(nc->proto);
+	struct sk_buff *skb;
+	int err = 0;
+	unsigned int copy, total = 0;
+	
+	while (data_size) {
+		skb = skb_peek_tail(&tp->retransmit_queue);
+		if (!skb || !skb_tailroom(skb) || atcp_skb_has_header(skb)) {
+			skb = alloc_skb_fclone(tp->mss, GFP_KERNEL);
+			if (!skb) {
+				err = -ENOMEM;
+				goto out;
+			}
+			skb->csum = 0;
+
+			skb->dst = netchannel_route_get(nc);
+			if (!skb->dst) {
+				err = -ENODEV;
+				kfree_skb(skb);
+				goto out;
+			}
+			skb_reserve(skb, MAX_TCP_HEADER);
+
+			__skb_queue_tail(&tp->retransmit_queue, skb);
+
+			tp->qlen += skb_tailroom(skb);
+			ulog("%s: queued skb: %p, size: %u, tail_len: %u.\n", 
+					__func__, skb, skb->len, skb_tailroom(skb));
+		}
+
+		copy = min_t(unsigned int, skb_tailroom(skb), data_size);
+		err = skb_add_data(skb, buf, copy);
+		if (err) {
+			__skb_unlink(skb, &tp->retransmit_queue);
+			kfree_skb(skb);
+			goto out;
+		}
+		buf += copy;
+		data_size -= copy;
+		total += copy;
+		
+		ulog("%s: skb: %p, copy: %u, total: %u, data_size: %u, skb_size: %u, tail_len: %u.\n", 
+				__func__, skb, copy, total, data_size, skb->len, skb_tailroom(skb));
+
+		if (!skb_tailroom(skb)) {
+			err = atcp_build_header(tp, skb, TCP_FLAG_PSH|TCP_FLAG_ACK, 0);
+			if (err) {
+				__skb_unlink(skb, &tp->retransmit_queue);
+				kfree_skb(skb);
+				goto out;
+			}
+			err = atcp_try_to_transmit(tp, skb);
+			if (err && err != -EAGAIN)
+				goto out;
+		}
+	}
+	err = total;
+
+out:
+	return err;
+}
+
+static int atcp_transmit_data(struct netchannel *nc, void *buf, unsigned int data_size)
+{
+	struct atcp_protocol *tp = atcp_convert(nc->proto);
+	struct sk_buff *skb;
+	unsigned int size;
+	int err, sent = 0;
+
+	while (data_size) {
+		size = min_t(unsigned int, tp->mss, data_size + MAX_TCP_HEADER);
+
+		skb = alloc_skb_fclone(size, GFP_KERNEL);
+		if (!skb) {
+			sent = -ENOMEM;
+			break;
+		}
+		skb->csum = 0;
+
+		skb->dst = netchannel_route_get(nc);
+		if (!skb->dst) {
+			kfree_skb(skb);
+			sent = -ENODEV;
+			break;
+		}
+		skb_reserve(skb, MAX_TCP_HEADER);
+		size -= MAX_TCP_HEADER;
+
+		err = skb_add_data(skb, buf, size);
+		if (err) {
+			kfree_skb(skb);
+			sent = err;
+			break;
+		}
+
+		err = atcp_build_header(tp, skb, TCP_FLAG_PSH|TCP_FLAG_ACK, 0);
+		if (err) {
+			kfree_skb(skb);
+			sent = err;
+			break;
+		}
+
+		__skb_queue_tail(&tp->retransmit_queue, skb);
+		tp->qlen += size;
+		ulog("%s: queued: skb: %p, size: %u, qlen: %u, data_size: %u, send_size: %u, tail_size: %u [%u, %p, %p, %p, %p].\n", 
+				__func__, skb, skb->len, tp->qlen, data_size, size, skb_tailroom(skb),
+				atcp_skb_has_header(skb), skb->head, skb->data, skb->tail, skb->end);
+
+		err = atcp_try_to_transmit(tp, skb);
+		if (err && err != -EAGAIN) {
+			sent = err;
+			break;
+		} else
+			atcp_out_read(nc, 0);
+
+		buf += size;
+		data_size -= size;
+		sent += size;
+	}
+
+	return sent;
+}
+
+static int atcp_process_out(struct netchannel *nc, void *buf, unsigned int data_size)
+{
+	struct atcp_protocol *tp = atcp_convert(nc->proto);
+	int ret = 0;
+
+	if (tp->state == TCP_CLOSE)
+		return -ECONNRESET;
+
+	if (tp->state == TCP_ESTABLISHED) {
+		ret = atcp_transmit_queue(tp);
+		if (ret)
+			goto out_read;
+#if 0
+		if (tp->qlen + data_size > atcp_max_qlen) {
+			ret = -EAGAIN;
+			goto out_read;
+		}
+#endif
+		if (atcp_in_slow_start(tp) || data_size + MAX_TCP_HEADER >= tp->mss)
+			ret = atcp_transmit_data(nc, buf, data_size);
+		else
+			ret = atcp_transmit_combined(nc, buf, data_size);
+	}
+
+out_read:
+	if (++tp->sent_without_reading >= 3) {
+		unsigned int tm = HZ;
+		
+		do {
+			if ((tp->state == TCP_ESTABLISHED) && atcp_can_send(tp))
+				tm = 0;
+			ulog("%s: sent_without_reading: %u, state: %u.\n", __func__, tp->sent_without_reading, tp->state);
+		} while (tp->sent_without_reading-- > 0 && atcp_out_read(nc, tm));
+
+		tp->sent_without_reading = 0;
+	}
+	return ret;
+}
+
+static int atcp_destroy(struct netchannel *nc)
+{
+	struct atcp_protocol *tp = atcp_convert(nc->proto);
+
+	cancel_rearming_delayed_work(&tp->work);
+	flush_scheduled_work();
+
+	if (tp->state == TCP_SYN_RECV ||
+			tp->state == TCP_ESTABLISHED || 
+			tp->state == TCP_FIN_WAIT1 ||
+			tp->state == TCP_FIN_WAIT2 ||
+			tp->state == TCP_CLOSE_WAIT)
+		atcp_send_bit(tp, TCP_FLAG_RST);
+
+	atcp_set_state(tp, TCP_CLOSE);
+	atcp_cleanup_queue(&tp->retransmit_queue, &tp->qlen);
+	atcp_cleanup_queue(&tp->ofo_queue, NULL);
+	return 0;
+}
+
+struct common_protocol atcp_common_protocol = {
+	.size		= sizeof(struct atcp_protocol),
+	.create		= &atcp_create,
+	.process_in	= &atcp_process_in,
+	.process_out	= &atcp_process_out,
+	.destroy	= &atcp_destroy,
+};

-- 
	Evgeniy Polyakov

^ permalink raw reply related	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-18  8:16 Netchannles: first stage has been completed. Further ideas Evgeniy Polyakov
@ 2006-07-18  8:34 ` David Miller
  2006-07-18  8:50   ` Evgeniy Polyakov
  2006-07-18 11:16 ` Christian Borntraeger
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 60+ messages in thread
From: David Miller @ 2006-07-18  8:34 UTC (permalink / raw)
  To: johnpol; +Cc: netdev

From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Tue, 18 Jul 2006 12:16:26 +0400

> I would ask to push netchannel support into -mm tree, but I expect
> in advance that having two separate TCP stacks (one of which can
> contain some bugs (I mean atcp.c)) is not that good idea, so I
> understand possible negative feedback on that issue, but it is much
> better than silence.

Evgeniy, you are present in my queue of work to review.

Perhaps I am mistaken with my priorities, but I tend to hit all the
easy patches and bug fixes first, before significant new work.

And even in the realm of new work, your things require the most
serious thinking and consideration.  I apologize for the time it takes
me, therefore, to get to reviewing deep work such as your's.

I will make a real effort to properly review your excellent work this
week, and I encourage any other netdev hackers with some spare
cycles to do the same. :)

Thanks!

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-18  8:34 ` David Miller
@ 2006-07-18  8:50   ` Evgeniy Polyakov
  0 siblings, 0 replies; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-18  8:50 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Tue, Jul 18, 2006 at 01:34:37AM -0700, David Miller (davem@davemloft.net) wrote:
> Perhaps I am mistaken with my priorities, but I tend to hit all the
> easy patches and bug fixes first, before significant new work.
> 
> And even in the realm of new work, your things require the most
> serious thinking and consideration.  I apologize for the time it takes
> me, therefore, to get to reviewing deep work such as your's.
> 
> I will make a real effort to properly review your excellent work this
> week, and I encourage any other netdev hackers with some spare
> cycles to do the same. :)

That would be great!

Please don't think that I wash people's mind with weekly "get it, get it" 
zombying stuff, I completely understand that there are things with much 
higher priority that netchannels, so it can wait (for a while :).

Thank you.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-18  8:16 Netchannles: first stage has been completed. Further ideas Evgeniy Polyakov
  2006-07-18  8:34 ` David Miller
@ 2006-07-18 11:16 ` Christian Borntraeger
  2006-07-18 11:51   ` Evgeniy Polyakov
  2006-07-18 12:15 ` Jörn Engel
  2006-07-18 23:01 ` Alexey Kuznetsov
  3 siblings, 1 reply; 60+ messages in thread
From: Christian Borntraeger @ 2006-07-18 11:16 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: netdev, David Miller

Hello Evgeniy,

> +asmlinkage long sys_netchannel_control(void __user *arg)
[...]
> +	if (copy_from_user(&ctl, arg, sizeof(struct unetchannel_control)))
> +		return -ERESTARTSYS;
                        ^^^^^^^^^^^
[...]
> +	if (copy_to_user(arg, &ctl, sizeof(struct unetchannel_control)))
> +		return -ERESTARTSYS;
                        ^^^^^^^^^^^

I think this should be -EFAULT instead of -ERESTARTSYS, right?

-- 
Mit freundlichen Grüßen / Best Regards

Christian Borntraeger
Linux Software Engineer zSeries Linux & Virtualization




^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-18 11:16 ` Christian Borntraeger
@ 2006-07-18 11:51   ` Evgeniy Polyakov
  2006-07-18 12:36     ` Christian Borntraeger
  0 siblings, 1 reply; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-18 11:51 UTC (permalink / raw)
  To: Christian Borntraeger; +Cc: netdev, David Miller

On Tue, Jul 18, 2006 at 01:16:18PM +0200, Christian Borntraeger (borntrae@de.ibm.com) wrote:
> Hello Evgeniy,
> 
> > +asmlinkage long sys_netchannel_control(void __user *arg)
> [...]
> > +	if (copy_from_user(&ctl, arg, sizeof(struct unetchannel_control)))
> > +		return -ERESTARTSYS;
>                         ^^^^^^^^^^^
> [...]
> > +	if (copy_to_user(arg, &ctl, sizeof(struct unetchannel_control)))
> > +		return -ERESTARTSYS;
>                         ^^^^^^^^^^^
> 
> I think this should be -EFAULT instead of -ERESTARTSYS, right?

I have no strong feeling on what must be returned in that case.
As far as I see, copy*user can fail due to absence of the next
destination page, so -ERESTARTSYS makes sence, but if failure happens due to
process size limitation, -EFAULT is correct.

Let's change it to -EFAULT.

> -- 
> Mit freundlichen Grüßen / Best Regards
> 
> Christian Borntraeger
> Linux Software Engineer zSeries Linux & Virtualization

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-18  8:16 Netchannles: first stage has been completed. Further ideas Evgeniy Polyakov
  2006-07-18  8:34 ` David Miller
  2006-07-18 11:16 ` Christian Borntraeger
@ 2006-07-18 12:15 ` Jörn Engel
  2006-07-18 19:08   ` Evgeniy Polyakov
  2006-07-18 23:01 ` Alexey Kuznetsov
  3 siblings, 1 reply; 60+ messages in thread
From: Jörn Engel @ 2006-07-18 12:15 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: netdev, David Miller

On Tue, 18 July 2006 12:16:26 +0400, Evgeniy Polyakov wrote:
> 
> Current tests with the latest netchannel patch show that netchannels 
> outperforms sockets in any type of bulk transfer (big-sized, small-sized, 
> sending, receiving) over 1gb wire. I omit graphs and numbers here, 
> since I posted it already several times. I also plan to proceed
> some negotiations which would allow to test netchannel support in 10gbit
> environment, but it can also happen after second development stage
> completed.

[ I don't have enough time for a deeper look.  So if my questions are
stupid, please just tell me so and don't take it personal. ]

After having seen Van Jacobson's presentation at LCA twice, it
appeared to me that Van could get astonishing speedups with small
incremental steps, only changing kernel code and leaving the
kernel-userspace interface as is.

Changing (or rather adding a new) the userspace interface was just the
last step, which also gave some performance benefits but is also a
change to the userspace interface and therefore easy to get wrong and
hard to fix later.

Your description makes it sound as if you would take a huge leap,
changing all in-kernel code _and_ the userspace interface in a single
patch.  Am I wrong?  Or am I right and would it make sense to extract
small incremental steps from your patch similar to those Van did in
his non-published work?

Jörn

-- 
When people work hard for you for a pat on the back, you've got
to give them that pat.
-- Robert Heinlein

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-18 11:51   ` Evgeniy Polyakov
@ 2006-07-18 12:36     ` Christian Borntraeger
  2006-07-18 19:11       ` Evgeniy Polyakov
  0 siblings, 1 reply; 60+ messages in thread
From: Christian Borntraeger @ 2006-07-18 12:36 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: netdev, David Miller

On Tuesday 18 July 2006 13:51, Evgeniy Polyakov wrote:
> > I think this should be -EFAULT instead of -ERESTARTSYS, right?
>
> I have no strong feeling on what must be returned in that case.
> As far as I see, copy*user can fail due to absence of the next
> destination page, so -ERESTARTSYS makes sence, but if failure happens due
> to process size limitation, -EFAULT is correct.

If I am not completely mistaken ERESTARTSYS is wrong. 
include/linux/errno.h says userspace should never see ERESTARTSYS, therefore 
we should only return it if we were interrupted by a signal as do_signal 
takes care of ERESTARTSYS. Furthermore, copy*user transparently faults in 
necessary pages as long as the address is valid in the user context. 

> Let's change it to -EFAULT.

Thanks :-)


-- 
Mit freundlichen Grüßen / Best Regards

Christian Borntraeger
Linux Software Engineer zSeries Linux & Virtualization




^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-18 12:15 ` Jörn Engel
@ 2006-07-18 19:08   ` Evgeniy Polyakov
  2006-07-19 11:00     ` Jörn Engel
  0 siblings, 1 reply; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-18 19:08 UTC (permalink / raw)
  To: J?rn Engel; +Cc: netdev, David Miller

On Tue, Jul 18, 2006 at 02:15:17PM +0200, J?rn Engel (joern@wohnheim.fh-wedel.de) wrote:
> 
> Your description makes it sound as if you would take a huge leap,
> changing all in-kernel code _and_ the userspace interface in a single
> patch.  Am I wrong?  Or am I right and would it make sense to extract
> small incremental steps from your patch similar to those Van did in
> his non-published work?

My first implementation used existing kernel code and showed small
performance win - there was binding of the socket to netchannel and all
protocol processing was moved into process context. It actually is the
same what IBM folks do, but my investigation showed that linux sending
side has some issues which would not allow to grow speed very noticebly
(after creating yet another congestion control algo I now think that the
problem is there, but I'm not 100% sure).
And after looking into Van's presentation (and his words about
_userspace_ protocol processing) I think they used own stack too.
So I reinvented the wheel and created my own too.

> J?rn
> 
> -- 
> When people work hard for you for a pat on the back, you've got
> to give them that pat.
> -- Robert Heinlein

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-18 12:36     ` Christian Borntraeger
@ 2006-07-18 19:11       ` Evgeniy Polyakov
  2006-07-18 21:20         ` David Miller
  0 siblings, 1 reply; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-18 19:11 UTC (permalink / raw)
  To: Christian Borntraeger; +Cc: netdev, David Miller

On Tue, Jul 18, 2006 at 02:36:57PM +0200, Christian Borntraeger (borntrae@de.ibm.com) wrote:
> On Tuesday 18 July 2006 13:51, Evgeniy Polyakov wrote:
> > > I think this should be -EFAULT instead of -ERESTARTSYS, right?
> >
> > I have no strong feeling on what must be returned in that case.
> > As far as I see, copy*user can fail due to absence of the next
> > destination page, so -ERESTARTSYS makes sence, but if failure happens due
> > to process size limitation, -EFAULT is correct.
> 
> If I am not completely mistaken ERESTARTSYS is wrong. 
> include/linux/errno.h says userspace should never see ERESTARTSYS, therefore 
> we should only return it if we were interrupted by a signal as do_signal 
> takes care of ERESTARTSYS. Furthermore, copy*user transparently faults in 
> necessary pages as long as the address is valid in the user context. 

Actually userspace will not see ERESTARTSYS, when it is returned from
syscall.

> > Let's change it to -EFAULT.
> 
> Thanks :-)

No problem. I've commited this change already.
 
> -- 
> Mit freundlichen Gr????en / Best Regards
> 
> Christian Borntraeger
> Linux Software Engineer zSeries Linux & Virtualization

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-18 19:11       ` Evgeniy Polyakov
@ 2006-07-18 21:20         ` David Miller
  0 siblings, 0 replies; 60+ messages in thread
From: David Miller @ 2006-07-18 21:20 UTC (permalink / raw)
  To: johnpol; +Cc: borntrae, netdev

From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Tue, 18 Jul 2006 23:11:37 +0400

> Actually userspace will not see ERESTARTSYS, when it is returned from
> syscall.

This is true only when a signal is pending.

It is the signal dispatch code that fixes up the return value
either by changing it to -EINTR or by resetting the register
state such that the signal handler returns to re-execute the
system call with the original set of argument register values.

If a signal is not pending, you risk leaking ERESTARTSYS to
userspace.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-18  8:16 Netchannles: first stage has been completed. Further ideas Evgeniy Polyakov
                   ` (2 preceding siblings ...)
  2006-07-18 12:15 ` Jörn Engel
@ 2006-07-18 23:01 ` Alexey Kuznetsov
  2006-07-19  0:39   ` David Miller
                     ` (3 more replies)
  3 siblings, 4 replies; 60+ messages in thread
From: Alexey Kuznetsov @ 2006-07-18 23:01 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: netdev, David Miller

Hello!

Can I ask couple of questions? Just as a person who looked at VJ's
slides once and was confused. And startled, when found that it is not
considered as another joke of genuis. :-)

About locks:

> 	  is completely lockless (there is one irq lock when skb 
> is queued/dequeued into netchannels queue in hard/soft irq, 

Equivalent of socket spinlock.

> one mutex for netchannel's bucket 

Equivalent of socket user lock.

> and some locks on qdisk/NIC driver layer,

The same as in traditional code, right?

>From all that I see, this "completely lockless code" has not less locks
than traditional approach, even when doing no protocol processing.
Where am I wrong? Frankly speaking, when talking about locks,
I do not see anything, which could be saved, only TCP hash table
lookup can be RCUized, but this optimization obviously has nothing to do
with netchannels.

The only improvement in this area suggested in VJ's slides 
is a lock-free producer-consumer ring. It is missing in your patch
and I could guess it is not big loss, it is unlikely
to improve something significantly until the lock is heavily contended,
which never happens without massive network-level parallelism
for a single bucket.

The next question is about locality:

To find netchannel bucket in netif_receive_skb() you have to access
all the headers of packet. Right? Then you wait for processing in user
context, and this information is washed out of cache or even scheduled
on another CPU.

In traditional approach you also fetch all the headers on softirq,
but you do all the required work with them immediately and do not access them
when the rest of processing is done in process context. I do not see
how netchannels (without hardware classification) can improve something
here. At the first sight it makes locality worse.

Honestly, I do not see how this approach could improve performance
even a little. And it looks like your benchmarks confirm that all
the win is not due to architectural changes, but just because
some required bits of code are castrated.

VJ slides describe a totally different scheme, where softirq part is omitted
completely, protocol processing is moved to user space as whole.
It is an amazing toy. But I see nothing, which could promote its status
to practical. Exokernels used to do this thing for ages, and all the
performance gains are compensated by overcomplicated classification
engine, which has to remain in kernel and essentially to do the same
work which routing/firewalling/socket hash tables do.

> advance that having two separate TCP stacks (one of which can contain 
> some bugs (I mean atcp.c)) is not that good idea, so I understand 
> possible negative feedback on that issue, but it is much better than
> silence.

You are absolutely right here. Moreover, I can guess that absense
of feedback is a direct consequence of this thing. I would advise to
get rid of it and never mention it again. :-) If you took VJ suggestion
seriously and moved TCP engine to user space, it could remain unnoticed.
But if TCP stays in kernel (and it obviously has to), you want to work
with normal stack, you can improve, optimize and rewrite it infinitely,
but do not start with a toy. It proves nothing and compromises
the whole approach.

Alexey

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-18 23:01 ` Alexey Kuznetsov
@ 2006-07-19  0:39   ` David Miller
  2006-07-19  5:38   ` Evgeniy Polyakov
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 60+ messages in thread
From: David Miller @ 2006-07-19  0:39 UTC (permalink / raw)
  To: kuznet; +Cc: johnpol, netdev

From: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Date: Wed, 19 Jul 2006 03:01:21 +0400

> The only improvement in this area suggested in VJ's slides is a
> lock-free producer-consumer ring. It is missing in your patch and I
> could guess it is not big loss, it is unlikely to improve something
> significantly until the lock is heavily contended, which never
> happens without massive network-level parallelism for a single
> bucket.

And the gains from this ring can be obtained by stateless hardware
classification pointing to unique MSI-X PCI interrupt vectors that get
targetted to specific unique cpus.  It is true zero cost in that case.

I guess my excitement about VJ channels, from a practical viewpoint,
begin to wane even further.  How depressing :)

Devices can move flow work to individual cpus via intellegent
interrupt targeting, and OS should just get out of the way and
continue doing what it does today.  This idea is actually very old,
and PCI MSI-X interrupts just make it practical for commodity devices.

At least, there is less code to write. :-)))

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-18 23:01 ` Alexey Kuznetsov
  2006-07-19  0:39   ` David Miller
@ 2006-07-19  5:38   ` Evgeniy Polyakov
  2006-07-19  6:30     ` Evgeniy Polyakov
  2006-07-19 13:19     ` Alexey Kuznetsov
  2006-07-19 19:52   ` Stephen Hemminger
  2006-07-27  2:17   ` Rusty Russell
  3 siblings, 2 replies; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-19  5:38 UTC (permalink / raw)
  To: Alexey Kuznetsov; +Cc: netdev, David Miller

On Wed, Jul 19, 2006 at 03:01:21AM +0400, Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) wrote:
> Hello!

Hello, Alexey.

> Can I ask couple of questions? Just as a person who looked at VJ's
> slides once and was confused. And startled, when found that it is not
> considered as another joke of genuis. :-)
> 
> 
> About locks:
> 
> > 	  is completely lockless (there is one irq lock when skb 
> > is queued/dequeued into netchannels queue in hard/soft irq, 
> 
> Equivalent of socket spinlock.

There is no socket spinlock anymore.
Above lock is skb_queue lock which is held inside
skb_dequeue/skb_queue_tail calls.

> > one mutex for netchannel's bucket 
> 
> Equivalent of socket user lock.

No, it is an equivalent for hash lock in socket table.

> > and some locks on qdisk/NIC driver layer,
> 
> The same as in traditional code, right?

I use dst_output(), so it is possible to have as many locks inside
low-level NIC driver as you want.

> From all that I see, this "completely lockless code" has not less locks
> than traditional approach, even when doing no protocol processing.
> Where am I wrong? Frankly speaking, when talking about locks,
> I do not see anything, which could be saved, only TCP hash table
> lookup can be RCUized, but this optimization obviously has nothing to do
> with netchannels.

It looks like you should looks at it again :)
Just an example - tcp_established() can be called with bh disabled under
the socket lock. In netchannels there is no need for that.

> The only improvement in this area suggested in VJ's slides 
> is a lock-free producer-consumer ring. It is missing in your patch
> and I could guess it is not big loss, it is unlikely
> to improve something significantly until the lock is heavily contended,
> which never happens without massive network-level parallelism
> for a single bucket.

That's because I decided to use skbs, but not special structures and
thus I use the same queue as socket code (and have the only one lock
inside skb_queue_tail()/skb_dequeue()). I will describe below why I do
not changed it to more hardware-friendly stuff.

> The next question is about locality:
> 
> To find netchannel bucket in netif_receive_skb() you have to access
> all the headers of packet. Right? Then you wait for processing in user
> context, and this information is washed out of cache or even scheduled
> on another CPU.
> 
> In traditional approach you also fetch all the headers on softirq,
> but you do all the required work with them immediately and do not access them
> when the rest of processing is done in process context. I do not see
> how netchannels (without hardware classification) can improve something
> here. At the first sight it makes locality worse.

In that case one copies the whole data into userspace, so access for 20
bytes of headers completely does not matter.

> Honestly, I do not see how this approach could improve performance
> even a little. And it looks like your benchmarks confirm that all
> the win is not due to architectural changes, but just because
> some required bits of code are castrated.

Hmm, for 80 bytes sized packets win was about 2.5 times. Could you
please show me lines inside existing code, which should be commented, so
I got 50Mbyte/sec for that?

> VJ slides describe a totally different scheme, where softirq part is omitted
> completely, protocol processing is moved to user space as whole.
> It is an amazing toy. But I see nothing, which could promote its status
> to practical. Exokernels used to do this thing for ages, and all the
> performance gains are compensated by overcomplicated classification
> engine, which has to remain in kernel and essentially to do the same
> work which routing/firewalling/socket hash tables do.

There are several ideas presented in his slides.
For my personal opinion most of performance win is obtained from
userspace processing and memcpy instead of copy_to_user() (but my
previous work showed that it is not the case for a lot of situations),
so I created first approach, tested second and now move into fully
zero-copy design. How skbs or other structures are delivered into the
queue/array does not matter in my design - I can replace it in a moment,
but I do not want to mess with drivers, since it is huge break, which
must be done after high-level stuff proven to work good.

> > advance that having two separate TCP stacks (one of which can contain 
> > some bugs (I mean atcp.c)) is not that good idea, so I understand 
> > possible negative feedback on that issue, but it is much better than
> > silence.
> 
> You are absolutely right here. Moreover, I can guess that absense
> of feedback is a direct consequence of this thing. I would advise to
> get rid of it and never mention it again. :-) If you took VJ suggestion
> seriously and moved TCP engine to user space, it could remain unnoticed.
> But if TCP stays in kernel (and it obviously has to), you want to work
> with normal stack, you can improve, optimize and rewrite it infinitely,
> but do not start with a toy. It proves nothing and compromises
> the whole approach.

Well, you probably did not read my previous e-mails about netchannels.
I showed there, that using existing stack it is imposible to get big
performance win (although I tested my patches with 1gb only), since
there are some nitpicks on sending side (now I think it is congestion
control, but I'm not 100% sure).

The only thing my TCP stack implementation proves, that it is possible
to have higher transfer rate with existing NICs over TCP that with
existing socket code, no more, no less.

> Alexey

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-19  5:38   ` Evgeniy Polyakov
@ 2006-07-19  6:30     ` Evgeniy Polyakov
  2006-07-19 13:19     ` Alexey Kuznetsov
  1 sibling, 0 replies; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-19  6:30 UTC (permalink / raw)
  To: Alexey Kuznetsov; +Cc: netdev, David Miller

On Wed, Jul 19, 2006 at 09:38:41AM +0400, Evgeniy Polyakov (johnpol@2ka.mipt.ru) wrote:
> There is no socket spinlock anymore.
> Above lock is skb_queue lock which is held inside
> skb_dequeue/skb_queue_tail calls.
>  
> > > one mutex for netchannel's bucket 
> > 
> > Equivalent of socket user lock.
> 
> No, it is an equivalent for hash lock in socket table.

Futher lock description:
low-level netchannels input uses RCU, where RCU and hash lock are
used in socket code. Userspace can bind file descriptor to netchannel,
so there will be no locks at all (netchannel bucket mutex is held, when 
netchannel lookup happens). That basically means that if two readers call 
netchannel_recv() simultaneously, there will be som troubles, so in that 
case netchannel bucket mutex is equivalent to socket lock (if netchannel
mutex is used).

But main thing here is that there is no need to protect against bh
context, and nothing is locked and processed in bh context at all.

This leads to complete removal of atomic allocations in netchannel code.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-18 19:08   ` Evgeniy Polyakov
@ 2006-07-19 11:00     ` Jörn Engel
  2006-07-20  7:42       ` Evgeniy Polyakov
  0 siblings, 1 reply; 60+ messages in thread
From: Jörn Engel @ 2006-07-19 11:00 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: netdev, David Miller

On Tue, 18 July 2006 23:08:01 +0400, Evgeniy Polyakov wrote:
> On Tue, Jul 18, 2006 at 02:15:17PM +0200, J?rn Engel (joern@wohnheim.fh-wedel.de) wrote:
> > 
> > Your description makes it sound as if you would take a huge leap,
> > changing all in-kernel code _and_ the userspace interface in a single
> > patch.  Am I wrong?  Or am I right and would it make sense to extract
> > small incremental steps from your patch similar to those Van did in
> > his non-published work?
> 
> My first implementation used existing kernel code and showed small
> performance win - there was binding of the socket to netchannel and all
> protocol processing was moved into process context.

Iirc, Van didn't show performance numbers but rather cpu utilization
numbers.  And those went down significantly without changing the
userspace interface.

Did you look at cpu utilization as well?  If you did and your numbers
are worse than Vans, he either did something smarter than you or
forged his numbers (quite unlikely).

Jörn

-- 
Sometimes, asking the right question is already the answer.
-- Unknown

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-19  5:38   ` Evgeniy Polyakov
  2006-07-19  6:30     ` Evgeniy Polyakov
@ 2006-07-19 13:19     ` Alexey Kuznetsov
  2006-07-20  7:32       ` Evgeniy Polyakov
  1 sibling, 1 reply; 60+ messages in thread
From: Alexey Kuznetsov @ 2006-07-19 13:19 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: netdev, David Miller

Hello!

> There is no socket spinlock anymore.
> Above lock is skb_queue lock which is held inside
> skb_dequeue/skb_queue_tail calls.

Lock is named differently, but it is still here.
BTW for UDP even the name is the same.

> > Equivalent of socket user lock.
> 
> No, it is an equivalent for hash lock in socket table.

OK. But you have to introduce socket mutex somewhere in any case.
Even in ATCP.

> Just an example - tcp_established() can be called with bh disabled under
> the socket lock.

When we have a process context in hands, it is not.

Did you ask youself, why do not we put all the packets to backlog/prequeue
and just wait when user will read the data? It would be 100% equivalent
to "netchannels".

The answer is simple: because we cannot wait. If user delays for 200msec,
wait for connection collapse due to retransmissions. If the segment is
out of order, immediate attention is required. Any scheme, which tries
to wait for user unconditionally, at least has to run a watchdog timer,
which fires before sender senses the gap.

And this is what we do for ages. Grep for "VJ" in sources. :-)

netchannels have nothing to do with it, it is much elder idea.

> In that case one copies the whole data into userspace, so access for 20
> bytes of headers completely does not matter.

For short packets it matters.

But I said not this. I said it looks _worse_. A bit, but worse.

> Hmm, for 80 bytes sized packets win was about 2.5 times. Could you
> please show me lines inside existing code, which should be commented, so
> I got 50Mbyte/sec for that?

If I knew it would be done. :-)

Actually, it is the action, which I would expect. This, but
not dropping all the TCP stack.

> I showed there, that using existing stack it is imposible

Please, understand, it is such statements that compromise your work.
If it is impossible then it is not interesting.

Alexey

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-18 23:01 ` Alexey Kuznetsov
  2006-07-19  0:39   ` David Miller
  2006-07-19  5:38   ` Evgeniy Polyakov
@ 2006-07-19 19:52   ` Stephen Hemminger
  2006-07-19 20:01     ` David Miller
  2006-07-27  2:17   ` Rusty Russell
  3 siblings, 1 reply; 60+ messages in thread
From: Stephen Hemminger @ 2006-07-19 19:52 UTC (permalink / raw)
  To: Alexey Kuznetsov; +Cc: Evgeniy Polyakov, netdev, David Miller

As a related note, I am looking into fixing inet hash tables to use RCU.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-19 19:52   ` Stephen Hemminger
@ 2006-07-19 20:01     ` David Miller
  2006-07-19 20:16       ` Stephen Hemminger
  2006-07-24 18:54       ` Stephen Hemminger
  0 siblings, 2 replies; 60+ messages in thread
From: David Miller @ 2006-07-19 20:01 UTC (permalink / raw)
  To: shemminger; +Cc: kuznet, johnpol, netdev

From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 19 Jul 2006 15:52:04 -0400

> As a related note, I am looking into fixing inet hash tables to use RCU.

IBM had posted a patch a long time ago, which would be not
so hard to munge into the current tree.  See if you can
spot it in the archives :)

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-19 20:01     ` David Miller
@ 2006-07-19 20:16       ` Stephen Hemminger
  2006-07-24 18:54       ` Stephen Hemminger
  1 sibling, 0 replies; 60+ messages in thread
From: Stephen Hemminger @ 2006-07-19 20:16 UTC (permalink / raw)
  To: David Miller; +Cc: kuznet, johnpol, netdev

On Wed, 19 Jul 2006 13:01:50 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Stephen Hemminger <shemminger@osdl.org>
> Date: Wed, 19 Jul 2006 15:52:04 -0400
> 
> > As a related note, I am looking into fixing inet hash tables to use RCU.
> 
> IBM had posted a patch a long time ago, which would be not
> so hard to munge into the current tree.  See if you can
> spot it in the archives :)

Ben posted a patch in March, and IBM did one a while ago.
I am looking at both.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-19 13:19     ` Alexey Kuznetsov
@ 2006-07-20  7:32       ` Evgeniy Polyakov
  2006-07-20 16:41         ` Alexey Kuznetsov
  0 siblings, 1 reply; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-20  7:32 UTC (permalink / raw)
  To: Alexey Kuznetsov; +Cc: davem, netdev

> Hello!

Hello, Alexey.

[ Sorry for long delay, there are some problems with mail servers, so I
can not access them remotely, so I create mail by hads, hopefully thread
will not be broken. ]

>> There is no socket spinlock anymore.
>> Above lock is skb_queue lock which is held inside
>> skb_dequeue/skb_queue_tail calls.

> Lock is named differently, but it is still here.
> BTW for UDP even the name is the same.

There is no bh processing, that lock is needed for 4 operations when skb
is enqueued/dequeued.

And if I would changed skbs to different structures there were no locks
at all - it is extremely lightweight, it can not be compared with socket
lock at all.

No bh/irq processing at all, natural speed management - that is main idea
behind netchannels.

>> > Equivalent of socket user lock.
>> 
>> No, it is an equivalent for hash lock in socket table.

>OK. But you have to introduce socket mutex somewhere in any case.
>Even in ATCP.

Actually not - VJ's idea is to have only one consumer and one provider,
so no locks needed, but I agree, in general case it is needed, but _only_
to protect against several netchannel userspace consumers.
There is no BH protocol processing at all, so there is no need to
pprotect against someone who will add data while you are processing own
chunk.

>> Just an example - tcp_established() can be called with bh disabled
>> under the socket lock.

> When we have a process context in hands, it is not.

>Did you ask youself, why do not we put all the packets to
>backlog/prequeue
>and just wait when user will read the data? It would be 100% equivalent
>to "netchannels".

How many hacks just to be a bit closer to userspace processing,
implemented in netchannels!

>The answer is simple: because we cannot wait. If user delays for
>200msec,
>wait for connection collapse due to retransmissions. If the segment is
>out of order, immediate attention is required. Any scheme, which tries
>to wait for user unconditionally, at least has to run a watchdog timer,
>which fires before sender senses the gap.

If userspace is scheduled away for too much time, it is bloody wrong to
ack the data, that is impossible to read due to the fact that system is
being busy. It is just postponing the work from one end to another - ack
now and stop when queue is full, or postpone the ack generation when
segment is realy being read.

>And this is what we do for ages. Grep for "VJ" in sources. :-)
>netchannels have nothing to do with it, it is much elder idea.

And it was Van, who decided to move away from BH/irq processing.
It was slow and a bit pain way (how many hacks with prequeue, with
direct processing, it is enough just to look how TCP socket lock is locked
in different contexts :)

>> In that case one copies the whole data into userspace, so access for
>> 20 bytes of headers completely does not matter.

>For short packets it matters.

>But I said not this. I said it looks _worse_. A bit, but worse.

At least for 80 bytes it does not matter at all.
And it is very likely that data is misaligned, so half of the
header will be in a cache line. And socket code has the same problem -
skb->cb can be flushed away, and tcp_recvmsg() needs to get it again.
And actually I never understood nanooptimisation behind more serious
problems (i.e. one cache line vs. 50MB/sec speed).

>> Hmm, for 80 bytes sized packets win was about 2.5 times. Could you
>> please show me lines inside existing code, which should be commented,
>> so I got 50Mbyte/sec for that?

>If I knew it would be done. :-)
>
>Actually, it is the action, which I would expect. This, but
>not dropping all the TCP stack.

I tried to use existing one, and I had speed and CPU usage win, but it's
magnitude was not what I expected, so I started userspace network stack
implementation. It was succeded, and there are _very_ major
optimisations over existing code, when processing is fully moved into
userspace, but also there are big problems, like one syscall per ack, 
so I decided to use that stack as a base for in-kernel process protocol 
processing, and I succeded. Probably I will return to the userspace 
network stack idea when I complete zero-copy networking support.

>> I showed there, that using existing stack it is imposible

>Please, understand, it is such statements that compromise your work.
>If it is impossible then it is not interesting.

Do not mix soft and warm - I just post the facts, that netchannel TCP
implementation works (sumetimes much) faster.
It is socket code that probably has some misoptimisations, and if it is
impossible to fix them (well, it least it is very hard), then it is not
interesting.

I definitely do not say, that it must be removed/replaced/anything - it
works perfectly ok, but it is possible to have better performance by
changing architecture, and it was done.

>Alexey

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-19 11:00     ` Jörn Engel
@ 2006-07-20  7:42       ` Evgeniy Polyakov
  0 siblings, 0 replies; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-20  7:42 UTC (permalink / raw)
  To: Jrn Engel; +Cc: davem, netdev

Hello.

[ Sorry for long delay, there are some problems with mail servers, so I
can not access them remotely, so I create mail by hads, hopefully thread
will not be broken. ]

>> > Your description makes it sound as if you would take a huge leap,
>> > changing all in-kernel code _and_ the userspace interface in a
>> > single
>> > patch.  Am I wrong?  Or am I right and would it make sense to
>> > extract
>> > small incremental steps from your patch similar to those Van did in
>> > his non-published work?
>> 
>> My first implementation used existing kernel code and showed small
>> performance win - there was binding of the socket to netchannel and
>> all
>> protocol processing was moved into process context.

>Iirc, Van didn't show performance numbers but rather cpu utilization
>numbers.  And those went down significantly without changing the
>userspace interface.

At least lca presentation graphs shows exactly different numbers - 
performance without CPU utilization (but not as his tables).

>Did you look at cpu utilization as well?  If you did and your numbers
>are worse than Vans, he either did something smarter than you or
>forged his numbers (quite unlikely).

Interesting sentence from political correcteness point of view :)

I did both CPU and speed measurements when used socket code [1], 
and both of them showed small gain, but I only tested 1gbit setup, so
they can not be compared with Van's.
But even with 1gb I was not satisfied with them, so I started different
implementation, which I described in my e-mail to Alexey.

1. speed/cpu measurements of one of the netchannels implementation which
used socket code.
http://thread.gmane.org/gmane.linux.network/36609/focus=36614

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-20  7:32       ` Evgeniy Polyakov
@ 2006-07-20 16:41         ` Alexey Kuznetsov
  2006-07-20 21:08           ` Evgeniy Polyakov
  0 siblings, 1 reply; 60+ messages in thread
From: Alexey Kuznetsov @ 2006-07-20 16:41 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: davem, netdev

Hello!

Small question first:

> userspace, but also there are big problems, like one syscall per ack,

I do not see redundant syscalls. Is not it expected to send ACKs only
after receiving data as you said? What is the problem?

Now boring things:

> There is no BH protocol processing at all, so there is no need to
> pprotect against someone who will add data while you are processing own
> chunk.

Essential part of socket user lock is the same mutex.

Backlog is actually not a protection, but a thing equivalent to netchannel.
The difference is only that it tries to process something immediately,
when it is safe. You can omit this and push everything to backlog(=netchannel),
which is processed only by syscalls, if you do not care about latency.

> How many hacks just to be a bit closer to userspace processing,
> implemented in netchannels!

Moving processing closer to userspace is not a goal, it is a tool.
Which sometimes useful, but generally quite useless.

F.e. in your tests it should not affect performance at all,
end user is just a sink.

What's about prequeueing, it is a bright example. Guess why is it useful?
What does it save? Nothing, like netchannel. Answer is: it is just a tool
to generate coarsed ACKs in a controlled manner without essential violation
of protocol. (Well, and to combine checksumming and copy if you do not like how
your card does this)

> If userspace is scheduled away for too much time, it is bloody wrong to
> ack the data, that is impossible to read due to the fact that system is
> being busy. It is just postponing the work from one end to another - ack
> now and stop when queue is full, or postpone the ack generation when
> segment is realy being read.

... when you get all the segments nicely aligned, blah-blah-blah.

If you do not care about losses-congestion-delays-delacks-whatever,
you have a totally different protocol. Sending window feedback
is only a minor part of tcp. But even these boring tcp intrinsics
are not so important, look at ideal lossless network:

Think what happens f.e. while plain file transfer to your notebook.
You get 110MB/sec for a few seconds, then writeback is fired and
disk io subsystems discovers that the disk holds only 50MB/sec.
If you are unlucky and some another application starts, disk is so congested
that it will take lots of seconds to make a progress with io.
For this time another side will retransmit, because poor thing thought
rtt is 100 usecs and you will never return to 50MB/sec.

You have to _CLOSE_ window in the case of long delay, rather than to forget
to ack. See the difference?

It is just because actual "end" user is still far far away.
And this happens all the time, when you relay the results to another
application via pipe, when... Well, the only case where real "end user"
is user of "netchannel" is when you receive to a sink.

> >But I said not this. I said it looks _worse_. A bit, but worse.
> 
> At least for 80 bytes it does not matter at all.

Hello-o, do you hear me? :-)

I am asking: it looks not much better, but a bit worse,
then what is real reason for better performance, unless it is
due to castration of protocol?

Simplify protocol, move all the processing (even memory copies) to softirq,
leave to user space only feeding pages to copy and you will have unbeatable
performance. Been there, done that, not with TCP of course, but if you do not
care about losses and ACK clocking and send an ACK once per window,
I do not see how it can spoil the situation.

> And actually I never understood nanooptimisation behind more serious
> problems (i.e. one cache line vs. 50MB/sec speed).

You deal with 80 byte packets, to all that I understand.
If you lose one cacheline per packet, it is a big problem.

All that we can change is protocol overhead. Handling data part
is invariant anyway. You are scared of complexity of tcp, but
you obviously forget one thing: cpu is fast.
The code can look very complicated: some crazy hash functions,
damn hairy protocol processing, but if you take care about caches etc.,
all this is dominated by the first look into packet in eth_type_trans()
or ip_rcv().

BTW, when you deal with normal data flow, cache can be not dirtied
by data at all, it can be bypassed.

> works perfectly ok, but it is possible to have better performance by
> changing architecture, and it was done.

It is exactly the point of trouble. From all that I see and you said,
better performance is got not due to change of architecture,
but despite of this.

A proof that we can perform better by changing protocol is not required,
it is kinda obvious. The question is how to make existing protocol
to perform better.

I have no idea, why your tcp performs better. It can be everything:
absence of slow start, more coarse ACKs, whatever. I believe you were careful
to check those reasons and to do a fair comparison, but then the only guess
remains that you saved lots of i-cache getting rid of long code path.

And none of those guesses can be attributed to "netchannels". :-)

Alexey

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-20 16:41         ` Alexey Kuznetsov
@ 2006-07-20 21:08           ` Evgeniy Polyakov
  2006-07-20 21:21             ` Ben Greear
                               ` (2 more replies)
  0 siblings, 3 replies; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-20 21:08 UTC (permalink / raw)
  To: Alexey Kuznetsov; +Cc: davem, netdev

On Thu, Jul 20, 2006 at 08:41:00PM +0400, Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) wrote:
> Hello!

Hello, Alexey.

> Small question first:
> 
> > userspace, but also there are big problems, like one syscall per ack,
> 
> I do not see redundant syscalls. Is not it expected to send ACKs only
> after receiving data as you said? What is the problem?

I mean that each ack is a pure syscall without any data, so overhead is
quite huge compared to the situatin when acks are created in
kernelspace.
At least slow start will eat a lot of CPU with them.

> Now boring things:
> 
> > There is no BH protocol processing at all, so there is no need to
> > pprotect against someone who will add data while you are processing own
> > chunk.
> 
> Essential part of socket user lock is the same mutex.
> 
> Backlog is actually not a protection, but a thing equivalent to netchannel.
> The difference is only that it tries to process something immediately,
> when it is safe. You can omit this and push everything to backlog(=netchannel),
> which is processed only by syscalls, if you do not care about latency.

If we consider netchannels as how Van Jackobson discribed them, then
mutext is not needed, since it is impossible to have several readers or
writers. But in socket case even if there is only one userspace
consumer, that lock must be held to protect against bh (or introduce
several queues and complicate a lot their's management (ucopy for
example)).

> > How many hacks just to be a bit closer to userspace processing,
> > implemented in netchannels!
> 
> Moving processing closer to userspace is not a goal, it is a tool.
> Which sometimes useful, but generally quite useless.
> 
> F.e. in your tests it should not affect performance at all,
> end user is just a sink.
> 
> What's about prequeueing, it is a bright example. Guess why is it useful?
> What does it save? Nothing, like netchannel. Answer is: it is just a tool
> to generate coarsed ACKs in a controlled manner without essential violation
> of protocol. (Well, and to combine checksumming and copy if you do not like how
> your card does this)

I can not agree here. 
The main goal of the protocol is data delivery to the user, but not
it's blind accepting and data transmit from user, but not some other
ring.
As you see, sending is already implemented in process' context,
but receiving is not directly connected to the user.
THe more elemnts between user and it's data we have, the more
probability of some problems there. And we already have two queues just
to eliminate one of them.
Moving protocol (no matter if it is TCP or not) closer to user allows
naturally control the dataflow - when user can read that data(and _this_
is the main goal), user acks, when it can not - it does not generate
ack. In theory that can lead to the full absence of the congestions,
especially if receiving window can be controlled in both directions.
At least with current state of routers it does not lead to the broken
connections.

> > If userspace is scheduled away for too much time, it is bloody wrong to
> > ack the data, that is impossible to read due to the fact that system is
> > being busy. It is just postponing the work from one end to another - ack
> > now and stop when queue is full, or postpone the ack generation when
> > segment is realy being read.
> 
> ... when you get all the segments nicely aligned, blah-blah-blah.
> 
> If you do not care about losses-congestion-delays-delacks-whatever,
> you have a totally different protocol. Sending window feedback
> is only a minor part of tcp. But even these boring tcp intrinsics
> are not so important, look at ideal lossless network:
> 
> Think what happens f.e. while plain file transfer to your notebook.
> You get 110MB/sec for a few seconds, then writeback is fired and
> disk io subsystems discovers that the disk holds only 50MB/sec.
> If you are unlucky and some another application starts, disk is so congested
> that it will take lots of seconds to make a progress with io.
> For this time another side will retransmit, because poor thing thought
> rtt is 100 usecs and you will never return to 50MB/sec.
> 
> You have to _CLOSE_ window in the case of long delay, rather than to forget
> to ack. See the difference?
> 
> It is just because actual "end" user is still far far away.
> And this happens all the time, when you relay the results to another
> application via pipe, when... Well, the only case where real "end user"
> is user of "netchannel" is when you receive to a sink.

There is one problem in your logic.
RTT will not be so small, since acks are not sent when user does not
read data.

> > >But I said not this. I said it looks _worse_. A bit, but worse.
> > 
> > At least for 80 bytes it does not matter at all.
> 
> Hello-o, do you hear me? :-)
> 
> I am asking: it looks not much better, but a bit worse,
> then what is real reason for better performance, unless it is
> due to castration of protocol?

Well, if speed would be measured in lines of code, that atcp gets far less than
existing tcp, but performance win is only 2.5 times.

> Simplify protocol, move all the processing (even memory copies) to softirq,
> leave to user space only feeding pages to copy and you will have unbeatable
> performance. Been there, done that, not with TCP of course, but if you do not
> care about losses and ACK clocking and send an ACK once per window,
> I do not see how it can spoil the situation.

Do you live in a perfect world, where user does not want what was
requested? I thought we both live in Russia or at least on the same Earth. 
I'm not 100% sure now...

Userspace needs that data, and it gets
it with netchannels (and sends it, and copies using copy_to_user()).

> > And actually I never understood nanooptimisation behind more serious
> > problems (i.e. one cache line vs. 50MB/sec speed).
> 
> You deal with 80 byte packets, to all that I understand.
> If you lose one cacheline per packet, it is a big problem.

So actual netchannels speed is even better? :)

> All that we can change is protocol overhead. Handling data part
> is invariant anyway. You are scared of complexity of tcp, but
> you obviously forget one thing: cpu is fast.
> The code can look very complicated: some crazy hash functions,
> damn hairy protocol processing, but if you take care about caches etc.,
> all this is dominated by the first look into packet in eth_type_trans()
> or ip_rcv().

I think I start to repeat myself: cache issues are the same.
You get headers into the cache in bh/interrupt time, you ron protocol
processing. softirq is completed, block layer flushes everything away,
you run recv() -> tcp_recvmsg() which loads into the cache skb->cb.
Point.

> BTW, when you deal with normal data flow, cache can be not dirtied
> by data at all, it can be bypassed.

You cut the lines about misaligned data, which is very common case.
So part of the header is in a cache line. You also cut lines about
exactly the same problem with existing code, since it stores a lot of
variables in skb->cb which is flushed away too.

You forget to say that with disabled bh you must done a lot of things -
ack (with atomic allocation), queueing, out-of-ordder handling and much
more. And then your process is scheduled away, skb->cb and other
variables are flushed away, and in tcp_recvmsg() time you get them
again. And you never measured that impact on performance, as long as I
never did that too, since it is quite hard to determine how much is the
cache line flushing price and how many of them were removed.
In theory it is perfect, but in practice netchannels perform much
better, although they have "all those problems"...
If protocol is "castrated", but it still allows to work faster, then
tell me, why we should keep (enabled) that redundant functionality?
Because it can work better in some other places, and that is correct,
but why it should be enabled then in majority of the cases?

> > works perfectly ok, but it is possible to have better performance by
> > changing architecture, and it was done.
> 
> It is exactly the point of trouble. From all that I see and you said,
> better performance is got not due to change of architecture,
> but despite of this.
> 
> A proof that we can perform better by changing protocol is not required,
> it is kinda obvious. The question is how to make existing protocol
> to perform better.
> 
> I have no idea, why your tcp performs better. It can be everything:
> absence of slow start, more coarse ACKs, whatever. I believe you were careful
> to check those reasons and to do a fair comparison, but then the only guess
> remains that you saved lots of i-cache getting rid of long code path.
> 
> And none of those guesses can be attributed to "netchannels". :-)

Well, atcp does have slow start, I implemented several ack generation algos,
and there was noticeble difference, but in any case netchannels were
faster, there were used several different MSS combining methods, and a
lot of testing to achieve current state of the atcp, so I think protocol
itself can produce some gain in performance. Cache issues are the same.

Let's draw the line.

You do not know, why netchannels work faster, but you are sure it
is not because of protocol processing happens in process context,
since you do not know why it can help in that case.

I understand your position.

My point of view, as one can expect, differs  from yours - netchannels
perform faster not only because of different TCP implementation, but
because of architectural changes (no BH/irq processing, no bh/irq locks, 
no complex queue management, no atomic allocations, no false (I do
understand that it is wrong word in this context, but from above one can
see, what I mean) acks, thus no possible queue overfull, natural flow
control and other things).

According to your logic, it is impossible to have faster processing
(with existing socket code), when protocol management is moved totally
into process context, but I showed with my initial netchannel implementation,
that it can be done - and there was small, but 100% reproducible steady
performance win (about 2-3 MB/sec and several % of CPU usage) with
big-sized chunks. Unfortunately I did not test small-sized ones, which
show big perfomance win with netchannels and atcp. Those results were
not enough for me, so I implemented different stack, which does not
have anything related to the two step processing, and it can be one of the reasons
for faster processing. It can have bugs, but the whole idea was proven
to be absolutely correct (when using either socket code, or atcp).

That was my opinion on the topic. It looks like neither you, nor me will
not change our point of view about that right now :)
But anyway it is a good discussion, let's see what others think about
it.

> Alexey

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-20 21:08           ` Evgeniy Polyakov
@ 2006-07-20 21:21             ` Ben Greear
  2006-07-21  7:19               ` Evgeniy Polyakov
  2006-07-20 21:40             ` Ian McDonald
  2006-07-20 22:59             ` Alexey Kuznetsov
  2 siblings, 1 reply; 60+ messages in thread
From: Ben Greear @ 2006-07-20 21:21 UTC (permalink / raw)
  To: johnpol; +Cc: Alexey Kuznetsov, davem, netdev

Evgeniy Polyakov wrote:

>>Backlog is actually not a protection, but a thing equivalent to netchannel.
>>The difference is only that it tries to process something immediately,
>>when it is safe. You can omit this and push everything to backlog(=netchannel),
>>which is processed only by syscalls, if you do not care about latency.
> 
> 
> If we consider netchannels as how Van Jackobson discribed them, then
> mutext is not needed, since it is impossible to have several readers or
> writers. But in socket case even if there is only one userspace
> consumer, that lock must be held to protect against bh (or introduce
> several queues and complicate a lot their's management (ucopy for
> example)).

Out of curiosity, is it possible to have the single producer logic
if you have two+ ethernet interfaces handling frames for a single
TCP connection?  (I am assuming some sort of multi-path routing
logic...)

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-20 21:08           ` Evgeniy Polyakov
  2006-07-20 21:21             ` Ben Greear
@ 2006-07-20 21:40             ` Ian McDonald
  2006-07-21  7:26               ` Evgeniy Polyakov
  2006-07-20 22:59             ` Alexey Kuznetsov
  2 siblings, 1 reply; 60+ messages in thread
From: Ian McDonald @ 2006-07-20 21:40 UTC (permalink / raw)
  To: johnpol; +Cc: Alexey Kuznetsov, davem, netdev

>
> If we consider netchannels as how Van Jackobson discribed them, then
> mutext is not needed, since it is impossible to have several readers or
> writers. But in socket case even if there is only one userspace
> consumer, that lock must be held to protect against bh (or introduce
> several queues and complicate a lot their's management (ucopy for
> example)).
>
As I recall Van's talk you don't need a lock with a ring buffer if you
have a start and end variable pointing to location within ring buffer.

He didn't explain this in great depth as it is computer science 101
but here is how I would explain it:

Once socket is initialiased consumer is the only one that sets start
variable and network driver reads this only. It is the other way
around for the end variable. As long as the writes are atomic then you
are fine. You only need one ring buffer in this scenario and two
atomic variables.

Having atomic writes does have overhead but far less than locking semantic.
-- 
Ian McDonald
Web: http://wand.net.nz/~iam4
Blog: http://imcdnzl.blogspot.com
WAND Network Research Group
Department of Computer Science
University of Waikato
New Zealand

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-20 21:08           ` Evgeniy Polyakov
  2006-07-20 21:21             ` Ben Greear
  2006-07-20 21:40             ` Ian McDonald
@ 2006-07-20 22:59             ` Alexey Kuznetsov
  2006-07-21  4:55               ` David Miller
  2 siblings, 1 reply; 60+ messages in thread
From: Alexey Kuznetsov @ 2006-07-20 22:59 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: davem, netdev

Hello!

> Moving protocol (no matter if it is TCP or not) closer to user allows
> naturally control the dataflow - when user can read that data(and _this_
> is the main goal), user acks, when it can not - it does not generate
> ack. In theory

To all that I rememeber, in theory absence of feedback leads
to loss of control yet. The same is in practice, unfortunately.
You must say that window is closed, otherwise sender is totally
confused.


> There is one problem in your logic.
> RTT will not be so small, since acks are not sent when user does not
> read data.

It is arithmetics: rtt = window/rate.

And rto stays rounded up to 200 msec, unless you messed the connection
so hard that it is not alive. Check.


> > Simplify protocol, move all the processing (even memory copies) to softirq,
> > leave to user space only feeding pages to copy and you will have unbeatable
> > performance. Been there, done that, not with TCP of course, but if you do not
> > care about losses and ACK clocking and send an ACK once per window,
> > I do not see how it can spoil the situation.
> 
> Do you live in a perfect world, where user does not want what was
> requested?

All the time I am trying to bring you attention that you read to sink. :-)
At least, read to disk to move it a little closer to reality.
Or at least do it from terminal and press ^Z sometimes.


> > You deal with 80 byte packets, to all that I understand.
> > If you lose one cacheline per packet, it is a big problem.
> 
> So actual netchannels speed is even better? :)

atcp. If you get rid of netchannels, leave only atcp, the speed will
be at least not worse. No doubts.


> tell me, why we should keep (enabled) that redundant functionality?
> Because it can work better in some other places, and that is correct,
> but why it should be enabled then in majority of the cases?

Did not I tell you something like that? :-) Optimize real thing,
even trying to detect the situations when retransmissions are redundant
and eliminate the code.


> Let's draw the line.
...
> That was my opinion on the topic. It looks like neither you, nor me will
> not change our point of view about that right now :)

I agree. :)

Alexey

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-20 22:59             ` Alexey Kuznetsov
@ 2006-07-21  4:55               ` David Miller
  2006-07-21  7:10                 ` Evgeniy Polyakov
  2006-07-21 16:26                 ` Rick Jones
  0 siblings, 2 replies; 60+ messages in thread
From: David Miller @ 2006-07-21  4:55 UTC (permalink / raw)
  To: kuznet; +Cc: johnpol, netdev

From: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Date: Fri, 21 Jul 2006 02:59:08 +0400

> > Moving protocol (no matter if it is TCP or not) closer to user allows
> > naturally control the dataflow - when user can read that data(and _this_
> > is the main goal), user acks, when it can not - it does not generate
> > ack. In theory
> 
> To all that I rememeber, in theory absence of feedback leads
> to loss of control yet. The same is in practice, unfortunately.
> You must say that window is closed, otherwise sender is totally
> confused.

Correct, and too large delay even results in retransmits.  You can say
that RTT will be adjusted by delay of ACK, but if user context
switches cleanly at the beginning, resulting in near immediate ACKs,
and then blocks later you will get spurious retransmits.  Alexey's
example of blocking on a disk write is a good example.  I really don't
like when pure NULL data sinks are used for "benchmarking" these kinds
of things because real applications 1) touch the data, 2) do something
with that data, and 3) have some life outside of TCP!

If you optimize an application that does nothing with the data it
receives, you have likewise optimized nothing :-)

All this talk reminds me of one thing, how expensive tcp_ack() is.
And this expense has nothing to do with TCP really.  The main cost is
purging and freeing up the skbs which have been ACK'd in the
retransmit queue.

So tcp_ack() sort of inherits the cost of freeing a bunch of SKBs
which haven't been touched by the cpu in some time and are thus nearly
guarenteed to be cold in the cache.

This is the kind of work we could think about batching to user
sleeping on some socket call.

Also notice that retransmit queue is potentially a good use of an
array similar VJ netchannel lockless queue data structure. :)

BTW, notice that TSO makes this work touch less skb state.  TSO also
decreases cpu utilization.  I think these two things are no
coincidence. :-)

I have even toyed with the idea of eventually abstracting the
retransmit queue into a pure data representation.  The skb_shinfo()
page vector is very nearly this already.  Or, a less extreme idea
where we have fully retained huge TSO skbs, but we do not chop them up
to create smaller TSO frames.  Instead, we add "offset" GSO attribute
which is used in the clones.

Calls to tso_fragment() would be replaced with pure clones and
adjustment of skb->len and the new "skb->gso_offset" in the clone.
Rest of the logic would remain identical except that non-linear data
would start "skb->gso_offset" bytes into the skb_shinfo() described
area.

In this way we could also set tp->xmit_size_goal to it's maximum
possible value, always.  Actually, I was looking at this the other day
and this clamping of xmit_size_goal to 1/2 max_window is extremely
dubious.  In fact it's downright wrong, only MSS needs this limiting
for sender side SWS avoidance.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-21  4:55               ` David Miller
@ 2006-07-21  7:10                 ` Evgeniy Polyakov
  2006-07-21  7:47                   ` David Miller
  2006-07-21 16:26                 ` Rick Jones
  1 sibling, 1 reply; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-21  7:10 UTC (permalink / raw)
  To: David Miller; +Cc: kuznet, netdev

On Thu, Jul 20, 2006 at 09:55:04PM -0700, David Miller (davem@davemloft.net) wrote:
> From: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
> Date: Fri, 21 Jul 2006 02:59:08 +0400
> 
> > > Moving protocol (no matter if it is TCP or not) closer to user allows
> > > naturally control the dataflow - when user can read that data(and _this_
> > > is the main goal), user acks, when it can not - it does not generate
> > > ack. In theory
> > 
> > To all that I rememeber, in theory absence of feedback leads
> > to loss of control yet. The same is in practice, unfortunately.
> > You must say that window is closed, otherwise sender is totally
> > confused.
> 
> Correct, and too large delay even results in retransmits.  You can say
> that RTT will be adjusted by delay of ACK, but if user context
> switches cleanly at the beginning, resulting in near immediate ACKs,
> and then blocks later you will get spurious retransmits.  Alexey's
> example of blocking on a disk write is a good example.  I really don't
> like when pure NULL data sinks are used for "benchmarking" these kinds
> of things because real applications 1) touch the data, 2) do something
> with that data, and 3) have some life outside of TCP!

And what will happen with sockets?
Data will arive and ack will be generated, until queue is filled and
duplicate ack started to be sent thus reducing window even more.

Results _are_ the same, both will have duplicate acks and so on, but
with netchannels there is no complex queue management, no two or more
rings, where data is procesed (bh, process context and so on), no locks
and ... hugh, I reacll I wrote it already several times :)

My userspace applications do memset, and actually writing data into
/dev/null through the stdout pipe does not change the overall picture.

I read a lot of your critics about benchmarking, so I'm ready :)

> If you optimize an application that does nothing with the data it
> receives, you have likewise optimized nothing :-)

I've run that test - dump all data into file through pipe.

84byte packet bulk receiving: 

netchannels: 8 Mb/sec (down 6 when VFS cache is filled)
socket: 7 Mb/sec (down to 6 when VFS cache is filled)

So you asked to create narrow pipe, and speed becomes equal to the speed
of that pipe. No more, no less.

> All this talk reminds me of one thing, how expensive tcp_ack() is.
> And this expense has nothing to do with TCP really.  The main cost is
> purging and freeing up the skbs which have been ACK'd in the
> retransmit queue.

Yes, allocation always takes first places in all profiles.
I'm working to eliminate that - it is a "side effect" of zero-copy
networking design I'm working on right now.

> So tcp_ack() sort of inherits the cost of freeing a bunch of SKBs
> which haven't been touched by the cpu in some time and are thus nearly
> guarenteed to be cold in the cache.
> 
> This is the kind of work we could think about batching to user
> sleeping on some socket call.
> 
> Also notice that retransmit queue is potentially a good use of an
> array similar VJ netchannel lockless queue data structure. :)

Array has a lot of disadvantages with it's resizing, there will be a lot
of troubles with recv/send queue len changes.
But it allows to remove several pointer from skb, which is always a good
start.

> BTW, notice that TSO makes this work touch less skb state.  TSO also
> decreases cpu utilization.  I think these two things are no
> coincidence. :-)

TSO/GSO is a good idea definitely, but it is completely unrelated to
other problems. If it will be implemented with netchannels we will have
even better perfomance.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-20 21:21             ` Ben Greear
@ 2006-07-21  7:19               ` Evgeniy Polyakov
  2006-07-21  7:20                 ` Evgeniy Polyakov
  2006-07-21 16:14                 ` Ben Greear
  0 siblings, 2 replies; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-21  7:19 UTC (permalink / raw)
  To: Ben Greear; +Cc: Alexey Kuznetsov, davem, netdev

On Thu, Jul 20, 2006 at 02:21:57PM -0700, Ben Greear (greearb@candelatech.com) wrote:
> Out of curiosity, is it possible to have the single producer logic
> if you have two+ ethernet interfaces handling frames for a single
> TCP connection?  (I am assuming some sort of multi-path routing
> logic...)

I do not think it is possible with additional logic like what is
implemented in softirqs, i.e. per cpu queues of data, which in turn will
be converted into skbs one-by-one.

> Thanks,
> Ben

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-21  7:19               ` Evgeniy Polyakov
@ 2006-07-21  7:20                 ` Evgeniy Polyakov
  2006-07-21 16:14                 ` Ben Greear
  1 sibling, 0 replies; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-21  7:20 UTC (permalink / raw)
  To: Ben Greear; +Cc: Alexey Kuznetsov, davem, netdev

On Fri, Jul 21, 2006 at 11:19:00AM +0400, Evgeniy Polyakov (johnpol@2ka.mipt.ru) wrote:
> On Thu, Jul 20, 2006 at 02:21:57PM -0700, Ben Greear (greearb@candelatech.com) wrote:
> > Out of curiosity, is it possible to have the single producer logic
> > if you have two+ ethernet interfaces handling frames for a single
> > TCP connection?  (I am assuming some sort of multi-path routing
> > logic...)
> 
> I do not think it is possible with additional logic like what is

I think it is posssible ...


-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-20 21:40             ` Ian McDonald
@ 2006-07-21  7:26               ` Evgeniy Polyakov
  0 siblings, 0 replies; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-21  7:26 UTC (permalink / raw)
  To: Ian McDonald; +Cc: Alexey Kuznetsov, davem, netdev

On Fri, Jul 21, 2006 at 09:40:32AM +1200, Ian McDonald (ian.mcdonald@jandi.co.nz) wrote:
> >If we consider netchannels as how Van Jackobson discribed them, then
> >mutext is not needed, since it is impossible to have several readers or
> >writers. But in socket case even if there is only one userspace
> >consumer, that lock must be held to protect against bh (or introduce
> >several queues and complicate a lot their's management (ucopy for
> >example)).
> >
> As I recall Van's talk you don't need a lock with a ring buffer if you
> have a start and end variable pointing to location within ring buffer.
> 
> He didn't explain this in great depth as it is computer science 101
> but here is how I would explain it:
> 
> Once socket is initialiased consumer is the only one that sets start
> variable and network driver reads this only. It is the other way
> around for the end variable. As long as the writes are atomic then you
> are fine. You only need one ring buffer in this scenario and two
> atomic variables.
> 
> Having atomic writes does have overhead but far less than locking semantic.

With netchannels and one data producer it should not be even atomic.
Problems start to appear when there are several producers or consumers -
there must be implemented either atomic or locking logic indeed.

> -- 
> Ian McDonald
> Web: http://wand.net.nz/~iam4
> Blog: http://imcdnzl.blogspot.com
> WAND Network Research Group
> Department of Computer Science
> University of Waikato
> New Zealand

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-21  7:10                 ` Evgeniy Polyakov
@ 2006-07-21  7:47                   ` David Miller
  2006-07-21  9:06                     ` Evgeniy Polyakov
  0 siblings, 1 reply; 60+ messages in thread
From: David Miller @ 2006-07-21  7:47 UTC (permalink / raw)
  To: johnpol; +Cc: kuznet, netdev

From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Fri, 21 Jul 2006 11:10:10 +0400

> On Thu, Jul 20, 2006 at 09:55:04PM -0700, David Miller (davem@davemloft.net) wrote:
> > Correct, and too large delay even results in retransmits.  You can say
> > that RTT will be adjusted by delay of ACK, but if user context
> > switches cleanly at the beginning, resulting in near immediate ACKs,
> > and then blocks later you will get spurious retransmits.  Alexey's
> > example of blocking on a disk write is a good example.  I really don't
> > like when pure NULL data sinks are used for "benchmarking" these kinds
> > of things because real applications 1) touch the data, 2) do something
> > with that data, and 3) have some life outside of TCP!
> 
> And what will happen with sockets?
> Data will arive and ack will be generated, until queue is filled and
> duplicate ack started to be sent thus reducing window even more.
> 
> Results _are_ the same, both will have duplicate acks and so on, but
> with netchannels there is no complex queue management, no two or more
> rings, where data is procesed (bh, process context and so on), no locks
> and ... hugh, I reacll I wrote it already several times :)

Packets will be retransmitted spuriously and unnecessarily, and we
cannot over-stress how bad this is.

Sure, your local 1gbit network can absorb this extra cost when
the application is blocked for a long time, but in the real internet
it is a real concern.

Please address the fact that your design makes for retransmits that
are totally unnecessary.  Your TCP stack is flawed if it allows this
to happen.  Proper closing of window and timely ACKs are not some
optional feature of TCP, they are in fact mandatory.

If you want to bypass these things, this is fine, but do not name it
TCP :-)))

As a related example, deeply stretched ACKs can help and are perfect
when there is no packet loss.  But in the event of packet loss a
stretch ACK will kill performance, because it makes packet loss
recovery take at least one extra round trip to occur.

Therefore I disabled stretch ACKs in the input path of TCP last year.

> > If you optimize an application that does nothing with the data it
> > receives, you have likewise optimized nothing :-)
> 
> I've run that test - dump all data into file through pipe.
> 
> 84byte packet bulk receiving: 
> 
> netchannels: 8 Mb/sec (down 6 when VFS cache is filled)
> socket: 7 Mb/sec (down to 6 when VFS cache is filled)
> 
> So you asked to create narrow pipe, and speed becomes equal to the speed
> of that pipe. No more, no less.

If you cause unnecessary retransmits, you add unnecessary congestion
to the network for other flows.

> > All this talk reminds me of one thing, how expensive tcp_ack() is.
> > And this expense has nothing to do with TCP really.  The main cost is
> > purging and freeing up the skbs which have been ACK'd in the
> > retransmit queue.
> 
> Yes, allocation always takes first places in all profiles.
> I'm working to eliminate that - it is a "side effect" of zero-copy
> networking design I'm working on right now.

When you say these things over and over again, people like Alexey
and myself perceive it as "La la la la, I'm not listening to you
guys"

Our point is not that your work cannot lead you to fixing these
problems.  Our point is that existing TCP stack can have these
problems fixed too!  With advantage that we don't need all the
negative aspects of moving TCP into userspace.

You can eliminate allocation overhead in our existing stack, with
the simple design I outlined.  In fact, I outlined two approaches,
there is such an abundance of ways to do it that you have a choice
of which one you like the best :)

> Array has a lot of disadvantages with it's resizing, there will be a lot
> of troubles with recv/send queue len changes.
> But it allows to remove several pointer from skb, which is always a good
> start.

Yes it is something to consider.  Large pipes with 4000+ packet
windows present considerable problems in this area.

> TSO/GSO is a good idea definitely, but it is completely unrelated to
> other problems. If it will be implemented with netchannels we will have
> even better perfomance.

I like TSO-like ideas because it points to solutions within existing
stack.

Radical changes are great, when they buy us something that is
"impossible" with current design.  A lot of things being shown and
discussed here are indeed possible with current design.

You have a nice toy and you should be proud of it, but do not make
it into panacea.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-21  7:47                   ` David Miller
@ 2006-07-21  9:06                     ` Evgeniy Polyakov
  2006-07-21  9:19                       ` David Miller
  0 siblings, 1 reply; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-21  9:06 UTC (permalink / raw)
  To: David Miller; +Cc: kuznet, netdev

On Fri, Jul 21, 2006 at 12:47:13AM -0700, David Miller (davem@davemloft.net) wrote:
> > > Correct, and too large delay even results in retransmits.  You can say
> > > that RTT will be adjusted by delay of ACK, but if user context
> > > switches cleanly at the beginning, resulting in near immediate ACKs,
> > > and then blocks later you will get spurious retransmits.  Alexey's
> > > example of blocking on a disk write is a good example.  I really don't
> > > like when pure NULL data sinks are used for "benchmarking" these kinds
> > > of things because real applications 1) touch the data, 2) do something
> > > with that data, and 3) have some life outside of TCP!
> > 
> > And what will happen with sockets?
> > Data will arive and ack will be generated, until queue is filled and
> > duplicate ack started to be sent thus reducing window even more.
> > 
> > Results _are_ the same, both will have duplicate acks and so on, but
> > with netchannels there is no complex queue management, no two or more
> > rings, where data is procesed (bh, process context and so on), no locks
> > and ... hugh, I reacll I wrote it already several times :)
> 
> Packets will be retransmitted spuriously and unnecessarily, and we
> cannot over-stress how bad this is.

In theory practice and theory are the same, but in practice they are
different (c) Larry McVoy as far as I recall :)
And even in theory Linux behaves the same.

I see the only point about process context tcp processing is following
issue:
we started tcp connection, and acks are generated very fast, then
suddenly receiving userspace is blocked.
In that case BH processing apologists state that sending side starts to
retransmit.
Let's see how it works.
If receiving side works for a long with maximum speed, then window is
opened enough, so it can even exceed socket buffer size (max 200k, I saw
several megs socket windows in my tests), so sending side will continue
to send until window is filled. 
Receiving side, nor matter if it is socket or netchannel, will drop
packets (socket due to queue overfull, netchannels will not drop, but
will not ack (it's maximum queue len is 1mb)).

So both approaches behave _exactly_ the same.
Did I miss something?

Btw, here are tests which were ran with netchannels:
 * surfing the web (index pages of different remote sites only)
 * 1gb transfers
 * 1gb <-> 100mb transfers

> Sure, your local 1gbit network can absorb this extra cost when
> the application is blocked for a long time, but in the real internet
> it is a real concern.

Writing into the pipe (or into 100mb NIC) and file is a real internet 
example - data is blocked, acks and retransmits happen.

> Please address the fact that your design makes for retransmits that
> are totally unnecessary.  Your TCP stack is flawed if it allows this
> to happen.  Proper closing of window and timely ACKs are not some
> optional feature of TCP, they are in fact mandatory.
> 
> If you want to bypass these things, this is fine, but do not name it
> TCP :-)))

Hey, you did not look into atcp.c in my patches :)

> As a related example, deeply stretched ACKs can help and are perfect
> when there is no packet loss.  But in the event of packet loss a
> stretch ACK will kill performance, because it makes packet loss
> recovery take at least one extra round trip to occur.
> 
> Therefore I disabled stretch ACKs in the input path of TCP last year.

For slow start it is definitely a must.
If stretching alog is based on timers and round trip time, then I do not
have that in atcp, but proper delaying based on sequence is used instead.

> > > If you optimize an application that does nothing with the data it
> > > receives, you have likewise optimized nothing :-)
> > 
> > I've run that test - dump all data into file through pipe.
> > 
> > 84byte packet bulk receiving: 
> > 
> > netchannels: 8 Mb/sec (down 6 when VFS cache is filled)
> > socket: 7 Mb/sec (down to 6 when VFS cache is filled)
> > 
> > So you asked to create narrow pipe, and speed becomes equal to the speed
> > of that pipe. No more, no less.
> 
> If you cause unnecessary retransmits, you add unnecessary congestion
> to the network for other flows.

Please refer to my description above.
Situation is perfectly the same as with socket code or with netchannels.

> > > All this talk reminds me of one thing, how expensive tcp_ack() is.
> > > And this expense has nothing to do with TCP really.  The main cost is
> > > purging and freeing up the skbs which have been ACK'd in the
> > > retransmit queue.
> > 
> > Yes, allocation always takes first places in all profiles.
> > I'm working to eliminate that - it is a "side effect" of zero-copy
> > networking design I'm working on right now.
> 
> When you say these things over and over again, people like Alexey
> and myself perceive it as "La la la la, I'm not listening to you
> guys"

Hmm, I've confirmed that allocation is a problem no matter which stack
is used. My problem fix has nothing special to netchannels at all.

> Our point is not that your work cannot lead you to fixing these
> problems.  Our point is that existing TCP stack can have these
> problems fixed too!  With advantage that we don't need all the
> negative aspects of moving TCP into userspace.
> 
> You can eliminate allocation overhead in our existing stack, with
> the simple design I outlined.  In fact, I outlined two approaches,
> there is such an abundance of ways to do it that you have a choice
> of which one you like the best :)

TCP stack has nothing to the allocation problem, and I work on
eliminating that problem regardless high-level interface.
Stack should net be fixed, if allocation takes too long.

> > Array has a lot of disadvantages with it's resizing, there will be a lot
> > of troubles with recv/send queue len changes.
> > But it allows to remove several pointer from skb, which is always a good
> > start.
> 
> Yes it is something to consider.  Large pipes with 4000+ packet
> windows present considerable problems in this area.
> 
> > TSO/GSO is a good idea definitely, but it is completely unrelated to
> > other problems. If it will be implemented with netchannels we will have
> > even better perfomance.
> 
> I like TSO-like ideas because it points to solutions within existing
> stack.
> 
> Radical changes are great, when they buy us something that is
> "impossible" with current design.  A lot of things being shown and
> discussed here are indeed possible with current design.
>
> You have a nice toy and you should be proud of it, but do not make
> it into panacea.

I do not force anyone to use netchannels - yes, one can consider it as a
toy. That toy has a lot inside and that toy proved that it is correct 
(with existing stack too).

No need to say a panacea, since there is no sickness.
Socket code has it's own design and it fits it's needs perfectly.
If we want to move further, something must be changed, since all of
_addons_ to the exisintg design do not and can not _change_ it's nature
(I do not say, that it is a problem, but nature of the existing network
stack design), and that addons (like TSO, GSO and any other) helps to
any stack, but stack itself is not a problem. 

I do not want to say, that existing tcp has bugs and must be replaced
with my implementation, or that socket code has bugs, and must be
replaced with netchannels. 

When moving outside existing design it is possible to have all those
advantages _and_ additional gains from removing several levels of
processing, simplification of the low-level data (queues and locks),
allocation changes (no more atomic allocations) and so on.

That's it.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-21  9:06                     ` Evgeniy Polyakov
@ 2006-07-21  9:19                       ` David Miller
  2006-07-21  9:39                         ` Evgeniy Polyakov
  0 siblings, 1 reply; 60+ messages in thread
From: David Miller @ 2006-07-21  9:19 UTC (permalink / raw)
  To: johnpol; +Cc: kuznet, netdev

From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Fri, 21 Jul 2006 13:06:11 +0400

> Receiving side, nor matter if it is socket or netchannel, will drop
> packets (socket due to queue overfull, netchannels will not drop, but
> will not ack (it's maximum queue len is 1mb)).
> 
> So both approaches behave _exactly_ the same.
> Did I miss something?

Socket will not drop the packets on receive because sender will not
violate the window which receiver advertises, therefore there is no
reason to drop the packets.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-21  9:19                       ` David Miller
@ 2006-07-21  9:39                         ` Evgeniy Polyakov
  2006-07-21  9:46                           ` David Miller
  0 siblings, 1 reply; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-21  9:39 UTC (permalink / raw)
  To: David Miller; +Cc: kuznet, netdev

On Fri, Jul 21, 2006 at 02:19:55AM -0700, David Miller (davem@davemloft.net) wrote:
> From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> Date: Fri, 21 Jul 2006 13:06:11 +0400
> 
> > Receiving side, nor matter if it is socket or netchannel, will drop
> > packets (socket due to queue overfull, netchannels will not drop, but
> > will not ack (it's maximum queue len is 1mb)).
> > 
> > So both approaches behave _exactly_ the same.
> > Did I miss something?
> 
> Socket will not drop the packets on receive because sender will not
> violate the window which receiver advertises, therefore there is no
> reason to drop the packets.

How come?
sk_stream_rmem_schedule(), sk_rmem_alloc and friends...

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-21  9:39                         ` Evgeniy Polyakov
@ 2006-07-21  9:46                           ` David Miller
  2006-07-21  9:55                             ` Evgeniy Polyakov
  0 siblings, 1 reply; 60+ messages in thread
From: David Miller @ 2006-07-21  9:46 UTC (permalink / raw)
  To: johnpol; +Cc: kuznet, netdev

From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Fri, 21 Jul 2006 13:39:09 +0400

> On Fri, Jul 21, 2006 at 02:19:55AM -0700, David Miller (davem@davemloft.net) wrote:
> > From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> > Date: Fri, 21 Jul 2006 13:06:11 +0400
> > 
> > > Receiving side, nor matter if it is socket or netchannel, will drop
> > > packets (socket due to queue overfull, netchannels will not drop, but
> > > will not ack (it's maximum queue len is 1mb)).
> > > 
> > > So both approaches behave _exactly_ the same.
> > > Did I miss something?
> > 
> > Socket will not drop the packets on receive because sender will not
> > violate the window which receiver advertises, therefore there is no
> > reason to drop the packets.
> 
> How come?
> sk_stream_rmem_schedule(), sk_rmem_alloc and friends...

sk_stream_rmem_schedule() allocates bytes from the global memory pool
quota for TCP sockets.  It is not something will trigger when, for
example, application blocks on a disk write.

In fact it will rarely trigger once size of window is known, since
sk_forward_alloc will grow to fill that size, then statically stay
at the value being able to service all allocation requests in the
future.

Only when there is severe global TCP memory pressure will it be
decreased.

And again this isn't something which happens when a user simply
blocks on some non-TCP operation.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-21  9:46                           ` David Miller
@ 2006-07-21  9:55                             ` Evgeniy Polyakov
  0 siblings, 0 replies; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-21  9:55 UTC (permalink / raw)
  To: David Miller; +Cc: kuznet, netdev

On Fri, Jul 21, 2006 at 02:46:23AM -0700, David Miller (davem@davemloft.net) wrote:
> > sk_stream_rmem_schedule(), sk_rmem_alloc and friends...
> 
> sk_stream_rmem_schedule() allocates bytes from the global memory pool
> quota for TCP sockets.  It is not something will trigger when, for
> example, application blocks on a disk write.
> 
> In fact it will rarely trigger once size of window is known, since
> sk_forward_alloc will grow to fill that size, then statically stay
> at the value being able to service all allocation requests in the
> future.
> 
> Only when there is severe global TCP memory pressure will it be
> decreased.
> 
> And again this isn't something which happens when a user simply
> blocks on some non-TCP operation.

Of course it is not, but something that breaks header prediction will
fall into memory check and so on.
Blocking on write will not trigger memory limits overcommit from the
first packet, as long as it will not trigger timeout retransmit if no
acks are sent. If there will be a lot of them, then troubles start.
We saw already, that speed decreased to the write speed in both 
implementations without connection collapsing.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-21  7:19               ` Evgeniy Polyakov
  2006-07-21  7:20                 ` Evgeniy Polyakov
@ 2006-07-21 16:14                 ` Ben Greear
  2006-07-21 16:27                   ` Evgeniy Polyakov
  2006-07-22 13:23                   ` Caitlin Bestler
  1 sibling, 2 replies; 60+ messages in thread
From: Ben Greear @ 2006-07-21 16:14 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: Alexey Kuznetsov, davem, netdev

Evgeniy Polyakov wrote:
> On Thu, Jul 20, 2006 at 02:21:57PM -0700, Ben Greear (greearb@candelatech.com) wrote:
> 
>>Out of curiosity, is it possible to have the single producer logic
>>if you have two+ ethernet interfaces handling frames for a single
>>TCP connection?  (I am assuming some sort of multi-path routing
>>logic...)
> 
> 
> I do not think it is possible with additional logic like what is
> implemented in softirqs, i.e. per cpu queues of data, which in turn will
> be converted into skbs one-by-one.

Couldn't you have two NICs being handled by two separate CPUs, with both
CPUs trying to write to the same socket queue?

The receive path works with RCU locking from what I understand, so
a protocol's receive function must be re-entrant.

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-21  4:55               ` David Miller
  2006-07-21  7:10                 ` Evgeniy Polyakov
@ 2006-07-21 16:26                 ` Rick Jones
  2006-07-21 20:57                   ` David Miller
  1 sibling, 1 reply; 60+ messages in thread
From: Rick Jones @ 2006-07-21 16:26 UTC (permalink / raw)
  To: David Miller; +Cc: kuznet, johnpol, netdev

> All this talk reminds me of one thing, how expensive tcp_ack() is.
> And this expense has nothing to do with TCP really.  The main cost is
> purging and freeing up the skbs which have been ACK'd in the
> retransmit queue.
> 
> So tcp_ack() sort of inherits the cost of freeing a bunch of SKBs
> which haven't been touched by the cpu in some time and are thus nearly
> guarenteed to be cold in the cache.
> 
> This is the kind of work we could think about batching to user
> sleeping on some socket call.

Ultimately isn't that just trying to squeeze the balloon?

rick jones

nice to see people seeing ACKs as expensive though :)

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-21 16:14                 ` Ben Greear
@ 2006-07-21 16:27                   ` Evgeniy Polyakov
  2006-07-22 13:23                   ` Caitlin Bestler
  1 sibling, 0 replies; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-21 16:27 UTC (permalink / raw)
  To: Ben Greear; +Cc: Alexey Kuznetsov, davem, netdev

On Fri, Jul 21, 2006 at 09:14:39AM -0700, Ben Greear (greearb@candelatech.com) wrote:
> >>Out of curiosity, is it possible to have the single producer logic
> >>if you have two+ ethernet interfaces handling frames for a single
> >>TCP connection?  (I am assuming some sort of multi-path routing
> >>logic...)
> >
> >I do not think it is possible with additional logic like what is
> >implemented in softirqs, i.e. per cpu queues of data, which in turn will
> >be converted into skbs one-by-one.
> 
> Couldn't you have two NICs being handled by two separate CPUs, with both
> CPUs trying to write to the same socket queue?
> 
> The receive path works with RCU locking from what I understand, so
> a protocol's receive function must be re-entrant.

There will not be socket queue on that stage - only per-cpu queues,
which then will be processed one-by-one by _exactly_ single user.
That user can get skb in round-robin manner and put them into socket
queue and call protocol receiving function.

> -- 
> Ben Greear <greearb@candelatech.com>
> Candela Technologies Inc  http://www.candelatech.com

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-21 16:26                 ` Rick Jones
@ 2006-07-21 20:57                   ` David Miller
  0 siblings, 0 replies; 60+ messages in thread
From: David Miller @ 2006-07-21 20:57 UTC (permalink / raw)
  To: rick.jones2; +Cc: kuznet, johnpol, netdev

From: Rick Jones <rick.jones2@hp.com>
Date: Fri, 21 Jul 2006 09:26:42 -0700

> > All this talk reminds me of one thing, how expensive tcp_ack() is.
> > And this expense has nothing to do with TCP really.  The main cost is
> > purging and freeing up the skbs which have been ACK'd in the
> > retransmit queue.
> > 
> > So tcp_ack() sort of inherits the cost of freeing a bunch of SKBs
> > which haven't been touched by the cpu in some time and are thus nearly
> > guarenteed to be cold in the cache.
> > 
> > This is the kind of work we could think about batching to user
> > sleeping on some socket call.
> 
> Ultimately isn't that just trying to squeeze the balloon?

In this case, the goal is not to eliminate the cost, but to
move it to user context so that it:

1) gets charged to the user instead of being lost in the ether of
   anonymous software interrupt execution, and more importantly...

2) it gets moved to the cpu where the user socket
   code is executing instead of the cpu where the ACK packet arrives
   which is basically arbitrary

#2 is in-line with the system level end-to-end principle goals of
netchannels.


^ permalink raw reply	[flat|nested] 60+ messages in thread

* RE: Netchannles: first stage has been completed. Further ideas.
  2006-07-21 16:14                 ` Ben Greear
  2006-07-21 16:27                   ` Evgeniy Polyakov
@ 2006-07-22 13:23                   ` Caitlin Bestler
  1 sibling, 0 replies; 60+ messages in thread
From: Caitlin Bestler @ 2006-07-22 13:23 UTC (permalink / raw)
  To: Ben Greear, Evgeniy Polyakov; +Cc: Alexey Kuznetsov, davem, netdev

netdev-owner@vger.kernel.org wrote:
> Evgeniy Polyakov wrote:
>> On Thu, Jul 20, 2006 at 02:21:57PM -0700, Ben Greear
> (greearb@candelatech.com) wrote:
>> 
>>> Out of curiosity, is it possible to have the single producer logic
>>> if you have two+ ethernet interfaces handling frames for a single
>>> TCP connection?  (I am assuming some sort of multi-path routing
>>> logic...)
>> 
>> 
>> I do not think it is possible with additional logic like what is
>> implemented in softirqs, i.e. per cpu queues of data, which in turn
>> will be converted into skbs one-by-one.
> 
> Couldn't you have two NICs being handled by two separate
> CPUs, with both CPUs trying to write to the same socket queue?
> 
> The receive path works with RCU locking from what I
> understand, so a protocol's receive function must be re-entrant.

Wouldn't it be easier simply not have two NICs feed the
same ring? What packets end up in which ring is fully
controllable. On the rare occasion that a single connection
must be fed by two NICs a software merge of the two rings
would be far cheaper than having to co-ordinate between
producers all the time.


^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-19 20:01     ` David Miller
  2006-07-19 20:16       ` Stephen Hemminger
@ 2006-07-24 18:54       ` Stephen Hemminger
  2006-07-24 20:52         ` Alexey Kuznetsov
  1 sibling, 1 reply; 60+ messages in thread
From: Stephen Hemminger @ 2006-07-24 18:54 UTC (permalink / raw)
  To: David Miller; +Cc: kuznet, johnpol, netdev

On Wed, 19 Jul 2006 13:01:50 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Stephen Hemminger <shemminger@osdl.org>
> Date: Wed, 19 Jul 2006 15:52:04 -0400
> 
> > As a related note, I am looking into fixing inet hash tables to use RCU.
> 
> IBM had posted a patch a long time ago, which would be not
> so hard to munge into the current tree.  See if you can
> spot it in the archives :)

Srivatsa Vaddagiri from IBM did  patch: http://lkml.org/lkml/2004/8/31/129

And Ben had a patch: http://lwn.net/Articles/174596/

Srivata's was more complete but pre-dates Acme's rearrangement.
Also, there is some code for refcnt's in it that looks wrong.
Or at minimum is masking underlying design flaws.

/* Ungrab socket and destroy it, if it was the last reference. */
 static inline void sock_put(struct sock *sk)
 {
-	if (atomic_dec_and_test(&sk->sk_refcnt))
-		sk_free(sk);
+sp_loop:
+	if (atomic_dec_and_test(&sk->sk_refcnt)) {
+		/* Restore ref count and schedule callback.
+		 * If we don't restore ref count, then the callback can be
+		 * scheduled by more than one CPU.
+		 */
+		atomic_inc(&sk->sk_refcnt);
+
+		if (atomic_read(&sk->sk_refcnt) == 1)
+			call_rcu(&sk->sk_rcu, sk_free_rcu);
+		else
+			goto sp_loop;
+	}
 }

Ben's still left reader writer locks, and needed IPV6 work. He said he
plans to get back to it.


-- 
Stephen Hemminger <shemminger@osdl.org>
"And in the Packet there writ down that doome"

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-24 18:54       ` Stephen Hemminger
@ 2006-07-24 20:52         ` Alexey Kuznetsov
  0 siblings, 0 replies; 60+ messages in thread
From: Alexey Kuznetsov @ 2006-07-24 20:52 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, johnpol, netdev

Hello!

> Also, there is some code for refcnt's in it that looks wrong.

Yes, it is disgusting. rcu does not allow to increase socket refcnt
in lookup routine.

Ben's version looks cleaner here, it does not touch refcnt
in rcu lookups. But it is dubious too:

 do_time_wait:
+       sock_hold(sk);

is obviously in violation of the rule. Probably, rcu lookup should do something
like:

if (!atomic_inc_not_zero(&sk->sk_refcnt))
	pretend_it_is_not_found; 

It is clear Ben did not look into IBM patch, because one known place
of trouble is missed: when socket moves from established to timewait,
timewait bucket must be inserted before established socket is removed.

Alexey

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-18 23:01 ` Alexey Kuznetsov
                     ` (2 preceding siblings ...)
  2006-07-19 19:52   ` Stephen Hemminger
@ 2006-07-27  2:17   ` Rusty Russell
  2006-07-27  5:17     ` David Miller
  3 siblings, 1 reply; 60+ messages in thread
From: Rusty Russell @ 2006-07-27  2:17 UTC (permalink / raw)
  To: Alexey Kuznetsov; +Cc: Evgeniy Polyakov, netdev, David Miller

On Wed, 2006-07-19 at 03:01 +0400, Alexey Kuznetsov wrote:
> Hello!
> 
> Can I ask couple of questions? Just as a person who looked at VJ's
> slides once and was confused. And startled, when found that it is not
> considered as another joke of genuis. :-)

Hi Alexey!

> About locks:
> 
> > 	  is completely lockless (there is one irq lock when skb 
> > is queued/dequeued into netchannels queue in hard/soft irq, 
> 
> Equivalent of socket spinlock.

I don't think they are equivalent.  In channels, this can be split into
two locks, queue lock and an dequeue lock, which operate independently.
The socket spinlock cannot.  Moreover, in the case where there is a
guarantee about IRQs being bound to a single CPU (as Dave's ideas on
MSI), the queue lock is no longer required.  In the case where there is
a single reader of the socket (or, as VJ did, the other end is in
userspace), no dequeue lock is required.

> VJ slides describe a totally different scheme, where softirq part is omitted
> completely, protocol processing is moved to user space as whole.
> It is an amazing toy. But I see nothing, which could promote its status
> to practical. Exokernels used to do this thing for ages, and all the
> performance gains are compensated by overcomplicated classification
> engine, which has to remain in kernel and essentially to do the same
> work which routing/firewalling/socket hash tables do.

My feeling is that modern cards will do partial demux for us; whether we
use netchannels or not, we should use that to accelerate lookup.  Making
card aim MSI at same CPU for same flow is a start (and as Dave said,
much less code).  As the next step, having the card give us a cookie
too, would allow us to explicitly skip first level of lookup.  This
should allow us to identify which flows are simple enough to be directly
accelerated (whether by channels or something else): no bonding, raw
sockets, non-trivial netfilter rules, connection tracking changes, etc.

Thoughts?
Rusty.
-- 
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-27  2:17   ` Rusty Russell
@ 2006-07-27  5:17     ` David Miller
  2006-07-27  5:46       ` Rusty Russell
  0 siblings, 1 reply; 60+ messages in thread
From: David Miller @ 2006-07-27  5:17 UTC (permalink / raw)
  To: rusty; +Cc: kuznet, johnpol, netdev

From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 27 Jul 2006 12:17:51 +1000

> On Wed, 2006-07-19 at 03:01 +0400, Alexey Kuznetsov wrote:
> > About locks:
> > 
> > > 	  is completely lockless (there is one irq lock when skb 
> > > is queued/dequeued into netchannels queue in hard/soft irq, 
> > 
> > Equivalent of socket spinlock.
> 
> I don't think they are equivalent.  In channels, this can be split into
> two locks, queue lock and an dequeue lock, which operate independently.
> The socket spinlock cannot.  Moreover, in the case where there is a
> guarantee about IRQs being bound to a single CPU (as Dave's ideas on
> MSI), the queue lock is no longer required.  In the case where there is
> a single reader of the socket (or, as VJ did, the other end is in
> userspace), no dequeue lock is required.

Cost is a very interesting question here.  I guess your main point
is that eventually this lock can be made to go away, whereas
Alexey speaks about the state of Evgivny's specific implementation.

> My feeling is that modern cards will do partial demux for us; whether we
> use netchannels or not, we should use that to accelerate lookup.  Making
> card aim MSI at same CPU for same flow is a start (and as Dave said,
> much less code).  As the next step, having the card give us a cookie
> too, would allow us to explicitly skip first level of lookup.  This
> should allow us to identify which flows are simple enough to be directly
> accelerated (whether by channels or something else): no bonding, raw
> sockets, non-trivial netfilter rules, connection tracking changes, etc.

I read this as "we will be able to get around the problems" but
no specific answer as to "how".  I am an optimist too but I want
to start seeing concrete discussion about the way in which the
problems will be dealt with.

Alexey has some ideas, such as running the netfilter path from the
netchannel consumer socket context.  That is the kind of thing
we need to be talking about.

Robert Olsson is also doing some work involving full flow
classifications using special trie structures in the routing cache
that might be extendable to netchannels.  His trick is to watch for
the FIN shutdown sequence and GC route cache entries for a flow when
this is seen.  This is in order to keep the trie shallow and thus have
a better bound on memory accesses for routing lookups.

We are not a group of mathematicians discussing the tractability of
some problem.  Our interest is practice not theory. :)

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-27  5:17     ` David Miller
@ 2006-07-27  5:46       ` Rusty Russell
  2006-07-27  6:00         ` David Miller
  2006-07-27 16:33         ` Alexey Kuznetsov
  0 siblings, 2 replies; 60+ messages in thread
From: Rusty Russell @ 2006-07-27  5:46 UTC (permalink / raw)
  To: David Miller; +Cc: kuznet, johnpol, netdev

On Wed, 2006-07-26 at 22:17 -0700, David Miller wrote:
> I read this as "we will be able to get around the problems" but
> no specific answer as to "how".  I am an optimist too but I want
> to start seeing concrete discussion about the way in which the
> problems will be dealt with.
> 
> Alexey has some ideas, such as running the netfilter path from the
> netchannel consumer socket context.  That is the kind of thing
> we need to be talking about.

Yes, my first thought back in January was how netfilter would interact
with this in a sane way.  One answer is "don't": once someone registers
on any hook we go into slow path.  Another is to run the hooks in socket
context, which is better, but precludes having the consumer in
userspace, which still appeals to me 8)

So I don't like either.  The mistake (?) with netfilter was that we are
completely general: you will see all packets, do what you want.  If,
instead, we had forced all rules to be of form "show me all packets
matching this tuple" we would be in a combine it in a single lookup with
routing etc.

What would the tuple look like?  Off the top of my head:
SRCIP/DSTIP/PROTO/SPT/DPT/IN/OUT (where IN and OUT are boolean values
indicating whether the src/dest is local).

Of course, it means rewriting all the userspace tools, documentation,
and creating a complete new infrastructure for connection tracking and
NAT, but if that's what's required, then so be it.

Rusty.
-- 
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-27  5:46       ` Rusty Russell
@ 2006-07-27  6:00         ` David Miller
  2006-07-27 18:54           ` Stephen Hemminger
  2006-07-28  5:54           ` Rusty Russell
  2006-07-27 16:33         ` Alexey Kuznetsov
  1 sibling, 2 replies; 60+ messages in thread
From: David Miller @ 2006-07-27  6:00 UTC (permalink / raw)
  To: rusty; +Cc: kuznet, johnpol, netdev

From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 27 Jul 2006 15:46:12 +1000

> Yes, my first thought back in January was how netfilter would interact
> with this in a sane way.  One answer is "don't": once someone registers
> on any hook we go into slow path.  Another is to run the hooks in socket
> context, which is better, but precludes having the consumer in
> userspace, which still appeals to me 8)

Small steps, small steps.  I have not ruled out userspace TCP just
yet, but we are not prepared to go there right now anyways.  It is
just the same kind of jump to go to kernel level netchannels as it is
to go from kernel level netchannels to userspace netchannel based TCP.

> What would the tuple look like?  Off the top of my head:
> SRCIP/DSTIP/PROTO/SPT/DPT/IN/OUT (where IN and OUT are boolean values
> indicating whether the src/dest is local).
> 
> Of course, it means rewriting all the userspace tools, documentation,
> and creating a complete new infrastructure for connection tracking and
> NAT, but if that's what's required, then so be it.

I think we are able to finally talk seriously about revamping
netfilter on this level because we finally have a good incentive to do
so and some kind of model exists to work against.  Robert's trie might
be able to handle your tuple very well, fwiw, perhaps even with
prefixing.

But something occurs to me.  Socket has ID when it is created and
goes to established state.  This means we have this tuple, and thus
we can prelookup the netfilter rule and attach this cached lookup
state on the socket.  Your tuple in this case is defined to be:

	SRCIP/DSTIP/"TCP"/SPT/DPT/0/1

I do not know how practical this is, it is just some suggestion.

Would there be prefixing in these tuples?  That's where the trouble
starts.  If you add prefixing, troubles and limitations of lookup of
today reappear.  If you disallow prefixing, tables get very large
but lookup becomes simpler and practical.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-27  5:46       ` Rusty Russell
  2006-07-27  6:00         ` David Miller
@ 2006-07-27 16:33         ` Alexey Kuznetsov
  2006-07-27 16:51           ` Evgeniy Polyakov
  2006-07-28  4:49           ` Rusty Russell
  1 sibling, 2 replies; 60+ messages in thread
From: Alexey Kuznetsov @ 2006-07-27 16:33 UTC (permalink / raw)
  To: Rusty Russell; +Cc: David Miller, johnpol, netdev

Hello!

On Thu, Jul 27, 2006 at 03:46:12PM +1000, Rusty Russell wrote:
> Of course, it means rewriting all the userspace tools, documentation,
> and creating a complete new infrastructure for connection tracking and
> NAT, but if that's what's required, then so be it.

That's what I love to hear. Not a joke. :-)

Could I only suggest not to relate this to netchannels? :-)
In the past we used to call this thing (grand-unified) "flow cache".

> I don't think they are equivalent.  In channels,

I understand this. Actually, it was what I said in the next paragraph,
which you even cited.

I really do not like to repeat myself, it is nothing but idle talk,
but if the questions are questioned...

First, it was stated that suggested implementation performs better and even
much better. I am asking why do we see such improvement?
I am absolutely not satisifed with statement "It is better. Period."
>From all that I see, this particular implementation does not implement
optimizations suggested by VJ, it implements only the things,
which are not supposed to affect performance or to affect it negatively.

Idle talk? I am sure that if that improvement happened not due
to a severe protocol violation we can easily fix existing stack.

> userspace), no dequeue lock is required.

And that was a part of the second question.

I do not see, how single threaded TCP is possible. In receiver path
it has to ack with quite strict time bounds, to delack etc., in sender path
it has to slow start, I am even not saying about "slow path" things:
retransmit, probing window, lingering without process context etc.
It looks like, VJ implies the protocol must be changed. We can't, we mustn't.

After we deidealize this idealization and recognize that some "slow path"
should exist and some part of this "slow path" has to be executed
with higher priority than the "fast" one, where do we arrive?
Is not it exactly what we have right now? Clean fast path, separate slow path.
Not good enough? Where? Let's find and fix this.

Alexey

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-27 16:33         ` Alexey Kuznetsov
@ 2006-07-27 16:51           ` Evgeniy Polyakov
  2006-07-27 20:56             ` Alexey Kuznetsov
  2006-07-28  4:49           ` Rusty Russell
  1 sibling, 1 reply; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-27 16:51 UTC (permalink / raw)
  To: Alexey Kuznetsov; +Cc: Rusty Russell, David Miller, netdev

Hello, Alexey.

On Thu, Jul 27, 2006 at 08:33:35PM +0400, Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) wrote:
> First, it was stated that suggested implementation performs better and even
> much better. I am asking why do we see such improvement?
> I am absolutely not satisifed with statement "It is better. Period."
> From all that I see, this particular implementation does not implement
> optimizations suggested by VJ, it implements only the things,
> which are not supposed to affect performance or to affect it negatively.

Just for clarifications: I showed that even using _existing_ stack
(using sk_backlog_rcv) performance in process context can exceed two
level processing. And after creating own TCP implemetation 
(which does not include two-level related overhead among other things)
performance different was even higher. I can agree that it is possible
that in second case part of the gain is obtained from the new TCP
implementation, but not 100% from process' context, but in first place 
existing socket code was used.

> > userspace), no dequeue lock is required.
> 
> And that was a part of the second question.
> 
> I do not see, how single threaded TCP is possible. In receiver path
> it has to ack with quite strict time bounds, to delack etc., in sender path
> it has to slow start, I am even not saying about "slow path" things:
> retransmit, probing window, lingering without process context etc.
> It looks like, VJ implies the protocol must be changed. We can't, we mustn't.
> 
> After we deidealize this idealization and recognize that some "slow path"
> should exist and some part of this "slow path" has to be executed
> with higher priority than the "fast" one, where do we arrive?
> Is not it exactly what we have right now? Clean fast path, separate slow path.
> Not good enough? Where? Let's find and fix this.

Slow path does exist, retransmits and friends are there too in new stack.
And my initial netchannel implementation used  _existing_ socket code
from process context. Again, there is no need to crate two levels
between fast and slow or softirq and process, and it was proven and
shown that it can perform faster.

Why don't you want to see, that existing model is just path enlargement:
there might also exist delayes between hard and soft irqs, so acks will
be delayed and so on... But stack works without problems even if some
kernel thread takes 100% cpu (with preemption), and there are very big
delays for ack generations, but userspace is not possible to get that
data. With netchannels it is essentially the same (heh, I said that
already a lot of times).

> Alexey

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-27  6:00         ` David Miller
@ 2006-07-27 18:54           ` Stephen Hemminger
  2006-07-28  8:21             ` David Miller
  2006-07-28  5:54           ` Rusty Russell
  1 sibling, 1 reply; 60+ messages in thread
From: Stephen Hemminger @ 2006-07-27 18:54 UTC (permalink / raw)
  To: David Miller; +Cc: rusty, kuznet, johnpol, netdev

On Wed, 26 Jul 2006 23:00:28 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Rusty Russell <rusty@rustcorp.com.au>
> Date: Thu, 27 Jul 2006 15:46:12 +1000
> 
> > Yes, my first thought back in January was how netfilter would interact
> > with this in a sane way.  One answer is "don't": once someone registers
> > on any hook we go into slow path.  Another is to run the hooks in socket
> > context, which is better, but precludes having the consumer in
> > userspace, which still appeals to me 8)
> 
> Small steps, small steps.  I have not ruled out userspace TCP just
> yet, but we are not prepared to go there right now anyways.  It is
> just the same kind of jump to go to kernel level netchannels as it is
> to go from kernel level netchannels to userspace netchannel based TCP.

I think we sell our existing stack short. There are lots of opportunities left
to look more closely at actual real performance bottlenecks and improve
incrementally. But it requires, tools, time, faster net hardware, and some
creative insight. I guess it just isn't as cool.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-27 16:51           ` Evgeniy Polyakov
@ 2006-07-27 20:56             ` Alexey Kuznetsov
  2006-07-28  5:17               ` Evgeniy Polyakov
  0 siblings, 1 reply; 60+ messages in thread
From: Alexey Kuznetsov @ 2006-07-27 20:56 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: Rusty Russell, David Miller, netdev

Hello!

> kernel thread takes 100% cpu (with preemption

Preemption, you tell... :-)

I begged you to spend 1 minute of your time to press ^Z. Did you?

Alexey

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-27 16:33         ` Alexey Kuznetsov
  2006-07-27 16:51           ` Evgeniy Polyakov
@ 2006-07-28  4:49           ` Rusty Russell
  1 sibling, 0 replies; 60+ messages in thread
From: Rusty Russell @ 2006-07-28  4:49 UTC (permalink / raw)
  To: Alexey Kuznetsov; +Cc: David Miller, johnpol, netdev

On Thu, 2006-07-27 at 20:33 +0400, Alexey Kuznetsov wrote:
> Hello!
> 
> On Thu, Jul 27, 2006 at 03:46:12PM +1000, Rusty Russell wrote:
> > Of course, it means rewriting all the userspace tools, documentation,
> > and creating a complete new infrastructure for connection tracking and
> > NAT, but if that's what's required, then so be it.
> 
> That's what I love to hear. Not a joke. :-)
> 
> Could I only suggest not to relate this to netchannels? :-)
> In the past we used to call this thing (grand-unified) "flow cache".

Yes.  Thankyou for all your explanation, it was very helpful.  I agree,
grand unified lookup idea returns 8).  Netchannels proposal vs.
netfilter forced me back into thinking about it again, but it is
unrelated.  Any "netfilter bypass" acceleration will want similar ideas.

I apologize for misreading your discussion of Evgeniy's implementation
with general channel problem.  My mistake.  

> > userspace), no dequeue lock is required.
> 
> And that was a part of the second question.
> 
> I do not see, how single threaded TCP is possible. In receiver path
> it has to ack with quite strict time bounds, to delack etc., in sender path
> it has to slow start, I am even not saying about "slow path" things:
> retransmit, probing window, lingering without process context etc.
> It looks like, VJ implies the protocol must be changed. We can't, we mustn't.

All good points.  I can see two kinds of problems here: performance
problems due to wakeup (eg. ack processing for 5MB write), and
correctness problems due to no kernel enforcement.  We need measurements
for the performance issues, so I'll ignore them for the moment.

For correctness, in true end-to-end, kernel is just a router for
userspace, then we do not worry about such problems 8)  In real life
kernel must enforce linger and sending tuple correctness, but I don't
know how much else we must regulate.  Too much, and you are right: we
have slow and fast path split just like now.

> After we deidealize this idealization and recognize that some "slow path"
> should exist and some part of this "slow path" has to be executed
> with higher priority than the "fast" one, where do we arrive?
> Is not it exactly what we have right now? Clean fast path, separate slow path.
> Not good enough? Where? Let's find and fix this.

I am still not sure how significant slow path is: if 99% can be in
userspace, it could work very well for RDMA.  I would like to have seen
VJ's implementation so we could compare and steal bits.

Thanks,
Rusty.
-- 
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-27 20:56             ` Alexey Kuznetsov
@ 2006-07-28  5:17               ` Evgeniy Polyakov
  2006-07-28  5:34                 ` David Miller
  0 siblings, 1 reply; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-28  5:17 UTC (permalink / raw)
  To: Alexey Kuznetsov; +Cc: Rusty Russell, David Miller, netdev

On Fri, Jul 28, 2006 at 12:56:51AM +0400, Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) wrote:
> Hello!
> 
> > kernel thread takes 100% cpu (with preemption
> 
> Preemption, you tell... :-)
> 
> I begged you to spend 1 minute of your time to press ^Z. Did you?

What would you expect from non-preemptible kernel? Hard lockup, no acks,
no soft irqs. So this case still does not differ from process' context
processing.

And after several minutes I pressed hardware reset button...

> Alexey

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-28  5:17               ` Evgeniy Polyakov
@ 2006-07-28  5:34                 ` David Miller
  2006-07-28  5:47                   ` Evgeniy Polyakov
  0 siblings, 1 reply; 60+ messages in thread
From: David Miller @ 2006-07-28  5:34 UTC (permalink / raw)
  To: johnpol; +Cc: kuznet, rusty, netdev

From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Date: Fri, 28 Jul 2006 09:17:25 +0400

> What would you expect from non-preemptible kernel? Hard lockup, no acks,
> no soft irqs.

Why does pressing Ctrl-Z on the user process stop kernel soft irq
processing?



^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-28  5:34                 ` David Miller
@ 2006-07-28  5:47                   ` Evgeniy Polyakov
  0 siblings, 0 replies; 60+ messages in thread
From: Evgeniy Polyakov @ 2006-07-28  5:47 UTC (permalink / raw)
  To: David Miller; +Cc: kuznet, rusty, netdev

On Thu, Jul 27, 2006 at 10:34:00PM -0700, David Miller (davem@davemloft.net) wrote:
> From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> Date: Fri, 28 Jul 2006 09:17:25 +0400
> 
> > What would you expect from non-preemptible kernel? Hard lockup, no acks,
> > no soft irqs.
> 
> Why does pressing Ctrl-Z on the user process stop kernel soft irq
> processing?

I do not know, why Alexey decided that Ctrl-Z was ever pressed.
I'm saying about the case when keventd ate 100% of CPU, but stack worked
with (very) long delays. Obviously userspace was unresponsible and no
data arrived there.
It is an analogy that posponed softirq work does not destroy connections
as long as process context protocol processing with delays.
User does not get it's data, so no need to send an ack. And if it is
impossible to get that data at all, user should not care that sending
side does not see acks. When user is capable to get that data, it starts
to acknowledge.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-27  6:00         ` David Miller
  2006-07-27 18:54           ` Stephen Hemminger
@ 2006-07-28  5:54           ` Rusty Russell
  2006-08-01  4:47             ` David Miller
  1 sibling, 1 reply; 60+ messages in thread
From: Rusty Russell @ 2006-07-28  5:54 UTC (permalink / raw)
  To: David Miller; +Cc: kuznet, johnpol, netdev

On Wed, 2006-07-26 at 23:00 -0700, David Miller wrote:
> From: Rusty Russell <rusty@rustcorp.com.au>
> Date: Thu, 27 Jul 2006 15:46:12 +1000
> 
> > Yes, my first thought back in January was how netfilter would interact
> > with this in a sane way.  One answer is "don't": once someone registers
> > on any hook we go into slow path.  Another is to run the hooks in socket
> > context, which is better, but precludes having the consumer in
> > userspace, which still appeals to me 8)
> 
> Small steps, small steps.  I have not ruled out userspace TCP just
> yet, but we are not prepared to go there right now anyways.  It is
> just the same kind of jump to go to kernel level netchannels as it is
> to go from kernel level netchannels to userspace netchannel based TCP.

I think I was unclear; the possibility of userspace netchannels adds
weight to the idea that we should rework netfilter hooks sooner rather
than later.

> > What would the tuple look like?  Off the top of my head:
> > SRCIP/DSTIP/PROTO/SPT/DPT/IN/OUT (where IN and OUT are boolean values
> > indicating whether the src/dest is local).
> > 
> > Of course, it means rewriting all the userspace tools, documentation,
> > and creating a complete new infrastructure for connection tracking and
> > NAT, but if that's what's required, then so be it.
> 
> I think we are able to finally talk seriously about revamping
> netfilter on this level because we finally have a good incentive to do
> so and some kind of model exists to work against.  Robert's trie might
> be able to handle your tuple very well, fwiw, perhaps even with
> prefixing.
> 
> But something occurs to me.  Socket has ID when it is created and
> goes to established state.  This means we have this tuple, and thus
> we can prelookup the netfilter rule and attach this cached lookup
> state on the socket.  Your tuple in this case is defined to be:
> 
> 	SRCIP/DSTIP/"TCP"/SPT/DPT/0/1
> 
> I do not know how practical this is, it is just some suggestion.
> 
> Would there be prefixing in these tuples?  That's where the trouble
> starts.  If you add prefixing, troubles and limitations of lookup of
> today reappear.  If you disallow prefixing, tables get very large
> but lookup becomes simpler and practical.

OK.  AFAICT, there are three ideas in play here (ignoring netchannels).
First, there should be a unified lookup for efficiency (Grand Unified
Cache).  Secondly, that netfilter hook users need to publish information
about what they are actually looking at if they are to use this lookup.
Thirdly, that smart cards can accelerate lookup.

(1) I am imagining some Grand Unified Flow Cache (Olsson trie?) that
holds (some subset of?) flows.  A successful lookup immediately after
packet comes off NIC gives destiny for packet: what route, (optionally)
what socket, what filtering, what connection tracking (& what NAT), etc?
I don't know if this should be a general array of fn & data ptrs, or
specialized fields for each one, or a mix.  Maybe there's a "too hard,
do slow path" bit, or maybe hard cases just never get put in the cache.
Perhaps we need a separate one for locally-generated packets, a-la
ip_route_output().  Anyway, we trade slightly more expensive flow setup
for faster packet processing within flows.

(2) To make this work sanely in the presence of netfilter hooks, we need
them to register the tuples they are interested in.  Not at the hook
level, but *in addition*.  For example, we need to know what flows each
packet filtering rule cares about.  Connection tracking wants to see the
first packet (and first reply packet), but then probably only want to
see packets with RST/SYN/FIN set.  (Erk, window tracking wants to see
every packet, but maybe we could do something).  NAT definitely needs to
see every packet on a connection which is natted.

One way to do this is to add a "have_interest" callback into the
hook_ops, which takes each about-to-be-inserted GUFC entry and adds any
destinies this hook cares about.  In the case of packet filtering this
would do a traversal and append a fn/data ptr to the entry for each rule
which could effect it.  

The other way is to have the hooks register what they are interested in
into a general data structure which GUFC entry creation then looks up
itself.  This general data structure will need to support wildcards
though.

We also need efficient ways of reflecting rule changes into the GUFC.
We can be pretty slack with conntrack timeouts, but we either need to
flush or handle callbacks from GUFC on timed-out entries.  Packet
filtering changes need to be synchronous, definitely.

(3) Smart NICs that do some flowid work themselves can accelerate lookup
implicitly (same flow goes to same CPU/thread) or explicitly (each
CPU/thread maintains only part of GUFC which it needs, or even NIC
returns flow cookie which is pointer to GUFC entry or subtree?).  AFAICT
this will magnify the payoff from the GUFC.

Sorry for the length,
Rusty.
-- 
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-27 18:54           ` Stephen Hemminger
@ 2006-07-28  8:21             ` David Miller
  0 siblings, 0 replies; 60+ messages in thread
From: David Miller @ 2006-07-28  8:21 UTC (permalink / raw)
  To: shemminger; +Cc: rusty, kuznet, johnpol, netdev

From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 27 Jul 2006 11:54:19 -0700

> I think we sell our existing stack short.

I agree.

> There are lots of opportunities left to look more closely at actual
> real performance bottlenecks and improve incrementally. But it
> requires, tools, time, faster net hardware, and some creative
> insight. I guess it just isn't as cool.

We are in fact suggesting some ideas that address the current
stack issues along the way.  Witness the discussion we had about
the tcp_ack() costs wrt. pruning the retransmit queue and tagging
packets for SACK, I'm working on a new data structure and layout
to cure all that stuff.

But I think we can do better.  Jamal said to me one email, "If even
only half of Van's numbers are real, this is really exciting."

Rusty and Alexey are looking at the problem from another direction.
Go back to the unified flow cache, implement all the hair to do
that, and then we can look at netchannels because they will be so
much more straight forward at that point.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-07-28  5:54           ` Rusty Russell
@ 2006-08-01  4:47             ` David Miller
  2006-08-01  6:36               ` Rusty Russell
  0 siblings, 1 reply; 60+ messages in thread
From: David Miller @ 2006-08-01  4:47 UTC (permalink / raw)
  To: rusty; +Cc: kuznet, johnpol, netdev

From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 28 Jul 2006 15:54:04 +1000

> (1) I am imagining some Grand Unified Flow Cache (Olsson trie?) that
> holds (some subset of?) flows.  A successful lookup immediately after
> packet comes off NIC gives destiny for packet: what route, (optionally)
> what socket, what filtering, what connection tracking (& what NAT), etc?
> I don't know if this should be a general array of fn & data ptrs, or
> specialized fields for each one, or a mix.  Maybe there's a "too hard,
> do slow path" bit, or maybe hard cases just never get put in the cache.
> Perhaps we need a separate one for locally-generated packets, a-la
> ip_route_output().  Anyway, we trade slightly more expensive flow setup
> for faster packet processing within flows.

So, specifically, one of the methods you are thinking about might
be implemented by adding:

	void (*input)(struct sk_buff *, void *);
	void *input_data;

to "struct flow_cache_entry" or whatever replaces it?

This way we don't need some kind of "type" information in
the flow cache entry, since the input handler knows the type.

> One way to do this is to add a "have_interest" callback into the
> hook_ops, which takes each about-to-be-inserted GUFC entry and adds any
> destinies this hook cares about.  In the case of packet filtering this
> would do a traversal and append a fn/data ptr to the entry for each rule
> which could effect it.  

Can you give a concrete example of how the GUFC might make use
of this?  Just some small abstract code snippets will do.

> The other way is to have the hooks register what they are interested in
> into a general data structure which GUFC entry creation then looks up
> itself.  This general data structure will need to support wildcards
> though.

My gut reaction is that imposing a global data structure on all object
classes is not prudent.  When we take a GUFC miss, it seems better we
call into the subsystems to resolve things.  It can implement whatever
slow path lookup algorithm is most appropriate for it's data.

> We also need efficient ways of reflecting rule changes into the GUFC.
> We can be pretty slack with conntrack timeouts, but we either need to
> flush or handle callbacks from GUFC on timed-out entries.  Packet
> filtering changes need to be synchronous, definitely.

This, I will remind, is similar to the problem of doing RCU locking
of the TCP hash tables.

> (3) Smart NICs that do some flowid work themselves can accelerate lookup
> implicitly (same flow goes to same CPU/thread) or explicitly (each
> CPU/thread maintains only part of GUFC which it needs, or even NIC
> returns flow cookie which is pointer to GUFC entry or subtree?).  AFAICT
> this will magnify the payoff from the GUFC.

I want to warn you about HW issues that I mentioned to Alexey the
other week.  If we are not careful, we can run into the same issues
TOE cards run into, performance wise.

Namely, it is important to be careful about how the GUFC table entries
get updated in the card.  If you add them synchronously, your
connection rates will deteriorate dramatically.

I had the idea of a lazy scheme.  When we create a GUFC entry, we
tack it onto a DMA'able linked list the card uses.  We do not
notify the card, we just entail the update onto the list.

Then, if the card misses it's on-chip GUFC table on an incoming
packet, it checks the DMA update list by reading it in from memory.
It updates it's GUFC table with whatever entries are found on this
list, then it retries to classify the packet.

This seems like a possible good solution until we try to address GUFC
entry deletion, which unfortunately cannot be evaluated in a lazy
fashion.  It must be synchronous.  This is because if, for example, we
just killed off a TCP socket we must make sure we don't hit the GUFC
entry for the TCP identity of that socket any longer.

Just something to think about, when considering how to translate these
ideas into hardware.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: Netchannles: first stage has been completed. Further ideas.
  2006-08-01  4:47             ` David Miller
@ 2006-08-01  6:36               ` Rusty Russell
  0 siblings, 0 replies; 60+ messages in thread
From: Rusty Russell @ 2006-08-01  6:36 UTC (permalink / raw)
  To: David Miller; +Cc: kuznet, johnpol, netdev

On Mon, 2006-07-31 at 21:47 -0700, David Miller wrote:
> From: Rusty Russell <rusty@rustcorp.com.au>
> Date: Fri, 28 Jul 2006 15:54:04 +1000
> 
> > (1) I am imagining some Grand Unified Flow Cache (Olsson trie?) that
> > holds (some subset of?) flows.  A successful lookup immediately after
> > packet comes off NIC gives destiny for packet: what route, (optionally)
> > what socket, what filtering, what connection tracking (& what NAT), etc?
> > I don't know if this should be a general array of fn & data ptrs, or
> > specialized fields for each one, or a mix.  Maybe there's a "too hard,
> > do slow path" bit, or maybe hard cases just never get put in the cache.
> > Perhaps we need a separate one for locally-generated packets, a-la
> > ip_route_output().  Anyway, we trade slightly more expensive flow setup
> > for faster packet processing within flows.
> 
> So, specifically, one of the methods you are thinking about might
> be implemented by adding:
> 
> 	void (*input)(struct sk_buff *, void *);
> 	void *input_data;
> 
> to "struct flow_cache_entry" or whatever replaces it?

Probably needs a return value to indicate stop packet processing, and to
be completely general I think we'd want more than one, eg:

	#define MAX_GUFC_INPUTS 5
	unsigned int num_inputs;
	int (*input[MAX_GUFC_INPUTS])(struct sk_buff *, void *);
	void *input_data[MAX_GUFC_INPUTS];

> This way we don't need some kind of "type" information in
> the flow cache entry, since the input handler knows the type.

Some things may want to jam more than a pointer into the cache entry, so
we might do something clever later, but as a first cut this would seem
to work.

> > One way to do this is to add a "have_interest" callback into the
> > hook_ops, which takes each about-to-be-inserted GUFC entry and adds any
> > destinies this hook cares about.  In the case of packet filtering this
> > would do a traversal and append a fn/data ptr to the entry for each rule
> > which could effect it.  
> 
> Can you give a concrete example of how the GUFC might make use
> of this?  Just some small abstract code snippets will do.

OK, I take it back.  I was thinking that on a miss, the GUFC called into
each subsystem to populate the new GUFC entry.  That would be a radical
departure from the current code, so forget it.

So, on a GUFC miss, we could create a new GUFC entry (on stack?), hang
it off the skb, then as each subsystem adds to it as we go through.  At
some point (handwave?) we collect the skb->gufc and insert it into the
trie.

For iptables, as a first step we'd simply do (open-coded for now):

	/* FIXME: Do acceleration properly */
	struct gufc *gufc = skb->gufc;
	if (!gufc || gufc->num_inputs == MAX_INPUTS) {
		skb->gufc = NULL;
	} else {
		gufc->input[gufc->num_inputs] = traverse_entire_table;
		gufc->input_data[gufc->num_inputs++] = this_table;
	}

Later we'd get funky:

	/* Filtering code here */
	...

	if (num_rules_applied > 1 || !only_needed_flow_info) {
		gufc->input[gufc->num_inputs] = traverse_entire_table;
		gufc->input_data[gufc->num_inputs++] = this_table;
	} else if (num_rules_applied == 1) {
		gufc->input[gufc->num_inputs] = traverse_one_rule;
		gufc->input_data[gufc->num_inputs++] = last_rule;
	}

Note that this could be cleverer, too:

	if (result == NF_DROP && only_needed_flow_info) {
		// Who cares about other inputs, we're going to drop
		gufc->input[0] = drop_skb;
		gufc->num_inputs = 1;
	}

Two potential performance issues: 

1) When we change rules, iptables replaces entire table from userspace.
We need pkttables (which uses incremental rule updates) to flush
intelligently.

2) Every iptables rule currently keeps pkt/byte counters, meaning we
can't bypass rules even though they might have no effect on the packet
(eg. iptables -A INPUT -i eth0 -j ETH0_RULES).  We can address this by
having pkt/byte counters in the gufc entry and a method of pushing them
back to iptables when the gufc entry is pruned, and manually traversing
the trie to flush them when the user asks for counters.

> I had the idea of a lazy scheme.  When we create a GUFC entry, we
> tack it onto a DMA'able linked list the card uses.  We do not
> notify the card, we just entail the update onto the list.
> 
> Then, if the card misses it's on-chip GUFC table on an incoming
> packet, it checks the DMA update list by reading it in from memory.
> It updates it's GUFC table with whatever entries are found on this
> list, then it retries to classify the packet.

I had assumed we would simply do full lookup on non-hw-classified
packets, so async insertion is a non-issue.  Can we assume hardware will
cover entire GUFC trie?

> This seems like a possible good solution until we try to address GUFC
> entry deletion, which unfortunately cannot be evaluated in a lazy
> fashion.  It must be synchronous.  This is because if, for example, we
> just killed off a TCP socket we must make sure we don't hit the GUFC
> entry for the TCP identity of that socket any longer.

With RCU, we'll probably be marking the GUFC entry deleted and freeing
it in a callback sometime later.  This gives us a window in which we can
delete it from the card's cache.  If we hit the callback and the card
still hasn't been updated, we need to go synchronous, but maybe that
will be rare?

> Just something to think about, when considering how to translate these
> ideas into hardware.

Yes, it's easy to imagine a DoS pattern where we spend all our cycles
updating trie and hw table, and even less time processing packets.

Cheers,
Rusty.
-- 
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law

^ permalink raw reply	[flat|nested] 60+ messages in thread

end of thread, other threads:[~2006-08-01  6:36 UTC | newest]

Thread overview: 60+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-07-18  8:16 Netchannles: first stage has been completed. Further ideas Evgeniy Polyakov
2006-07-18  8:34 ` David Miller
2006-07-18  8:50   ` Evgeniy Polyakov
2006-07-18 11:16 ` Christian Borntraeger
2006-07-18 11:51   ` Evgeniy Polyakov
2006-07-18 12:36     ` Christian Borntraeger
2006-07-18 19:11       ` Evgeniy Polyakov
2006-07-18 21:20         ` David Miller
2006-07-18 12:15 ` Jörn Engel
2006-07-18 19:08   ` Evgeniy Polyakov
2006-07-19 11:00     ` Jörn Engel
2006-07-20  7:42       ` Evgeniy Polyakov
2006-07-18 23:01 ` Alexey Kuznetsov
2006-07-19  0:39   ` David Miller
2006-07-19  5:38   ` Evgeniy Polyakov
2006-07-19  6:30     ` Evgeniy Polyakov
2006-07-19 13:19     ` Alexey Kuznetsov
2006-07-20  7:32       ` Evgeniy Polyakov
2006-07-20 16:41         ` Alexey Kuznetsov
2006-07-20 21:08           ` Evgeniy Polyakov
2006-07-20 21:21             ` Ben Greear
2006-07-21  7:19               ` Evgeniy Polyakov
2006-07-21  7:20                 ` Evgeniy Polyakov
2006-07-21 16:14                 ` Ben Greear
2006-07-21 16:27                   ` Evgeniy Polyakov
2006-07-22 13:23                   ` Caitlin Bestler
2006-07-20 21:40             ` Ian McDonald
2006-07-21  7:26               ` Evgeniy Polyakov
2006-07-20 22:59             ` Alexey Kuznetsov
2006-07-21  4:55               ` David Miller
2006-07-21  7:10                 ` Evgeniy Polyakov
2006-07-21  7:47                   ` David Miller
2006-07-21  9:06                     ` Evgeniy Polyakov
2006-07-21  9:19                       ` David Miller
2006-07-21  9:39                         ` Evgeniy Polyakov
2006-07-21  9:46                           ` David Miller
2006-07-21  9:55                             ` Evgeniy Polyakov
2006-07-21 16:26                 ` Rick Jones
2006-07-21 20:57                   ` David Miller
2006-07-19 19:52   ` Stephen Hemminger
2006-07-19 20:01     ` David Miller
2006-07-19 20:16       ` Stephen Hemminger
2006-07-24 18:54       ` Stephen Hemminger
2006-07-24 20:52         ` Alexey Kuznetsov
2006-07-27  2:17   ` Rusty Russell
2006-07-27  5:17     ` David Miller
2006-07-27  5:46       ` Rusty Russell
2006-07-27  6:00         ` David Miller
2006-07-27 18:54           ` Stephen Hemminger
2006-07-28  8:21             ` David Miller
2006-07-28  5:54           ` Rusty Russell
2006-08-01  4:47             ` David Miller
2006-08-01  6:36               ` Rusty Russell
2006-07-27 16:33         ` Alexey Kuznetsov
2006-07-27 16:51           ` Evgeniy Polyakov
2006-07-27 20:56             ` Alexey Kuznetsov
2006-07-28  5:17               ` Evgeniy Polyakov
2006-07-28  5:34                 ` David Miller
2006-07-28  5:47                   ` Evgeniy Polyakov
2006-07-28  4:49           ` Rusty Russell

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).