public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Shailabh Nagar <nagar@watson.ibm.com>
To: linux-kernel <linux-kernel@vger.kernel.org>,
	ckrm-tech <ckrm-tech@lists.sourceforge.net>
Subject: [PATCH 6/6] CKRM socket accept queue resource controller
Date: Thu, 29 Apr 2004 04:24:57 -0400	[thread overview]
Message-ID: <4090BBD9.7050807@watson.ibm.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 0 bytes --]



[-- Attachment #2: 05-socketaq.ckrm-E12.patch --]
[-- Type: text/plain, Size: 29574 bytes --]

diff -Nru a/include/linux/tcp.h b/include/linux/tcp.h
--- a/include/linux/tcp.h	Wed Apr 28 22:41:05 2004
+++ b/include/linux/tcp.h	Wed Apr 28 22:41:05 2004
@@ -128,6 +128,10 @@
 #define TCP_INFO		11	/* Information about this connection. */
 #define TCP_QUICKACK		12	/* Block/reenable quick acks */
 
+#ifdef CONFIG_ACCEPT_QUEUES
+#define TCP_ACCEPTQ_SHARE	13	/* Set accept queue share */
+#endif
+
 #define TCPI_OPT_TIMESTAMPS	1
 #define TCPI_OPT_SACK		2
 #define TCPI_OPT_WSCALE		4
@@ -185,6 +189,18 @@
 	__u32	tcpi_reordering;
 };
 
+#ifdef CONFIG_ACCEPT_QUEUES
+
+#define NUM_ACCEPT_QUEUES	8 	/* Must be power of 2 */
+
+struct tcp_acceptq_info {
+	unsigned char acceptq_shares;
+	unsigned long acceptq_wait_time;
+	unsigned int acceptq_qcount;
+	unsigned int acceptq_count;
+};
+#endif
+
 #ifdef __KERNEL__
 
 #include <linux/config.h>
@@ -362,8 +378,9 @@
 
 	/* FIFO of established children */
 	struct open_request	*accept_queue;
-	struct open_request	*accept_queue_tail;
-
+#ifndef CONFIG_ACCEPT_QUEUES
+	struct open_request     *accept_queue_tail;
+#endif
 	int			write_pending;	/* A write to socket waits to start. */
 
 	unsigned int		keepalive_time;	  /* time before keep alive takes place */
@@ -387,6 +404,22 @@
                 __u32    rtt;
                 __u32    rtt_min;          /* minimum observed RTT */
         } westwood;
+
+#ifdef CONFIG_ACCEPT_QUEUES
+	/* move to listen opt... */
+	char		class_index;
+	struct {
+		struct open_request     *aq_head;
+		struct open_request     *aq_tail;
+		unsigned int		 aq_cnt;
+		unsigned int		 aq_ratio;
+		unsigned int             aq_count;
+		unsigned int             aq_qcount;
+		unsigned int             aq_backlog;
+		unsigned int             aq_wait_time;
+		int			 aq_valid;
+	} acceptq[NUM_ACCEPT_QUEUES];
+#endif
 };
 
 /* WARNING: don't change the layout of the members in tcp_sock! */
diff -Nru a/include/net/sock.h b/include/net/sock.h
--- a/include/net/sock.h	Wed Apr 28 22:41:05 2004
+++ b/include/net/sock.h	Wed Apr 28 22:41:05 2004
@@ -245,6 +245,7 @@
 	struct timeval		sk_stamp;
 	struct socket		*sk_socket;
 	void			*sk_user_data;
+	void                    *sk_ns;        // For use by CKRM
 	struct module		*sk_owner;
 	void			*sk_security;
 	void			(*sk_state_change)(struct sock *sk);
diff -Nru a/include/net/tcp.h b/include/net/tcp.h
--- a/include/net/tcp.h	Wed Apr 28 22:41:05 2004
+++ b/include/net/tcp.h	Wed Apr 28 22:41:05 2004
@@ -642,6 +642,10 @@
 		struct tcp_v6_open_req v6_req;
 #endif
 	} af;
+#ifdef CONFIG_ACCEPT_QUEUES
+	unsigned long acceptq_time_stamp;
+	int	      acceptq_class;
+#endif
 };
 
 /* SLAB cache for open requests. */
@@ -1691,6 +1695,69 @@
 	return tcp_win_from_space(sk->sk_rcvbuf); 
 }
 
+#ifdef CONFIG_ACCEPT_QUEUES
+static inline void tcp_acceptq_removed(struct sock *sk, int class)
+{
+	tcp_sk(sk)->acceptq[class].aq_backlog--;
+}
+
+static inline void tcp_acceptq_added(struct sock *sk, int class)
+{
+	tcp_sk(sk)->acceptq[class].aq_backlog++;
+}
+
+static inline int tcp_acceptq_is_full(struct sock *sk, int class)
+{
+	return tcp_sk(sk)->acceptq[class].aq_backlog >
+		sk->sk_max_ack_backlog;
+}
+
+static inline void tcp_set_acceptq(struct tcp_opt *tp, struct open_request *req)
+{
+	int class = req->acceptq_class;
+	int prev_class;
+
+	if (!tp->acceptq[class].aq_ratio) {
+		req->acceptq_class = 0;
+		class = 0;
+	}
+
+	tp->acceptq[class].aq_qcount++;
+	req->acceptq_time_stamp = jiffies;
+
+	if (tp->acceptq[class].aq_tail) {
+		req->dl_next = tp->acceptq[class].aq_tail->dl_next;
+		tp->acceptq[class].aq_tail->dl_next = req;
+		tp->acceptq[class].aq_tail = req;
+	} else { /* if first request in the class */
+		tp->acceptq[class].aq_head = req;
+		tp->acceptq[class].aq_tail = req;
+
+		prev_class = class - 1;
+		while (prev_class >= 0) {
+			if (tp->acceptq[prev_class].aq_tail)
+				break;
+			prev_class--;
+		}
+		if (prev_class < 0) {
+			req->dl_next = tp->accept_queue;
+			tp->accept_queue = req;
+		}
+		else {
+			req->dl_next = tp->acceptq[prev_class].aq_tail->dl_next;
+			tp->acceptq[prev_class].aq_tail->dl_next = req;
+		}
+	}
+}
+static inline void tcp_acceptq_queue(struct sock *sk, struct open_request *req,
+					 struct sock *child)
+{
+	tcp_set_acceptq(tcp_sk(sk),req);
+	req->sk = child;
+	tcp_acceptq_added(sk,req->acceptq_class);
+}
+
+#else
 static inline void tcp_acceptq_removed(struct sock *sk)
 {
 	sk->sk_ack_backlog--;
@@ -1723,16 +1790,55 @@
 	req->dl_next = NULL;
 }
 
+#endif
+
 struct tcp_listen_opt
 {
 	u8			max_qlen_log;	/* log_2 of maximal queued SYNs */
 	int			qlen;
+#ifdef CONFIG_ACCEPT_QUEUES
+	int			qlen_young[NUM_ACCEPT_QUEUES];
+#else
 	int			qlen_young;
+#endif
 	int			clock_hand;
 	u32			hash_rnd;
 	struct open_request	*syn_table[TCP_SYNQ_HSIZE];
 };
 
+#ifdef CONFIG_ACCEPT_QUEUES
+static inline void
+tcp_synq_removed(struct sock *sk, struct open_request *req)
+{
+	struct tcp_listen_opt *lopt = tcp_sk(sk)->listen_opt;
+
+	if (--lopt->qlen == 0)
+		tcp_delete_keepalive_timer(sk);
+	if (req->retrans == 0)
+		lopt->qlen_young[req->acceptq_class]--;
+}
+
+static inline void tcp_synq_added(struct sock *sk, struct open_request *req)
+{
+	struct tcp_listen_opt *lopt = tcp_sk(sk)->listen_opt;
+
+	if (lopt->qlen++ == 0)
+		tcp_reset_keepalive_timer(sk, TCP_TIMEOUT_INIT);
+	lopt->qlen_young[req->acceptq_class]++;
+}
+
+static inline int tcp_synq_len(struct sock *sk)
+{
+	return tcp_sk(sk)->listen_opt->qlen;
+}
+
+static inline int tcp_synq_young(struct sock *sk, int class)
+{
+	return tcp_sk(sk)->listen_opt->qlen_young[class];
+}
+
+#else
+
 static inline void
 tcp_synq_removed(struct sock *sk, struct open_request *req)
 {
@@ -1762,6 +1868,7 @@
 {
 	return tcp_sk(sk)->listen_opt->qlen_young;
 }
+#endif
 
 static inline int tcp_synq_is_full(struct sock *sk)
 {
diff -Nru a/kernel/ckrm/ckrm_listenaq.c b/kernel/ckrm/ckrm_listenaq.c
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/kernel/ckrm/ckrm_listenaq.c	Wed Apr 28 22:41:05 2004
@@ -0,0 +1,503 @@
+/* ckrm_socketaq.c - accept queue resource controller
+ *
+ * Copyright (C) Vivek Kashyap,      IBM Corp. 2004
+ * 
+ * Latest version, more details at http://ckrm.sf.net
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/* Changes
+ * Initial version
+ */
+
+/* Code Description: TBD
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/errno.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/ckrm.h>
+#include <linux/ckrm_rc.h>
+#include <net/tcp.h>
+
+#include <linux/ckrm_net.h>
+
+#define hnode_2_core(ptr) \
+                ((ptr) ? container_of(ptr, struct ckrm_core_class, hnode) : NULL)
+
+
+#define CKRM_SAQ_MAX_DEPTH	3 // 0 => /rcfs
+				  // 1 => socket_aq
+				  // 2 => socket_aq/listen_class
+				  // 3 => socket_aq/listen_class/accept_queues
+				  // 4 => Not allowed
+
+typedef struct ckrm_laq_res {
+	spinlock_t		reslock;
+	atomic_t		refcnt;
+	struct ckrm_shares 	shares;
+	struct ckrm_core_class *core;
+	struct ckrm_core_class *pcore;
+	int 			my_depth;
+	int 			my_id;
+} ckrm_laq_res_t;
+
+static int my_resid = -1;
+
+extern	struct ckrm_core_class *rcfs_create_under_netroot(char *, int, int);
+extern struct ckrm_core_class *rcfs_make_core(struct dentry *, 
+						struct ckrm_core_class * ) ;
+
+void
+laq_res_hold(struct ckrm_laq_res *res)
+{
+        atomic_inc(&res->refcnt);
+	return;
+}
+
+void
+laq_res_put(struct ckrm_laq_res *res)
+{
+	if (atomic_dec_and_test(&res->refcnt))
+		kfree(res);
+	return;
+}
+
+/* Initialize rescls values
+ */
+static void
+laq_res_initcls(void *my_res)
+{
+	ckrm_laq_res_t *res = my_res;
+
+	res->shares.my_guarantee     = CKRM_SHARE_DONTCARE;
+	res->shares.my_limit         = CKRM_SHARE_DONTCARE;
+	res->shares.total_guarantee  = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+	res->shares.max_limit        = CKRM_SHARE_DFLT_MAX_LIMIT;
+	res->shares.unused_guarantee = CKRM_SHARE_DFLT_TOTAL_GUARANTEE;
+	res->shares.cur_max_limit    = 0;
+}
+
+static int 
+atoi(char *s)
+{
+	int k = 0;
+	while(*s) 
+		k = *s++ - '0' + (k * 10);
+	return k;
+}
+
+static char *
+laq_get_name(struct ckrm_core_class *c)
+{
+        char *p = (char *)c->name;
+
+        while(*p)
+                p++;
+        while( *p != '/' && p != c->name)
+                p--;
+
+        return ++p;
+}
+
+static void *
+laq_res_alloc(struct ckrm_core_class *core, struct ckrm_core_class *parent)
+{
+	ckrm_laq_res_t *res, *pres;
+	int pdepth;
+
+	if (parent)
+		pres = ckrm_get_res_class(parent, my_resid, ckrm_laq_res_t);
+	else
+		pres = NULL;
+
+	if (core == core->classtype->default_class)    
+		pdepth = 1;
+	else {
+		if (!parent)
+			return NULL;
+		pdepth = 1 + pres->my_depth;
+	}
+
+	res = kmalloc(sizeof(ckrm_laq_res_t), GFP_ATOMIC);
+	if (res) {
+		memset(res, 0, sizeof(res));
+		spin_lock_init(&res->reslock);
+		laq_res_hold(res);
+		res->my_depth  = pdepth;
+		if (pdepth == 2)	// listen class
+			res->my_id = 0;
+		else if (pdepth == 3)
+			res->my_id = atoi(laq_get_name(core));
+		res->core = core;
+		res->pcore = parent;
+
+		// rescls in place, now initialize contents other than 
+		// hierarchy pointers
+		laq_res_initcls(res); // acts as initialising value
+	}
+
+	return res;
+}
+
+static void
+laq_res_free(void *my_res)
+{
+	ckrm_laq_res_t *res = (ckrm_laq_res_t *)my_res;
+	ckrm_laq_res_t *parent;
+
+	if (!res) 
+		return;
+
+	if (res->my_depth != 3) {
+		kfree(res);
+		return;
+	}
+
+	parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
+	if (!parent)	// Should never happen
+		return;
+
+	spin_lock(&parent->reslock);
+	spin_lock(&res->reslock);
+
+	// return child's guarantee to parent node
+	// Limits have no meaning for accept queue control
+	child_guarantee_changed(&parent->shares, res->shares.my_guarantee, 0);
+
+	spin_unlock(&res->reslock);
+	laq_res_put(res);	
+	spin_unlock(&parent->reslock);
+	return;
+}
+
+/**************************************************************************
+ * 			SHARES					        ***
+ **************************************************************************/
+
+void
+laq_set_aq_values(ckrm_laq_res_t *my_res, ckrm_laq_res_t *parent, int updatep)
+{
+
+	struct ckrm_net_struct *ns;
+	struct ckrm_core_class *core = parent->core;
+	struct tcp_opt *tp;
+	
+	if (my_res->my_depth < 2) 
+		return;
+	
+	// XXX Instead of holding a  class_lock introduce a rw
+	// lock to be write locked by listen callbacks and read locked here.
+	// - VK
+	class_lock(core);
+	list_for_each_entry(ns, &core->objlist,ckrm_link) { 
+		tp = tcp_sk(ns->ns_sk);
+		if (updatep)
+			tp->acceptq[0].aq_ratio =
+			       parent->shares.total_guarantee/
+				parent->shares.unused_guarantee;	       
+
+		tp->acceptq[my_res->my_id].aq_ratio =
+		       my_res->shares.total_guarantee/
+			parent->shares.my_guarantee;	       
+	}
+	class_unlock(core);
+	return;
+}
+
+static int
+laq_set_share_values(void *my_res, struct ckrm_shares *shares)
+{
+	ckrm_laq_res_t *res = my_res;
+	ckrm_laq_res_t *parent, *child;
+	struct ckrm_hnode *chnode; 
+	int rc = 0;
+
+	if (!res) 
+		return -EINVAL;
+
+	if (!res->pcore) { 
+		// something is badly wrong
+		printk(KERN_ERR "socketaq internal inconsistency\n");
+		return -EBADF;
+	}
+
+	parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
+	if (!parent)	// socket_class does not have a share interface
+		return -EINVAL;
+
+	// Ensure that we ignore limit values
+	shares->my_limit = shares->max_limit = CKRM_SHARE_UNCHANGED;
+
+	switch (res->my_depth) {
+
+	case 0: printk(KERN_ERR "socketaq bad entry\n");
+		rc = -EBADF;
+		break;
+
+	case 1: // can't be written to. this is internal default.
+		// return -EINVAL
+		rc = -EINVAL;
+		break;
+
+	case 2: // nothing to inherit
+		if (!shares->total_guarantee) {
+			rc = -EINVAL;
+			break;
+		}
+
+		ckrm_lock_hier(res->pcore);
+		spin_lock(&res->reslock);
+		rc = set_shares(shares, &res->shares, NULL);
+		if (!rc) {
+			list_for_each_entry(chnode,
+					&res->core->hnode.children,siblings){
+				child=hnode_2_core(chnode)->res_class[my_resid];
+				laq_set_aq_values(child,res,(child->my_id==1));
+			}
+		}
+		spin_unlock(&res->reslock);
+		ckrm_unlock_hier(res->pcore);
+		break;
+
+	case 3: // accept queue itself. Check against parent.
+		ckrm_lock_hier(parent->pcore);
+		spin_lock(&parent->reslock);
+		rc = set_shares(shares, &res->shares, &parent->shares);
+		if (!rc) {
+			laq_set_aq_values(res,parent,1);
+		}
+		spin_unlock(&parent->reslock);
+		ckrm_unlock_hier(parent->pcore);
+		break;
+	}
+
+	return rc;
+}
+
+static int
+laq_get_share_values(void *my_res, struct ckrm_shares *shares)
+{
+	ckrm_laq_res_t *res = my_res;
+
+	if (!res) 
+		return -EINVAL;
+	*shares = res->shares;
+	return 0;
+}
+
+/**************************************************************************
+ * 			STATS						***
+ **************************************************************************/
+
+void
+laq_print_aq_stats(struct seq_file *sfile, struct tcp_acceptq_info *taq, int i)
+{
+	seq_printf(sfile, "Class %d connections:\n\taccepted: %u\n\t"
+			  "queued: %u\n\twait_time: %lu\n\t",
+			  i, taq->acceptq_count, taq->acceptq_qcount,
+			  taq->acceptq_wait_time);
+
+	if (i)
+		return;
+
+	for (i = 1; i < NUM_ACCEPT_QUEUES; i++) {
+		taq[0].acceptq_wait_time += taq[i].acceptq_wait_time;
+		taq[0].acceptq_qcount += taq[i].acceptq_qcount;
+		taq[0].acceptq_count += taq[i].acceptq_count;
+	}
+
+	seq_printf(sfile, "Totals :\n\taccepted: %u\n\t"
+			  "queued: %u\n\twait_time: %lu\n",
+			   taq->acceptq_count, taq->acceptq_qcount,
+			  taq->acceptq_wait_time);
+
+	return;
+}
+
+void
+laq_get_aq_stats(ckrm_laq_res_t *pres, ckrm_laq_res_t *mres, 
+					struct tcp_acceptq_info *taq)
+{
+	struct ckrm_net_struct *ns;
+	struct ckrm_core_class *core = pres->core;
+	struct tcp_opt *tp;
+	int a = mres->my_id;
+	int z;
+
+	if (a == 0)
+		z = NUM_ACCEPT_QUEUES;
+	else
+		z = a+1;
+
+	// XXX Instead of holding a  class_lock introduce a rw
+	// lock to be write locked by listen callbacks and read locked here.
+	// - VK
+	class_lock(pres->core);
+	list_for_each_entry(ns, &core->objlist,ckrm_link) { 
+		tp = tcp_sk(ns->ns_sk);
+		for (; a< z; a++) {
+			taq->acceptq_wait_time += tp->acceptq[a].aq_wait_time;
+			taq->acceptq_qcount += tp->acceptq[a].aq_qcount;
+			taq->acceptq_count += tp->acceptq[a].aq_count;
+			taq++;
+		}
+	}
+	class_unlock(pres->core);
+}
+
+
+static int  
+laq_get_stats(void *my_res, struct seq_file *sfile)
+{
+	ckrm_laq_res_t *res = my_res;
+	ckrm_laq_res_t *parent;
+	struct tcp_acceptq_info taq[NUM_ACCEPT_QUEUES];
+	int rc = 0;
+
+	if (!res) 
+		return -EINVAL;
+	
+	if (!res->pcore) { 
+		// something is badly wrong
+		printk(KERN_ERR "socketaq internal inconsistency\n");
+		return -EBADF;
+	}
+
+	parent = ckrm_get_res_class(res->pcore, my_resid, ckrm_laq_res_t);
+	if (!parent) {	// socket_class does not have a stat interface
+		printk(KERN_ERR "socketaq internal fs inconsistency\n");
+		return -EINVAL;
+	}
+
+	memset(taq, 0, sizeof(struct tcp_acceptq_info) * NUM_ACCEPT_QUEUES);
+
+	switch (res->my_depth) {
+
+	default:
+	case 0: printk(KERN_ERR "socket class bad entry\n");
+		rc = -EBADF;
+		break;
+
+	case 1: // can't be read from. this is internal default.
+		// return -EINVAL
+		rc = -EINVAL;
+		break;
+
+	case 2: // return the default and total
+		ckrm_lock_hier(res->core);	// block any deletes
+		laq_get_aq_stats(res, res, &taq[0]);
+		laq_print_aq_stats(sfile, &taq[0], 0);
+		ckrm_unlock_hier(res->core);	// block any deletes
+		break;
+
+	case 3: 
+		ckrm_lock_hier(parent->core);	// block any deletes
+		laq_get_aq_stats(parent, res, &taq[res->my_id]);
+		laq_print_aq_stats(sfile, &taq[res->my_id], res->my_id);
+		ckrm_unlock_hier(parent->core);	// block any deletes
+		break;
+	}
+
+	return rc;
+}
+
+/*
+ * The network connection is reclassified to this class. Update its shares.
+ * The socket lock is held. 
+ */
+static void
+laq_change_resclass(void *n, void *old, void *r)
+{
+	struct ckrm_net_struct *ns = (struct ckrm_net_struct *)n;
+	struct ckrm_laq_res *res = (struct ckrm_laq_res *)r;
+	struct ckrm_hnode  *chnode = NULL;
+
+
+	if (res->my_depth != 2) 
+		return;	
+
+	// a change to my_depth == 3 ie. the accept classes cannot happen.
+	// there is no target file
+	if (res->my_depth == 2) { // it is one of the socket classes
+		struct ckrm_laq_res *reschild;
+		struct sock *sk = ns->ns_sk; 
+		struct tcp_opt *tp = tcp_sk(sk);
+
+		// share rule: hold parent resource lock. then self.
+		// However, since my_depth == 1 is a generic class it is not
+		// needed here. Self lock is enough.
+		spin_lock(&res->reslock);
+		tp->acceptq[0].aq_ratio = res->shares.total_guarantee/
+				res->shares.unused_guarantee;
+		list_for_each_entry(chnode,&res->core->hnode.children,siblings){
+			reschild = hnode_2_core(chnode)->res_class[my_resid];
+
+			spin_lock(&reschild->reslock);
+			tp->acceptq[reschild->my_id].aq_ratio=
+				reschild->shares.total_guarantee/
+					res->shares.my_guarantee;
+			spin_unlock(&reschild->reslock);
+		}
+		spin_unlock(&res->reslock);
+	}
+	
+	return;
+}
+
+struct ckrm_res_ctlr laq_rcbs = {
+	.res_name          = "laq",
+	.resid		   = -1 , // dynamically assigned
+	.res_alloc         = laq_res_alloc,
+	.res_free          = laq_res_free,
+	.set_share_values  = laq_set_share_values,
+	.get_share_values  = laq_get_share_values,
+	.get_stats         = laq_get_stats,
+	.change_resclass   = laq_change_resclass,
+	//	.res_initcls       = laq_res_initcls,         // LAQ_HUBERTUS: no need for this !!
+};
+
+int __init
+init_ckrm_laq_res(void)
+{
+	struct ckrm_classtype *clstype;
+	int resid;
+
+	clstype = ckrm_find_classtype_by_name("socket_class");
+	if (clstype == NULL) {
+		printk(KERN_INFO " Unknown ckrm classtype<socket_class>");
+		return -ENOENT;
+	}
+
+	if (my_resid == -1) {
+		resid = ckrm_register_res_ctlr(clstype,&laq_rcbs);
+		if (resid >= 0)
+			my_resid = resid;
+		printk("........init_ckrm_listen_aq_res -> %d\n",my_resid);
+	}
+	return 0;
+
+}	
+
+void __exit
+exit_ckrm_laq_res(void)
+{
+	ckrm_unregister_res_ctlr(&laq_rcbs);
+	my_resid = -1;
+}
+
+
+module_init(init_ckrm_laq_res)
+module_exit(exit_ckrm_laq_res)
+
+MODULE_LICENSE("GPL");
+
diff -Nru a/net/ipv4/Kconfig b/net/ipv4/Kconfig
--- a/net/ipv4/Kconfig	Wed Apr 28 22:41:05 2004
+++ b/net/ipv4/Kconfig	Wed Apr 28 22:41:05 2004
@@ -360,5 +360,28 @@
 	  
 	  If unsure, say Y.
 
+config ACCEPT_QUEUES 
+	bool "IP: TCP Multiple accept queues support"
+	depends on INET && NETFILTER
+	---help---
+	  Support multiple accept queues per listening socket. If you say Y
+	  here, multiple accept queues will be configured per listening
+	  socket.
+	  
+	  Each queue is mapped to a priority class. Incoming connection 
+	  requests can be classified (see iptables(8), MARK target), depending
+	  on the packet's src/dest address or other parameters, into one of 
+	  the priority classes. The requests are then queued to the relevant
+	  accept queue. 
+
+	  Each of the queues can be assigned a weight. The accept()ance 
+	  of packets is then scheduled in accordance with the weight 
+	  assigned to the priority class. 
+	  
+	  Be sure to enable "Network packet filtering" if you wish
+	  to use this feature.
+
+	  If unsure, say N.
+
 source "net/ipv4/ipvs/Kconfig"
 
diff -Nru a/net/ipv4/tcp.c b/net/ipv4/tcp.c
--- a/net/ipv4/tcp.c	Wed Apr 28 22:41:05 2004
+++ b/net/ipv4/tcp.c	Wed Apr 28 22:41:05 2004
@@ -256,6 +256,7 @@
 #include <linux/smp_lock.h>
 #include <linux/fs.h>
 #include <linux/random.h>
+#include <linux/ckrm.h>
 
 #include <net/icmp.h>
 #include <net/tcp.h>
@@ -534,13 +535,34 @@
 
 int tcp_listen_start(struct sock *sk)
 {
+#ifdef CONFIG_ACCEPT_QUEUES
+	int i = 0;
+#endif
 	struct inet_opt *inet = inet_sk(sk);
 	struct tcp_opt *tp = tcp_sk(sk);
 	struct tcp_listen_opt *lopt;
 
 	sk->sk_max_ack_backlog = 0;
 	sk->sk_ack_backlog = 0;
-	tp->accept_queue = tp->accept_queue_tail = NULL;
+	tp->accept_queue = NULL;
+#ifdef CONFIG_ACCEPT_QUEUES
+	tp->class_index = 0;
+	for (i=0; i < NUM_ACCEPT_QUEUES; i++) {
+		tp->acceptq[i].aq_tail = NULL;
+		tp->acceptq[i].aq_head = NULL;
+		tp->acceptq[i].aq_wait_time = 0; 
+		tp->acceptq[i].aq_qcount = 0; 
+		tp->acceptq[i].aq_count = 0; 
+		if (i == 0) {
+			tp->acceptq[i].aq_valid = 1; 
+			tp->acceptq[i].aq_ratio = 1; 
+		}
+		else {
+			tp->acceptq[i].aq_valid = 0; 
+			tp->acceptq[i].aq_ratio = 0; 
+		}
+	}
+#endif
 	tp->syn_wait_lock = RW_LOCK_UNLOCKED;
 	tcp_delack_init(tp);
 
@@ -570,6 +592,10 @@
 		sk_dst_reset(sk);
 		sk->sk_prot->hash(sk);
 
+#ifdef CONFIG_CKRM
+		ckrm_cb_listen_start(sk);
+#endif
+
 		return 0;
 	}
 
@@ -600,7 +626,18 @@
 	write_lock_bh(&tp->syn_wait_lock);
 	tp->listen_opt = NULL;
 	write_unlock_bh(&tp->syn_wait_lock);
-	tp->accept_queue = tp->accept_queue_tail = NULL;
+
+#ifdef CONFIG_CKRM
+		ckrm_cb_listen_stop(sk);
+#endif
+
+#ifdef CONFIG_ACCEPT_QUEUES
+	for (i = 0; i < NUM_ACCEPT_QUEUES; i++)
+		tp->acceptq[i].aq_head = tp->acceptq[i].aq_tail = NULL;
+#else
+	tp->accept_queue_tail = NULL;
+#endif
+	tp->accept_queue = NULL;
 
 	if (lopt->qlen) {
 		for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
@@ -646,7 +683,11 @@
 		local_bh_enable();
 		sock_put(child);
 
+#ifdef CONFIG_ACCEPT_QUEUES
+		tcp_acceptq_removed(sk, req->acceptq_class);
+#else
 		tcp_acceptq_removed(sk);
+#endif
 		tcp_openreq_fastfree(req);
 	}
 	BUG_TRAP(!sk->sk_ack_backlog);
@@ -2230,6 +2271,10 @@
 	struct open_request *req;
 	struct sock *newsk;
 	int error;
+#ifdef CONFIG_ACCEPT_QUEUES	
+	int prev_class = 0;
+	int first;
+#endif
 
 	lock_sock(sk);
 
@@ -2243,7 +2288,6 @@
 	/* Find already established connection */
 	if (!tp->accept_queue) {
 		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
-
 		/* If this is a non blocking socket don't sleep */
 		error = -EAGAIN;
 		if (!timeo)
@@ -2254,12 +2298,46 @@
 			goto out;
 	}
 
+#ifndef CONFIG_ACCEPT_QUEUES
 	req = tp->accept_queue;
 	if ((tp->accept_queue = req->dl_next) == NULL)
 		tp->accept_queue_tail = NULL;
 
- 	newsk = req->sk;
 	tcp_acceptq_removed(sk);
+#else
+	first = tp->class_index;
+	/* We should always have  request queued here. The accept_queue
+	 * is already checked for NULL above.
+	 */
+	while(!tp->acceptq[first].aq_head) {
+		tp->acceptq[first].aq_cnt = 0;
+		first = (first+1) & ~NUM_ACCEPT_QUEUES; 
+	}
+        req = tp->acceptq[first].aq_head;
+	tp->acceptq[first].aq_qcount--;
+	tp->acceptq[first].aq_count++;
+	tp->acceptq[first].aq_wait_time+=(jiffies - req->acceptq_time_stamp);
+
+	for (prev_class= first-1 ; prev_class >=0; prev_class--)
+		if (tp->acceptq[prev_class].aq_tail)
+			break;
+	if (prev_class>=0)
+		tp->acceptq[prev_class].aq_tail->dl_next = req->dl_next; 
+	else 
+		tp->accept_queue = req->dl_next;
+
+	if (req == tp->acceptq[first].aq_tail) 
+		tp->acceptq[first].aq_head = tp->acceptq[first].aq_tail = NULL;
+	else
+		tp->acceptq[first].aq_head = req->dl_next;
+
+	if((++(tp->acceptq[first].aq_cnt)) >= tp->acceptq[first].aq_ratio){
+		tp->acceptq[first].aq_cnt = 0;
+		tp->class_index = ++first & ~NUM_ACCEPT_QUEUES;
+	}	
+	tcp_acceptq_removed(sk, req->acceptq_class);
+#endif
+ 	newsk = req->sk;
 	tcp_openreq_fastfree(req);
 	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
 	release_sock(sk);
@@ -2429,6 +2507,49 @@
 			}
 		}
 		break;
+		
+#ifdef CONFIG_ACCEPT_QUEUES
+	case TCP_ACCEPTQ_SHARE:
+		{
+			char share_wt[NUM_ACCEPT_QUEUES];
+			int i,j;
+
+			if (sk->sk_state != TCP_LISTEN)
+				return -EOPNOTSUPP;
+
+			if (copy_from_user(share_wt,optval, optlen)) {
+				err = -EFAULT;
+				break;
+			}
+			j = 0;
+			for (i = 0; i < NUM_ACCEPT_QUEUES; i++) {
+				if (share_wt[i]) {
+					if (!j)
+						j = share_wt[i];
+					else if (share_wt[i] < j) {
+						j = share_wt[i];
+					}
+					tp->acceptq[i].aq_valid = 1;
+				}
+				else
+					tp->acceptq[i].aq_valid = 0;
+					
+			}
+			if (j == 0) {
+				/* Class 0 is always valid. If nothing is 
+				 * specified set class 0 as 1.
+				 */
+				share_wt[0] = 1;
+				tp->acceptq[0].aq_valid = 1;
+				j = 1;
+			}
+			for (i=0; i < NUM_ACCEPT_QUEUES; i++)  {
+				tp->acceptq[i].aq_ratio = share_wt[i]/j;
+				tp->acceptq[i].aq_cnt = 0;
+			}
+		}
+		break;
+#endif
 
 	default:
 		err = -ENOPROTOOPT;
@@ -2555,6 +2676,41 @@
 	case TCP_QUICKACK:
 		val = !tp->ack.pingpong;
 		break;
+
+#ifdef CONFIG_ACCEPT_QUEUES
+	case TCP_ACCEPTQ_SHARE: {
+		struct tcp_acceptq_info tinfo[NUM_ACCEPT_QUEUES];
+		int i;
+
+		if (sk->sk_state != TCP_LISTEN)
+			return -EOPNOTSUPP;
+
+		if (get_user(len, optlen))
+			return -EFAULT;
+
+		memset(tinfo, 0, sizeof(tinfo));
+
+		for(i=0; i < NUM_ACCEPT_QUEUES; i++) {
+			tinfo[i].acceptq_wait_time = 
+				tp->acceptq[i].aq_wait_time/(HZ/USER_HZ);
+			tinfo[i].acceptq_qcount = tp->acceptq[i].aq_qcount;
+			tinfo[i].acceptq_count = tp->acceptq[i].aq_count;
+			if (tp->acceptq[i].aq_valid) 
+				tinfo[i].acceptq_shares=tp->acceptq[i].aq_ratio;
+			else
+				tinfo[i].acceptq_shares = 0;
+		}
+
+		len = min_t(unsigned int, len, sizeof(tinfo));
+		if (put_user(len, optlen)) 
+			return -EFAULT;
+			
+		if (copy_to_user(optval, (char *)tinfo, len))
+			return -EFAULT;
+		
+		return 0;
+	}
+#endif
 	default:
 		return -ENOPROTOOPT;
 	};
diff -Nru a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
--- a/net/ipv4/tcp_ipv4.c	Wed Apr 28 22:41:05 2004
+++ b/net/ipv4/tcp_ipv4.c	Wed Apr 28 22:41:05 2004
@@ -458,7 +458,6 @@
 	head = &tcp_listening_hash[tcp_lhashfn(hnum)];
 	if (!hlist_empty(head)) {
 		struct inet_opt *inet = inet_sk((sk = __sk_head(head)));
-
 		if (inet->num == hnum && !sk->sk_node.next &&
 		    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
 		    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
@@ -916,7 +915,11 @@
 	lopt->syn_table[h] = req;
 	write_unlock(&tp->syn_wait_lock);
 
+#ifdef CONFIG_ACCEPT_QUEUES
+	tcp_synq_added(sk, req);
+#else
 	tcp_synq_added(sk);
+#endif
 }
 
 
@@ -1413,6 +1416,9 @@
 	__u32 daddr = skb->nh.iph->daddr;
 	__u32 isn = TCP_SKB_CB(skb)->when;
 	struct dst_entry *dst = NULL;
+#ifdef CONFIG_ACCEPT_QUEUES
+	int class = 0;
+#endif
 #ifdef CONFIG_SYN_COOKIES
 	int want_cookie = 0;
 #else
@@ -1437,12 +1443,32 @@
 		goto drop;
 	}
 
+#ifdef CONFIG_ACCEPT_QUEUES
+	class = (skb->nfmark <= 0) ? 0 :
+		((skb->nfmark > NUM_ACCEPT_QUEUES) ? NUM_ACCEPT_QUEUES:
+		 skb->nfmark);
+	/*
+	 * Accept only if the class has shares set or if the default class
+	 * i.e. class 0 has shares
+	 */
+	if (!(tcp_sk(sk)->acceptq[class].aq_valid)) {
+		if (tcp_sk(sk)->acceptq[0].aq_valid) 
+			class = 0;
+		else
+			goto drop;
+	}
+#endif
+
 	/* Accept backlog is full. If we have already queued enough
 	 * of warm entries in syn queue, drop request. It is better than
 	 * clogging syn queue with openreqs with exponentially increasing
 	 * timeout.
 	 */
+#ifdef CONFIG_ACCEPT_QUEUES
+	if (tcp_acceptq_is_full(sk, class) && tcp_synq_young(sk, class) > 1)
+#else
 	if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+#endif
 		goto drop;
 
 	req = tcp_openreq_alloc();
@@ -1472,7 +1498,10 @@
 	tp.tstamp_ok = tp.saw_tstamp;
 
 	tcp_openreq_init(req, &tp, skb);
-
+#ifdef CONFIG_ACCEPT_QUEUES
+	req->acceptq_class = class;
+	req->acceptq_time_stamp = jiffies;
+#endif
 	req->af.v4_req.loc_addr = daddr;
 	req->af.v4_req.rmt_addr = saddr;
 	req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
@@ -1567,7 +1596,11 @@
 	struct tcp_opt *newtp;
 	struct sock *newsk;
 
+#ifdef CONFIG_ACCEPT_QUEUES
+	if (tcp_acceptq_is_full(sk, req->acceptq_class))
+#else
 	if (tcp_acceptq_is_full(sk))
+#endif
 		goto exit_overflow;
 
 	if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
diff -Nru a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
--- a/net/ipv4/tcp_minisocks.c	Wed Apr 28 22:41:05 2004
+++ b/net/ipv4/tcp_minisocks.c	Wed Apr 28 22:41:05 2004
@@ -787,7 +787,14 @@
 		newtp->num_sacks = 0;
 		newtp->urg_data = 0;
 		newtp->listen_opt = NULL;
+#ifdef CONFIG_ACCEPT_QUEUES
+		newtp->accept_queue = NULL;
+		memset(newtp->acceptq, 0,sizeof(newtp->acceptq));
+		newtp->class_index = 0;
+
+#else
 		newtp->accept_queue = newtp->accept_queue_tail = NULL;
+#endif
 		/* Deinitialize syn_wait_lock to trap illegal accesses. */
 		memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
 
diff -Nru a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
--- a/net/ipv4/tcp_timer.c	Wed Apr 28 22:41:05 2004
+++ b/net/ipv4/tcp_timer.c	Wed Apr 28 22:41:05 2004
@@ -498,7 +498,16 @@
 	 * ones are about to clog our table.
 	 */
 	if (lopt->qlen>>(lopt->max_qlen_log-1)) {
+#ifdef CONFIG_ACCEPT_QUEUES
+		int young = 0;
+	       
+		for(i=0; i < NUM_ACCEPT_QUEUES; i++) 
+			young += lopt->qlen_young[i];
+		
+		young <<= 1;
+#else
 		int young = (lopt->qlen_young<<1);
+#endif
 
 		while (thresh > 2) {
 			if (lopt->qlen < young)
@@ -524,9 +533,12 @@
 					unsigned long timeo;
 
 					if (req->retrans++ == 0)
-						lopt->qlen_young--;
-					timeo = min((TCP_TIMEOUT_INIT << req->retrans),
-						    TCP_RTO_MAX);
+#ifdef CONFIG_ACCEPT_QUEUES
+			         		lopt->qlen_young[req->acceptq_class]--;
+#else
+			         		lopt->qlen_young--;
+#endif
+					timeo = min((TCP_TIMEOUT_INIT << req->retrans), TCP_RTO_MAX);
 					req->expires = now + timeo;
 					reqp = &req->dl_next;
 					continue;
@@ -538,7 +550,11 @@
 				write_unlock(&tp->syn_wait_lock);
 				lopt->qlen--;
 				if (req->retrans == 0)
-					lopt->qlen_young--;
+#ifdef CONFIG_ACCEPT_QUEUES
+			         		lopt->qlen_young[req->acceptq_class]--;
+#else
+			         		lopt->qlen_young--;
+#endif
 				tcp_openreq_free(req);
 				continue;
 			}

                 reply	other threads:[~2004-04-29  8:27 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4090BBD9.7050807@watson.ibm.com \
    --to=nagar@watson.ibm.com \
    --cc=ckrm-tech@lists.sourceforge.net \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox