Netdev List
 help / color / mirror / Atom feed
* [*v3 PATCH 14/22] IPVS: netns awareness to ip_vs_sync
From: hans @ 2010-12-30 10:50 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: Hans Schillstrom
In-Reply-To: <1293706266-27152-1-git-send-email-hans@schillstrom.com>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

All global variables moved to struct ipvs,
most external changes fixed (i.e. init_net removed)
in sync_buf create  + 4 replaced by sizeof(struct..)

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/net/ip_vs.h             |   14 +-
 include/net/netns/ip_vs.h       |   16 ++
 net/netfilter/ipvs/ip_vs_core.c |   14 +-
 net/netfilter/ipvs/ip_vs_ctl.c  |   55 ++++---
 net/netfilter/ipvs/ip_vs_sync.c |  325 +++++++++++++++++++++------------------
 5 files changed, 235 insertions(+), 189 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 489c6ea..d7b1dcd 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -956,7 +956,7 @@ extern struct ip_vs_stats ip_vs_stats;
 extern const struct ctl_path net_vs_ctl_path[];
 extern int sysctl_ip_vs_sync_ver;
 
-extern void ip_vs_sync_switch_mode(int mode);
+extern void ip_vs_sync_switch_mode(struct net *net, int mode);
 extern struct ip_vs_service *
 ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
 		  const union nf_inet_addr *vaddr, __be16 vport);
@@ -985,14 +985,10 @@ extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp);
  *      IPVS sync daemon data and function prototypes
  *      (from ip_vs_sync.c)
  */
-extern volatile int ip_vs_sync_state;
-extern volatile int ip_vs_master_syncid;
-extern volatile int ip_vs_backup_syncid;
-extern char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-extern char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-extern int start_sync_thread(int state, char *mcast_ifn, __u8 syncid);
-extern int stop_sync_thread(int state);
-extern void ip_vs_sync_conn(struct ip_vs_conn *cp);
+extern int start_sync_thread(struct net *net, int state, char *mcast_ifn,
+			     __u8 syncid);
+extern int stop_sync_thread(struct net *net, int state);
+extern void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp);
 extern int ip_vs_sync_init(void);
 extern void ip_vs_sync_cleanup(void);
 
diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h
index 3da0eca..f6a6114 100644
--- a/include/net/netns/ip_vs.h
+++ b/include/net/netns/ip_vs.h
@@ -75,6 +75,22 @@ struct netns_ipvs {
 	spinlock_t		est_lock;
 	struct timer_list	est_timer;	/* Estimation timer */
 
+	/* ip_vs_sync */
+	struct list_head	sync_queue;
+	spinlock_t		sync_lock;
+	struct ip_vs_sync_buff  *sync_buff;
+	spinlock_t		sync_buff_lock;
+	struct sockaddr_in 	sync_mcast_addr;
+	struct task_struct 	*master_thread;
+	struct task_struct 	*backup_thread;
+	int 			send_mesg_maxlen;
+	int 			recv_mesg_maxlen;
+	volatile int 		sync_state;
+	volatile int 		master_syncid;
+	volatile int 		backup_syncid;
+	/* multicast interface name */
+	char 			master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+	char 			backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
 };
 
 #endif /* IP_VS_H_ */
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 0454a11..5d6e250 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1477,6 +1477,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	struct ip_vs_proto_data *pd;
 	struct ip_vs_conn *cp;
 	int ret, restart, pkts;
+	struct net *net;
+	struct netns_ipvs *ipvs;
 
 	/* Already marked as IPVS request or reply? */
 	if (skb->ipvs_property)
@@ -1556,7 +1558,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	}
 
 	IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
-
+	net = skb_net(skb);
+	ipvs = net_ipvs(net);
 	/* Check the server status */
 	if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 		/* the destination server is not available */
@@ -1589,12 +1592,13 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	 *
 	 * For ONE_PKT let ip_vs_sync_conn() do the filter work.
 	 */
+
 	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
 		pkts = sysctl_ip_vs_sync_threshold[0];
 	else
 		pkts = atomic_add_return(1, &cp->in_pkts);
 
-	if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+	if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
 	    cp->protocol == IPPROTO_SCTP) {
 		if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
 			(pkts % sysctl_ip_vs_sync_threshold[1]
@@ -1603,13 +1607,13 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 				 ((cp->state == IP_VS_SCTP_S_CLOSED) ||
 				  (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
 				  (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
-			ip_vs_sync_conn(cp);
+			ip_vs_sync_conn(net, cp);
 			goto out;
 		}
 	}
 
 	/* Keep this block last: TCP and others with pp->num_states <= 1 */
-	else if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+	else if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
 	    (((cp->protocol != IPPROTO_TCP ||
 	       cp->state == IP_VS_TCP_S_ESTABLISHED) &&
 	      (pkts % sysctl_ip_vs_sync_threshold[1]
@@ -1619,7 +1623,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	       (cp->state == IP_VS_TCP_S_CLOSE) ||
 	       (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
 	       (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
-		ip_vs_sync_conn(cp);
+		ip_vs_sync_conn(net, cp);
 out:
 	cp->old_state = cp->state;
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 5b4da80..105c05f 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1559,7 +1559,8 @@ proc_do_sync_mode(ctl_table *table, int write,
 			/* Restore the correct value */
 			*valp = val;
 		} else {
-			ip_vs_sync_switch_mode(val);
+			struct net *net = current->nsproxy->net_ns;
+			ip_vs_sync_switch_mode(net, val);
 		}
 	}
 	return rc;
@@ -2174,11 +2175,12 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 		goto out_unlock;
 	} else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
 		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
-		ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
+		ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
+					dm->syncid);
 		goto out_unlock;
 	} else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
 		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
-		ret = stop_sync_thread(dm->state);
+		ret = stop_sync_thread(net, dm->state);
 		goto out_unlock;
 	}
 
@@ -2286,7 +2288,8 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
 }
 
 static inline int
-__ip_vs_get_service_entries(struct net *net, const struct ip_vs_get_services *get,
+__ip_vs_get_service_entries(struct net *net,
+			    const struct ip_vs_get_services *get,
 			    struct ip_vs_get_services __user *uptr)
 {
 	int idx, count=0;
@@ -2423,6 +2426,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	int ret = 0;
 	unsigned int copylen;
 	struct net *net = sock_net(sk);
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
 	BUG_ON(!net);
 	if (!capable(CAP_NET_ADMIN))
@@ -2544,15 +2548,17 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		struct ip_vs_daemon_user d[2];
 
 		memset(&d, 0, sizeof(d));
-		if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
+		if (ipvs->sync_state & IP_VS_STATE_MASTER) {
 			d[0].state = IP_VS_STATE_MASTER;
-			strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
-			d[0].syncid = ip_vs_master_syncid;
+			strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
+				sizeof(d[0].mcast_ifn));
+			d[0].syncid = ipvs->master_syncid;
 		}
-		if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
+		if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
 			d[1].state = IP_VS_STATE_BACKUP;
-			strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
-			d[1].syncid = ip_vs_backup_syncid;
+			strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
+				sizeof(d[1].mcast_ifn));
+			d[1].syncid = ipvs->backup_syncid;
 		}
 		if (copy_to_user(user, &d, sizeof(d)) != 0)
 			ret = -EFAULT;
@@ -3059,20 +3065,23 @@ nla_put_failure:
 static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
 				   struct netlink_callback *cb)
 {
+	struct net *net = skb_net(skb);
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
 	mutex_lock(&__ip_vs_mutex);
-	if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
+	if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
 		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
-					   ip_vs_master_mcast_ifn,
-					   ip_vs_master_syncid, cb) < 0)
+					   ipvs->master_mcast_ifn,
+					   ipvs->master_syncid, cb) < 0)
 			goto nla_put_failure;
 
 		cb->args[0] = 1;
 	}
 
-	if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
+	if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
 		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
-					   ip_vs_backup_mcast_ifn,
-					   ip_vs_backup_syncid, cb) < 0)
+					   ipvs->backup_mcast_ifn,
+					   ipvs->backup_syncid, cb) < 0)
 			goto nla_put_failure;
 
 		cb->args[1] = 1;
@@ -3084,24 +3093,26 @@ nla_put_failure:
 	return skb->len;
 }
 
-static int ip_vs_genl_new_daemon(struct nlattr **attrs)
+static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
 {
 	if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
 	      attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
 	      attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
 		return -EINVAL;
 
-	return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
+	return start_sync_thread(net,
+				 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
 				 nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
 				 nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
 }
 
-static int ip_vs_genl_del_daemon(struct nlattr **attrs)
+static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
 {
 	if (!attrs[IPVS_DAEMON_ATTR_STATE])
 		return -EINVAL;
 
-	return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
+	return stop_sync_thread(net,
+				nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
 }
 
 static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
@@ -3157,9 +3168,9 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 		}
 
 		if (cmd == IPVS_CMD_NEW_DAEMON)
-			ret = ip_vs_genl_new_daemon(daemon_attrs);
+			ret = ip_vs_genl_new_daemon(net, daemon_attrs);
 		else
-			ret = ip_vs_genl_del_daemon(daemon_attrs);
+			ret = ip_vs_genl_del_daemon(net, daemon_attrs);
 		goto out;
 	} else if (cmd == IPVS_CMD_ZERO &&
 		   !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 8c8bb4c..29c6bbb 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -192,6 +192,7 @@ union ip_vs_sync_conn {
 #define IPVS_OPT_F_PARAM	(1 << (IPVS_OPT_PARAM-1))
 
 struct ip_vs_sync_thread_data {
+	struct net *net;
 	struct socket *sock;
 	char *buf;
 };
@@ -259,10 +260,6 @@ struct ip_vs_sync_mesg {
 	/* ip_vs_sync_conn entries start here */
 };
 
-/* the maximum length of sync (sending/receiving) message */
-static int sync_send_mesg_maxlen;
-static int sync_recv_mesg_maxlen;
-
 struct ip_vs_sync_buff {
 	struct list_head        list;
 	unsigned long           firstuse;
@@ -273,28 +270,6 @@ struct ip_vs_sync_buff {
 	unsigned char           *end;
 };
 
-
-/* the sync_buff list head and the lock */
-static LIST_HEAD(ip_vs_sync_queue);
-static DEFINE_SPINLOCK(ip_vs_sync_lock);
-
-/* current sync_buff for accepting new conn entries */
-static struct ip_vs_sync_buff   *curr_sb = NULL;
-static DEFINE_SPINLOCK(curr_sb_lock);
-
-/* ipvs sync daemon state */
-volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
-volatile int ip_vs_master_syncid = 0;
-volatile int ip_vs_backup_syncid = 0;
-
-/* multicast interface name */
-char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-
-/* sync daemon tasks */
-static struct task_struct *sync_master_thread;
-static struct task_struct *sync_backup_thread;
-
 /* multicast addr */
 static struct sockaddr_in mcast_addr = {
 	.sin_family		= AF_INET,
@@ -324,20 +299,20 @@ static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
 	put_unaligned_be32(ho->previous_delta, &no->previous_delta);
 }
 
-static inline struct ip_vs_sync_buff *sb_dequeue(void)
+static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs)
 {
 	struct ip_vs_sync_buff *sb;
 
-	spin_lock_bh(&ip_vs_sync_lock);
-	if (list_empty(&ip_vs_sync_queue)) {
+	spin_lock_bh(&ipvs->sync_lock);
+	if (list_empty(&ipvs->sync_queue)) {
 		sb = NULL;
 	} else {
-		sb = list_entry(ip_vs_sync_queue.next,
+		sb = list_entry(ipvs->sync_queue.next,
 				struct ip_vs_sync_buff,
 				list);
 		list_del(&sb->list);
 	}
-	spin_unlock_bh(&ip_vs_sync_lock);
+	spin_unlock_bh(&ipvs->sync_lock);
 
 	return sb;
 }
@@ -345,25 +320,26 @@ static inline struct ip_vs_sync_buff *sb_dequeue(void)
 /*
  * Create a new sync buffer for Version 1 proto.
  */
-static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
 {
 	struct ip_vs_sync_buff *sb;
 
 	if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
 		return NULL;
 
-	if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
+	if (!(sb->mesg=kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC))) {
 		kfree(sb);
 		return NULL;
 	}
 	sb->mesg->reserved = 0;  /* old nr_conns i.e. must be zeo now */
 	sb->mesg->version = SYNC_PROTO_VER;
-	sb->mesg->syncid = ip_vs_master_syncid;
+	sb->mesg->syncid = ipvs->master_syncid;
 	sb->mesg->size = sizeof(struct ip_vs_sync_mesg);
 	sb->mesg->nr_conns = 0;
 	sb->mesg->spare = 0;
 	sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
-	sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
+	sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen;
 
 	sb->firstuse = jiffies;
 	return sb;
@@ -375,14 +351,16 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
 	kfree(sb);
 }
 
-static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
+static inline void sb_queue_tail(struct netns_ipvs *ipvs)
 {
-	spin_lock(&ip_vs_sync_lock);
-	if (ip_vs_sync_state & IP_VS_STATE_MASTER)
-		list_add_tail(&sb->list, &ip_vs_sync_queue);
+	struct ip_vs_sync_buff *sb = ipvs->sync_buff;
+
+	spin_lock(&ipvs->sync_lock);
+	if (ipvs->sync_state & IP_VS_STATE_MASTER)
+		list_add_tail(&sb->list, &ipvs->sync_queue);
 	else
 		ip_vs_sync_buff_release(sb);
-	spin_unlock(&ip_vs_sync_lock);
+	spin_unlock(&ipvs->sync_lock);
 }
 
 /*
@@ -390,18 +368,18 @@ static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
  *	than the specified time or the specified time is zero.
  */
 static inline struct ip_vs_sync_buff *
-get_curr_sync_buff(unsigned long time)
+get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time)
 {
 	struct ip_vs_sync_buff *sb;
 
-	spin_lock_bh(&curr_sb_lock);
-	if (curr_sb && (time == 0 ||
-			time_before(jiffies - curr_sb->firstuse, time))) {
-		sb = curr_sb;
-		curr_sb = NULL;
+	spin_lock_bh(&ipvs->sync_buff_lock);
+	if (ipvs->sync_buff && (time == 0 ||
+	    time_before(jiffies - ipvs->sync_buff->firstuse, time))) {
+		sb = ipvs->sync_buff;
+		ipvs->sync_buff = NULL;
 	} else
 		sb = NULL;
-	spin_unlock_bh(&curr_sb_lock);
+	spin_unlock_bh(&ipvs->sync_buff_lock);
 	return sb;
 }
 
@@ -409,33 +387,37 @@ get_curr_sync_buff(unsigned long time)
  * Switch mode from sending version 0 or 1
  *  - must handle sync_buf
  */
-void ip_vs_sync_switch_mode(int mode) {
+void ip_vs_sync_switch_mode(struct net *net, int mode)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
-	if (!ip_vs_sync_state & IP_VS_STATE_MASTER)
+	if (!ipvs->sync_state & IP_VS_STATE_MASTER)
 		return;
-	if (mode == sysctl_ip_vs_sync_ver || !curr_sb)
+	if (mode == sysctl_ip_vs_sync_ver || !ipvs->sync_buff)
 		return;
 
-	spin_lock_bh(&curr_sb_lock);
+	spin_lock_bh(&ipvs->sync_buff_lock);
 	/* Buffer empty ? then let buf_create do the job  */
-	if ( curr_sb->mesg->size <=  sizeof(struct ip_vs_sync_mesg)) {
-		kfree(curr_sb);
-		curr_sb = NULL;
+	if ( ipvs->sync_buff->mesg->size <=  sizeof(struct ip_vs_sync_mesg)) {
+		kfree(ipvs->sync_buff);
+		ipvs->sync_buff = NULL;
 	} else {
-		spin_lock_bh(&ip_vs_sync_lock);
-		if (ip_vs_sync_state & IP_VS_STATE_MASTER)
-			list_add_tail(&curr_sb->list, &ip_vs_sync_queue);
+		spin_lock_bh(&ipvs->sync_lock);
+		if (ipvs->sync_state & IP_VS_STATE_MASTER)
+			list_add_tail(&ipvs->sync_buff->list,
+				      &ipvs->sync_queue);
 		else
-			ip_vs_sync_buff_release(curr_sb);
-		spin_unlock_bh(&ip_vs_sync_lock);
+			ip_vs_sync_buff_release(ipvs->sync_buff);
+		spin_unlock_bh(&ipvs->sync_lock);
 	}
-	spin_unlock_bh(&curr_sb_lock);
+	spin_unlock_bh(&ipvs->sync_buff_lock);
 }
 
 /*
  * Create a new sync buffer for Version 0 proto.
  */
-static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create_v0(void)
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
 {
 	struct ip_vs_sync_buff *sb;
 	struct ip_vs_sync_mesg_v0 *mesg;
@@ -443,16 +425,16 @@ static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create_v0(void)
 	if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
 		return NULL;
 
-	if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
+	if (!(sb->mesg=kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC))) {
 		kfree(sb);
 		return NULL;
 	}
 	mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
 	mesg->nr_conns = 0;
-	mesg->syncid = ip_vs_master_syncid;
-	mesg->size = 4;
-	sb->head = (unsigned char *)mesg + 4;
-	sb->end = (unsigned char *)mesg + sync_send_mesg_maxlen;
+	mesg->syncid = ipvs->master_syncid;
+	mesg->size = sizeof(struct ip_vs_sync_mesg_v0);
+	sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
+	sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen;
 	sb->firstuse = jiffies;
 	return sb;
 }
@@ -461,8 +443,9 @@ static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create_v0(void)
  *      Version 0 , could be switched in by sys_ctl.
  *      Add an ip_vs_conn information into the current sync_buff.
  */
-void ip_vs_sync_conn_v0(struct ip_vs_conn *cp)
+void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
 {
+	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct ip_vs_sync_mesg_v0 *m;
 	struct ip_vs_sync_conn_v0 *s;
 	int len;
@@ -473,10 +456,12 @@ void ip_vs_sync_conn_v0(struct ip_vs_conn *cp)
 	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
 		return;
 
-	spin_lock(&curr_sb_lock);
-	if (!curr_sb) {
-		if (!(curr_sb=ip_vs_sync_buff_create_v0())) {
-			spin_unlock(&curr_sb_lock);
+	spin_lock(&ipvs->sync_buff_lock);
+	if (!ipvs->sync_buff) {
+		ipvs->sync_buff =
+			ip_vs_sync_buff_create_v0(ipvs);
+		if (!ipvs->sync_buff) {
+			spin_unlock(&ipvs->sync_buff_lock);
 			pr_err("ip_vs_sync_buff_create failed.\n");
 			return;
 		}
@@ -484,8 +469,8 @@ void ip_vs_sync_conn_v0(struct ip_vs_conn *cp)
 
 	len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
 		SIMPLE_CONN_SIZE;
-	m = (struct ip_vs_sync_mesg_v0 *)curr_sb->mesg;
-	s = (struct ip_vs_sync_conn_v0 *)curr_sb->head;
+	m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg;
+	s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head;
 
 	/* copy members */
 	s->reserved = 0;
@@ -506,18 +491,18 @@ void ip_vs_sync_conn_v0(struct ip_vs_conn *cp)
 
 	m->nr_conns++;
 	m->size += len;
-	curr_sb->head += len;
+	ipvs->sync_buff->head += len;
 
 	/* check if there is a space for next one */
-	if (curr_sb->head + FULL_CONN_SIZE > curr_sb->end) {
-		sb_queue_tail(curr_sb);
-		curr_sb = NULL;
+	if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) {
+		sb_queue_tail(ipvs);
+		ipvs->sync_buff = NULL;
 	}
-	spin_unlock(&curr_sb_lock);
+	spin_unlock(&ipvs->sync_buff_lock);
 
 	/* synchronize its controller if it has */
 	if (cp->control)
-		ip_vs_sync_conn(cp->control);
+		ip_vs_sync_conn(net, cp->control);
 }
 
 /*
@@ -525,8 +510,9 @@ void ip_vs_sync_conn_v0(struct ip_vs_conn *cp)
  *      Called by ip_vs_in.
  *      Sending Version 1 messages
  */
-void ip_vs_sync_conn(struct ip_vs_conn *cp)
+void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp)
 {
+	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct ip_vs_sync_mesg *m;
 	union ip_vs_sync_conn *s;
 	__u8 *p;
@@ -534,7 +520,7 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
 
 	/* Handle old version of the protocol */
 	if (sysctl_ip_vs_sync_ver == 0) {
-		ip_vs_sync_conn_v0(cp);
+		ip_vs_sync_conn_v0(net, cp);
 		return;
 	}
 	/* Do not sync ONE PACKET */
@@ -551,7 +537,7 @@ sloop:
 		pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
 	}
 
-	spin_lock(&curr_sb_lock);
+	spin_lock(&ipvs->sync_buff_lock);
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (cp->af == AF_INET6)
@@ -570,26 +556,26 @@ sloop:
 
 	/* check if there is a space for this one  */
 	pad = 0;
-	if (curr_sb) {
-		pad = (4 - (size_t)curr_sb->head) & 3;
-		if (curr_sb->head + len + pad > curr_sb->end) {
-			sb_queue_tail(curr_sb);
-			curr_sb = NULL;
+	if (ipvs->sync_buff) {
+		pad = (4 - (size_t)ipvs->sync_buff->head) & 3;
+		if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) {
+			sb_queue_tail(ipvs);
+			ipvs->sync_buff = NULL;
 			pad = 0;
 		}
 	}
 
-	if (!curr_sb) {
-		if (!(curr_sb=ip_vs_sync_buff_create())) {
-			spin_unlock(&curr_sb_lock);
+	if (!ipvs->sync_buff) {
+		if (!(ipvs->sync_buff=ip_vs_sync_buff_create(ipvs))) {
+			spin_unlock(&ipvs->sync_buff_lock);
 			pr_err("ip_vs_sync_buff_create failed.\n");
 			return;
 		}
 	}
 
-	m = curr_sb->mesg;
-	p = curr_sb->head;
-	curr_sb->head += pad + len;
+	m = ipvs->sync_buff->mesg;
+	p = ipvs->sync_buff->head;
+	ipvs->sync_buff->head += pad + len;
 	m->size += pad + len;
 	/* Add ev. padding from prev. sync_conn */
 	while (pad--)
@@ -647,7 +633,7 @@ sloop:
 		}
 	}
 
-	spin_unlock(&curr_sb_lock);
+	spin_unlock(&ipvs->sync_buff_lock);
 
 control:
 	/* synchronize its controller if it has */
@@ -699,7 +685,8 @@ ip_vs_conn_fill_param_sync(int af, union ip_vs_sync_conn *sc,
 			buff[pe_name_len]=0;
 			p->pe = __ip_vs_pe_getbyname(buff);
 			if (!p->pe) {
-				IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", buff);
+				IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
+					     buff);
 				return 1;
 			}
 		} else {
@@ -748,7 +735,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
 		 * If it is not found the connection will remain unbound
 		 * but still handled.
 		 */
-		dest = ip_vs_find_dest(&init_net, type, daddr, dport, param->vaddr,
+		dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr,
 				       param->vport, protocol, fwmark);
 
 		/*  Set the approprite ativity flag */
@@ -1089,6 +1076,7 @@ out:
 static void ip_vs_process_message(struct net *net, __u8 *buffer,
 				  const size_t buflen)
 {
+	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
 	__u8 *p, *msg_end;
 	int i, nr_conns;
@@ -1105,7 +1093,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer,
 		return;
 	}
 	/* SyncID sanity check */
-	if (ip_vs_backup_syncid != 0 && m2->syncid != ip_vs_backup_syncid) {
+	if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) {
 		IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
 		return;
 	}
@@ -1189,8 +1177,9 @@ static int set_mcast_if(struct sock *sk, char *ifname)
 {
 	struct net_device *dev;
 	struct inet_sock *inet = inet_sk(sk);
+	struct net *net = sock_net(sk);
 
-	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+	if ((dev = __dev_get_by_name(net, ifname)) == NULL)
 		return -ENODEV;
 
 	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
@@ -1209,30 +1198,33 @@ static int set_mcast_if(struct sock *sk, char *ifname)
  *	Set the maximum length of sync message according to the
  *	specified interface's MTU.
  */
-static int set_sync_mesg_maxlen(int sync_state)
+static int set_sync_mesg_maxlen(struct net *net, int sync_state)
 {
+	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct net_device *dev;
 	int num;
 
 	if (sync_state == IP_VS_STATE_MASTER) {
-		if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
+		dev = __dev_get_by_name(net, ipvs->master_mcast_ifn);
+		if (!dev)
 			return -ENODEV;
 
 		num = (dev->mtu - sizeof(struct iphdr) -
 		       sizeof(struct udphdr) -
 		       SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
-		sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
+		ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
 			SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
 		IP_VS_DBG(7, "setting the maximum length of sync sending "
-			  "message %d.\n", sync_send_mesg_maxlen);
+			  "message %d.\n", ipvs->send_mesg_maxlen);
 	} else if (sync_state == IP_VS_STATE_BACKUP) {
-		if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
+		dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn);
+		if (!dev)
 			return -ENODEV;
 
-		sync_recv_mesg_maxlen = dev->mtu -
+		ipvs->recv_mesg_maxlen = dev->mtu -
 			sizeof(struct iphdr) - sizeof(struct udphdr);
 		IP_VS_DBG(7, "setting the maximum length of sync receiving "
-			  "message %d.\n", sync_recv_mesg_maxlen);
+			  "message %d.\n", ipvs->recv_mesg_maxlen);
 	}
 
 	return 0;
@@ -1247,6 +1239,7 @@ static int set_sync_mesg_maxlen(int sync_state)
 static int
 join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
 {
+	struct net *net = sock_net(sk);
 	struct ip_mreqn mreq;
 	struct net_device *dev;
 	int ret;
@@ -1254,7 +1247,7 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
 	memset(&mreq, 0, sizeof(mreq));
 	memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
 
-	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+	if ((dev = __dev_get_by_name(net, ifname)) == NULL)
 		return -ENODEV;
 	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
 		return -EINVAL;
@@ -1271,11 +1264,12 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
 
 static int bind_mcastif_addr(struct socket *sock, char *ifname)
 {
+	struct net *net = sock_net(sock->sk);
 	struct net_device *dev;
 	__be32 addr;
 	struct sockaddr_in sin;
 
-	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+	if ((dev = __dev_get_by_name(net, ifname)) == NULL)
 		return -ENODEV;
 
 	addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
@@ -1297,8 +1291,9 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)
 /*
  *      Set up sending multicast socket over UDP
  */
-static struct socket * make_send_sock(void)
+static struct socket * make_send_sock(struct net *net)
 {
+	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct socket *sock;
 	int result;
 
@@ -1309,7 +1304,7 @@ static struct socket * make_send_sock(void)
 		return ERR_PTR(result);
 	}
 
-	result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn);
+	result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
 	if (result < 0) {
 		pr_err("Error setting outbound mcast interface\n");
 		goto error;
@@ -1318,7 +1313,7 @@ static struct socket * make_send_sock(void)
 	set_mcast_loop(sock->sk, 0);
 	set_mcast_ttl(sock->sk, 1);
 
-	result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn);
+	result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
 	if (result < 0) {
 		pr_err("Error binding address of the mcast interface\n");
 		goto error;
@@ -1342,8 +1337,9 @@ static struct socket * make_send_sock(void)
 /*
  *      Set up receiving multicast socket over UDP
  */
-static struct socket * make_receive_sock(void)
+static struct socket * make_receive_sock(struct net *net)
 {
+	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct socket *sock;
 	int result;
 
@@ -1367,7 +1363,7 @@ static struct socket * make_receive_sock(void)
 	/* join the multicast group */
 	result = join_mcast_group(sock->sk,
 			(struct in_addr *) &mcast_addr.sin_addr,
-			ip_vs_backup_mcast_ifn);
+			ipvs->backup_mcast_ifn);
 	if (result < 0) {
 		pr_err("Error joining to the multicast group\n");
 		goto error;
@@ -1438,20 +1434,21 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
 static int sync_thread_master(void *data)
 {
 	struct ip_vs_sync_thread_data *tinfo = data;
+	struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
 	struct ip_vs_sync_buff *sb;
 
 	pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
 		"syncid = %d\n",
-		ip_vs_master_mcast_ifn, ip_vs_master_syncid);
+		ipvs->master_mcast_ifn, ipvs->master_syncid);
 
 	while (!kthread_should_stop()) {
-		while ((sb = sb_dequeue())) {
+		while ((sb = sb_dequeue(ipvs))) {
 			ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
 			ip_vs_sync_buff_release(sb);
 		}
 
-		/* check if entries stay in curr_sb for 2 seconds */
-		sb = get_curr_sync_buff(2 * HZ);
+		/* check if entries stay in ipvs->sync_buff for 2 seconds */
+		sb = get_curr_sync_buff(ipvs, 2 * HZ);
 		if (sb) {
 			ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
 			ip_vs_sync_buff_release(sb);
@@ -1461,12 +1458,12 @@ static int sync_thread_master(void *data)
 	}
 
 	/* clean up the sync_buff queue */
-	while ((sb=sb_dequeue())) {
+	while ((sb=sb_dequeue(ipvs))) {
 		ip_vs_sync_buff_release(sb);
 	}
 
 	/* clean up the current sync_buff */
-	if ((sb = get_curr_sync_buff(0))) {
+	if ((sb = get_curr_sync_buff(ipvs, 0))) {
 		ip_vs_sync_buff_release(sb);
 	}
 
@@ -1481,11 +1478,12 @@ static int sync_thread_master(void *data)
 static int sync_thread_backup(void *data)
 {
 	struct ip_vs_sync_thread_data *tinfo = data;
+	struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
 	int len;
 
 	pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
 		"syncid = %d\n",
-		ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
+		ipvs->backup_mcast_ifn, ipvs->backup_syncid);
 
 	while (!kthread_should_stop()) {
 		wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
@@ -1495,7 +1493,7 @@ static int sync_thread_backup(void *data)
 		/* do we have data now? */
 		while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
 			len = ip_vs_receive(tinfo->sock, tinfo->buf,
-					sync_recv_mesg_maxlen);
+					ipvs->recv_mesg_maxlen);
 			if (len <= 0) {
 				pr_err("receiving message error\n");
 				break;
@@ -1504,7 +1502,7 @@ static int sync_thread_backup(void *data)
 			/* disable bottom half, because it accesses the data
 			   shared by softirq while getting/creating conns */
 			local_bh_disable();
-			ip_vs_process_message(&init_net, tinfo->buf, len);
+			ip_vs_process_message(tinfo->net, tinfo->buf, len);
 			local_bh_enable();
 		}
 	}
@@ -1518,11 +1516,12 @@ static int sync_thread_backup(void *data)
 }
 
 
-int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
+int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
 {
 	struct ip_vs_sync_thread_data *tinfo;
 	struct task_struct **realtask, *task;
 	struct socket *sock;
+	struct netns_ipvs *ipvs = net_ipvs(net);
 	char *name, *buf = NULL;
 	int (*threadfn)(void *data);
 	int result = -ENOMEM;
@@ -1532,27 +1531,27 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
 		  sizeof(struct ip_vs_sync_conn_v0));
 
 	if (state == IP_VS_STATE_MASTER) {
-		if (sync_master_thread)
+		if (ipvs->master_thread)
 			return -EEXIST;
 
-		strlcpy(ip_vs_master_mcast_ifn, mcast_ifn,
-			sizeof(ip_vs_master_mcast_ifn));
-		ip_vs_master_syncid = syncid;
-		realtask = &sync_master_thread;
-		name = "ipvs_syncmaster";
+		strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
+			sizeof(ipvs->master_mcast_ifn));
+		ipvs->master_syncid = syncid;
+		realtask = &ipvs->master_thread;
+		name = "ipvs_master:%d";
 		threadfn = sync_thread_master;
-		sock = make_send_sock();
+		sock = make_send_sock(net);
 	} else if (state == IP_VS_STATE_BACKUP) {
-		if (sync_backup_thread)
+		if (ipvs->backup_thread)
 			return -EEXIST;
 
-		strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn,
-			sizeof(ip_vs_backup_mcast_ifn));
-		ip_vs_backup_syncid = syncid;
-		realtask = &sync_backup_thread;
-		name = "ipvs_syncbackup";
+		strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
+			sizeof(ipvs->backup_mcast_ifn));
+		ipvs->backup_syncid = syncid;
+		realtask = &ipvs->backup_thread;
+		name = "ipvs_backup:%d";
 		threadfn = sync_thread_backup;
-		sock = make_receive_sock();
+		sock = make_receive_sock(net);
 	} else {
 		return -EINVAL;
 	}
@@ -1562,9 +1561,9 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
 		goto out;
 	}
 
-	set_sync_mesg_maxlen(state);
+	set_sync_mesg_maxlen(net, state);
 	if (state == IP_VS_STATE_BACKUP) {
-		buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL);
+		buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL);
 		if (!buf)
 			goto outsocket;
 	}
@@ -1573,10 +1572,11 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
 	if (!tinfo)
 		goto outbuf;
 
+	tinfo->net = net;
 	tinfo->sock = sock;
 	tinfo->buf = buf;
 
-	task = kthread_run(threadfn, tinfo, name);
+	task = kthread_run(threadfn, tinfo, name, ipvs->inc);
 	if (IS_ERR(task)) {
 		result = PTR_ERR(task);
 		goto outtinfo;
@@ -1584,7 +1584,7 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
 
 	/* mark as active */
 	*realtask = task;
-	ip_vs_sync_state |= state;
+	ipvs->sync_state |= state;
 
 	/* increase the module use count */
 	ip_vs_use_count_inc();
@@ -1602,16 +1602,18 @@ out:
 }
 
 
-int stop_sync_thread(int state)
+int stop_sync_thread(struct net *net, int state)
 {
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
 	IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
 
 	if (state == IP_VS_STATE_MASTER) {
-		if (!sync_master_thread)
+		if (!ipvs->master_thread)
 			return -ESRCH;
 
 		pr_info("stopping master sync thread %d ...\n",
-			task_pid_nr(sync_master_thread));
+			task_pid_nr(ipvs->master_thread));
 
 		/*
 		 * The lock synchronizes with sb_queue_tail(), so that we don't
@@ -1619,21 +1621,21 @@ int stop_sync_thread(int state)
 		 * progress of stopping the master sync daemon.
 		 */
 
-		spin_lock_bh(&ip_vs_sync_lock);
-		ip_vs_sync_state &= ~IP_VS_STATE_MASTER;
-		spin_unlock_bh(&ip_vs_sync_lock);
-		kthread_stop(sync_master_thread);
-		sync_master_thread = NULL;
+		spin_lock_bh(&ipvs->sync_lock);
+		ipvs->sync_state &= ~IP_VS_STATE_MASTER;
+		spin_unlock_bh(&ipvs->sync_lock);
+		kthread_stop(ipvs->master_thread);
+		ipvs->master_thread = NULL;
 	} else if (state == IP_VS_STATE_BACKUP) {
-		if (!sync_backup_thread)
+		if (!ipvs->backup_thread)
 			return -ESRCH;
 
 		pr_info("stopping backup sync thread %d ...\n",
-			task_pid_nr(sync_backup_thread));
+			task_pid_nr(ipvs->backup_thread));
 
-		ip_vs_sync_state &= ~IP_VS_STATE_BACKUP;
-		kthread_stop(sync_backup_thread);
-		sync_backup_thread = NULL;
+		ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
+		kthread_stop(ipvs->backup_thread);
+		ipvs->backup_thread = NULL;
 	} else {
 		return -EINVAL;
 	}
@@ -1649,13 +1651,30 @@ int stop_sync_thread(int state)
  */
 static int __net_init __ip_vs_sync_init(struct net *net)
 {
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	if (!net_eq(net, &init_net))	/* netns not enabled yet */
+		return -EPERM;
+
+	INIT_LIST_HEAD(&ipvs->sync_queue);
+	spin_lock_init(&ipvs->sync_lock);
+	spin_lock_init(&ipvs->sync_buff_lock);
+
+	ipvs->sync_mcast_addr.sin_family = AF_INET;
+	ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT);
+	ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP);
 	return 0;
 }
 
 static void __ip_vs_sync_cleanup(struct net *net)
 {
+	if (!net_eq(net, &init_net))	/* netns not enabled yet */
+		return;
+	stop_sync_thread(net, IP_VS_STATE_MASTER);
+	stop_sync_thread(net, IP_VS_STATE_BACKUP);
 	return;
 }
+
 static struct pernet_operations ipvs_sync_ops = {
 	.init = __ip_vs_sync_init,
 	.exit = __ip_vs_sync_cleanup,
-- 
1.7.2.3


^ permalink raw reply related

* [*v3 PATCH 11/22] IPVS: netns, common protocol changes and use of appcnt.
From: hans @ 2010-12-30 10:50 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: Hans Schillstrom
In-Reply-To: <1293706266-27152-1-git-send-email-hans@schillstrom.com>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

appcnt and timeout_table moved from struct ip_vs_protocol to
ip_vs proto_data.

struct net *net added as first param to
 - register_app()
 - unregister_app()
 - app_conn_bind()
 - ip_vs_conn_new()

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/net/ip_vs.h                   |    2 -
 net/netfilter/ipvs/ip_vs_conn.c       |    6 ++--
 net/netfilter/ipvs/ip_vs_proto_sctp.c |    4 +-
 net/netfilter/ipvs/ip_vs_proto_tcp.c  |    5 +--
 net/netfilter/ipvs/ip_vs_proto_udp.c  |    4 +-
 net/netfilter/ipvs/ip_vs_sync.c       |   56 +++++++++++++++++---------------
 6 files changed, 39 insertions(+), 38 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 135a163..fde0bca 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -358,8 +358,6 @@ struct ip_vs_protocol {
 	u16			protocol;
 	u16			num_states;
 	int			dont_defrag;
-	atomic_t		appcnt;		/* counter of proto app incs */
-	int			*timeout_table;	/* protocol timeout table */
 
 	void (*init)(struct ip_vs_protocol *pp);
 
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index a7aba6a..b2024c9 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -804,7 +804,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
 	       struct ip_vs_dest *dest, __u32 fwmark)
 {
 	struct ip_vs_conn *cp;
-	struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, p->protocol);
 
 	cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
 	if (cp == NULL) {
@@ -863,8 +863,8 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
 #endif
 		ip_vs_bind_xmit(cp);
 
-	if (unlikely(pp && atomic_read(&pp->appcnt)))
-		ip_vs_bind_app(cp, pp);
+	if (unlikely(pd && atomic_read(&pd->appcnt)))
+		ip_vs_bind_app(cp, pd->pp);
 
 	/*
 	 * Allow conntrack to be preserved. By default, conntrack
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index f363f66..ad44205 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -1035,7 +1035,7 @@ static int sctp_register_app(struct ip_vs_app *inc)
 		}
 	}
 	list_add(&inc->p_list, &ipvs->sctp_apps[hash]);
-	atomic_inc(&pd->pp->appcnt);
+	atomic_inc(&pd->appcnt);
 out:
 	spin_unlock_bh(&ipvs->sctp_app_lock);
 
@@ -1048,7 +1048,7 @@ static void sctp_unregister_app(struct ip_vs_app *inc)
 	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_SCTP);
 
 	spin_lock_bh(&ipvs->sctp_app_lock);
-	atomic_dec(&pd->pp->appcnt);
+	atomic_dec(&pd->appcnt);
 	list_del(&inc->p_list);
 	spin_unlock_bh(&ipvs->sctp_app_lock);
 }
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 830dc3e..83bcda3 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -595,7 +595,7 @@ static int tcp_register_app(struct ip_vs_app *inc)
 		}
 	}
 	list_add(&inc->p_list, &ipvs->tcp_apps[hash]);
-	atomic_inc(&pd->pp->appcnt);
+	atomic_inc(&pd->appcnt);
 
   out:
 	spin_unlock_bh(&ipvs->tcp_app_lock);
@@ -610,7 +610,7 @@ tcp_unregister_app(struct ip_vs_app *inc)
 	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_TCP);
 
 	spin_lock_bh(&ipvs->tcp_app_lock);
-	atomic_dec(&pd->pp->appcnt);
+	atomic_dec(&pd->appcnt);
 	list_del(&inc->p_list);
 	spin_unlock_bh(&ipvs->tcp_app_lock);
 }
@@ -700,7 +700,6 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
 	.protocol =		IPPROTO_TCP,
 	.num_states =		IP_VS_TCP_S_LAST,
 	.dont_defrag =		0,
-	.appcnt =		ATOMIC_INIT(0),
 	.init =			NULL,
 	.exit =			NULL,
 	.init_netns =		__ip_vs_tcp_init,
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index aa85df2..3719837 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -373,7 +373,7 @@ static int udp_register_app(struct ip_vs_app *inc)
 		}
 	}
 	list_add(&inc->p_list, &ipvs->udp_apps[hash]);
-	atomic_inc(&pd->pp->appcnt);
+	atomic_inc(&pd->appcnt);
 
   out:
 	spin_unlock_bh(&ipvs->udp_app_lock);
@@ -388,7 +388,7 @@ udp_unregister_app(struct ip_vs_app *inc)
 	struct netns_ipvs *ipvs = net_ipvs(&init_net);
 
 	spin_lock_bh(&ipvs->udp_app_lock);
-	atomic_dec(&pd->pp->appcnt);
+	atomic_dec(&pd->appcnt);
 	list_del(&inc->p_list);
 	spin_unlock_bh(&ipvs->udp_app_lock);
 }
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index df74b2c..8c8bb4c 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -725,17 +725,16 @@ ip_vs_conn_fill_param_sync(int af, union ip_vs_sync_conn *sc,
  *  Param: ...
  *         timeout is in sec.
  */
-static void ip_vs_proc_conn(struct ip_vs_conn_param *param,  unsigned flags,
-			    unsigned state, unsigned protocol, unsigned type,
+static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
+			    unsigned int flags, unsigned int state,
+			    unsigned int protocol, unsigned int type,
 			    const union nf_inet_addr *daddr, __be16 dport,
 			    unsigned long timeout, __u32 fwmark,
-			    struct ip_vs_sync_conn_options *opt,
-			    struct ip_vs_protocol *pp)
+			    struct ip_vs_sync_conn_options *opt)
 {
 	struct ip_vs_dest *dest;
 	struct ip_vs_conn *cp;
 
-
 	if (!(flags & IP_VS_CONN_F_TEMPLATE))
 		cp = ip_vs_conn_in_get(param);
 	else
@@ -821,17 +820,23 @@ static void ip_vs_proc_conn(struct ip_vs_conn_param *param,  unsigned flags,
 		if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
 			timeout = MAX_SCHEDULE_TIMEOUT / HZ;
 		cp->timeout = timeout*HZ;
-	} else if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
-		cp->timeout = pp->timeout_table[state];
-	else
-		cp->timeout = (3*60*HZ);
+	} else {
+		struct ip_vs_proto_data *pd;
+
+		pd = ip_vs_proto_data_get(net, protocol);
+		if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
+			cp->timeout = pd->timeout_table[state];
+		else
+			cp->timeout = (3*60*HZ);
+	}
 	ip_vs_conn_put(cp);
 }
 
 /*
  *  Process received multicast message for Version 0
  */
-static void ip_vs_process_message_v0(const char *buffer, const size_t buflen)
+static void ip_vs_process_message_v0(struct net *net, const char *buffer,
+				     const size_t buflen)
 {
 	struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
 	struct ip_vs_sync_conn_v0 *s;
@@ -843,7 +848,7 @@ static void ip_vs_process_message_v0(const char *buffer, const size_t buflen)
 
 	p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
 	for (i=0; i<m->nr_conns; i++) {
-		unsigned flags, state;
+		unsigned int flags, state;
 
 		if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
 			IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
@@ -879,7 +884,6 @@ static void ip_vs_process_message_v0(const char *buffer, const size_t buflen)
 			}
 		} else {
 			/* protocol in templates is not used for state/timeout */
-			pp = NULL;
 			if (state > 0) {
 				IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
 					state);
@@ -894,9 +898,9 @@ static void ip_vs_process_message_v0(const char *buffer, const size_t buflen)
 				      s->vport, &param);
 
 		/* Send timeout as Zero */
-		ip_vs_proc_conn(&param, flags, state, s->protocol, AF_INET,
+		ip_vs_proc_conn(net, &param, flags, state, s->protocol, AF_INET,
 				(union nf_inet_addr *)&s->daddr, s->dport,
-				0, 0, opt, pp);
+				0, 0, opt);
 	}
 }
 
@@ -945,7 +949,7 @@ static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
 /*
  *   Process a Version 1 sync. connection
  */
-static inline int ip_vs_proc_sync_conn(__u8 *p, __u8 *msg_end)
+static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end)
 {
 	struct ip_vs_sync_conn_options opt;
 	union  ip_vs_sync_conn *s;
@@ -1043,7 +1047,6 @@ static inline int ip_vs_proc_sync_conn(__u8 *p, __u8 *msg_end)
 		}
 	} else {
 		/* protocol in templates is not used for state/timeout */
-		pp = NULL;
 		if (state > 0) {
 			IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
 				state);
@@ -1058,18 +1061,18 @@ static inline int ip_vs_proc_sync_conn(__u8 *p, __u8 *msg_end)
 	}
 	/* If only IPv4, just silent skip IPv6 */
 	if (af == AF_INET)
-		ip_vs_proc_conn(&param, flags, state, s->v4.protocol, af,
+		ip_vs_proc_conn(net, &param, flags, state, s->v4.protocol, af,
 				(union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
 				ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
-				(opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL),
-				pp);
+				(opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
+				);
 #ifdef CONFIG_IP_VS_IPV6
 	else
-		ip_vs_proc_conn(&param, flags, state, s->v6.protocol, af,
+		ip_vs_proc_conn(net, &param, flags, state, s->v6.protocol, af,
 				(union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
 				ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
-				(opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL),
-				pp);
+				(opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
+				);
 #endif
 	return 0;
 	/* Error exit */
@@ -1083,7 +1086,8 @@ out:
  *      ip_vs_conn entries.
  *      Handles Version 0 & 1
  */
-static void ip_vs_process_message(__u8 *buffer, const size_t buflen)
+static void ip_vs_process_message(struct net *net, __u8 *buffer,
+				  const size_t buflen)
 {
 	struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
 	__u8 *p, *msg_end;
@@ -1136,7 +1140,7 @@ static void ip_vs_process_message(__u8 *buffer, const size_t buflen)
 				return;
 			}
 			/* Process a single sync_conn */
-			if ((retc=ip_vs_proc_sync_conn(p, msg_end)) < 0) {
+			if ((retc=ip_vs_proc_sync_conn(net, p, msg_end)) < 0) {
 				IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
 					     retc);
 				return;
@@ -1146,7 +1150,7 @@ static void ip_vs_process_message(__u8 *buffer, const size_t buflen)
 		}
 	} else {
 		/* Old type of message */
-		ip_vs_process_message_v0(buffer, buflen);
+		ip_vs_process_message_v0(net, buffer, buflen);
 		return;
 	}
 }
@@ -1500,7 +1504,7 @@ static int sync_thread_backup(void *data)
 			/* disable bottom half, because it accesses the data
 			   shared by softirq while getting/creating conns */
 			local_bh_disable();
-			ip_vs_process_message(tinfo->buf, len);
+			ip_vs_process_message(&init_net, tinfo->buf, len);
 			local_bh_enable();
 		}
 	}
-- 
1.7.2.3


^ permalink raw reply related

* [*v3 PATCH 10/22] IPVS: netns, use ip_vs_proto_data as param.
From: hans @ 2010-12-30 10:50 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: Hans Schillstrom
In-Reply-To: <1293706266-27152-1-git-send-email-hans@schillstrom.com>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

ip_vs_protocol *pp is replaced by ip_vs_proto_data *pd in
function call in ip_vs_protocol struct i.e. :,
 - timeout_change()
 - state_transition()

ip_vs_protocol_timeout_change() got ipvs as param, due to above
and a upcoming patch - defence work

Most of this changes are triggered by Julians comment:
"tcp_timeout_change should work with the new struct ip_vs_proto_data
        so that tcp_state_table will go to pd->state_table
        and set_tcp_state will get pd instead of pp"

*v3
Mostly comments from Julian
The pp -> pd conversion should start from functions like
ip_vs_out() that use pp = ip_vs_proto_get(iph.protocol),
now they should use ip_vs_proto_data_get(net, iph.protocol).
conn_in_get() and conn_out_get() unused param *pp, removed.

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/net/ip_vs.h                     |   18 ++-----
 net/netfilter/ipvs/ip_vs_conn.c         |    2 -
 net/netfilter/ipvs/ip_vs_core.c         |   77 +++++++++++++++++++------------
 net/netfilter/ipvs/ip_vs_ctl.c          |   54 +++++++++++++--------
 net/netfilter/ipvs/ip_vs_proto.c        |   17 +++++--
 net/netfilter/ipvs/ip_vs_proto_ah_esp.c |   10 ++--
 net/netfilter/ipvs/ip_vs_proto_sctp.c   |   16 +++----
 net/netfilter/ipvs/ip_vs_proto_tcp.c    |   26 ++++------
 net/netfilter/ipvs/ip_vs_proto_udp.c    |   11 ++---
 net/netfilter/xt_ipvs.c                 |    2 +-
 10 files changed, 126 insertions(+), 107 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 8e544be..135a163 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -370,13 +370,12 @@ struct ip_vs_protocol {
 	void (*exit_netns)(struct net *net, struct ip_vs_proto_data *pd);
 
 	int (*conn_schedule)(int af, struct sk_buff *skb,
-			     struct ip_vs_protocol *pp,
+			     struct ip_vs_proto_data *pd,
 			     int *verdict, struct ip_vs_conn **cpp);
 
 	struct ip_vs_conn *
 	(*conn_in_get)(int af,
 		       const struct sk_buff *skb,
-		       struct ip_vs_protocol *pp,
 		       const struct ip_vs_iphdr *iph,
 		       unsigned int proto_off,
 		       int inverse);
@@ -384,7 +383,6 @@ struct ip_vs_protocol {
 	struct ip_vs_conn *
 	(*conn_out_get)(int af,
 			const struct sk_buff *skb,
-			struct ip_vs_protocol *pp,
 			const struct ip_vs_iphdr *iph,
 			unsigned int proto_off,
 			int inverse);
@@ -402,7 +400,7 @@ struct ip_vs_protocol {
 
 	int (*state_transition)(struct ip_vs_conn *cp, int direction,
 				const struct sk_buff *skb,
-				struct ip_vs_protocol *pp);
+				struct ip_vs_proto_data *pd);
 
 	int (*register_app)(struct ip_vs_app *inc);
 
@@ -415,9 +413,7 @@ struct ip_vs_protocol {
 			     int offset,
 			     const char *msg);
 
-	void (*timeout_change)(struct ip_vs_protocol *pp, int flags);
-
-	int (*set_state_timeout)(struct ip_vs_protocol *pp, char *sname, int to);
+	void (*timeout_change)(struct ip_vs_proto_data *pd, int flags);
 };
 
 /*
@@ -776,7 +772,6 @@ struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p);
 struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p);
 
 struct ip_vs_conn * ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
-					    struct ip_vs_protocol *pp,
 					    const struct ip_vs_iphdr *iph,
 					    unsigned int proto_off,
 					    int inverse);
@@ -784,7 +779,6 @@ struct ip_vs_conn * ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
 struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p);
 
 struct ip_vs_conn * ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
-					     struct ip_vs_protocol *pp,
 					     const struct ip_vs_iphdr *iph,
 					     unsigned int proto_off,
 					     int inverse);
@@ -915,7 +909,7 @@ static inline void ip_vs_pe_put(const struct ip_vs_pe *pe)
  */
 extern int ip_vs_protocol_init(void);
 extern void ip_vs_protocol_cleanup(void);
-extern void ip_vs_protocol_timeout_change(int flags);
+extern void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags);
 extern int *ip_vs_create_timeout_table(int *table, int size);
 extern int
 ip_vs_set_state_timeout(int *table, int num, const char *const *names,
@@ -945,9 +939,9 @@ extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name);
 extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler);
 extern struct ip_vs_conn *
 ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
-	       struct ip_vs_protocol *pp, int *ignored);
+	       struct ip_vs_proto_data *pd, int *ignored);
 extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
-			struct ip_vs_protocol *pp);
+			struct ip_vs_proto_data *pd);
 
 
 /*
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 7a0e79e..a7aba6a 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -329,7 +329,6 @@ ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
 
 struct ip_vs_conn *
 ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
-			struct ip_vs_protocol *pp,
 			const struct ip_vs_iphdr *iph,
 			unsigned int proto_off, int inverse)
 {
@@ -428,7 +427,6 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
 
 struct ip_vs_conn *
 ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
-			 struct ip_vs_protocol *pp,
 			 const struct ip_vs_iphdr *iph,
 			 unsigned int proto_off, int inverse)
 {
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 68ecc7f..0454a11 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -177,11 +177,11 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
 static inline int
 ip_vs_set_state(struct ip_vs_conn *cp, int direction,
 		const struct sk_buff *skb,
-		struct ip_vs_protocol *pp)
+		struct ip_vs_proto_data *pd)
 {
-	if (unlikely(!pp->state_transition))
+	if (unlikely(!pd->pp->state_transition))
 		return 0;
-	return pp->state_transition(cp, direction, skb, pp);
+	return pd->pp->state_transition(cp, direction, skb, pd);
 }
 
 static inline int
@@ -378,8 +378,9 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
  */
 struct ip_vs_conn *
 ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
-	       struct ip_vs_protocol *pp, int *ignored)
+	       struct ip_vs_proto_data *pd, int *ignored)
 {
+	struct ip_vs_protocol *pp = pd->pp;
 	struct ip_vs_conn *cp = NULL;
 	struct ip_vs_iphdr iph;
 	struct ip_vs_dest *dest;
@@ -408,7 +409,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 	 *    Do not schedule replies from local real server.
 	 */
 	if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
-	    (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
+	    (cp = pp->conn_in_get(svc->af, skb, &iph, iph.len, 1))) {
 		IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
 			      "Not scheduling reply for existing connection");
 		__ip_vs_conn_put(cp);
@@ -479,11 +480,12 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
  *  no destination is available for a new connection.
  */
 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
-		struct ip_vs_protocol *pp)
+		struct ip_vs_proto_data *pd)
 {
 	__be16 _ports[2], *pptr;
 	struct ip_vs_iphdr iph;
 	int unicast;
+
 	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 
 	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
@@ -530,10 +532,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 		ip_vs_in_stats(cp, skb);
 
 		/* set state */
-		cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+		cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
 
 		/* transmit the first SYN packet */
-		ret = cp->packet_xmit(skb, cp, pp);
+		ret = cp->packet_xmit(skb, cp, pd->pp);
 		/* do not touch skb anymore */
 
 		atomic_inc(&cp->in_pkts);
@@ -840,7 +842,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
 
 	ip_vs_fill_iphdr(AF_INET, cih, &ciph);
 	/* The embedded headers contain source and dest in reverse order */
-	cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
+	cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1);
 	if (!cp)
 		return NF_ACCEPT;
 
@@ -917,7 +919,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
 
 	ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
 	/* The embedded headers contain source and dest in reverse order */
-	cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
+	cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1);
 	if (!cp)
 		return NF_ACCEPT;
 
@@ -956,9 +958,11 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
  * Used for NAT and local client.
  */
 static unsigned int
-handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 		struct ip_vs_conn *cp, int ihl)
 {
+	struct ip_vs_protocol *pp = pd->pp;
+
 	IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
 
 	if (!skb_make_writable(skb, ihl))
@@ -1007,7 +1011,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
 
 	ip_vs_out_stats(cp, skb);
-	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
+	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
 	skb->ipvs_property = 1;
 	if (!(cp->flags & IP_VS_CONN_F_NFCT))
 		ip_vs_notrack(skb);
@@ -1034,6 +1038,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 	struct net *net = NULL;
 	struct ip_vs_iphdr iph;
 	struct ip_vs_protocol *pp;
+	struct ip_vs_proto_data *pd;
 	struct ip_vs_conn *cp;
 
 	EnterFunction(11);
@@ -1079,9 +1084,10 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 		}
 
-	pp = ip_vs_proto_get(iph.protocol);
-	if (unlikely(!pp))
+	pd = ip_vs_proto_data_get(net, iph.protocol);
+	if (unlikely(!pd))
 		return NF_ACCEPT;
+	pp = pd->pp;
 
 	/* reassemble IP fragments */
 #ifdef CONFIG_IP_VS_IPV6
@@ -1107,10 +1113,10 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 	/*
 	 * Check if the packet belongs to an existing entry
 	 */
-	cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
+	cp = pp->conn_out_get(af, skb, &iph, iph.len, 0);
 
 	if (likely(cp))
-		return handle_response(af, skb, pp, cp, iph.len);
+		return handle_response(af, skb, pd, cp, iph.len);
 	if (sysctl_ip_vs_nat_icmp_send &&
 	    (pp->protocol == IPPROTO_TCP ||
 	     pp->protocol == IPPROTO_UDP ||
@@ -1236,12 +1242,14 @@ ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
 static int
 ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 {
+	struct net *net = NULL;
 	struct iphdr *iph;
 	struct icmphdr	_icmph, *ic;
 	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */
 	struct ip_vs_iphdr ciph;
 	struct ip_vs_conn *cp;
 	struct ip_vs_protocol *pp;
+	struct ip_vs_proto_data *pd;
 	unsigned int offset, ihl, verdict;
 	union nf_inet_addr snet;
 
@@ -1283,9 +1291,11 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 	if (cih == NULL)
 		return NF_ACCEPT; /* The packet looks wrong, ignore */
 
-	pp = ip_vs_proto_get(cih->protocol);
-	if (!pp)
+	net = skb_net(skb);
+	pd = ip_vs_proto_data_get(net, cih->protocol);
+	if (!pd)
 		return NF_ACCEPT;
+	pp = pd->pp;
 
 	/* Is the embedded protocol header present? */
 	if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
@@ -1299,10 +1309,10 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 
 	ip_vs_fill_iphdr(AF_INET, cih, &ciph);
 	/* The embedded headers contain source and dest in reverse order */
-	cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
+	cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1);
 	if (!cp) {
 		/* The packet could also belong to a local client */
-		cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
+		cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1);
 		if (cp) {
 			snet.ip = iph->saddr;
 			return handle_response_icmp(AF_INET, skb, &snet,
@@ -1346,6 +1356,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 static int
 ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 {
+	struct net *net = NULL;
 	struct ipv6hdr *iph;
 	struct icmp6hdr	_icmph, *ic;
 	struct ipv6hdr	_ciph, *cih;	/* The ip header contained
@@ -1353,6 +1364,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 	struct ip_vs_iphdr ciph;
 	struct ip_vs_conn *cp;
 	struct ip_vs_protocol *pp;
+	struct ip_vs_proto_data *pd;
 	unsigned int offset, verdict;
 	union nf_inet_addr snet;
 	struct rt6_info *rt;
@@ -1395,9 +1407,11 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 	if (cih == NULL)
 		return NF_ACCEPT; /* The packet looks wrong, ignore */
 
-	pp = ip_vs_proto_get(cih->nexthdr);
-	if (!pp)
+	net = skb_net(skb);
+	pd = ip_vs_proto_data_get(net, cih->nexthdr);
+	if (!pd)
 		return NF_ACCEPT;
+	pp = pd->pp;
 
 	/* Is the embedded protocol header present? */
 	/* TODO: we don't support fragmentation at the moment anyways */
@@ -1411,10 +1425,10 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 
 	ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
 	/* The embedded headers contain source and dest in reverse order */
-	cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
+	cp = pp->conn_in_get(AF_INET6, skb, &ciph, offset, 1);
 	if (!cp) {
 		/* The packet could also belong to a local client */
-		cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
+		cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1);
 		if (cp) {
 			ipv6_addr_copy(&snet.in6, &iph->saddr);
 			return handle_response_icmp(AF_INET6, skb, &snet,
@@ -1457,8 +1471,10 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 static unsigned int
 ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 {
+	struct net *net = NULL;
 	struct ip_vs_iphdr iph;
 	struct ip_vs_protocol *pp;
+	struct ip_vs_proto_data *pd;
 	struct ip_vs_conn *cp;
 	int ret, restart, pkts;
 
@@ -1514,20 +1530,21 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 		}
 
+	net = skb_net(skb);
 	/* Protocol supported? */
-	pp = ip_vs_proto_get(iph.protocol);
-	if (unlikely(!pp))
+	pd = ip_vs_proto_data_get(net, iph.protocol);
+	if (unlikely(!pd))
 		return NF_ACCEPT;
-
+	pp = pd->pp;
 	/*
 	 * Check if the packet belongs to an existing connection entry
 	 */
-	cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
+	cp = pp->conn_in_get(af, skb, &iph, iph.len, 0);
 
 	if (unlikely(!cp)) {
 		int v;
 
-		if (!pp->conn_schedule(af, skb, pp, &v, &cp))
+		if (!pp->conn_schedule(af, skb, pd, &v, &cp))
 			return v;
 	}
 
@@ -1555,7 +1572,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	}
 
 	ip_vs_in_stats(cp, skb);
-	restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+	restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
 	if (cp->packet_xmit)
 		ret = cp->packet_xmit(skb, cp, pp);
 		/* do not touch skb anymore */
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 45e4b8e..faaee81 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -38,6 +38,7 @@
 #include <linux/mutex.h>
 
 #include <net/net_namespace.h>
+#include <linux/nsproxy.h>
 #include <net/ip.h>
 #ifdef CONFIG_IP_VS_IPV6
 #include <net/ipv6.h>
@@ -127,7 +128,7 @@ static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
  *	update_defense_level is called from keventd and from sysctl,
  *	so it needs to protect itself from softirqs
  */
-static void update_defense_level(void)
+static void update_defense_level(struct netns_ipvs *ipvs)
 {
 	struct sysinfo i;
 	static int old_secure_tcp = 0;
@@ -241,7 +242,7 @@ static void update_defense_level(void)
 	}
 	old_secure_tcp = sysctl_ip_vs_secure_tcp;
 	if (to_change >= 0)
-		ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
+		ip_vs_protocol_timeout_change(ipvs, sysctl_ip_vs_secure_tcp>1);
 	spin_unlock(&ip_vs_securetcp_lock);
 
 	local_bh_enable();
@@ -257,7 +258,10 @@ static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
 
 static void defense_work_handler(struct work_struct *work)
 {
-	update_defense_level();
+	struct net *net = &init_net;
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	update_defense_level(ipvs);
 	if (atomic_read(&ip_vs_dropentry))
 		ip_vs_random_dropentry();
 
@@ -1503,6 +1507,7 @@ static int
 proc_do_defense_mode(ctl_table *table, int write,
 		     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
+	struct net *net = current->nsproxy->net_ns;
 	int *valp = table->data;
 	int val = *valp;
 	int rc;
@@ -1513,7 +1518,7 @@ proc_do_defense_mode(ctl_table *table, int write,
 			/* Restore the correct value */
 			*valp = val;
 		} else {
-			update_defense_level();
+			update_defense_level(net_ipvs(net));
 		}
 	}
 	return rc;
@@ -2034,8 +2039,10 @@ static const struct file_operations ip_vs_stats_fops = {
 /*
  *	Set timeout values for tcp tcpfin udp in the timeout_table.
  */
-static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
+static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
 {
+	struct ip_vs_proto_data *pd;
+
 	IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
 		  u->tcp_timeout,
 		  u->tcp_fin_timeout,
@@ -2043,19 +2050,22 @@ static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
 
 #ifdef CONFIG_IP_VS_PROTO_TCP
 	if (u->tcp_timeout) {
-		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
+		pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+		pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
 			= u->tcp_timeout * HZ;
 	}
 
 	if (u->tcp_fin_timeout) {
-		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
+		pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+		pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
 			= u->tcp_fin_timeout * HZ;
 	}
 #endif
 
 #ifdef CONFIG_IP_VS_PROTO_UDP
 	if (u->udp_timeout) {
-		ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
+		pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+		pd->timeout_table[IP_VS_UDP_S_NORMAL]
 			= u->udp_timeout * HZ;
 	}
 #endif
@@ -2159,7 +2169,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 		goto out_unlock;
 	} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
 		/* Set timeout values for (tcp tcpfin udp) */
-		ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
+		ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
 		goto out_unlock;
 	} else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
 		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
@@ -2370,17 +2380,19 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
 }
 
 static inline void
-__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
+__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
 {
+	struct ip_vs_proto_data *pd;
+
 #ifdef CONFIG_IP_VS_PROTO_TCP
-	u->tcp_timeout =
-		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
-	u->tcp_fin_timeout =
-		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
+	pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+	u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
+	u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
 #endif
 #ifdef CONFIG_IP_VS_PROTO_UDP
+	pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
 	u->udp_timeout =
-		ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
+			pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
 #endif
 }
 
@@ -2520,7 +2532,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	{
 		struct ip_vs_timeout_user t;
 
-		__ip_vs_get_timeouts(&t);
+		__ip_vs_get_timeouts(net, &t);
 		if (copy_to_user(user, &t, sizeof(t)) != 0)
 			ret = -EFAULT;
 	}
@@ -3091,11 +3103,11 @@ static int ip_vs_genl_del_daemon(struct nlattr **attrs)
 	return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
 }
 
-static int ip_vs_genl_set_config(struct nlattr **attrs)
+static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
 {
 	struct ip_vs_timeout_user t;
 
-	__ip_vs_get_timeouts(&t);
+	__ip_vs_get_timeouts(net, &t);
 
 	if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
 		t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
@@ -3107,7 +3119,7 @@ static int ip_vs_genl_set_config(struct nlattr **attrs)
 	if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
 		t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
 
-	return ip_vs_set_timeout(&t);
+	return ip_vs_set_timeout(net, &t);
 }
 
 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
@@ -3128,7 +3140,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 		ret = ip_vs_flush(net);
 		goto out;
 	} else if (cmd == IPVS_CMD_SET_CONFIG) {
-		ret = ip_vs_genl_set_config(info->attrs);
+		ret = ip_vs_genl_set_config(net, info->attrs);
 		goto out;
 	} else if (cmd == IPVS_CMD_NEW_DAEMON ||
 		   cmd == IPVS_CMD_DEL_DAEMON) {
@@ -3280,7 +3292,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
 	{
 		struct ip_vs_timeout_user t;
 
-		__ip_vs_get_timeouts(&t);
+		__ip_vs_get_timeouts(net, &t);
 #ifdef CONFIG_IP_VS_PROTO_TCP
 		NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
 		NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index 1089248..803c6ec 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -152,9 +152,8 @@ EXPORT_SYMBOL(ip_vs_proto_get);
  *	get ip_vs_protocol object data by netns and proto
  */
 struct ip_vs_proto_data *
-ip_vs_proto_data_get(struct net *net, unsigned short proto)
+__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto)
 {
-	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct ip_vs_proto_data *pd;
 	unsigned hash = IP_VS_PROTO_HASH(proto);
 
@@ -165,12 +164,20 @@ ip_vs_proto_data_get(struct net *net, unsigned short proto)
 
 	return NULL;
 }
+
+struct ip_vs_proto_data *
+ip_vs_proto_data_get(struct net *net, unsigned short proto)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	return __ipvs_proto_data_get(ipvs, proto);
+}
 EXPORT_SYMBOL(ip_vs_proto_data_get);
 
 /*
  *	Propagate event for state change to all protocols
  */
-void ip_vs_protocol_timeout_change(int flags)
+void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags)
 {
 	struct ip_vs_protocol *pp;
 	int i;
@@ -178,7 +185,9 @@ void ip_vs_protocol_timeout_change(int flags)
 	for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
 		for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
 			if (pp->timeout_change)
-				pp->timeout_change(pp, flags);
+				pp->timeout_change(__ipvs_proto_data_get(ipvs,
+									 pp->protocol),
+						   flags);
 		}
 	}
 }
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index b8b37fa..28039cb 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -55,7 +55,7 @@ ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph,
 }
 
 static struct ip_vs_conn *
-ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
+ah_esp_conn_in_get(int af, const struct sk_buff *skb,
 		   const struct ip_vs_iphdr *iph, unsigned int proto_off,
 		   int inverse)
 {
@@ -72,7 +72,7 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
 		IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
 			      "%s%s %s->%s\n",
 			      inverse ? "ICMP+" : "",
-			      pp->name,
+			      ip_vs_proto_get(iph->protocol)->name,
 			      IP_VS_DBG_ADDR(af, &iph->saddr),
 			      IP_VS_DBG_ADDR(af, &iph->daddr));
 	}
@@ -83,7 +83,6 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
 
 static struct ip_vs_conn *
 ah_esp_conn_out_get(int af, const struct sk_buff *skb,
-		    struct ip_vs_protocol *pp,
 		    const struct ip_vs_iphdr *iph,
 		    unsigned int proto_off,
 		    int inverse)
@@ -97,7 +96,7 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
 		IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
 			      "%s%s %s->%s\n",
 			      inverse ? "ICMP+" : "",
-			      pp->name,
+			      ip_vs_proto_get(iph->protocol)->name,
 			      IP_VS_DBG_ADDR(af, &iph->saddr),
 			      IP_VS_DBG_ADDR(af, &iph->daddr));
 	}
@@ -107,7 +106,7 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
 
 
 static int
-ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 		     int *verdict, struct ip_vs_conn **cpp)
 {
 	/*
@@ -137,7 +136,6 @@ struct ip_vs_protocol ip_vs_protocol_ah = {
 	.app_conn_bind =	NULL,
 	.debug_packet =		ip_vs_tcpudp_debug_packet,
 	.timeout_change =	NULL,		/* ISAKMP */
-	.set_state_timeout =	NULL,
 };
 #endif
 
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 49abf2b..f363f66 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -9,7 +9,7 @@
 #include <net/ip_vs.h>
 
 static int
-sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 		   int *verdict, struct ip_vs_conn **cpp)
 {
 	struct net *net;
@@ -47,10 +47,10 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 * Let the virtual server select a real server for the
 		 * incoming connection, and create a connection entry.
 		 */
-		*cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+		*cpp = ip_vs_schedule(svc, skb, pd, &ignored);
 		if (!*cpp && ignored <= 0) {
 			if (!ignored)
-				*verdict = ip_vs_leave(svc, skb, pp);
+				*verdict = ip_vs_leave(svc, skb, pd);
 			else {
 				ip_vs_service_put(svc);
 				*verdict = NF_DROP;
@@ -907,14 +907,13 @@ static const char *sctp_state_name(int state)
 }
 
 static inline int
-set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
 		int direction, const struct sk_buff *skb)
 {
 	sctp_chunkhdr_t _sctpch, *sch;
 	unsigned char chunk_type;
 	int event, next_state;
 	int ihl;
-	struct ip_vs_proto_data *pd;
 
 #ifdef CONFIG_IP_VS_IPV6
 	ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
@@ -966,7 +965,7 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 
 		IP_VS_DBG_BUF(8, "%s %s  %s:%d->"
 				"%s:%d state: %s->%s conn->refcnt:%d\n",
-				pp->name,
+				pd->pp->name,
 				((direction == IP_VS_DIR_OUTPUT) ?
 				 "output " : "input "),
 				IP_VS_DBG_ADDR(cp->af, &cp->daddr),
@@ -990,7 +989,6 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 			}
 		}
 	}
-	pd = ip_vs_proto_data_get(&init_net, pp->protocol); /* tmp fix */
 	if (likely(pd))
 		cp->timeout = pd->timeout_table[cp->state = next_state];
 	else	/* What to do ? */
@@ -1001,12 +999,12 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 
 static int
 sctp_state_transition(struct ip_vs_conn *cp, int direction,
-		const struct sk_buff *skb, struct ip_vs_protocol *pp)
+		const struct sk_buff *skb, struct ip_vs_proto_data *pd)
 {
 	int ret = 0;
 
 	spin_lock(&cp->lock);
-	ret = set_sctp_state(pp, cp, direction, skb);
+	ret = set_sctp_state(pd, cp, direction, skb);
 	spin_unlock(&cp->lock);
 
 	return ret;
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 88f3a22..830dc3e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -32,7 +32,7 @@
 #include <net/ip_vs.h>
 
 static int
-tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 		  int *verdict, struct ip_vs_conn **cpp)
 {
 	struct net *net;
@@ -68,10 +68,10 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 * Let the virtual server select a real server for the
 		 * incoming connection, and create a connection entry.
 		 */
-		*cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+		*cpp = ip_vs_schedule(svc, skb, pd, &ignored);
 		if (!*cpp && ignored <= 0) {
 			if (!ignored)
-				*verdict = ip_vs_leave(svc, skb, pp);
+				*verdict = ip_vs_leave(svc, skb, pd);
 			else {
 				ip_vs_service_put(svc);
 				*verdict = NF_DROP;
@@ -448,10 +448,7 @@ static struct tcp_states_t tcp_states_dos [] = {
 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 };
 
-static struct tcp_states_t *tcp_state_table = tcp_states;
-
-
-static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
+static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
 {
 	int on = (flags & 1);		/* secure_tcp */
 
@@ -461,7 +458,7 @@ static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
 	** for most if not for all of the applications. Something
 	** like "capabilities" (flags) for each object.
 	*/
-	tcp_state_table = (on? tcp_states_dos : tcp_states);
+	pd->tcp_state_table = (on? tcp_states_dos : tcp_states);
 }
 
 static inline int tcp_state_idx(struct tcphdr *th)
@@ -478,13 +475,12 @@ static inline int tcp_state_idx(struct tcphdr *th)
 }
 
 static inline void
-set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
 	      int direction, struct tcphdr *th)
 {
 	int state_idx;
 	int new_state = IP_VS_TCP_S_CLOSE;
 	int state_off = tcp_state_off[direction];
-	struct ip_vs_proto_data *pd;  /* Temp fix */
 
 	/*
 	 *    Update state offset to INPUT_ONLY if necessary
@@ -502,7 +498,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 		goto tcp_state_out;
 	}
 
-	new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
+	new_state = pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
 
   tcp_state_out:
 	if (new_state != cp->state) {
@@ -510,7 +506,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 
 		IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
 			      "%s:%d state: %s->%s conn->refcnt:%d\n",
-			      pp->name,
+			      pd->pp->name,
 			      ((state_off == TCP_DIR_OUTPUT) ?
 			       "output " : "input "),
 			      th->syn ? 'S' : '.',
@@ -540,7 +536,6 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 		}
 	}
 
-	pd = ip_vs_proto_data_get(&init_net, pp->protocol);
 	if (likely(pd))
 		cp->timeout = pd->timeout_table[cp->state = new_state];
 	else	/* What to do ? */
@@ -553,7 +548,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 static int
 tcp_state_transition(struct ip_vs_conn *cp, int direction,
 		     const struct sk_buff *skb,
-		     struct ip_vs_protocol *pp)
+		     struct ip_vs_proto_data *pd)
 {
 	struct tcphdr _tcph, *th;
 
@@ -568,7 +563,7 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction,
 		return 0;
 
 	spin_lock(&cp->lock);
-	set_tcp_state(pp, cp, direction, th);
+	set_tcp_state(pd, cp, direction, th);
 	spin_unlock(&cp->lock);
 
 	return 1;
@@ -691,6 +686,7 @@ static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
 	spin_lock_init(&ipvs->tcp_app_lock);
 	pd->timeout_table = ip_vs_create_timeout_table((int*)tcp_timeouts,
 							sizeof(tcp_timeouts));
+	pd->tcp_state_table =  tcp_states;
 }
 
 static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 71a4721..aa85df2 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -29,7 +29,7 @@
 #include <net/ip6_checksum.h>
 
 static int
-udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
+udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
 		  int *verdict, struct ip_vs_conn **cpp)
 {
 	struct net *net;
@@ -64,10 +64,10 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 * Let the virtual server select a real server for the
 		 * incoming connection, and create a connection entry.
 		 */
-		*cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+		*cpp = ip_vs_schedule(svc, skb, pd, &ignored);
 		if (!*cpp && ignored <= 0) {
 			if (!ignored)
-				*verdict = ip_vs_leave(svc, skb, pp);
+				*verdict = ip_vs_leave(svc, skb, pd);
 			else {
 				ip_vs_service_put(svc);
 				*verdict = NF_DROP;
@@ -457,11 +457,8 @@ static const char * udp_state_name(int state)
 static int
 udp_state_transition(struct ip_vs_conn *cp, int direction,
 		     const struct sk_buff *skb,
-		     struct ip_vs_protocol *pp)
+		     struct ip_vs_proto_data *pd)
 {
-	struct ip_vs_proto_data *pd;   /* Temp fix, pp will be replaced by pd */

^ permalink raw reply related

* [*v3 PATCH 09/22] IPVS: netns preparation for proto_ah_esp
From: hans @ 2010-12-30 10:50 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: Hans Schillstrom
In-Reply-To: <1293706266-27152-1-git-send-email-hans@schillstrom.com>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

In this phase (one), all local vars will be moved to ipvs struct.

Remaining work, add param struct net *net to a couple of
functions that common for all protos.

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 net/netfilter/ipvs/ip_vs_proto.c        |    6 ++++++
 net/netfilter/ipvs/ip_vs_proto_ah_esp.c |   20 ++++----------------
 2 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index fd3a7a8..1089248 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -316,6 +316,12 @@ static int  __net_init  __ip_vs_protocol_init(struct net *net)
 #ifdef CONFIG_IP_VS_PROTO_SCTP
 	register_ip_vs_proto_netns(net, &ip_vs_protocol_sctp);
 #endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+	register_ip_vs_proto_netns(net, &ip_vs_protocol_ah);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+	register_ip_vs_proto_netns(net, &ip_vs_protocol_esp);
+#endif
 	return 0;
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 3a04611..b8b37fa 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -117,26 +117,14 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	return 0;
 }
 
-static void ah_esp_init(struct ip_vs_protocol *pp)
-{
-	/* nothing to do now */
-}
-
-
-static void ah_esp_exit(struct ip_vs_protocol *pp)
-{
-	/* nothing to do now */
-}
-
-
 #ifdef CONFIG_IP_VS_PROTO_AH
 struct ip_vs_protocol ip_vs_protocol_ah = {
 	.name =			"AH",
 	.protocol =		IPPROTO_AH,
 	.num_states =		1,
 	.dont_defrag =		1,
-	.init =			ah_esp_init,
-	.exit =			ah_esp_exit,
+	.init =			NULL,
+	.exit =			NULL,
 	.conn_schedule =	ah_esp_conn_schedule,
 	.conn_in_get =		ah_esp_conn_in_get,
 	.conn_out_get =		ah_esp_conn_out_get,
@@ -159,8 +147,8 @@ struct ip_vs_protocol ip_vs_protocol_esp = {
 	.protocol =		IPPROTO_ESP,
 	.num_states =		1,
 	.dont_defrag =		1,
-	.init =			ah_esp_init,
-	.exit =			ah_esp_exit,
+	.init =			NULL,
+	.exit =			NULL,
 	.conn_schedule =	ah_esp_conn_schedule,
 	.conn_in_get =		ah_esp_conn_in_get,
 	.conn_out_get =		ah_esp_conn_out_get,
-- 
1.7.2.3


^ permalink raw reply related

* [*v3 PATCH 08/22] IPVS: netns preparation for proto_sctp
From: hans @ 2010-12-30 10:50 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: Hans Schillstrom
In-Reply-To: <1293706266-27152-1-git-send-email-hans@schillstrom.com>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

In this phase (one), all local vars will be moved to ipvs struct.

Remaining work, add param struct net *net to a couple of
functions that is common for all protos and use ip_vs_proto_data

*v3
 Removed unuset function set_state_timeout()

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/net/netns/ip_vs.h             |   10 +++-
 net/netfilter/ipvs/ip_vs_proto.c      |    3 +
 net/netfilter/ipvs/ip_vs_proto_sctp.c |  121 ++++++++++++++++-----------------
 3 files changed, 70 insertions(+), 64 deletions(-)

diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h
index 4975026..fcb3c7c 100644
--- a/include/net/netns/ip_vs.h
+++ b/include/net/netns/ip_vs.h
@@ -48,7 +48,15 @@ struct netns_ipvs {
 	struct list_head 	udp_apps[UDP_APP_TAB_SIZE];
 	spinlock_t		udp_app_lock;
 #endif
-
+	/* ip_vs_proto_sctp */
+#ifdef CONFIG_IP_VS_PROTO_SCTP
+	#define SCTP_APP_TAB_BITS        4
+	#define SCTP_APP_TAB_SIZE        (1 << SCTP_APP_TAB_BITS)
+	#define SCTP_APP_TAB_MASK        (SCTP_APP_TAB_SIZE - 1)
+	/* Hash table for SCTP application incarnations	 */
+	struct list_head 	sctp_apps[SCTP_APP_TAB_SIZE];
+	spinlock_t		sctp_app_lock;
+#endif
 	/* ip_vs_lblc */
 	int 			sysctl_lblc_expiration;
 	struct ctl_table_header	*lblc_ctl_header;
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index ec71d47..fd3a7a8 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -313,6 +313,9 @@ static int  __net_init  __ip_vs_protocol_init(struct net *net)
 #ifdef CONFIG_IP_VS_PROTO_UDP
 	register_ip_vs_proto_netns(net, &ip_vs_protocol_udp);
 #endif
+#ifdef CONFIG_IP_VS_PROTO_SCTP
+	register_ip_vs_proto_netns(net, &ip_vs_protocol_sctp);
+#endif
 	return 0;
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 521b827..49abf2b 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -862,7 +862,7 @@ static struct ipvs_sctp_nextstate
 /*
  *      Timeout table[state]
  */
-static int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
+static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
 	[IP_VS_SCTP_S_NONE]         =     2 * HZ,
 	[IP_VS_SCTP_S_INIT_CLI]     =     1 * 60 * HZ,
 	[IP_VS_SCTP_S_INIT_SER]     =     1 * 60 * HZ,
@@ -906,18 +906,6 @@ static const char *sctp_state_name(int state)
 	return "?";
 }
 
-static void sctp_timeout_change(struct ip_vs_protocol *pp, int flags)
-{
-}
-
-static int
-sctp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
-
-return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_SCTP_S_LAST,
-				sctp_state_name_table, sname, to);
-}
-
 static inline int
 set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 		int direction, const struct sk_buff *skb)
@@ -926,6 +914,7 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 	unsigned char chunk_type;
 	int event, next_state;
 	int ihl;
+	struct ip_vs_proto_data *pd;
 
 #ifdef CONFIG_IP_VS_IPV6
 	ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
@@ -1001,10 +990,13 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 			}
 		}
 	}
+	pd = ip_vs_proto_data_get(&init_net, pp->protocol); /* tmp fix */
+	if (likely(pd))
+		cp->timeout = pd->timeout_table[cp->state = next_state];
+	else	/* What to do ? */
+		cp->timeout = sctp_timeouts[cp->state = next_state];
 
-	 cp->timeout = pp->timeout_table[cp->state = next_state];
-
-	 return 1;
+	return 1;
 }
 
 static int
@@ -1020,16 +1012,6 @@ sctp_state_transition(struct ip_vs_conn *cp, int direction,
 	return ret;
 }
 
-/*
- *      Hash table for SCTP application incarnations
- */
-#define SCTP_APP_TAB_BITS        4
-#define SCTP_APP_TAB_SIZE        (1 << SCTP_APP_TAB_BITS)
-#define SCTP_APP_TAB_MASK        (SCTP_APP_TAB_SIZE - 1)
-
-static struct list_head sctp_apps[SCTP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(sctp_app_lock);
-
 static inline __u16 sctp_app_hashkey(__be16 port)
 {
 	return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port)
@@ -1042,34 +1024,40 @@ static int sctp_register_app(struct ip_vs_app *inc)
 	__u16 hash;
 	__be16 port = inc->port;
 	int ret = 0;
+	struct netns_ipvs *ipvs = net_ipvs(&init_net);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_SCTP);
 
 	hash = sctp_app_hashkey(port);
 
-	spin_lock_bh(&sctp_app_lock);
-	list_for_each_entry(i, &sctp_apps[hash], p_list) {
+	spin_lock_bh(&ipvs->sctp_app_lock);
+	list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) {
 		if (i->port == port) {
 			ret = -EEXIST;
 			goto out;
 		}
 	}
-	list_add(&inc->p_list, &sctp_apps[hash]);
-	atomic_inc(&ip_vs_protocol_sctp.appcnt);
+	list_add(&inc->p_list, &ipvs->sctp_apps[hash]);
+	atomic_inc(&pd->pp->appcnt);
 out:
-	spin_unlock_bh(&sctp_app_lock);
+	spin_unlock_bh(&ipvs->sctp_app_lock);
 
 	return ret;
 }
 
 static void sctp_unregister_app(struct ip_vs_app *inc)
 {
-	spin_lock_bh(&sctp_app_lock);
-	atomic_dec(&ip_vs_protocol_sctp.appcnt);
+	struct netns_ipvs *ipvs = net_ipvs(&init_net);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_SCTP);
+
+	spin_lock_bh(&ipvs->sctp_app_lock);
+	atomic_dec(&pd->pp->appcnt);
 	list_del(&inc->p_list);
-	spin_unlock_bh(&sctp_app_lock);
+	spin_unlock_bh(&ipvs->sctp_app_lock);
 }
 
 static int sctp_app_conn_bind(struct ip_vs_conn *cp)
 {
+	struct netns_ipvs *ipvs = net_ipvs(&init_net);
 	int hash;
 	struct ip_vs_app *inc;
 	int result = 0;
@@ -1080,12 +1068,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
 	/* Lookup application incarnations and bind the right one */
 	hash = sctp_app_hashkey(cp->vport);
 
-	spin_lock(&sctp_app_lock);
-	list_for_each_entry(inc, &sctp_apps[hash], p_list) {
+	spin_lock(&ipvs->sctp_app_lock);
+	list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) {
 		if (inc->port == cp->vport) {
 			if (unlikely(!ip_vs_app_inc_get(inc)))
 				break;
-			spin_unlock(&sctp_app_lock);
+			spin_unlock(&ipvs->sctp_app_lock);
 
 			IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
 					"%s:%u to app %s on port %u\n",
@@ -1101,43 +1089,50 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
 			goto out;
 		}
 	}
-	spin_unlock(&sctp_app_lock);
+	spin_unlock(&ipvs->sctp_app_lock);
 out:
 	return result;
 }
 
-static void ip_vs_sctp_init(struct ip_vs_protocol *pp)
+/* ---------------------------------------------
+ *   timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
 {
-	IP_VS_INIT_HASH_TABLE(sctp_apps);
-	pp->timeout_table = sctp_timeouts;
-}
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
+	ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
+	spin_lock_init(&ipvs->tcp_app_lock);
+	pd->timeout_table = ip_vs_create_timeout_table((int*)sctp_timeouts,
+							sizeof(sctp_timeouts));
+}
 
-static void ip_vs_sctp_exit(struct ip_vs_protocol *pp)
+static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd)
 {
-
+	kfree(pd->timeout_table);
 }
 
 struct ip_vs_protocol ip_vs_protocol_sctp = {
-	.name = "SCTP",
-	.protocol = IPPROTO_SCTP,
-	.num_states = IP_VS_SCTP_S_LAST,
-	.dont_defrag = 0,
-	.appcnt = ATOMIC_INIT(0),
-	.init = ip_vs_sctp_init,
-	.exit = ip_vs_sctp_exit,
-	.register_app = sctp_register_app,
+	.name		= "SCTP",
+	.protocol	= IPPROTO_SCTP,
+	.num_states	= IP_VS_SCTP_S_LAST,
+	.dont_defrag	= 0,
+	.init 		= NULL,
+	.exit 		= NULL,
+	.init_netns 	= __ip_vs_sctp_init,
+	.exit_netns 	= __ip_vs_sctp_exit,
+	.register_app	= sctp_register_app,
 	.unregister_app = sctp_unregister_app,
-	.conn_schedule = sctp_conn_schedule,
-	.conn_in_get = ip_vs_conn_in_get_proto,
-	.conn_out_get = ip_vs_conn_out_get_proto,
-	.snat_handler = sctp_snat_handler,
-	.dnat_handler = sctp_dnat_handler,
-	.csum_check = sctp_csum_check,
-	.state_name = sctp_state_name,
+	.conn_schedule	= sctp_conn_schedule,
+	.conn_in_get	= ip_vs_conn_in_get_proto,
+	.conn_out_get	= ip_vs_conn_out_get_proto,
+	.snat_handler	= sctp_snat_handler,
+	.dnat_handler	= sctp_dnat_handler,
+	.csum_check	= sctp_csum_check,
+	.state_name	= sctp_state_name,
 	.state_transition = sctp_state_transition,
-	.app_conn_bind = sctp_app_conn_bind,
-	.debug_packet = ip_vs_tcpudp_debug_packet,
-	.timeout_change = sctp_timeout_change,
-	.set_state_timeout = sctp_set_state_timeout,
+	.app_conn_bind	= sctp_app_conn_bind,
+	.debug_packet	= ip_vs_tcpudp_debug_packet,
+	.timeout_change	= NULL,
 };
-- 
1.7.2.3


^ permalink raw reply related

* [*v3 PATCH 06/22]  IPVS: netns preparation for proto_tcp
From: hans @ 2010-12-30 10:50 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: Hans Schillstrom
In-Reply-To: <1293706266-27152-1-git-send-email-hans@schillstrom.com>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

In this phase (one), all local vars will be moved to ipvs struct.

Remaining work, add param struct net *net to a couple of
functions that is common for all protos and use all
ip_vs_proto_data

*v3
Removed unused function as sugested by Simon

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/net/ip_vs.h                  |    6 +-
 include/net/netns/ip_vs.h            |    8 +++
 net/netfilter/ipvs/ip_vs_ftp.c       |    8 ++-
 net/netfilter/ipvs/ip_vs_proto.c     |   13 ++++-
 net/netfilter/ipvs/ip_vs_proto_tcp.c |  101 ++++++++++++++++++----------------
 5 files changed, 83 insertions(+), 53 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 4fc61bc..8e544be 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -41,7 +41,7 @@ static inline struct netns_ipvs * net_ipvs(struct net* net)
  * Get net ptr from skb in traffic cases
  * use skb_sknet when call is from userland (ioctl or netlink)
  */
-static inline struct net *skb_net(struct sk_buff *skb) {
+static inline struct net *skb_net(const struct sk_buff *skb) {
 #ifdef CONFIG_NET_NS
 #ifdef CONFIG_IP_VS_DEBUG
 	/*
@@ -68,7 +68,7 @@ static inline struct net *skb_net(struct sk_buff *skb) {
 #endif
 }
 
-static inline struct net *skb_sknet(struct sk_buff *skb) {
+static inline struct net *skb_sknet(const struct sk_buff *skb) {
 #ifdef CONFIG_NET_NS
 #ifdef CONFIG_IP_VS_DEBUG
 	/* Start with the most likely hit */
@@ -805,7 +805,7 @@ extern void ip_vs_conn_expire_now(struct ip_vs_conn *cp);
 
 extern const char * ip_vs_state_name(__u16 proto, int state);
 
-extern void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp);
+extern void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp);
 extern int ip_vs_check_template(struct ip_vs_conn *ct);
 extern void ip_vs_random_dropentry(void);
 extern int ip_vs_conn_init(void);
diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h
index b7d7815..512cdd0 100644
--- a/include/net/netns/ip_vs.h
+++ b/include/net/netns/ip_vs.h
@@ -32,6 +32,14 @@ struct netns_ipvs {
 	/* ip_vs_proto */
 	#define IP_VS_PROTO_TAB_SIZE	32	/* must be power of 2 */
 	struct ip_vs_proto_data *proto_data_table[IP_VS_PROTO_TAB_SIZE];
+	/* ip_vs_proto_tcp */
+#ifdef CONFIG_IP_VS_PROTO_TCP
+	#define	TCP_APP_TAB_BITS	4
+	#define	TCP_APP_TAB_SIZE	(1 << TCP_APP_TAB_BITS)
+	#define	TCP_APP_TAB_MASK	(TCP_APP_TAB_SIZE - 1)
+	struct list_head 	tcp_apps[TCP_APP_TAB_SIZE];
+	spinlock_t		tcp_app_lock;
+#endif
 
 	/* ip_vs_lblc */
 	int 			sysctl_lblc_expiration;
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 0e762f3..b38ae94 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -157,6 +157,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	int ret = 0;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
+	struct net *net;
 
 #ifdef CONFIG_IP_VS_IPV6
 	/* This application helper doesn't work with IPv6 yet,
@@ -257,8 +258,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 		 * would be adjusted twice.
 		 */
 
+		net = skb_net(skb);
 		cp->app_data = NULL;
-		ip_vs_tcp_conn_listen(n_cp);
+		ip_vs_tcp_conn_listen(net, n_cp);
 		ip_vs_conn_put(n_cp);
 		return ret;
 	}
@@ -287,6 +289,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	union nf_inet_addr to;
 	__be16 port;
 	struct ip_vs_conn *n_cp;
+	struct net *net;
 
 #ifdef CONFIG_IP_VS_IPV6
 	/* This application helper doesn't work with IPv6 yet,
@@ -378,7 +381,8 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	/*
 	 *	Move tunnel to listen state
 	 */
-	ip_vs_tcp_conn_listen(n_cp);
+	net = skb_net(skb);
+	ip_vs_tcp_conn_listen(net, n_cp);
 	ip_vs_conn_put(n_cp);
 
 	return 1;
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index 8caaf3e..90d69c5 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -307,12 +307,23 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
  */
 static int  __net_init  __ip_vs_protocol_init(struct net *net)
 {
+#ifdef CONFIG_IP_VS_PROTO_TCP
+	register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp);
+#endif
 	return 0;
 }
 
 static void __net_exit __ip_vs_protocol_cleanup(struct net *net)
 {
-	/* empty */
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_proto_data *pd;
+	int i;
+
+	/* unregister all the ipvs proto data for this netns */
+	for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+		while ((pd = ipvs->proto_data_table[i]) != NULL)
+			unregister_ip_vs_proto_netns(net, pd);
+	}
 }
 
 static struct pernet_operations ipvs_proto_ops = {
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 5e4da60..88f3a22 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -9,8 +9,12 @@
  *              as published by the Free Software Foundation; either version
  *              2 of the License, or (at your option) any later version.
  *
- * Changes:
+ * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
  *
+ *              Network name space (netns) aware.
+ *              Global data moved to netns i.e struct netns_ipvs
+ *              tcp_timeouts table has copy per netns in a hash table per
+ *              protocol ip_vs_proto_data and is handled by netns
  */
 
 #define KMSG_COMPONENT "IPVS"
@@ -46,8 +50,8 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	net = skb_net(skb);
 	/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
 	if (th->syn &&
-	    (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, &iph.daddr,
-				     th->dest))) {
+	    (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
+				     &iph.daddr, th->dest))) {
 		int ignored;
 
 		if (ip_vs_todrop()) {
@@ -345,7 +349,7 @@ static const int tcp_state_off[IP_VS_DIR_LAST] = {
 /*
  *	Timeout table[state]
  */
-static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
+static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
 	[IP_VS_TCP_S_NONE]		=	2*HZ,
 	[IP_VS_TCP_S_ESTABLISHED]	=	15*60*HZ,
 	[IP_VS_TCP_S_SYN_SENT]		=	2*60*HZ,
@@ -460,13 +464,6 @@ static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
 	tcp_state_table = (on? tcp_states_dos : tcp_states);
 }
 
-static int
-tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
-	return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
-				       tcp_state_name_table, sname, to);
-}
-
 static inline int tcp_state_idx(struct tcphdr *th)
 {
 	if (th->rst)
@@ -487,6 +484,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 	int state_idx;
 	int new_state = IP_VS_TCP_S_CLOSE;
 	int state_off = tcp_state_off[direction];
+	struct ip_vs_proto_data *pd;  /* Temp fix */
 
 	/*
 	 *    Update state offset to INPUT_ONLY if necessary
@@ -542,10 +540,13 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 		}
 	}
 
-	cp->timeout = pp->timeout_table[cp->state = new_state];
+	pd = ip_vs_proto_data_get(&init_net, pp->protocol);
+	if (likely(pd))
+		cp->timeout = pd->timeout_table[cp->state = new_state];
+	else	/* What to do ? */
+		cp->timeout = tcp_timeouts[cp->state = new_state];
 }
 
-
 /*
  *	Handle state transitions
  */
@@ -573,17 +574,6 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction,
 	return 1;
 }
 
-
-/*
- *	Hash table for TCP application incarnations
- */
-#define	TCP_APP_TAB_BITS	4
-#define	TCP_APP_TAB_SIZE	(1 << TCP_APP_TAB_BITS)
-#define	TCP_APP_TAB_MASK	(TCP_APP_TAB_SIZE - 1)
-
-static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(tcp_app_lock);
-
 static inline __u16 tcp_app_hashkey(__be16 port)
 {
 	return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
@@ -597,21 +587,23 @@ static int tcp_register_app(struct ip_vs_app *inc)
 	__u16 hash;
 	__be16 port = inc->port;
 	int ret = 0;
+	struct netns_ipvs *ipvs = net_ipvs(&init_net);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_TCP);
 
 	hash = tcp_app_hashkey(port);
 
-	spin_lock_bh(&tcp_app_lock);
-	list_for_each_entry(i, &tcp_apps[hash], p_list) {
+	spin_lock_bh(&ipvs->tcp_app_lock);
+	list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
 		if (i->port == port) {
 			ret = -EEXIST;
 			goto out;
 		}
 	}
-	list_add(&inc->p_list, &tcp_apps[hash]);
-	atomic_inc(&ip_vs_protocol_tcp.appcnt);
+	list_add(&inc->p_list, &ipvs->tcp_apps[hash]);
+	atomic_inc(&pd->pp->appcnt);
 
   out:
-	spin_unlock_bh(&tcp_app_lock);
+	spin_unlock_bh(&ipvs->tcp_app_lock);
 	return ret;
 }
 
@@ -619,16 +611,20 @@ static int tcp_register_app(struct ip_vs_app *inc)
 static void
 tcp_unregister_app(struct ip_vs_app *inc)
 {
-	spin_lock_bh(&tcp_app_lock);
-	atomic_dec(&ip_vs_protocol_tcp.appcnt);
+	struct netns_ipvs *ipvs = net_ipvs(&init_net);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(&init_net, IPPROTO_TCP);
+
+	spin_lock_bh(&ipvs->tcp_app_lock);
+	atomic_dec(&pd->pp->appcnt);
 	list_del(&inc->p_list);
-	spin_unlock_bh(&tcp_app_lock);
+	spin_unlock_bh(&ipvs->tcp_app_lock);
 }
 
 
 static int
 tcp_app_conn_bind(struct ip_vs_conn *cp)
 {
+	struct netns_ipvs *ipvs = net_ipvs(&init_net);
 	int hash;
 	struct ip_vs_app *inc;
 	int result = 0;
@@ -640,12 +636,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
 	/* Lookup application incarnations and bind the right one */
 	hash = tcp_app_hashkey(cp->vport);
 
-	spin_lock(&tcp_app_lock);
-	list_for_each_entry(inc, &tcp_apps[hash], p_list) {
+	spin_lock(&ipvs->tcp_app_lock);
+	list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) {
 		if (inc->port == cp->vport) {
 			if (unlikely(!ip_vs_app_inc_get(inc)))
 				break;
-			spin_unlock(&tcp_app_lock);
+			spin_unlock(&ipvs->tcp_app_lock);
 
 			IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
 				      "%s:%u to app %s on port %u\n",
@@ -662,7 +658,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
 			goto out;
 		}
 	}
-	spin_unlock(&tcp_app_lock);
+	spin_unlock(&ipvs->tcp_app_lock);
 
   out:
 	return result;
@@ -672,24 +668,34 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
 /*
  *	Set LISTEN timeout. (ip_vs_conn_put will setup timer)
  */
-void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
+void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
 {
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
 	spin_lock(&cp->lock);
 	cp->state = IP_VS_TCP_S_LISTEN;
-	cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
+	cp->timeout = ( pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
+			   : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
 	spin_unlock(&cp->lock);
 }
 
-
-static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
+/* ---------------------------------------------
+ *   timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
 {
-	IP_VS_INIT_HASH_TABLE(tcp_apps);
-	pp->timeout_table = tcp_timeouts;
-}
+	struct netns_ipvs *ipvs = net_ipvs(net);
 
+	ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
+	spin_lock_init(&ipvs->tcp_app_lock);
+	pd->timeout_table = ip_vs_create_timeout_table((int*)tcp_timeouts,
+							sizeof(tcp_timeouts));
+}
 
-static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
+static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
 {
+	kfree(pd->timeout_table);
 }
 
 
@@ -699,8 +705,10 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
 	.num_states =		IP_VS_TCP_S_LAST,
 	.dont_defrag =		0,
 	.appcnt =		ATOMIC_INIT(0),
-	.init =			ip_vs_tcp_init,
-	.exit =			ip_vs_tcp_exit,
+	.init =			NULL,
+	.exit =			NULL,
+	.init_netns =		__ip_vs_tcp_init,
+	.exit_netns =		__ip_vs_tcp_exit,
 	.register_app =		tcp_register_app,
 	.unregister_app =	tcp_unregister_app,
 	.conn_schedule =	tcp_conn_schedule,
@@ -714,5 +722,4 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
 	.app_conn_bind =	tcp_app_conn_bind,
 	.debug_packet =		ip_vs_tcpudp_debug_packet,
 	.timeout_change =	tcp_timeout_change,
-	.set_state_timeout =	tcp_set_state_timeout,
 };
-- 
1.7.2.3


^ permalink raw reply related

* [*v3 PATCH 05/22] IPVS: netns, prepare protocol
From: hans @ 2010-12-30 10:50 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: Hans Schillstrom
In-Reply-To: <1293706266-27152-1-git-send-email-hans@schillstrom.com>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

Add support for protocol data per name-space.
in struct ip_vs_protocol, appcnt will be removed when all protos
are modified for network name-space.

This patch causes warnings of unused functions, they will be used
when next patch will be applied.

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/net/ip_vs.h              |   20 +++++++++++-
 include/net/netns/ip_vs.h        |    3 ++
 net/netfilter/ipvs/ip_vs_proto.c |   66 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 88 insertions(+), 1 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 70c5462..4fc61bc 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -350,6 +350,7 @@ struct iphdr;
 struct ip_vs_conn;
 struct ip_vs_app;
 struct sk_buff;
+struct ip_vs_proto_data;
 
 struct ip_vs_protocol {
 	struct ip_vs_protocol	*next;
@@ -364,6 +365,10 @@ struct ip_vs_protocol {
 
 	void (*exit)(struct ip_vs_protocol *pp);
 
+	void (*init_netns)(struct net *net, struct ip_vs_proto_data *pd);
+
+	void (*exit_netns)(struct net *net, struct ip_vs_proto_data *pd);
+
 	int (*conn_schedule)(int af, struct sk_buff *skb,
 			     struct ip_vs_protocol *pp,
 			     int *verdict, struct ip_vs_conn **cpp);
@@ -415,7 +420,20 @@ struct ip_vs_protocol {
 	int (*set_state_timeout)(struct ip_vs_protocol *pp, char *sname, int to);
 };
 
-extern struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto);
+/*
+ * protocol data per netns
+ */
+struct ip_vs_proto_data {
+	struct ip_vs_proto_data	*next;
+	struct ip_vs_protocol	*pp;
+	int			*timeout_table;	/* protocol timeout table */
+	atomic_t		appcnt;		/* counter of proto app incs. */
+	struct tcp_states_t 	*tcp_state_table;
+};
+
+extern struct ip_vs_protocol   * ip_vs_proto_get(unsigned short proto);
+extern struct ip_vs_proto_data * ip_vs_proto_data_get(struct net *net,
+						      unsigned short proto);
 
 struct ip_vs_conn_param {
 	const union nf_inet_addr	*caddr;
diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h
index 4c8d751..b7d7815 100644
--- a/include/net/netns/ip_vs.h
+++ b/include/net/netns/ip_vs.h
@@ -29,6 +29,9 @@ struct netns_ipvs {
 	#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
 
 	struct list_head 	rs_table[IP_VS_RTAB_SIZE];
+	/* ip_vs_proto */
+	#define IP_VS_PROTO_TAB_SIZE	32	/* must be power of 2 */
+	struct ip_vs_proto_data *proto_data_table[IP_VS_PROTO_TAB_SIZE];
 
 	/* ip_vs_lblc */
 	int 			sysctl_lblc_expiration;
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index 27bf034..8caaf3e 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -60,6 +60,31 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
 	return 0;
 }
 
+/*
+ *	register an ipvs protocols netns related data
+ */
+static int
+register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+	struct ip_vs_proto_data *pd =
+			kzalloc(sizeof(struct ip_vs_proto_data), GFP_ATOMIC);
+
+	if (!pd) {
+		pr_err("%s(): no memory.\n", __func__);
+		return -ENOMEM;
+	}
+	pd->pp = pp;	/* For speed issues */
+	pd->next = ipvs->proto_data_table[hash];
+	ipvs->proto_data_table[hash] = pd;
+	atomic_set(&pd->appcnt, 0);	/* Init app counter */
+
+	if (pp->init_netns != NULL)
+		pp->init_netns(net, pd);
+
+	return 0;
+}
 
 /*
  *	unregister an ipvs protocol
@@ -82,6 +107,29 @@ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
 	return -ESRCH;
 }
 
+/*
+ *	unregister an ipvs protocols netns data
+ */
+static int
+unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_proto_data **pd_p;
+	unsigned hash = IP_VS_PROTO_HASH(pd->pp->protocol);
+
+	pd_p = &ipvs->proto_data_table[hash];
+	for (; *pd_p; pd_p = &(*pd_p)->next) {
+		if (*pd_p == pd) {
+			*pd_p = pd->next;
+			if (pd->pp->exit_netns != NULL)
+				pd->pp->exit_netns(net, pd);
+			kfree(pd);
+			return 0;
+		}
+	}
+
+	return -ESRCH;
+}
 
 /*
  *	get ip_vs_protocol object by its proto.
@@ -100,6 +148,24 @@ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
 }
 EXPORT_SYMBOL(ip_vs_proto_get);
 
+/*
+ *	get ip_vs_protocol object data by netns and proto
+ */
+struct ip_vs_proto_data *
+ip_vs_proto_data_get(struct net *net, unsigned short proto)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+	struct ip_vs_proto_data *pd;
+	unsigned hash = IP_VS_PROTO_HASH(proto);
+
+	for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) {
+		if (pd->pp->protocol == proto)
+			return pd;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(ip_vs_proto_data_get);
 
 /*
  *	Propagate event for state change to all protocols
-- 
1.7.2.3


^ permalink raw reply related

* [*v3 PATCH 03/22] IPVS: netns awarness to lblcr sheduler
From: hans @ 2010-12-30 10:50 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: Hans Schillstrom
In-Reply-To: <1293706266-27152-1-git-send-email-hans@schillstrom.com>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

var sysctl_ip_vs_lblcr_expiration moved to ipvs struct as
    sysctl_lblcr_expiration

procfs updated to handle this.

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/net/netns/ip_vs.h        |    4 +++
 net/netfilter/ipvs/ip_vs_lblcr.c |   52 +++++++++++++++++++++++++------------
 2 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h
index c22fec2..17e4e3a 100644
--- a/include/net/netns/ip_vs.h
+++ b/include/net/netns/ip_vs.h
@@ -30,6 +30,10 @@ struct netns_ipvs {
 
 	struct list_head 	rs_table[IP_VS_RTAB_SIZE];
 
+	/* ip_vs_lblcr */
+	int 			sysctl_lblcr_expiration;
+	struct ctl_table_header	*lblcr_ctl_header;
+	struct ctl_table	*lblcr_ctl_table;
 };
 
 #endif /* IP_VS_H_ */
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 7c7396a..91568c3 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -70,8 +70,6 @@
  *    entries that haven't been touched for a day.
  */
 #define COUNT_FOR_FULL_EXPIRATION   30
-static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
-
 
 /*
  *     for IPVS lblcr entry hash table
@@ -296,7 +294,7 @@ struct ip_vs_lblcr_table {
 static ctl_table vs_vars_table[] = {
 	{
 		.procname	= "lblcr_expiration",
-		.data		= &sysctl_ip_vs_lblcr_expiration,
+		.data		= NULL,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
@@ -304,8 +302,6 @@ static ctl_table vs_vars_table[] = {
 	{ }
 };
 
-static struct ctl_table_header * sysctl_header;
-
 static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
 {
 	list_del(&en->list);
@@ -425,13 +421,14 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
 	unsigned long now = jiffies;
 	int i, j;
 	struct ip_vs_lblcr_entry *en, *nxt;
+	struct netns_ipvs *ipvs = net_ipvs(svc->net);
 
 	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
 		j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
 
 		write_lock(&svc->sched_lock);
 		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
-			if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
+			if (time_after(en->lastuse+ipvs->sysctl_lblcr_expiration,
 				       now))
 				continue;
 
@@ -664,6 +661,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 	read_lock(&svc->sched_lock);
 	en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr);
 	if (en) {
+		struct netns_ipvs *ipvs = net_ipvs(svc->net);
 		/* We only hold a read lock, but this is atomic */
 		en->lastuse = jiffies;
 
@@ -675,7 +673,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 		/* More than one destination + enough time passed by, cleanup */
 		if (atomic_read(&en->set.size) > 1 &&
 				time_after(jiffies, en->set.lastmod +
-				sysctl_ip_vs_lblcr_expiration)) {
+				ipvs->sysctl_lblcr_expiration)) {
 			struct ip_vs_dest *m;
 
 			write_lock(&en->set.lock);
@@ -749,23 +747,43 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
  */
 static int __net_init __ip_vs_lblcr_init(struct net *net)
 {
-	if (!net_eq(net, &init_net))	/* netns not enabled yet */
-		return -EPERM;
-
-	sysctl_header = register_net_sysctl_table(net, net_vs_ctl_path,
-						  vs_vars_table);
-	if (!sysctl_header)
-		return -ENOMEM;
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	if (!net_eq(net, &init_net)) {
+		ipvs->lblcr_ctl_table = kmemdup(vs_vars_table,
+						sizeof(vs_vars_table),
+						GFP_KERNEL);
+		if (ipvs->lblcr_ctl_table == NULL)
+			goto err_dup;
+	} else
+		ipvs->lblcr_ctl_table = vs_vars_table;
+	ipvs->sysctl_lblcr_expiration = 24*60*60*HZ;
+	ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration;
+
+	ipvs->lblcr_ctl_header =
+		register_net_sysctl_table(net, net_vs_ctl_path,
+					  ipvs->lblcr_ctl_table);
+	if (!ipvs->lblcr_ctl_header)
+		goto err_reg;
 
 	return 0;
+
+err_reg:
+	if (!net_eq(net, &init_net))
+		kfree(ipvs->lblcr_ctl_table);
+
+err_dup:
+	return -ENOMEM;
 }
 
 static void __net_exit __ip_vs_lblcr_exit(struct net *net)
 {
-	if (!net_eq(net, &init_net))	/* netns not enabled yet */
-		return;
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	unregister_net_sysctl_table(ipvs->lblcr_ctl_header);
 
-	unregister_net_sysctl_table(sysctl_header);
+	if (!net_eq(net, &init_net))
+		kfree(ipvs->lblcr_ctl_table);
 }
 
 static struct pernet_operations ip_vs_lblcr_ops = {
-- 
1.7.2.3


^ permalink raw reply related

* [*v3 PATCH 02/22] IPVS: netns to services part 1
From: hans @ 2010-12-30 10:50 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: Hans Schillstrom
In-Reply-To: <1293706266-27152-1-git-send-email-hans@schillstrom.com>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

Services hash tables got netns ptr a hash arg,
While Real Servers (rs) has been moved to ipvs struct.
Two new inline functions added to get net ptr from skb.

Since ip_vs is called from different contexts there is two
places to dig for the net ptr skb->dev or skb->sk
this is handled in skb_net() and skb_sknet()

Global functions, ip_vs_service_get() ip_vs_lookup_real_service()
etc have got  struct net *net as first param.
If possible get net ptr skb etc,
 - if not &init_net is used at this early stage of patching.

ip_vs_ctl.c  procfs not ready for netns yet.

*v3
 Comments by Julian
- __ip_vs_service_find and __ip_vs_svc_fwm_find are fast path,
  net_eq(svc->net, net) so the check is at the end now.
- net = skb_net(skb) in ip_vs_out moved after check for skb_dst.

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/net/ip_vs.h                   |   62 +++++++++-
 include/net/netns/ip_vs.h             |    9 ++
 net/netfilter/ipvs/ip_vs_conn.c       |    2 +-
 net/netfilter/ipvs/ip_vs_core.c       |    4 +-
 net/netfilter/ipvs/ip_vs_ctl.c        |  224 +++++++++++++++++++--------------
 net/netfilter/ipvs/ip_vs_proto_sctp.c |    5 +-
 net/netfilter/ipvs/ip_vs_proto_tcp.c  |    5 +-
 net/netfilter/ipvs/ip_vs_proto_udp.c  |    5 +-
 net/netfilter/ipvs/ip_vs_sync.c       |    2 +-
 9 files changed, 207 insertions(+), 111 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 98f9ebf..70c5462 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -37,6 +37,57 @@ static inline struct netns_ipvs * net_ipvs(struct net* net)
 {
 	return net->ipvs;
 }
+/*
+ * Get net ptr from skb in traffic cases
+ * use skb_sknet when call is from userland (ioctl or netlink)
+ */
+static inline struct net *skb_net(struct sk_buff *skb) {
+#ifdef CONFIG_NET_NS
+#ifdef CONFIG_IP_VS_DEBUG
+	/*
+	 * This is used for debug only.
+	 * Start with the most likely hit
+	 * End with BUG
+	 */
+	if (likely(skb->dev && skb->dev->nd_net))
+		return dev_net(skb->dev);
+	if (skb_dst(skb)->dev)
+		return dev_net(skb_dst(skb)->dev);
+	WARN(skb->sk,"Maybe skb_sknet should be used instead in %s() line:%d\n",
+		      __func__, __LINE__);
+	if (likely(skb->sk && skb->sk->sk_net))
+		return sock_net(skb->sk);
+	pr_err("There is no net ptr to find in the skb in %s() line:%d\n",
+		__func__, __LINE__);
+	BUG();
+#else
+	return dev_net(skb->dev ? : skb_dst(skb)->dev);
+#endif
+#else
+	return &init_net;
+#endif
+}
+
+static inline struct net *skb_sknet(struct sk_buff *skb) {
+#ifdef CONFIG_NET_NS
+#ifdef CONFIG_IP_VS_DEBUG
+	/* Start with the most likely hit */
+	if (likely(skb->sk && skb->sk->sk_net))
+		return sock_net(skb->sk);
+	WARN(skb->dev,"Maybe skb_net should be used instead in %s() line:%d\n",
+		       __func__, __LINE__);
+	if (likely(skb->dev && skb->dev->nd_net))
+		return dev_net(skb->dev);
+	pr_err("There is no net ptr to find in the skb in %s() line:%d\n",
+		__func__, __LINE__);
+	BUG();
+#else
+	return sock_net(skb->sk);
+#endif
+#else
+	return &init_net;
+#endif
+}
 
 /* Connections' size value needed by ip_vs_ctl.c */
 extern int ip_vs_conn_tab_size;
@@ -496,6 +547,7 @@ struct ip_vs_service {
 	unsigned		flags;	  /* service status flags */
 	unsigned		timeout;  /* persistent timeout in ticks */
 	__be32			netmask;  /* grouping granularity */
+	struct net		*net;
 
 	struct list_head	destinations;  /* real server d-linked list */
 	__u32			num_dests;     /* number of servers */
@@ -896,7 +948,7 @@ extern int sysctl_ip_vs_sync_ver;
 
 extern void ip_vs_sync_switch_mode(int mode);
 extern struct ip_vs_service *
-ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
+ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
 		  const union nf_inet_addr *vaddr, __be16 vport);
 
 static inline void ip_vs_service_put(struct ip_vs_service *svc)
@@ -905,7 +957,7 @@ static inline void ip_vs_service_put(struct ip_vs_service *svc)
 }
 
 extern struct ip_vs_dest *
-ip_vs_lookup_real_service(int af, __u16 protocol,
+ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
 			  const union nf_inet_addr *daddr, __be16 dport);
 
 extern int ip_vs_use_count_inc(void);
@@ -913,9 +965,9 @@ extern void ip_vs_use_count_dec(void);
 extern int ip_vs_control_init(void);
 extern void ip_vs_control_cleanup(void);
 extern struct ip_vs_dest *
-ip_vs_find_dest(int af, const union nf_inet_addr *daddr, __be16 dport,
-		const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol,
-		__u32 fwmark);
+ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr,
+		__be16 dport, const union nf_inet_addr *vaddr, __be16 vport,
+		__u16 protocol, __u32 fwmark);
 extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp);
 
 
diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h
index 9068d95..c22fec2 100644
--- a/include/net/netns/ip_vs.h
+++ b/include/net/netns/ip_vs.h
@@ -21,6 +21,15 @@ struct ctl_table_header;
 
 struct netns_ipvs {
 	int			inc;		/* Incarnation */
+	/*
+	 *	Hash table: for real service lookups
+	 */
+	#define IP_VS_RTAB_BITS 4
+	#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
+	#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
+
+	struct list_head 	rs_table[IP_VS_RTAB_SIZE];
+
 };
 
 #endif /* IP_VS_H_ */
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 7c1b502..7a0e79e 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -611,7 +611,7 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
 	struct ip_vs_dest *dest;
 
 	if ((cp) && (!cp->dest)) {
-		dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport,
+		dest = ip_vs_find_dest(&init_net, cp->af, &cp->daddr, cp->dport,
 				       &cp->vaddr, cp->vport,
 				       cp->protocol, cp->fwmark);
 		ip_vs_bind_dest(cp, dest);
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index cd8616e..68ecc7f 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1031,6 +1031,7 @@ drop:
 static unsigned int
 ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 {
+	struct net *net = NULL;
 	struct ip_vs_iphdr iph;
 	struct ip_vs_protocol *pp;
 	struct ip_vs_conn *cp;
@@ -1054,6 +1055,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 	if (unlikely(!skb_dst(skb)))
 		return NF_ACCEPT;
 
+	net = skb_net(skb);
 	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
@@ -1119,7 +1121,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 					  sizeof(_ports), _ports);
 		if (pptr == NULL)
 			return NF_ACCEPT;	/* Not for me */
-		if (ip_vs_lookup_real_service(af, iph.protocol,
+		if (ip_vs_lookup_real_service(net, af, iph.protocol,
 					      &iph.saddr,
 					      pptr[0])) {
 			/*
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 33511f4..45e4b8e 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -290,15 +290,6 @@ static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
 
 /*
- *	Hash table: for real service lookups
- */
-#define IP_VS_RTAB_BITS 4
-#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
-#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
-
-static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
-
-/*
  *	Trash for destinations
  */
 static LIST_HEAD(ip_vs_dest_trash);
@@ -314,7 +305,7 @@ static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
  *	Returns hash value for virtual service
  */
 static __inline__ unsigned
-ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
+ip_vs_svc_hashkey(struct net *net, int af, unsigned proto, const union nf_inet_addr *addr,
 		  __be16 port)
 {
 	register unsigned porth = ntohs(port);
@@ -325,6 +316,7 @@ ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
 		addr_fold = addr->ip6[0]^addr->ip6[1]^
 			    addr->ip6[2]^addr->ip6[3];
 #endif
+	addr_fold ^= ((size_t)net>>8);
 
 	return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
 		& IP_VS_SVC_TAB_MASK;
@@ -333,13 +325,13 @@ ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
 /*
  *	Returns hash value of fwmark for virtual service lookup
  */
-static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
+static __inline__ unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
 {
-	return fwmark & IP_VS_SVC_TAB_MASK;
+	return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
 }
 
 /*
- *	Hashes a service in the ip_vs_svc_table by <proto,addr,port>
+ *	Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
  *	or in the ip_vs_svc_fwm_table by fwmark.
  *	Should be called with locked tables.
  */
@@ -355,16 +347,16 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
 
 	if (svc->fwmark == 0) {
 		/*
-		 *  Hash it by <protocol,addr,port> in ip_vs_svc_table
+		 *  Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
 		 */
-		hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr,
-					 svc->port);
+		hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
+					 &svc->addr, svc->port);
 		list_add(&svc->s_list, &ip_vs_svc_table[hash]);
 	} else {
 		/*
-		 *  Hash it by fwmark in ip_vs_svc_fwm_table
+		 *  Hash it by fwmark in svc_fwm_table
 		 */
-		hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
+		hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
 		list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
 	}
 
@@ -376,7 +368,7 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
 
 
 /*
- *	Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
+ *	Unhashes a service from svc_table / svc_fwm_table.
  *	Should be called with locked tables.
  */
 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
@@ -388,10 +380,10 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
 	}
 
 	if (svc->fwmark == 0) {
-		/* Remove it from the ip_vs_svc_table table */
+		/* Remove it from the svc_table table */
 		list_del(&svc->s_list);
 	} else {
-		/* Remove it from the ip_vs_svc_fwm_table table */
+		/* Remove it from the svc_fwm_table table */
 		list_del(&svc->f_list);
 	}
 
@@ -402,23 +394,24 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
 
 
 /*
- *	Get service by {proto,addr,port} in the service table.
+ *	Get service by {netns, proto,addr,port} in the service table.
  */
 static inline struct ip_vs_service *
-__ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
-		    __be16 vport)
+__ip_vs_service_find(struct net *net, int af, __u16 protocol,
+		     const union nf_inet_addr *vaddr, __be16 vport)
 {
 	unsigned hash;
 	struct ip_vs_service *svc;
 
 	/* Check for "full" addressed entries */
-	hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport);
+	hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
 
 	list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
 		if ((svc->af == af)
 		    && ip_vs_addr_equal(af, &svc->addr, vaddr)
 		    && (svc->port == vport)
-		    && (svc->protocol == protocol)) {
+		    && (svc->protocol == protocol)
+		    && net_eq(svc->net, net)) {
 			/* HIT */
 			return svc;
 		}
@@ -432,16 +425,17 @@ __ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
  *	Get service by {fwmark} in the service table.
  */
 static inline struct ip_vs_service *
-__ip_vs_svc_fwm_find(int af, __u32 fwmark)
+__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
 {
 	unsigned hash;
 	struct ip_vs_service *svc;
 
 	/* Check for fwmark addressed entries */
-	hash = ip_vs_svc_fwm_hashkey(fwmark);
+	hash = ip_vs_svc_fwm_hashkey(net, fwmark);
 
 	list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
-		if (svc->fwmark == fwmark && svc->af == af) {
+		if (svc->fwmark == fwmark && svc->af == af
+		    && net_eq(svc->net, net)) {
 			/* HIT */
 			return svc;
 		}
@@ -451,7 +445,7 @@ __ip_vs_svc_fwm_find(int af, __u32 fwmark)
 }
 
 struct ip_vs_service *
-ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
+ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
 		  const union nf_inet_addr *vaddr, __be16 vport)
 {
 	struct ip_vs_service *svc;
@@ -461,14 +455,14 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
 	/*
 	 *	Check the table hashed by fwmark first
 	 */
-	if (fwmark && (svc = __ip_vs_svc_fwm_find(af, fwmark)))
+	if (fwmark && (svc = __ip_vs_svc_fwm_find(net, af, fwmark)))
 		goto out;
 
 	/*
 	 *	Check the table hashed by <protocol,addr,port>
 	 *	for "full" addressed entries
 	 */
-	svc = __ip_vs_service_find(af, protocol, vaddr, vport);
+	svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
 
 	if (svc == NULL
 	    && protocol == IPPROTO_TCP
@@ -478,7 +472,7 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
 		 * Check if ftp service entry exists, the packet
 		 * might belong to FTP data connections.
 		 */
-		svc = __ip_vs_service_find(af, protocol, vaddr, FTPPORT);
+		svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
 	}
 
 	if (svc == NULL
@@ -486,7 +480,7 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
 		/*
 		 * Check if the catch-all port (port zero) exists
 		 */
-		svc = __ip_vs_service_find(af, protocol, vaddr, 0);
+		svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
 	}
 
   out:
@@ -547,10 +541,10 @@ static inline unsigned ip_vs_rs_hashkey(int af,
 }
 
 /*
- *	Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
+ *	Hashes ip_vs_dest in rs_table by <proto,addr,port>.
  *	should be called with locked tables.
  */
-static int ip_vs_rs_hash(struct ip_vs_dest *dest)
+static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
 {
 	unsigned hash;
 
@@ -564,19 +558,19 @@ static int ip_vs_rs_hash(struct ip_vs_dest *dest)
 	 */
 	hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
 
-	list_add(&dest->d_list, &ip_vs_rtable[hash]);
+	list_add(&dest->d_list, &ipvs->rs_table[hash]);
 
 	return 1;
 }
 
 /*
- *	UNhashes ip_vs_dest from ip_vs_rtable.
+ *	UNhashes ip_vs_dest from rs_table.
  *	should be called with locked tables.
  */
 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
 {
 	/*
-	 * Remove it from the ip_vs_rtable table.
+	 * Remove it from the rs_table table.
 	 */
 	if (!list_empty(&dest->d_list)) {
 		list_del(&dest->d_list);
@@ -590,10 +584,11 @@ static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
  *	Lookup real service by <proto,addr,port> in the real service table.
  */
 struct ip_vs_dest *
-ip_vs_lookup_real_service(int af, __u16 protocol,
+ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
 			  const union nf_inet_addr *daddr,
 			  __be16 dport)
 {
+	struct netns_ipvs *ipvs = net_ipvs(net);
 	unsigned hash;
 	struct ip_vs_dest *dest;
 
@@ -604,7 +599,7 @@ ip_vs_lookup_real_service(int af, __u16 protocol,
 	hash = ip_vs_rs_hashkey(af, daddr, dport);
 
 	read_lock(&__ip_vs_rs_lock);
-	list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
+	list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
 		if ((dest->af == af)
 		    && ip_vs_addr_equal(af, &dest->addr, daddr)
 		    && (dest->port == dport)
@@ -654,7 +649,8 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
  * ip_vs_lookup_real_service() looked promissing, but
  * seems not working as expected.
  */
-struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr,
+struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
+				   const union nf_inet_addr *daddr,
 				   __be16 dport,
 				   const union nf_inet_addr *vaddr,
 				   __be16 vport, __u16 protocol, __u32 fwmark)
@@ -662,7 +658,7 @@ struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr,
 	struct ip_vs_dest *dest;
 	struct ip_vs_service *svc;
 
-	svc = ip_vs_service_get(af, fwmark, protocol, vaddr, vport);
+	svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
 	if (!svc)
 		return NULL;
 	dest = ip_vs_lookup_dest(svc, daddr, dport);
@@ -770,6 +766,7 @@ static void
 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 		    struct ip_vs_dest_user_kern *udest, int add)
 {
+	struct netns_ipvs *ipvs = net_ipvs(svc->net);
 	int conn_flags;
 
 	/* set the weight and the flags */
@@ -782,11 +779,11 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 		conn_flags |= IP_VS_CONN_F_NOOUTPUT;
 	} else {
 		/*
-		 *    Put the real service in ip_vs_rtable if not present.
+		 *    Put the real service in rs_table if not present.
 		 *    For now only for NAT!
 		 */
 		write_lock_bh(&__ip_vs_rs_lock);
-		ip_vs_rs_hash(dest);
+		ip_vs_rs_hash(ipvs, dest);
 		write_unlock_bh(&__ip_vs_rs_lock);
 	}
 	atomic_set(&dest->conn_flags, conn_flags);
@@ -1119,7 +1116,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
  *	Add a service into the service hash table
  */
 static int
-ip_vs_add_service(struct ip_vs_service_user_kern *u,
+ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
 		  struct ip_vs_service **svc_p)
 {
 	int ret = 0;
@@ -1174,6 +1171,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 	svc->flags = u->flags;
 	svc->timeout = u->timeout * HZ;
 	svc->netmask = u->netmask;
+	svc->net = net;
 
 	INIT_LIST_HEAD(&svc->destinations);
 	rwlock_init(&svc->sched_lock);
@@ -1430,17 +1428,19 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
 /*
  *	Flush all the virtual services
  */
-static int ip_vs_flush(void)
+static int ip_vs_flush(struct net *net)
 {
 	int idx;
 	struct ip_vs_service *svc, *nxt;
 
 	/*
-	 * Flush the service table hashed by <protocol,addr,port>
+	 * Flush the service table hashed by <netns,protocol,addr,port>
 	 */
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
-		list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
-			ip_vs_unlink_service(svc);
+		list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
+					 s_list) {
+			if (net_eq(svc->net, net))
+				ip_vs_unlink_service(svc);
 		}
 	}
 
@@ -1450,7 +1450,8 @@ static int ip_vs_flush(void)
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		list_for_each_entry_safe(svc, nxt,
 					 &ip_vs_svc_fwm_table[idx], f_list) {
-			ip_vs_unlink_service(svc);
+			if (net_eq(svc->net, net))
+				ip_vs_unlink_service(svc);
 		}
 	}
 
@@ -1474,20 +1475,22 @@ static int ip_vs_zero_service(struct ip_vs_service *svc)
 	return 0;
 }
 
-static int ip_vs_zero_all(void)
+static int ip_vs_zero_all(struct net *net)
 {
 	int idx;
 	struct ip_vs_service *svc;
 
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
-			ip_vs_zero_service(svc);
+			if (net_eq(svc->net, net))
+				ip_vs_zero_service(svc);
 		}
 	}
 
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
-			ip_vs_zero_service(svc);
+			if (net_eq(svc->net, net))
+				ip_vs_zero_service(svc);
 		}
 	}
 
@@ -1765,6 +1768,7 @@ static struct ctl_table_header * sysctl_header;
 #ifdef CONFIG_PROC_FS
 
 struct ip_vs_iter {
+	struct seq_net_private p;  /* Do not move this, netns depends upon it*/
 	struct list_head *table;
 	int bucket;
 };
@@ -1791,6 +1795,7 @@ static inline const char *ip_vs_fwd_name(unsigned flags)
 /* Get the Nth entry in the two lists */
 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
 {
+	struct net *net = seq_file_net(seq);
 	struct ip_vs_iter *iter = seq->private;
 	int idx;
 	struct ip_vs_service *svc;
@@ -1798,7 +1803,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
 	/* look in hash by protocol */
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
-			if (pos-- == 0){
+			if (net_eq(svc->net, net) && pos-- == 0){
 				iter->table = ip_vs_svc_table;
 				iter->bucket = idx;
 				return svc;
@@ -1809,7 +1814,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
 	/* keep looking in fwmark */
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
-			if (pos-- == 0) {
+			if (net_eq(svc->net, net) && pos-- == 0) {
 				iter->table = ip_vs_svc_fwm_table;
 				iter->bucket = idx;
 				return svc;
@@ -1963,7 +1968,7 @@ static const struct seq_operations ip_vs_info_seq_ops = {
 
 static int ip_vs_info_open(struct inode *inode, struct file *file)
 {
-	return seq_open_private(file, &ip_vs_info_seq_ops,
+	return seq_open_net(inode, file, &ip_vs_info_seq_ops,
 			sizeof(struct ip_vs_iter));
 }
 
@@ -2013,7 +2018,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
 
 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, ip_vs_stats_show, NULL);
+	return single_open_net(inode, file, ip_vs_stats_show);
 }
 
 static const struct file_operations ip_vs_stats_fops = {
@@ -2115,6 +2120,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
 static int
 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 {
+	struct net *net = sock_net(sk);
 	int ret;
 	unsigned char arg[MAX_ARG_LEN];
 	struct ip_vs_service_user *usvc_compat;
@@ -2149,7 +2155,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 
 	if (cmd == IP_VS_SO_SET_FLUSH) {
 		/* Flush the virtual service */
-		ret = ip_vs_flush();
+		ret = ip_vs_flush(net);
 		goto out_unlock;
 	} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
 		/* Set timeout values for (tcp tcpfin udp) */
@@ -2176,7 +2182,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 	if (cmd == IP_VS_SO_SET_ZERO) {
 		/* if no service address is set, zero counters in all */
 		if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
-			ret = ip_vs_zero_all();
+			ret = ip_vs_zero_all(net);
 			goto out_unlock;
 		}
 	}
@@ -2193,10 +2199,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 
 	/* Lookup the exact service by <protocol, addr, port> or fwmark */
 	if (usvc.fwmark == 0)
-		svc = __ip_vs_service_find(usvc.af, usvc.protocol,
+		svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
 					   &usvc.addr, usvc.port);
 	else
-		svc = __ip_vs_svc_fwm_find(usvc.af, usvc.fwmark);
+		svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
 
 	if (cmd != IP_VS_SO_SET_ADD
 	    && (svc == NULL || svc->protocol != usvc.protocol)) {
@@ -2209,7 +2215,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 		if (svc != NULL)
 			ret = -EEXIST;
 		else
-			ret = ip_vs_add_service(&usvc, &svc);
+			ret = ip_vs_add_service(net, &usvc, &svc);
 		break;
 	case IP_VS_SO_SET_EDIT:
 		ret = ip_vs_edit_service(svc, &usvc);
@@ -2269,7 +2275,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
 }
 
 static inline int
-__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
+__ip_vs_get_service_entries(struct net *net, const struct ip_vs_get_services *get,
 			    struct ip_vs_get_services __user *uptr)
 {
 	int idx, count=0;
@@ -2280,7 +2286,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
 			/* Only expose IPv4 entries to old interface */
-			if (svc->af != AF_INET)
+			if (svc->af != AF_INET || !net_eq(svc->net, net))
 				continue;
 
 			if (count >= get->num_services)
@@ -2299,7 +2305,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
 			/* Only expose IPv4 entries to old interface */
-			if (svc->af != AF_INET)
+			if (svc->af != AF_INET || !net_eq(svc->net, net))
 				continue;
 
 			if (count >= get->num_services)
@@ -2319,7 +2325,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
 }
 
 static inline int
-__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
+__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
 			 struct ip_vs_get_dests __user *uptr)
 {
 	struct ip_vs_service *svc;
@@ -2327,9 +2333,9 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
 	int ret = 0;
 
 	if (get->fwmark)
-		svc = __ip_vs_svc_fwm_find(AF_INET, get->fwmark);
+		svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
 	else
-		svc = __ip_vs_service_find(AF_INET, get->protocol, &addr,
+		svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
 					   get->port);
 
 	if (svc) {
@@ -2403,7 +2409,9 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	unsigned char arg[128];
 	int ret = 0;
 	unsigned int copylen;
+	struct net *net = sock_net(sk);
 
+	BUG_ON(!net);
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
 
@@ -2465,7 +2473,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 			ret = -EINVAL;
 			goto out;
 		}
-		ret = __ip_vs_get_service_entries(get, user);
+		ret = __ip_vs_get_service_entries(net, get, user);
 	}
 	break;
 
@@ -2478,9 +2486,9 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		entry = (struct ip_vs_service_entry *)arg;
 		addr.ip = entry->addr;
 		if (entry->fwmark)
-			svc = __ip_vs_svc_fwm_find(AF_INET, entry->fwmark);
+			svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
 		else
-			svc = __ip_vs_service_find(AF_INET, entry->protocol,
+			svc = __ip_vs_service_find(net, AF_INET, entry->protocol,
 						   &addr, entry->port);
 		if (svc) {
 			ip_vs_copy_service(entry, svc);
@@ -2504,7 +2512,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 			ret = -EINVAL;
 			goto out;
 		}
-		ret = __ip_vs_get_dest_entries(get, user);
+		ret = __ip_vs_get_dest_entries(net, get, user);
 	}
 	break;
 
@@ -2724,11 +2732,12 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
 	int idx = 0, i;
 	int start = cb->args[0];
 	struct ip_vs_service *svc;
+	struct net *net = skb_sknet(skb);
 
 	mutex_lock(&__ip_vs_mutex);
 	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
 		list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
-			if (++idx <= start)
+			if (++idx <= start || !net_eq(svc->net, net))
 				continue;
 			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
 				idx--;
@@ -2739,7 +2748,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
 
 	for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
 		list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
-			if (++idx <= start)
+			if (++idx <= start || !net_eq(svc->net, net))
 				continue;
 			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
 				idx--;
@@ -2755,7 +2764,8 @@ nla_put_failure:
 	return skb->len;
 }
 
-static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
+static int ip_vs_genl_parse_service(struct net *net,
+				    struct ip_vs_service_user_kern *usvc,
 				    struct nlattr *nla, int full_entry,
 				    struct ip_vs_service **ret_svc)
 {
@@ -2798,9 +2808,9 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
 	}
 
 	if (usvc->fwmark)
-		svc = __ip_vs_svc_fwm_find(usvc->af, usvc->fwmark);
+		svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
 	else
-		svc = __ip_vs_service_find(usvc->af, usvc->protocol,
+		svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
 					   &usvc->addr, usvc->port);
 	*ret_svc = svc;
 
@@ -2837,13 +2847,14 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
 	return 0;
 }
 
-static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
+static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
+						     struct nlattr *nla)
 {
 	struct ip_vs_service_user_kern usvc;
 	struct ip_vs_service *svc;
 	int ret;
 
-	ret = ip_vs_genl_parse_service(&usvc, nla, 0, &svc);
+	ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
 	return ret ? ERR_PTR(ret) : svc;
 }
 
@@ -2911,6 +2922,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
 	struct ip_vs_service *svc;
 	struct ip_vs_dest *dest;
 	struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
+	struct net *net;
 
 	mutex_lock(&__ip_vs_mutex);
 
@@ -2919,7 +2931,8 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
 			IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
 		goto out_err;
 
-	svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]);
+	net = skb_sknet(skb);
+	svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
 	if (IS_ERR(svc) || svc == NULL)
 		goto out_err;
 
@@ -3104,13 +3117,15 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 	struct ip_vs_dest_user_kern udest;
 	int ret = 0, cmd;
 	int need_full_svc = 0, need_full_dest = 0;
+	struct net *net;
 
+	net = skb_sknet(skb);
 	cmd = info->genlhdr->cmd;
 
 	mutex_lock(&__ip_vs_mutex);
 
 	if (cmd == IPVS_CMD_FLUSH) {
-		ret = ip_vs_flush();
+		ret = ip_vs_flush(net);
 		goto out;
 	} else if (cmd == IPVS_CMD_SET_CONFIG) {
 		ret = ip_vs_genl_set_config(info->attrs);
@@ -3135,7 +3150,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 		goto out;
 	} else if (cmd == IPVS_CMD_ZERO &&
 		   !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
-		ret = ip_vs_zero_all();
+		ret = ip_vs_zero_all(net);
 		goto out;
 	}
 
@@ -3145,7 +3160,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 	if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
 		need_full_svc = 1;
 
-	ret = ip_vs_genl_parse_service(&usvc,
+	ret = ip_vs_genl_parse_service(net, &usvc,
 				       info->attrs[IPVS_CMD_ATTR_SERVICE],
 				       need_full_svc, &svc);
 	if (ret)
@@ -3175,7 +3190,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 	switch (cmd) {
 	case IPVS_CMD_NEW_SERVICE:
 		if (svc == NULL)
-			ret = ip_vs_add_service(&usvc, &svc);
+			ret = ip_vs_add_service(net, &usvc, &svc);
 		else
 			ret = -EEXIST;
 		break;
@@ -3213,7 +3228,9 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
 	struct sk_buff *msg;
 	void *reply;
 	int ret, cmd, reply_cmd;
+	struct net *net;
 
+	net = skb_sknet(skb);
 	cmd = info->genlhdr->cmd;
 
 	if (cmd == IPVS_CMD_GET_SERVICE)
@@ -3242,7 +3259,8 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
 	{
 		struct ip_vs_service *svc;
 
-		svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]);
+		svc = ip_vs_genl_find_service(net,
+					      info->attrs[IPVS_CMD_ATTR_SERVICE]);
 		if (IS_ERR(svc)) {
 			ret = PTR_ERR(svc);
 			goto out_err;
@@ -3413,9 +3431,16 @@ static void ip_vs_genl_unregister(void)
  */
 int __net_init __ip_vs_control_init(struct net *net)
 {
+	int idx;
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
 	if (!net_eq(net, &init_net))	/* netns not enabled yet */
 		return -EPERM;
 
+	for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
+		INIT_LIST_HEAD(&ipvs->rs_table[idx]);
+	}
+
 	proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops);
 	proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops);
 	sysctl_header = register_net_sysctl_table(net, net_vs_ctl_path, vs_vars);
@@ -3446,43 +3471,48 @@ static struct pernet_operations ipvs_control_ops = {
 
 int __init ip_vs_control_init(void)
 {
-	int ret;
 	int idx;
+	int ret;
 
 	EnterFunction(2);
 
-	/* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
+	/* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
 		INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
 		INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
 	}
-	for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
-		INIT_LIST_HEAD(&ip_vs_rtable[idx]);
+
+	ret = register_pernet_subsys(&ipvs_control_ops);
+	if (ret) {
+		pr_err("cannot register namespace.\n");
+		goto err;
 	}
-	smp_wmb();
+
+	smp_wmb();	/* Do wee really need it now ? */
 
 	ret = nf_register_sockopt(&ip_vs_sockopts);
 	if (ret) {
 		pr_err("cannot register sockopt.\n");
-		return ret;
+		goto err_net;
 	}
 
 	ret = ip_vs_genl_register();
 	if (ret) {
 		pr_err("cannot register Generic Netlink interface.\n");
 		nf_unregister_sockopt(&ip_vs_sockopts);
-		return ret;
+		goto err_net;
 	}
 
-	ret = register_pernet_subsys(&ipvs_control_ops);
-	if (ret)
-		return ret;
-
 	/* Hook the defense timer */
 	schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
 
 	LeaveFunction(2);
 	return 0;
+
+err_net:
+	unregister_pernet_subsys(&ipvs_control_ops);
+err:
+	return ret;
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index a315159..521b827 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -12,6 +12,7 @@ static int
 sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		   int *verdict, struct ip_vs_conn **cpp)
 {
+	struct net *net;
 	struct ip_vs_service *svc;
 	sctp_chunkhdr_t _schunkh, *sch;
 	sctp_sctphdr_t *sh, _sctph;
@@ -27,9 +28,9 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 				 sizeof(_schunkh), &_schunkh);
 	if (sch == NULL)
 		return 0;
-
+	net = skb_net(skb);
 	if ((sch->type == SCTP_CID_INIT) &&
-	    (svc = ip_vs_service_get(af, skb->mark, iph.protocol,
+	    (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
 				     &iph.daddr, sh->dest))) {
 		int ignored;
 
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 1cdab12..5e4da60 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -31,6 +31,7 @@ static int
 tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		  int *verdict, struct ip_vs_conn **cpp)
 {
+	struct net *net;
 	struct ip_vs_service *svc;
 	struct tcphdr _tcph, *th;
 	struct ip_vs_iphdr iph;
@@ -42,10 +43,10 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		*verdict = NF_DROP;
 		return 0;
 	}
-
+	net = skb_net(skb);
 	/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
 	if (th->syn &&
-	    (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
+	    (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, &iph.daddr,
 				     th->dest))) {
 		int ignored;
 
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index cd398de..5ab54f6 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -31,6 +31,7 @@ static int
 udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		  int *verdict, struct ip_vs_conn **cpp)
 {
+	struct net *net;
 	struct ip_vs_service *svc;
 	struct udphdr _udph, *uh;
 	struct ip_vs_iphdr iph;
@@ -42,8 +43,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		*verdict = NF_DROP;
 		return 0;
 	}
-
-	svc = ip_vs_service_get(af, skb->mark, iph.protocol,
+	net = skb_net(skb);
+	svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
 				&iph.daddr, uh->dest);
 	if (svc) {
 		int ignored;
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index ea390f8..df74b2c 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -749,7 +749,7 @@ static void ip_vs_proc_conn(struct ip_vs_conn_param *param,  unsigned flags,
 		 * If it is not found the connection will remain unbound
 		 * but still handled.
 		 */
-		dest = ip_vs_find_dest(type, daddr, dport, param->vaddr,
+		dest = ip_vs_find_dest(&init_net, type, daddr, dport, param->vaddr,
 				       param->vport, protocol, fwmark);
 
 		/*  Set the approprite ativity flag */
-- 
1.7.2.3


^ permalink raw reply related

* [*v3 PATCH 01/22] IPVS: netns, add basic init per netns.
From: hans @ 2010-12-30 10:50 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: Hans Schillstrom
In-Reply-To: <1293706266-27152-1-git-send-email-hans@schillstrom.com>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

Preparation for network name-space init, in this stage
some empty functions exists.

In most files there is a check if it is root ns i.e. init_net
if (!net_eq(net, &init_net))
        return ...
this will be removed by the last patch, when enabling name-space.

*v3
 ip_vs_conn.c merge error corrected.
 net_ipvs #ifdef removed as sugested by Jan Engelhardt

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/net/ip_vs.h              |   11 ++++++
 include/net/net_namespace.h      |    2 +
 include/net/netns/ip_vs.h        |   26 +++++++++++++++
 net/netfilter/ipvs/ip_vs_app.c   |   32 +++++++++++++++---
 net/netfilter/ipvs/ip_vs_conn.c  |   34 +++++++++++++++++---
 net/netfilter/ipvs/ip_vs_core.c  |   63 ++++++++++++++++++++++++++++++++++++-
 net/netfilter/ipvs/ip_vs_ctl.c   |   48 +++++++++++++++++++++++-----
 net/netfilter/ipvs/ip_vs_est.c   |   20 +++++++++++-
 net/netfilter/ipvs/ip_vs_ftp.c   |   34 ++++++++++++++++++--
 net/netfilter/ipvs/ip_vs_lblc.c  |   37 ++++++++++++++++++++--
 net/netfilter/ipvs/ip_vs_lblcr.c |   38 ++++++++++++++++++++--
 net/netfilter/ipvs/ip_vs_proto.c |   19 +++++++++++
 net/netfilter/ipvs/ip_vs_sync.c  |   28 +++++++++++++++++
 13 files changed, 357 insertions(+), 35 deletions(-)
 create mode 100644 include/net/netns/ip_vs.h

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index d858264..98f9ebf 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -28,6 +28,15 @@
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 #include <net/netfilter/nf_conntrack.h>
 #endif
+#include <net/net_namespace.h>		/* Netw namespace */
+
+/*
+ * Generic access of ipvs struct
+ */
+static inline struct netns_ipvs * net_ipvs(struct net* net)
+{
+	return net->ipvs;
+}

 /* Connections' size value needed by ip_vs_ctl.c */
 extern int ip_vs_conn_tab_size;
@@ -922,6 +931,8 @@ extern char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
 extern int start_sync_thread(int state, char *mcast_ifn, __u8 syncid);
 extern int stop_sync_thread(int state);
 extern void ip_vs_sync_conn(struct ip_vs_conn *cp);
+extern int ip_vs_sync_init(void);
+extern void ip_vs_sync_cleanup(void);


 /*
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 1bf812b..b3b4a34 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -20,6 +20,7 @@
 #include <net/netns/conntrack.h>
 #endif
 #include <net/netns/xfrm.h>
+#include <net/netns/ip_vs.h>

 struct proc_dir_entry;
 struct net_device;
@@ -94,6 +95,7 @@ struct net {
 #ifdef CONFIG_XFRM
 	struct netns_xfrm	xfrm;
 #endif
+	struct netns_ipvs	*ipvs;
 };


diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h
new file mode 100644
index 0000000..9068d95
--- /dev/null
+++ b/include/net/netns/ip_vs.h
@@ -0,0 +1,26 @@
+/*
+ * ip_vs.h
+ *
+ *  Created on: Nov 23, 2010
+ *  Author: hans
+ */
+
+#ifndef IP_VS_H_
+#define IP_VS_H_
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/list_nulls.h>
+#include <linux/ip_vs.h>
+#include <asm/atomic.h>
+#include <linux/in.h>
+
+struct ip_vs_stats;
+struct ip_vs_sync_buff;
+struct ctl_table_header;
+
+struct netns_ipvs {
+	int			inc;		/* Incarnation */
+};
+
+#endif /* IP_VS_H_ */
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index a475ede..6d10352 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -264,7 +264,7 @@ static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
 	 *	for all packets	before most recent resized pkt seq.
 	 */
 	if (vseq->delta || vseq->previous_delta) {
-		if(after(seq, vseq->init_seq)) {
+		if (after(seq, vseq->init_seq)) {
 			th->seq = htonl(seq + vseq->delta);
 			IP_VS_DBG(9, "%s(): added delta (%d) to seq\n",
 				  __func__, vseq->delta);
@@ -293,7 +293,7 @@ vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
 	if (vseq->delta || vseq->previous_delta) {
 		/* since ack_seq is the number of octet that is expected
 		   to receive next, so compare it with init_seq+delta */
-		if(after(ack_seq, vseq->init_seq+vseq->delta)) {
+		if (after(ack_seq, vseq->init_seq+vseq->delta)) {
 			th->ack_seq = htonl(ack_seq - vseq->delta);
 			IP_VS_DBG(9, "%s(): subtracted delta "
 				  "(%d) from ack_seq\n", __func__, vseq->delta);
@@ -569,15 +569,35 @@ static const struct file_operations ip_vs_app_fops = {
 };
 #endif

-int __init ip_vs_app_init(void)
+static int __net_init __ip_vs_app_init(struct net *net)
 {
-	/* we will replace it with proc_net_ipvs_create() soon */
-	proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops);
+	if (!net_eq(net, &init_net))	/* netns not enabled yet */
+		return -EPERM;
+
+	proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops);
 	return 0;
 }

+static void __net_exit __ip_vs_app_cleanup(struct net *net)
+{
+	proc_net_remove(net, "ip_vs_app");
+}
+
+static struct pernet_operations ip_vs_app_ops = {
+	.init = __ip_vs_app_init,
+	.exit = __ip_vs_app_cleanup,
+};
+
+int __init ip_vs_app_init(void)
+{
+	int rv;
+
+	rv = register_pernet_subsys(&ip_vs_app_ops);
+	return rv;
+}
+

 void ip_vs_app_cleanup(void)
 {
-	proc_net_remove(&init_net, "ip_vs_app");
+	unregister_pernet_subsys(&ip_vs_app_ops);
 }
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 66e4662..7c1b502 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1201,11 +1201,36 @@ static void ip_vs_conn_flush(void)
 		goto flush_again;
 	}
 }
+/*
+ * per netns init and exit
+ */
+int __net_init __ip_vs_conn_init(struct net *net)
+{
+	if (!net_eq(net, &init_net))	/* netns not enabled yet */
+		return -EPERM;

+	proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops);
+	proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
+	return 0;
+}
+
+static void __net_exit __ip_vs_conn_cleanup(struct net *net)
+{
+	if (!net_eq(net, &init_net))	/* netns not enabled yet */
+		return;
+
+	proc_net_remove(net, "ip_vs_conn");
+	proc_net_remove(net, "ip_vs_conn_sync");
+}
+static struct pernet_operations ipvs_conn_ops = {
+	.init = __ip_vs_conn_init,
+	.exit = __ip_vs_conn_cleanup,
+};

 int __init ip_vs_conn_init(void)
 {
 	int idx;
+	int retc;

 	/* Compute size and mask */
 	ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
@@ -1243,24 +1268,21 @@ int __init ip_vs_conn_init(void)
 		rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
 	}

-	proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops);
-	proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
+	retc = register_pernet_subsys(&ipvs_conn_ops);

 	/* calculate the random value for connection hash */
 	get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));

-	return 0;
+	return retc;
 }

-
 void ip_vs_conn_cleanup(void)
 {
+	unregister_pernet_subsys(&ipvs_conn_ops);
 	/* flush all the connection entries first */
 	ip_vs_conn_flush();

 	/* Release the empty cache */
 	kmem_cache_destroy(ip_vs_conn_cachep);
-	proc_net_remove(&init_net, "ip_vs_conn");
-	proc_net_remove(&init_net, "ip_vs_conn_sync");
 	vfree(ip_vs_conn_tab);
 }
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 5287771..cd8616e 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -41,6 +41,7 @@
 #include <net/icmp.h>                   /* for icmp_send */
 #include <net/route.h>
 #include <net/ip6_checksum.h>
+#include <net/netns/generic.h>		/* net_generic() */

 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
@@ -68,6 +69,12 @@ EXPORT_SYMBOL(ip_vs_conn_put);
 EXPORT_SYMBOL(ip_vs_get_debug_level);
 #endif

+int ip_vs_net_id __read_mostly;
+#ifdef IP_VS_GENERIC_NETNS
+EXPORT_SYMBOL(ip_vs_net_id);
+#endif
+/* netns cnt used for uniqueness */
+static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);

 /* ID used in ICMP lookups */
 #define icmp_id(icmph)          (((icmph)->un).echo.id)
@@ -1813,6 +1820,44 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 #endif
 };

+/*
+ *	Initialize IP Virtual Server netns mem.
+ */
+static int __net_init  __ip_vs_init(struct net *net)
+{
+	struct netns_ipvs *ipvs;
+
+	if (!net_eq(net, &init_net)) {
+		pr_err("The final patch for enabling netns is missing\n");
+		return -EPERM;
+	}
+	ipvs = (struct netns_ipvs *)net_generic(net, ip_vs_net_id);
+	if (ipvs == NULL) {
+		pr_err("%s(): no memory.\n", __func__);
+		return -ENOMEM;
+	}
+	/* Incarnation counters used for creating unique names */
+	ipvs->inc = atomic_read(&ipvs_netns_cnt);
+	atomic_inc(&ipvs_netns_cnt);
+	net->ipvs = ipvs;
+	printk(KERN_INFO "IPVS: Creating netns size=%lu id=%d\n",
+			 sizeof(struct netns_ipvs), ipvs->inc);
+	return 0;
+}
+
+static void __net_exit __ip_vs_cleanup(struct net *net)
+{
+	struct netns_ipvs *ipvs = net_ipvs(net);
+
+	IP_VS_DBG(10, "ipvs netns %d released\n", ipvs->inc);
+}
+
+static struct pernet_operations ipvs_core_ops = {
+	.init = __ip_vs_init,
+	.exit = __ip_vs_cleanup,
+	.id   = &ip_vs_net_id,
+	.size = sizeof(struct netns_ipvs),
+};

 /*
  *	Initialize IP Virtual Server
@@ -1821,8 +1866,11 @@ static int __init ip_vs_init(void)
 {
 	int ret;

-	ip_vs_estimator_init();
+	ret = register_pernet_subsys(&ipvs_core_ops);	/* Alloc ip_vs struct */
+	if (ret < 0)
+		return ret;

+	ip_vs_estimator_init();
 	ret = ip_vs_control_init();
 	if (ret < 0) {
 		pr_err("can't setup control.\n");
@@ -1843,15 +1891,23 @@ static int __init ip_vs_init(void)
 		goto cleanup_app;
 	}

+	ret = ip_vs_sync_init();
+	if (ret < 0) {
+		pr_err("can't setup sync data.\n");
+		goto cleanup_conn;
+	}
+
 	ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
 	if (ret < 0) {
 		pr_err("can't register hooks.\n");
-		goto cleanup_conn;
+		goto cleanup_sync;
 	}

 	pr_info("ipvs loaded.\n");
 	return ret;

+  cleanup_sync:
+	ip_vs_sync_cleanup();
   cleanup_conn:
 	ip_vs_conn_cleanup();
   cleanup_app:
@@ -1861,17 +1917,20 @@ static int __init ip_vs_init(void)
 	ip_vs_control_cleanup();
   cleanup_estimator:
 	ip_vs_estimator_cleanup();
+	unregister_pernet_subsys(&ipvs_core_ops);	/* free ip_vs struct */
 	return ret;
 }

 static void __exit ip_vs_cleanup(void)
 {
 	nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+	ip_vs_sync_cleanup();
 	ip_vs_conn_cleanup();
 	ip_vs_app_cleanup();
 	ip_vs_protocol_cleanup();
 	ip_vs_control_cleanup();
 	ip_vs_estimator_cleanup();
+	unregister_pernet_subsys(&ipvs_core_ops);	/* free ip_vs struct */
 	pr_info("ipvs unloaded.\n");
 }

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index d12a13c..33511f4 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3408,6 +3408,41 @@ static void ip_vs_genl_unregister(void)

 /* End of Generic Netlink interface definitions */

+/*
+ * per netns intit/exit func.
+ */
+int __net_init __ip_vs_control_init(struct net *net)
+{
+	if (!net_eq(net, &init_net))	/* netns not enabled yet */
+		return -EPERM;
+
+	proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops);
+	proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops);
+	sysctl_header = register_net_sysctl_table(net, net_vs_ctl_path, vs_vars);
+	if (sysctl_header == NULL)
+		goto err_reg;
+	ip_vs_new_estimator(&ip_vs_stats);
+	return 0;
+
+err_reg:
+	return -ENOMEM;
+}
+
+static void __net_exit __ip_vs_control_cleanup(struct net *net)
+{
+	if (!net_eq(net, &init_net))	/* netns not enabled yet */
+		return;
+
+	ip_vs_kill_estimator(&ip_vs_stats);
+	unregister_net_sysctl_table(sysctl_header);
+	proc_net_remove(net, "ip_vs_stats");
+	proc_net_remove(net, "ip_vs");
+}
+
+static struct pernet_operations ipvs_control_ops = {
+	.init = __ip_vs_control_init,
+	.exit = __ip_vs_control_cleanup,
+};

 int __init ip_vs_control_init(void)
 {
@@ -3439,12 +3474,9 @@ int __init ip_vs_control_init(void)
 		return ret;
 	}

-	proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
-	proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
-
-	sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
-
-	ip_vs_new_estimator(&ip_vs_stats);
+	ret = register_pernet_subsys(&ipvs_control_ops);
+	if (ret)
+		return ret;

 	/* Hook the defense timer */
 	schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
@@ -3461,9 +3493,7 @@ void ip_vs_control_cleanup(void)
 	cancel_rearming_delayed_work(&defense_work);
 	cancel_work_sync(&defense_work.work);
 	ip_vs_kill_estimator(&ip_vs_stats);
-	unregister_sysctl_table(sysctl_header);
-	proc_net_remove(&init_net, "ip_vs_stats");
-	proc_net_remove(&init_net, "ip_vs");
+	unregister_pernet_subsys(&ipvs_control_ops);
 	ip_vs_genl_unregister();
 	nf_unregister_sockopt(&ip_vs_sockopts);
 	LeaveFunction(2);
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index ff28801..7417a0c 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -157,13 +157,31 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats)
 	est->outbps = 0;
 }

+static int __net_init __ip_vs_estimator_init(struct net *net)
+{
+	if (!net_eq(net, &init_net))	/* netns not enabled yet */
+		return -EPERM;
+
+	return 0;
+}
+
+static struct pernet_operations ip_vs_app_ops = {
+	.init = __ip_vs_estimator_init,
+};
+
 int __init ip_vs_estimator_init(void)
 {
+	int rv;
+
+	rv = register_pernet_subsys(&ip_vs_app_ops);
+	if (rv < 0)
+		return rv;
 	mod_timer(&est_timer, jiffies + 2 * HZ);
-	return 0;
+	return rv;
 }

 void ip_vs_estimator_cleanup(void)
 {
 	del_timer_sync(&est_timer);
+	unregister_pernet_subsys(&ip_vs_app_ops);
 }
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 84aef65..0e762f3 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -399,15 +399,17 @@ static struct ip_vs_app ip_vs_ftp = {
 	.pkt_in =	ip_vs_ftp_in,
 };

-
 /*
- *	ip_vs_ftp initialization
+ *	per netns ip_vs_ftp initialization
  */
-static int __init ip_vs_ftp_init(void)
+static int __net_init __ip_vs_ftp_init(struct net *net)
 {
 	int i, ret;
 	struct ip_vs_app *app = &ip_vs_ftp;

+	if (!net_eq(net, &init_net))	/* netns not enabled yet */
+		return -EPERM;
+
 	ret = register_ip_vs_app(app);
 	if (ret)
 		return ret;
@@ -427,14 +429,38 @@ static int __init ip_vs_ftp_init(void)

 	return ret;
 }
+/*
+ *	netns exit
+ */
+static void __ip_vs_ftp_exit(struct net *net)
+{
+	struct ip_vs_app *app = &ip_vs_ftp;
+
+	if (!net_eq(net, &init_net))	/* netns not enabled yet */
+		return;
+
+	unregister_ip_vs_app(app);
+}
+
+static struct pernet_operations ip_vs_ftp_ops = {
+	.init = __ip_vs_ftp_init,
+	.exit = __ip_vs_ftp_exit,
+};

+int __init ip_vs_ftp_init(void)
+{
+	int rv;
+
+	rv = register_pernet_subsys(&ip_vs_ftp_ops);
+	return rv;
+}

 /*
  *	ip_vs_ftp finish.
  */
 static void __exit ip_vs_ftp_exit(void)
 {
-	unregister_ip_vs_app(&ip_vs_ftp);
+	unregister_pernet_subsys(&ip_vs_ftp_ops);
 }


diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 9323f89..84278fb 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -543,23 +543,54 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler =
 	.schedule =		ip_vs_lblc_schedule,
 };

+/*
+ *  per netns init.
+ */
+static int __net_init __ip_vs_lblc_init(struct net *net)
+{
+	if (!net_eq(net, &init_net))	/* netns not enabled yet */
+		return -EPERM;
+
+	sysctl_header = register_net_sysctl_table(net, net_vs_ctl_path,
+						  vs_vars_table);
+	if (!sysctl_header)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __net_exit __ip_vs_lblc_exit(struct net *net)
+{
+	if (!net_eq(net, &init_net))	/* netns not enabled yet */
+		return;
+
+	unregister_net_sysctl_table(sysctl_header);
+}
+
+static struct pernet_operations ip_vs_lblc_ops = {
+	.init = __ip_vs_lblc_init,
+	.exit = __ip_vs_lblc_exit,
+};

 static int __init ip_vs_lblc_init(void)
 {
 	int ret;

-	sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
+	ret = register_pernet_subsys(&ip_vs_lblc_ops);
+	if (ret)
+		return ret;
+
 	ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
 	if (ret)
-		unregister_sysctl_table(sysctl_header);
+		unregister_pernet_subsys(&ip_vs_lblc_ops);
 	return ret;
 }


 static void __exit ip_vs_lblc_cleanup(void)
 {
-	unregister_sysctl_table(sysctl_header);
 	unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+	unregister_pernet_subsys(&ip_vs_lblc_ops);
 }


diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index dbeed8e..7c7396a 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -744,23 +744,53 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
 	.schedule =		ip_vs_lblcr_schedule,
 };

+/*
+ *  per netns init.
+ */
+static int __net_init __ip_vs_lblcr_init(struct net *net)
+{
+	if (!net_eq(net, &init_net))	/* netns not enabled yet */
+		return -EPERM;
+
+	sysctl_header = register_net_sysctl_table(net, net_vs_ctl_path,
+						  vs_vars_table);
+	if (!sysctl_header)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __net_exit __ip_vs_lblcr_exit(struct net *net)
+{
+	if (!net_eq(net, &init_net))	/* netns not enabled yet */
+		return;
+
+	unregister_net_sysctl_table(sysctl_header);
+}
+
+static struct pernet_operations ip_vs_lblcr_ops = {
+	.init = __ip_vs_lblcr_init,
+	.exit = __ip_vs_lblcr_exit,
+};

 static int __init ip_vs_lblcr_init(void)
 {
 	int ret;

-	sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
+	ret = register_pernet_subsys(&ip_vs_lblcr_ops);
+	if (ret)
+		return ret;
+
 	ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
 	if (ret)
-		unregister_sysctl_table(sysctl_header);
+		unregister_pernet_subsys(&ip_vs_lblcr_ops);
 	return ret;
 }

-
 static void __exit ip_vs_lblcr_cleanup(void)
 {
-	unregister_sysctl_table(sysctl_header);
 	unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+	unregister_pernet_subsys(&ip_vs_lblcr_ops);
 }


diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index c539983..27bf034 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -236,6 +236,23 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
 		ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
 }

+/*
+ * per network name-space init
+ */
+static int  __net_init  __ip_vs_protocol_init(struct net *net)
+{
+	return 0;
+}
+
+static void __net_exit __ip_vs_protocol_cleanup(struct net *net)
+{
+	/* empty */
+}
+
+static struct pernet_operations ipvs_proto_ops = {
+	.init = __ip_vs_protocol_init,
+	.exit = __ip_vs_protocol_cleanup,
+};

 int __init ip_vs_protocol_init(void)
 {
@@ -265,6 +282,7 @@ int __init ip_vs_protocol_init(void)
 	REGISTER_PROTOCOL(&ip_vs_protocol_esp);
 #endif
 	pr_info("Registered protocols (%s)\n", &protocols[2]);
+	return register_pernet_subsys(&ipvs_proto_ops);

 	return 0;
 }
@@ -275,6 +293,7 @@ void ip_vs_protocol_cleanup(void)
 	struct ip_vs_protocol *pp;
 	int i;

+	unregister_pernet_subsys(&ipvs_proto_ops);
 	/* unregister all the ipvs protocols */
 	for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
 		while ((pp = ip_vs_proto_table[i]) != NULL)
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index c1c167a..ea390f8 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1639,3 +1639,31 @@ int stop_sync_thread(int state)

 	return 0;
 }
+
+/*
+ * Initialize data struct for each netns
+ */
+static int __net_init __ip_vs_sync_init(struct net *net)
+{
+	return 0;
+}
+
+static void __ip_vs_sync_cleanup(struct net *net)
+{
+	return;
+}
+static struct pernet_operations ipvs_sync_ops = {
+	.init = __ip_vs_sync_init,
+	.exit = __ip_vs_sync_cleanup,
+};
+
+
+int __init ip_vs_sync_init(void)
+{
+	return register_pernet_subsys(&ipvs_sync_ops);
+}
+
+void __exit ip_vs_sync_cleanup(void)
+{
+	unregister_pernet_subsys(&ipvs_sync_ops);
+}
--
1.7.2.3


^ permalink raw reply related

* [*v3 PATCH 00/22] IPVS Network Name Space aware
From: hans @ 2010-12-30 10:50 UTC (permalink / raw)
  To: horms, ja, daniel.lezcano, wensong, lvs-devel, netdev,
	netfilter-devel
  Cc: Hans Schillstrom

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

This patch series adds network name space support to the LVS.

REVISION

This is version 3

OVERVIEW

The patch doesn't remove or add any functionality except for netns.
For users that don't use network name space (netns) this patch is
completely transparent.

Now it's possible to run LVS in a Linux container (see lxc-tools)
i.e.  a light weight visualization. For example it's possible to run
one or several lvs on a real server in their own network name spaces.
>From the LVS point of view it looks like it runs on it's own machine.

IMPLEMENTATION
Basic requirements for netns awareness
 - Global variables has to be moved to dyn. allocated memory.
 - No or very little performance loss

Large hash tables connection hash and service hashes still resides in
global memory with net ptr added in hash key.
Most global variables now resides in a struct ipvs { } in netns/ip_vs.h.
The size of per name space is 2096 bytes (for x86_64) and a little bit less
for 32 bit arch's.

Statistics counters is now lock-free i.e. incremented per CPU,
The estimator does a sum when using it.

Procfs: ip_vs_stats_percpu is added to reflect the "per cpu"
ex.
# cat /proc/net/ip_vs_stats
       Total Incoming Outgoing         Incoming         Outgoing
CPU    Conns  Packets  Packets            Bytes            Bytes
  0        0        3        1               9D               34
  1        0        1        2               49               70
  2        0        1        2               34               76
  3        1        2        2               70               74
  ~        1        7        7              18A              18E

     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s
           0        0        0                0                0

Algorithm files are untouched except for lblc and lblcr.

STEP BY STEP
First patch creates network name space init for all files that need it.
How ever if a new name space is created an error is returned.
This will be removed in the last patch.

When net ptr ain't available init_net will be used temporarily.

CHANGES

*v2
The patches is totally reworked so each patch compile ...
Depends on the IPv6 and Persistence Backup patch.
Common hash-table per name-space for connections and services
Stats per CPU
smaller changes in lblc and lblcr
Triggered by Julians comment:
"tcp_timeout_change should work with the new struct ip_vs_proto_data
        so that tcp_state_table will go to pd->state_table
        and set_tcp_state will get pd instead of pp"

*v3
  Changes triggered by comment mostly, see individual patches for details.

  procfs ip_vs_stats remains the same, ip_vs_stats_percpu added instead.
  Unused functions removed.
  The pp -> pd conversion should start from functions like ip_vs_out()....
  Timer per ns instead of a common timer in estimator.
  Moved net compare to the end in "fast path"
  __ip_vs_mutex remains global



PATCH SET
This patch set is based upon lvs-test-2.6 / v2.6.37-rc1
and depends upon IPVS sync patches

STATUS
untested protos
 - sctp
 - esp_ah
and SIP for IPv6

SUMMARY
 include/net/ip_vs.h                     |  248 +++++++---
 include/net/net_namespace.h             |    2 +
 include/net/netns/ip_vs.h               |  146 ++++++
 net/netfilter/ipvs/ip_vs_app.c          |  101 +++--
 net/netfilter/ipvs/ip_vs_conn.c         |  159 ++++---
 net/netfilter/ipvs/ip_vs_core.c         |  228 ++++++---
 net/netfilter/ipvs/ip_vs_ctl.c          |  826 ++++++++++++++++++-------------
 net/netfilter/ipvs/ip_vs_est.c          |  131 ++++--
 net/netfilter/ipvs/ip_vs_ftp.c          |   56 ++-
 net/netfilter/ipvs/ip_vs_lblc.c         |   66 +++-
 net/netfilter/ipvs/ip_vs_lblcr.c        |   70 +++-
 net/netfilter/ipvs/ip_vs_nfct.c         |    6 +-
 net/netfilter/ipvs/ip_vs_proto.c        |  121 +++++-
 net/netfilter/ipvs/ip_vs_proto_ah_esp.c |   45 +-
 net/netfilter/ipvs/ip_vs_proto_sctp.c   |  144 +++---
 net/netfilter/ipvs/ip_vs_proto_tcp.c    |  133 +++---
 net/netfilter/ipvs/ip_vs_proto_udp.c    |  102 ++--
 net/netfilter/ipvs/ip_vs_sync.c         |  422 +++++++++-------
 net/netfilter/xt_ipvs.c                 |    2 +-
 19 files changed, 1960 insertions(+), 1048 deletions(-)

^ permalink raw reply

* Re: [PATCH v2 00/12] make rpc_pipefs be mountable multiple time
From: Kirill A. Shutemov @ 2010-12-30 10:44 UTC (permalink / raw)
  To: Rob Landley
  Cc: Kirill A. Shutemov, Rob Landley, Trond Myklebust, J. Bruce Fields,
	Neil Brown, Pavel Emelyanov, linux-nfs-u79uwXL29TY76Z2rM5mHXA,
	David S. Miller, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <4D1C5953.6020200-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>

On Thu, Dec 30, 2010 at 04:05:07AM -0600, Rob Landley wrote:
> On 12/30/2010 03:44 AM, Kirill A. Shutemov wrote:
> >>> If no rpcmount mountoption, no rpc_pipefs was found at
> >>> '/var/lib/nfs/rpc_pipefs' and we are in init's mount namespace, we use
> >>> init_rpc_pipefs.
> >>
> >> It's the "we are in init's mount namespace" that I was wondering about.
> >>
> >> So if I naievely chroot, nfs mount stops working the way it did before I
> >> chrooted unless I do an extra setup step?
> >
> > No. It will work as before since you are still in init's mount namespace.
> > Creating new mount namespace changes rules.
> 
> Ah, CLONE_NEWNS and then you need /var/lib/nfs/rpc_pipefs.  Got it.
> 
> I'm kind of surprised that the kernel cares about a specific path under 
> /var/lib.  (Seems like policy in the kernel somehow.)

Yep. It's bad, but there is way to overwrite the default.

Other way is to leave 'rpcmount' mountoption without default.
get_rpc_pipefs(NULL) in init's mount namespace will always return
init_rpc_pipefs, without filesystem lookup.
get_rpc_pipefs(NULL) in non-init's mount namespace will always return
error.

So you will have to specify 'rpcmount' mountoption for every nfs mount in
container. Hmm, I guess, it may confuse user.

Or we can try to move the default to userspace. /sbin/mount.nfs?

> Can't it just 
> check the current process's mount list to see if an instance of 
> rpc_pipefs is mounted in the current namespace the way lxc looks for 
> cgroups?  Or are there potential performance/scalability issues with that?

What should we do if we have several rpc_pipefs mounts in the namespace?

-- 
 Kirill A. Shutemov
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Compilation of pktgen fails for ARCH=um
From: Christoph Paasch @ 2010-12-30 10:33 UTC (permalink / raw)
  To: netdev; +Cc: linux-arch

Hi,

compiling the packet generator (NET_PKTGEN) for ARCH=um does not work.

Since commit 43d28b6515a6ea580a198df3e253e88236f08978 (pktgen: increasing 
transmission granularity), function spin(...) uses ndelay(...), which is not 
implemented for uml.

Should pktgen be disabled for uml?
Or should ndelay be an empty macro? (in arch/um/include/asm/delay.h)

Regards,
Christoph

--
Christoph Paasch

Research Assistant
IP Networking Lab --- http://inl.info.ucl.ac.be
MultiPath TCP in the Linux Kernel --- http://inl.info.ucl.ac.be/mptcp
Université Catholique de Louvain

www.rollerbulls.be
--

^ permalink raw reply

* Re: [PATCH v2 00/12] make rpc_pipefs be mountable multiple time
From: Rob Landley @ 2010-12-30 10:05 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Rob Landley, Trond Myklebust, J. Bruce Fields, Neil Brown,
	Pavel Emelyanov, linux-nfs, David S. Miller, netdev, linux-kernel
In-Reply-To: <20101230094433.GB29697@shutemov.name>

On 12/30/2010 03:44 AM, Kirill A. Shutemov wrote:
>>> If no rpcmount mountoption, no rpc_pipefs was found at
>>> '/var/lib/nfs/rpc_pipefs' and we are in init's mount namespace, we use
>>> init_rpc_pipefs.
>>
>> It's the "we are in init's mount namespace" that I was wondering about.
>>
>> So if I naievely chroot, nfs mount stops working the way it did before I
>> chrooted unless I do an extra setup step?
>
> No. It will work as before since you are still in init's mount namespace.
> Creating new mount namespace changes rules.

Ah, CLONE_NEWNS and then you need /var/lib/nfs/rpc_pipefs.  Got it.

I'm kind of surprised that the kernel cares about a specific path under 
/var/lib.  (Seems like policy in the kernel somehow.)  Can't it just 
check the current process's mount list to see if an instance of 
rpc_pipefs is mounted in the current namespace the way lxc looks for 
cgroups?  Or are there potential performance/scalability issues with that?

Rob

^ permalink raw reply

* Re: Re: dm9000 patch
From: Angelo Dureghello @ 2010-12-30  9:59 UTC (permalink / raw)
  To: Greg Ungerer
  Cc: Baruch Siach, uClinux development list, netdev, linux-m68k,
	linux-kernel
In-Reply-To: <4D1BD461.4070104@snapgear.com>

Hi all,

Joe,
about the debug line inside dm9000_interrupt,

     //dm9000_dbg(db, 3, "entering %s\n", __func__);

nothing change, first browsing attempt crashed the board with the same 
call stack trace:

[    4.660000] eth0: link up, 100Mbps, full-duplex, lpa 0x45E1
[   54.340000] BUG: spinlock recursion on CPU#0, swapper/0
[   54.340000]  lock: 00191244, .magic: dead4ead, .owner: swapper/0, 
.owner_cpu: 0
[   54.340000] Stack from 001a1b44:
[   54.340000]         001a1b70 000ad968 0018409b 00191244 dead4ead 
0018d7a8 00000000 00000000
[   54.340000]         00191244 0002b4ea 0014d190 001a1ba4 000adb3a 
00191244 00184101 00191244
[   54.340000]         001a1c98 0000001f 0014d15e ffffffe3 00191208 
0002b4ea 0014d190 0002caf6
[   54.340000]         001a1bb0 0014d1ac 00191244 001a1bdc 0002b602 
00191244 0000001f 001a1c98
[   54.340000]         0000001f 0014d15e 00191244 0002b4ea 0014d190 
0002caf6 001a1bf0 00000bb6
[   54.340000]         0000001f 00191244 00cfc6c0 001a1c38 000033c6 
0000001f 001a1c00 00000001
[   54.340000] Call Trace:
[   54.340000]  [000ad968] spin_bug+0x86/0x11a
[   54.340000]  [000adb3a] do_raw_spin_lock+0x58/0x120
[   54.340000]  [0014d1ac] _raw_spin_lock+0x1c/0x22
[   54.340000]  [0002b602] __do_IRQ+0x2c/0x108
[   54.340000]  [00000bb6] do_IRQ+0x48/0x62
[   54.340000]  [000033c6] inthandler+0x6a/0x74
[   54.340000]  [0014d16c] _raw_spin_unlock+0xe/0x32
[   54.340000]  [0002b6d2] __do_IRQ+0xfc/0x108
[   54.340000]  [00000bb6] do_IRQ+0x48/0x62
[   54.340000]  [000033c6] inthandler+0x6a/0x74
[   54.340000]  [00130f66] tcp_v4_conn_request+0x3fa/0x57c
[   54.340000]  [0012a1a6] tcp_rcv_state_process+0x25e/0xa66
[   54.340000]  [00130984] tcp_v4_do_rcv+0x7c/0x1c8
[   54.340000]  [00132834] tcp_v4_rcv+0x546/0x6d2

Greg,

i phisically connected the HW interrupt pin of dm9000 to MCF5307 IRQ7 
pin (pin68). dm9000 is configured (through a resistor to3.3V on pin 57) 
not as default, but to act with HIGH to LOW interrupt edge, as MCF5307 
understand, and the interrupt line is pulled up to 3.3V to avoid flickering.


             PULL UP RES to 3.3V

dm9000  |         |
    IRQ  |---------+-------------------| MCCF5307 PIN 68 (IRQ7)


IRQ 7 is the "level 7" autovectored interrupt (vect 31 dec).

Checking well the MCF5307 datasheet i have seen that "level 7" interrupt 
i casually choosed seems to be a special level:

/18.7.1 Level 7 Interrupts
Level 7 interrupts are nonmaskable and are handled differently than 
other interrupts.
Level 7 interrupts are edge triggered by a transition from a lower 
priority request to the
level 7 request. Interrupts at all other levels are level sensitive. 
Therefore, if IRQ7 remains
asserted, the MCF5307 recognizes only one level 7 interrupt because only 
one transition
from a lower level request to a level 7 request occurred. For the 
processor to
recognize two consecutive level 7 interrupts, one of the following must 
occur:

1) The interrupt request on the interrupt control pins is raised to 
level 7 and stays there
until an interrupt-acknowledge cycle begins. The level later drops but 
then returns to
level 7, causing a second transition on the interrupt control lines.

2) The interrupt request on the interrupt control pins is raised to 
level 7 and stays there.
If the level 7 interrupt routine lowers the mask level, a second level 7 
interrupt is
recognized without a transition of the interrupt control pins. After the 
level 7 routine
completes, the MCF5307 compares the mask level to the request level on 
the IRQx
signals. Because the mask level is lower than the requested level, the 
interrupt mask
is set back to level 7. To ensure it is recognized, the level 7 request 
on IRQ7 must be
held until the second interrupt-acknowledge bus cycle begins./

I guess i can try to use another IRQ line, for example IRQ1 and see what 
happen. Let me know your thought and i can try right now to hw wire up 
the fix.

still many thanks,

regards,
angelo


On 30/12/2010 01:37, Greg Ungerer wrote:
> Hi Angelo,
>
> On 30/12/10 06:57, Angelo Dureghello wrote:
>> Hi all,
>> thanks for the help,
>> the kernel is a main line kernel. Then yes, i am still using uclinux
>> tree for libc/tools.
>
> How is the DM9000 hardware connected to the 5307?
> I am wondering how you connected the interrupt (and to
> which interrupt) and the addressing (direct of a chip select)?
>
> (For example NETtel based 5307 platform support of the SMC91x code is
> in mainline as arch/m68knommu/platform/5307/nettel.c). Can you show
> the code you used to setup your dm9000 hardware?
> (Specifically I guess I want to know if you use the "auto-vectored"
> interrupt mode?)
>
> Thanks
> Greg
>
>
>> I collected another spinlock recursion with a slightly different call
>> stack trace, as always, the spinlock recursion issue happen on a high
>> tx/rx traffic of the dm9000e, in this case just asking an index.html
>> with some images and texts:
>>
>> [ 1108.930000] BUG: spinlock recursion on CPU#0, httpd/29
>> [ 1108.930000] lock: 00c42c06, .magic: dead4ead, .owner: httpd/29,
>> .owner_cpu: 0
>> [ 1108.930000] Stack from 00d7a688:
>> [ 1108.930000] 00d7a6b4 000ad988 001840ca 00c42c06 dead4ead 00d641d4
>> 0000001d 00000000
>> [ 1108.930000] 00c42c06 000064f0 00c42800 00d7a6e8 000adb5a 00c42c06
>> 00184130 00002704
>> [ 1108.930000] 00000000 0000001f 0014d17e 00159912 00c42b60 000064f0
>> 00c42800 0002cb16
>> [ 1108.930000] 00d7a6f8 0014d24e 00c42c06 00000000 00d7a738 000e485c
>> 00c42c06 00000000
>> [ 1108.930000] 00000000 0000001f 0014d17e 00159912 0000004a 00cfc600
>> 000064f0 00009a74
>> [ 1108.930000] 0002cb16 00191204 00d7a760 0002b6f2 00d7a760 0002b514
>> 0000001f 00c42800
>> [ 1108.930000] Call Trace:
>> [ 1108.930000] [000ad988] spin_bug+0x86/0x11a
>> [ 1108.930000] [000adb5a] do_raw_spin_lock+0x58/0x120
>> [ 1108.930000] [0014d24e] _raw_spin_lock_irqsave+0x28/0x32
>> [ 1108.930000] [000e485c] dm9000_interrupt+0x1a/0x2e0
>> [ 1108.930000] [0002b514] handle_IRQ_event+0x2a/0xec
>> [ 1108.930000] [0002b680] __do_IRQ+0xaa/0x128
>> [ 1108.930000] [00000bb6] do_IRQ+0x48/0x62
>> [ 1108.930000] [000033c6] inthandler+0x6a/0x74
>> [ 1108.930000] [000fb626] dev_hard_start_xmit+0x170/0x4c4
>> [ 1108.930000] [0010b80e] sch_direct_xmit+0xc0/0x1bc
>> [ 1108.930000] [000fe9de] dev_queue_xmit+0x160/0x3e6
>> [ 1108.930000] [001195c4] ip_finish_output+0xec/0x320
>> [ 1108.930000] [0011a768] ip_output+0x9e/0xa8
>> [ 1108.930000] [00119856] ip_local_out+0x26/0x30
>> [ 1108.930000] [0011a56e] ip_build_and_send_pkt+0x16e/0x178
>> [ 1108.930000] [0012fc96] tcp_v4_send_synack+0x52/0x90
>> [ 1108.930000] [00130f86] tcp_v4_conn_request+0x3fa/0x57c
>> [ 1108.930000] [0012a1c6] tcp_rcv_state_process+0x25e/0xa66
>> [ 1108.930000] [001309a4] tcp_v4_do_rcv+0x7c/0x1c8
>> [ 1108.930000] [00132854] tcp_v4_rcv+0x546/0x6d2
>> [ 1108.930000] [001153a8] ip_local_deliver+0x9c/0x1b0
>> [ 1108.930000] [001158e8] ip_rcv+0x42c/0x5f0
>> [ 1108.930000] [000fa74e] __netif_receive_skb+0x196/0x2ec
>> [ 1108.930000] [000fe142] process_backlog+0x72/0x11e
>> [ 1108.930000] [000fe290] net_rx_action+0xa2/0x150
>> [ 1108.930000] [0000e13c] __do_softirq+0x74/0xe4
>> [ 1108.930000] [0000e1e2] do_softirq+0x36/0x40
>> [ 1108.930000] [0000e6c6] local_bh_enable+0x7a/0xa4
>> [ 1108.930000] [000fe972] dev_queue_xmit+0xf4/0x3e6
>> [ 1108.930000] [001195c4] ip_finish_output+0xec/0x320
>> [ 1108.930000] [0011a768] ip_output+0x9e/0xa8
>> [ 1108.930000] [00119856] ip_local_out+0x26/0x30
>> [ 1108.930000] [0011a90a] ip_queue_xmit+0x198/0x426
>> [ 1108.930000] [0012bcc8] tcp_transmit_skb+0x3f0/0x76c
>> [ 1108.930000] [0012cfda] tcp_write_xmit+0x178/0x868
>> [ 1108.930000] [0012d6f8] __tcp_push_pending_frames+0x2e/0x9a
>> [ 1108.930000] [001222be] tcp_sendmsg+0x82e/0x98c
>> [ 1108.930000] [0013d9c0] inet_sendmsg+0x32/0x54
>> [ 1108.930000] [000ec25e] sock_aio_write+0xc8/0x138
>> [ 1108.930000] [00043e7e] do_sync_write+0x9e/0xfe
>> [ 1108.930000] [00043f56] vfs_write+0x78/0x84
>> [ 1108.930000] [0004446c] sys_write+0x40/0x7a
>> [ 1108.930000] [00003244] system_call+0x84/0xc2
>> [ 1108.930000]
>>
>> seems like while i transmit a packet, dm9000_interrupt try to acquire
>> the spinlock owned from the same task.
>>
>> Compiling the kernel i am getting:
>> CC kernel/irq/handle.o
>> kernel/irq/handle.c:432:3: warning: #warning __do_IRQ is deprecated.
>> Please convert to proper flow handlers
>>
>> Could the usage of __do_IRQ super-handler be a cause of the issue ?
>>
>>
>> many thanks,
>> angelo
>>
>> On 29/12/2010 19:45, Geert Uytterhoeven wrote:
>>> On Wed, Dec 29, 2010 at 19:06, Baruch Siach<baruch@tkos.co.il> wrote:
>>>> Hi Angelo,
>>>>
>>>> On Wed, Dec 29, 2010 at 02:13:22PM +0100, Angelo Dureghello wrote:
>>>>> just FYI, i tested kernel 2.6.36.2, unfortunately the issue is still
>>>>> there, below the call stack trace.
>>>> Help from the m68k experts seems to be needed. Adding the relevant
>>>> list to Cc.
>>> This is uClinux? Added Cc...
>>>
>>>>> [ 4.620000] eth0: link up, 100Mbps, full-duplex, lpa 0x45E1
>>>>> [ 39.390000] BUG: spinlock recursion on CPU#0, httpd/29
>>>>> [ 39.390000] lock: 00189c44, .magic: dead4ead, .owner: httpd/29,
>>>>> .owner_cpu: 0
>>>>> [ 39.390000] Stack from 00d6a990:
>>>>> [ 39.390000] 00d6a9bc 000a9710 0017cac7 00189c44 dead4ead
>>>>> 00de48f4 0000001d 00000000
>>>>> [ 39.390000] 00189c44 0002a646 00145f70 00d6a9f0 000a98e2
>>>>> 00189c44 0017cb2d 00189c44
>>>>> [ 39.390000] 00d6aad8 0000001f 00145f5c 001523f6 00189c08
>>>>> 0002a646 00145f70 0002bc52
>>>>> [ 39.390000] 00d6a9fc 00145f7e 00189c44 00d6aa28 0002a75e
>>>>> 00189c44 0000001f 00d6aad8
>>>>> [ 39.390000] 0000001f 00145f5c 00189c08 0002a646 00145f70
>>>>> 0002bc52 00d6aa3c 00000bb6
>>>>> [ 39.390000] 0000001f 00189c44 00cfc780 00d6aa84 0000337a
>>>>> 0000001f 00d6aa4c 00000001
>>>>> [ 39.390000] Call Trace:
>>>>> [ 39.390000] [000a9710] spin_bug+0x86/0x11a
>>>>> [ 39.390000] [000a98e2] do_raw_spin_lock+0x58/0x120
>>>>> [ 39.390000] [00145f7e] _raw_spin_lock+0xe/0x14
>>>>> [ 39.390000] [0002a75e] __do_IRQ+0x2c/0x108
>>>>> [ 39.390000] [00000bb6] do_IRQ+0x48/0x62
>>>>> [ 39.390000] [0000337a] inthandler+0x6a/0x74
>>>>> [ 39.390000] [0002a82e] __do_IRQ+0xfc/0x108
>>>>> [ 39.390000] [00000bb6] do_IRQ+0x48/0x62
>>>>> [ 39.390000] [0000337a] inthandler+0x6a/0x74
>>>>> [ 39.390000] [000ef0ce] skb_release_all+0x10/0x20
>>>>> [ 39.390000] [000ee6bc] __kfree_skb+0x10/0x92
>>>>> [ 39.390000] [000ee75e] consume_skb+0x20/0x34
>>>>> [ 39.390000] [000e004e] dm9000_start_xmit+0xdc/0xec
>>>>> [ 39.390000] [000f67a2] dev_hard_start_xmit+0x146/0x472
>>>>> [ 39.390000] [00106506] sch_direct_xmit+0xc0/0x1bc
>>>>> [ 39.390000] [000f9914] dev_queue_xmit+0x160/0x3e4
>>>>> [ 39.390000] [00113b3e] ip_finish_output+0xee/0x318
>>>>> [ 39.390000] [001142b4] ip_output+0x7c/0x88
>>>>> [ 39.390000] [00113dc6] ip_local_out+0x26/0x30
>>>>> [ 39.390000] [00114d9a] ip_queue_xmit+0x152/0x374
>>>>> [ 39.390000] [00125c8c] tcp_transmit_skb+0x3f0/0x732
>>>>> [ 39.390000] [00126f26] tcp_write_xmit+0x178/0x868
>>>>> [ 39.390000] [00127644] __tcp_push_pending_frames+0x2e/0x9a
>>>>> [ 39.390000] [0011c3d6] tcp_sendmsg+0x82e/0x98c
>>>>> [ 39.390000] [00137544] inet_sendmsg+0x32/0x54
>>>>> [ 39.390000] [000e79a6] sock_aio_write+0xc8/0x138
>>>>> [ 39.390000] [00042590] do_sync_write+0x9e/0xfe
>>>>> [ 39.390000] [00042668] vfs_write+0x78/0x84
>>>>> [ 39.390000] [00042a92] sys_write+0x40/0x7a
>>>>> [ 39.390000] [00003224] system_call+0x84/0xc2
>>>>> [ 39.390000]
>>>>>
>>>>> dm9000e is as default not visible/selectable in menuconfig for
>>>>> Coldfire architectures, so this probably cannot be considered as a
>>>>> kernel bug.
>>>>>
>>>>> I going forward in investigations, every help is appreciated,
>>>>>
>>>>> regards,
>>>>> angelo
>>>>>
>>>>>
>>>>>
>>>>> On 29/12/2010 07:06, Baruch Siach wrote:
>>>>>> Hi Angelo,
>>>>>>
>>>>>> On Tue, Dec 28, 2010 at 10:52:42PM +0100, Angelo Dureghello wrote:
>>>>>>> sorry to contact you directly but i couldn't get any help from the
>>>>>>> kernel.org mailing list, since i am not a developer my mails are
>>>>>>> generally skipped.
>>>>>> The best way to get the contact info for a piece of kernel code, is
>>>>>> using the
>>>>>> get_maintainer.pl script. Running 'scripts/get_maintainer.pl -f
>>>>>> drivers/net/dm9000.c' gives the following output:
>>>>>>
>>>>>> netdev@vger.kernel.org
>>>>>> linux-kernel@vger.kernel.org
>>>>>>
>>>>>> I added both to Cc.
>>>>>>
>>>>>>> I am very near to have a custom board working with MCF5307 cpu and
>>>>>>> dm9000.
>>>>>>> I am using kernel 2.6.36-rc3 with your last patch about
>>>>>>> spinlock-recursion already included.
>>>>>> You should try to update to the latest .36 kernel, which is 
>>>>>> currently
>>>>>> 2.6.36.2. The problem that you experience might be unrelated to the
>>>>>> dm9000
>>>>>> driver (or to networking at all), so it might have been fixed in
>>>>>> this version.
>>>>>>
>>>>>>> I have "ping" and "telnet" to the embedded board fully working.
>>>>>>> If i try to get a sample web page with some images from the board
>>>>>>> httpd with a browser, in 80% of cases i get a trap/oops:
>>>>>> Try to enable KALLSYMS in your kernel .config to make your stack
>>>>>> trace more
>>>>>> meaningful. This is under 'General setup -> Configure standard
>>>>>> kernel features
>>>>>> (for small systems) -> Load all symbols for debugging/ksymoops'.
>>>>>>
>>>>>> I hope this helps.
>>>>>>
>>>>>> baruch
>>>>>>
>>>>>>> [ 4.590000] eth0: link up, 100Mbps, full-duplex, lpa 0x45E1
>>>>>>> [ 67.630000] BUG: spinlock recursion on CPU#0, httpd/29
>>>>>>> [ 67.630000] lock: 00c42c06, .magic: dead4ead, .owner: httpd/29,
>>>>>>> .owner_cpu: 0
>>>>>>> [ 67.630000] Stack from 00d7b914:
>>>>>>> [ 67.630000] 00d7b940 000a8cf0 0015f693 00c42c06 dead4ead
>>>>>>> 00dec1d4 0000001d 00000000
>>>>>>> [ 67.630000] 00c42c06 00006188 00c42800 00d7b974 000a8ec2
>>>>>>> 00c42c06 0015f6f9 00002704
>>>>>>> [ 67.630000] 00000000 0000001f 00146fa4 00152f0c 00c42b60
>>>>>>> 00006188 00c42800 0002b312
>>>>>>> [ 67.630000] 00d7b984 0014701e 00c42c06 00000000 00d7b9c4
>>>>>>> 000df21c 00c42c06 00000000
>>>>>>> [ 67.630000] 00000000 0000001f 00146fa4 00152f0c 000005ea
>>>>>>> 00cfc640 00006188 000096e8
>>>>>>> [ 67.630000] 0002b312 00146fa4 00c42b60 00002704 00d7b9ec
>>>>>>> 00029d3a 0000001f 00c42800
>>>>>>> [ 67.630000] Call Trace:
>>>>>>> [ 67.630000] [000a8cf0] [000a8ec2] [0014701e] [000df21c] [00029d3a]
>>>>>>> [ 67.630000] [00029e84] [00000bb6] [0000336e] [000df162] [000effd6]
>>>>>>> [ 67.630000] [00100482] [000f312e] [000f9ebc] [0010dd2a] [0010e4a0]
>>>>>>> [ 67.630000] [0010dfb2] [0010ef80] [0011fed6] [00121170] [0012188e]
>>>>>>> [ 67.630000] [0011ecc6] [001249fe] [000e4084] [0011621c] [00131a44]
>>>>>>> [ 67.630000] [000e11ee] [00041944] [00041a1c] [00041e46] [00003218]
>>>>>>> [ 67.630000] BUG: spinlock lockup on CPU#0, httpd/29, 00c42c06
>>>>>>> [ 67.630000] Stack from 00d7b934:
>>>>>>> [ 67.630000] 00d7b974 000a8f66 0015f703 00000000 00dec1d4
>>>>>>> 0000001d 00c42c06 00002704
>>>>>>> [ 67.630000] 00000000 0000001f 00146fa4 00152f0c 00c42b60
>>>>>>> 00006188 00c42800 0002b312
>>>>>>> [ 67.630000] 00d7b984 0014701e 00c42c06 00000000 00d7b9c4
>>>>>>> 000df21c 00c42c06 00000000
>>>>>>> [ 67.630000] 00000000 0000001f 00146fa4 00152f0c 000005ea
>>>>>>> 00cfc640 00006188 000096e8
>>>>>>> [ 67.630000] 0002b312 00146fa4 00c42b60 00002704 00d7b9ec
>>>>>>> 00029d3a 0000001f 00c42800
>>>>>>> [ 67.630000] 0016c1b4 00cfc640 0000001f 0016c178 00029d10
>>>>>>> 00146fb8 00d7ba20 00029e84
>>>>>>> [ 67.630000] Call Trace:
>>>>>>> [ 67.630000] [000a8f66] [0014701e] [000df21c] [00029d3a] [00029e84]
>>>>>>> [ 67.630000] [00000bb6] [0000336e] [000df162] [000effd6] [00100482]
>>>>>>> [ 67.630000] [000f312e] [000f9ebc] [0010dd2a] [0010e4a0] [0010dfb2]
>>>>>>> [ 67.630000] [0010ef80] [0011fed6] [00121170] [0012188e] [0011ecc6]
>>>>>>> [ 67.630000] [001249fe] [000e4084] [0011621c] [00131a44] [000e11ee]
>>>>>>> [ 67.630000] [00041944] [00041a1c] [00041e46] [00003218]
>>>>>>>
>>>>>>> As i said, i was hoping in your patch but i sadly discovered it is
>>>>>>> already included in this kernel version.
>>>>>>> Hope you can give me some help or can forward me to an appropriate
>>>>>>> mailing list.
>>>> -- 
>>>> ~. .~ Tk Open Systems
>>>> =}------------------------------------------------ooO--U--Ooo------------{= 
>>>>
>>>>
>>>> - baruch@tkos.co.il - tel: +972.2.679.5364, http://www.tkos.co.il -
>>>> -- 
>>>> To unsubscribe from this list: send the line "unsubscribe
>>>> linux-kernel" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>> Please read the FAQ at http://www.tux.org/lkml/
>>>>
>>>
>>>
>>
>> _______________________________________________
>> uClinux-dev mailing list
>> uClinux-dev@uclinux.org
>> http://mailman.uclinux.org/mailman/listinfo/uclinux-dev
>> This message was resent by uclinux-dev@uclinux.org
>> To unsubscribe see:
>> http://mailman.uclinux.org/mailman/options/uclinux-dev
>>
>
>


_______________________________________________
uClinux-dev mailing list
uClinux-dev@uclinux.org
http://mailman.uclinux.org/mailman/listinfo/uclinux-dev
This message was resent by uclinux-dev@uclinux.org
To unsubscribe see:
http://mailman.uclinux.org/mailman/options/uclinux-dev

^ permalink raw reply

* Re: Re: dm9000 patch
From: Angelo Dureghello @ 2010-12-30  9:53 UTC (permalink / raw)
  To: Greg Ungerer
  Cc: Baruch Siach, uClinux development list, netdev, linux-m68k,
	linux-kernel
In-Reply-To: <4D1BD461.4070104@snapgear.com>


[-- Attachment #1.1: Type: text/plain, Size: 16981 bytes --]

Hi all,

Joe,
about the debug line inside dm9000_interrupt,

     //dm9000_dbg(db, 3, "entering %s\n", __func__);

nothing change, first browsing attempt crashed the board with the same 
call stack trace:

[    4.660000] eth0: link up, 100Mbps, full-duplex, lpa 0x45E1
[   54.340000] BUG: spinlock recursion on CPU#0, swapper/0
[   54.340000]  lock: 00191244, .magic: dead4ead, .owner: swapper/0, 
.owner_cpu: 0
[   54.340000] Stack from 001a1b44:
[   54.340000]         001a1b70 000ad968 0018409b 00191244 dead4ead 
0018d7a8 00000000 00000000
[   54.340000]         00191244 0002b4ea 0014d190 001a1ba4 000adb3a 
00191244 00184101 00191244
[   54.340000]         001a1c98 0000001f 0014d15e ffffffe3 00191208 
0002b4ea 0014d190 0002caf6
[   54.340000]         001a1bb0 0014d1ac 00191244 001a1bdc 0002b602 
00191244 0000001f 001a1c98
[   54.340000]         0000001f 0014d15e 00191244 0002b4ea 0014d190 
0002caf6 001a1bf0 00000bb6
[   54.340000]         0000001f 00191244 00cfc6c0 001a1c38 000033c6 
0000001f 001a1c00 00000001
[   54.340000] Call Trace:
[   54.340000]  [000ad968] spin_bug+0x86/0x11a
[   54.340000]  [000adb3a] do_raw_spin_lock+0x58/0x120
[   54.340000]  [0014d1ac] _raw_spin_lock+0x1c/0x22
[   54.340000]  [0002b602] __do_IRQ+0x2c/0x108
[   54.340000]  [00000bb6] do_IRQ+0x48/0x62
[   54.340000]  [000033c6] inthandler+0x6a/0x74
[   54.340000]  [0014d16c] _raw_spin_unlock+0xe/0x32
[   54.340000]  [0002b6d2] __do_IRQ+0xfc/0x108
[   54.340000]  [00000bb6] do_IRQ+0x48/0x62
[   54.340000]  [000033c6] inthandler+0x6a/0x74
[   54.340000]  [00130f66] tcp_v4_conn_request+0x3fa/0x57c
[   54.340000]  [0012a1a6] tcp_rcv_state_process+0x25e/0xa66
[   54.340000]  [00130984] tcp_v4_do_rcv+0x7c/0x1c8
[   54.340000]  [00132834] tcp_v4_rcv+0x546/0x6d2

Greg,

i phisically connected the HW interrupt pin of dm9000 to MCF5307 IRQ7 
pin (pin68). dm9000 is configured (through a resistor to3.3V on pin 57) 
not as default, but to act with HIGH to LOW interrupt edge, as MCF5307 
understand, and the interrupt line is pulled up to 3.3V to avoid flickering.


             PULL UP RES to 3.3V

dm9000  |         |
    IRQ  |---------+-------------------| MCCF5307 PIN 68 (IRQ7)


IRQ 7 is the "level 7" autovectored interrupt (vect 31 dec).

Checking well the MCF5307 datasheet i have seen that "level 7" interrupt 
i casually choosed seems to be a special level:

/18.7.1 Level 7 Interrupts
Level 7 interrupts are nonmaskable and are handled differently than 
other interrupts.
Level 7 interrupts are edge triggered by a transition from a lower 
priority request to the
level 7 request. Interrupts at all other levels are level sensitive. 
Therefore, if IRQ7 remains
asserted, the MCF5307 recognizes only one level 7 interrupt because only 
one transition
from a lower level request to a level 7 request occurred. For the 
processor to
recognize two consecutive level 7 interrupts, one of the following must 
occur:

1) The interrupt request on the interrupt control pins is raised to 
level 7 and stays there
until an interrupt-acknowledge cycle begins. The level later drops but 
then returns to
level 7, causing a second transition on the interrupt control lines.

2) The interrupt request on the interrupt control pins is raised to 
level 7 and stays there.
If the level 7 interrupt routine lowers the mask level, a second level 7 
interrupt is
recognized without a transition of the interrupt control pins. After the 
level 7 routine
completes, the MCF5307 compares the mask level to the request level on 
the IRQx
signals. Because the mask level is lower than the requested level, the 
interrupt mask
is set back to level 7. To ensure it is recognized, the level 7 request 
on IRQ7 must be
held until the second interrupt-acknowledge bus cycle begins./

I guess i can try to use another IRQ line, for example IRQ1 and see what 
happen. Let me know your thought and i can try right now to hw wire up 
the fix.

still many thanks,

regards,
angelo


On 30/12/2010 01:37, Greg Ungerer wrote:
> Hi Angelo,
>
> On 30/12/10 06:57, Angelo Dureghello wrote:
>> Hi all,
>> thanks for the help,
>> the kernel is a main line kernel. Then yes, i am still using uclinux
>> tree for libc/tools.
>
> How is the DM9000 hardware connected to the 5307?
> I am wondering how you connected the interrupt (and to
> which interrupt) and the addressing (direct of a chip select)?
>
> (For example NETtel based 5307 platform support of the SMC91x code is
> in mainline as arch/m68knommu/platform/5307/nettel.c). Can you show
> the code you used to setup your dm9000 hardware?
> (Specifically I guess I want to know if you use the "auto-vectored"
> interrupt mode?)
>
> Thanks
> Greg
>
>
>> I collected another spinlock recursion with a slightly different call
>> stack trace, as always, the spinlock recursion issue happen on a high
>> tx/rx traffic of the dm9000e, in this case just asking an index.html
>> with some images and texts:
>>
>> [ 1108.930000] BUG: spinlock recursion on CPU#0, httpd/29
>> [ 1108.930000] lock: 00c42c06, .magic: dead4ead, .owner: httpd/29,
>> .owner_cpu: 0
>> [ 1108.930000] Stack from 00d7a688:
>> [ 1108.930000] 00d7a6b4 000ad988 001840ca 00c42c06 dead4ead 00d641d4
>> 0000001d 00000000
>> [ 1108.930000] 00c42c06 000064f0 00c42800 00d7a6e8 000adb5a 00c42c06
>> 00184130 00002704
>> [ 1108.930000] 00000000 0000001f 0014d17e 00159912 00c42b60 000064f0
>> 00c42800 0002cb16
>> [ 1108.930000] 00d7a6f8 0014d24e 00c42c06 00000000 00d7a738 000e485c
>> 00c42c06 00000000
>> [ 1108.930000] 00000000 0000001f 0014d17e 00159912 0000004a 00cfc600
>> 000064f0 00009a74
>> [ 1108.930000] 0002cb16 00191204 00d7a760 0002b6f2 00d7a760 0002b514
>> 0000001f 00c42800
>> [ 1108.930000] Call Trace:
>> [ 1108.930000] [000ad988] spin_bug+0x86/0x11a
>> [ 1108.930000] [000adb5a] do_raw_spin_lock+0x58/0x120
>> [ 1108.930000] [0014d24e] _raw_spin_lock_irqsave+0x28/0x32
>> [ 1108.930000] [000e485c] dm9000_interrupt+0x1a/0x2e0
>> [ 1108.930000] [0002b514] handle_IRQ_event+0x2a/0xec
>> [ 1108.930000] [0002b680] __do_IRQ+0xaa/0x128
>> [ 1108.930000] [00000bb6] do_IRQ+0x48/0x62
>> [ 1108.930000] [000033c6] inthandler+0x6a/0x74
>> [ 1108.930000] [000fb626] dev_hard_start_xmit+0x170/0x4c4
>> [ 1108.930000] [0010b80e] sch_direct_xmit+0xc0/0x1bc
>> [ 1108.930000] [000fe9de] dev_queue_xmit+0x160/0x3e6
>> [ 1108.930000] [001195c4] ip_finish_output+0xec/0x320
>> [ 1108.930000] [0011a768] ip_output+0x9e/0xa8
>> [ 1108.930000] [00119856] ip_local_out+0x26/0x30
>> [ 1108.930000] [0011a56e] ip_build_and_send_pkt+0x16e/0x178
>> [ 1108.930000] [0012fc96] tcp_v4_send_synack+0x52/0x90
>> [ 1108.930000] [00130f86] tcp_v4_conn_request+0x3fa/0x57c
>> [ 1108.930000] [0012a1c6] tcp_rcv_state_process+0x25e/0xa66
>> [ 1108.930000] [001309a4] tcp_v4_do_rcv+0x7c/0x1c8
>> [ 1108.930000] [00132854] tcp_v4_rcv+0x546/0x6d2
>> [ 1108.930000] [001153a8] ip_local_deliver+0x9c/0x1b0
>> [ 1108.930000] [001158e8] ip_rcv+0x42c/0x5f0
>> [ 1108.930000] [000fa74e] __netif_receive_skb+0x196/0x2ec
>> [ 1108.930000] [000fe142] process_backlog+0x72/0x11e
>> [ 1108.930000] [000fe290] net_rx_action+0xa2/0x150
>> [ 1108.930000] [0000e13c] __do_softirq+0x74/0xe4
>> [ 1108.930000] [0000e1e2] do_softirq+0x36/0x40
>> [ 1108.930000] [0000e6c6] local_bh_enable+0x7a/0xa4
>> [ 1108.930000] [000fe972] dev_queue_xmit+0xf4/0x3e6
>> [ 1108.930000] [001195c4] ip_finish_output+0xec/0x320
>> [ 1108.930000] [0011a768] ip_output+0x9e/0xa8
>> [ 1108.930000] [00119856] ip_local_out+0x26/0x30
>> [ 1108.930000] [0011a90a] ip_queue_xmit+0x198/0x426
>> [ 1108.930000] [0012bcc8] tcp_transmit_skb+0x3f0/0x76c
>> [ 1108.930000] [0012cfda] tcp_write_xmit+0x178/0x868
>> [ 1108.930000] [0012d6f8] __tcp_push_pending_frames+0x2e/0x9a
>> [ 1108.930000] [001222be] tcp_sendmsg+0x82e/0x98c
>> [ 1108.930000] [0013d9c0] inet_sendmsg+0x32/0x54
>> [ 1108.930000] [000ec25e] sock_aio_write+0xc8/0x138
>> [ 1108.930000] [00043e7e] do_sync_write+0x9e/0xfe
>> [ 1108.930000] [00043f56] vfs_write+0x78/0x84
>> [ 1108.930000] [0004446c] sys_write+0x40/0x7a
>> [ 1108.930000] [00003244] system_call+0x84/0xc2
>> [ 1108.930000]
>>
>> seems like while i transmit a packet, dm9000_interrupt try to acquire
>> the spinlock owned from the same task.
>>
>> Compiling the kernel i am getting:
>> CC kernel/irq/handle.o
>> kernel/irq/handle.c:432:3: warning: #warning __do_IRQ is deprecated.
>> Please convert to proper flow handlers
>>
>> Could the usage of __do_IRQ super-handler be a cause of the issue ?
>>
>>
>> many thanks,
>> angelo
>>
>> On 29/12/2010 19:45, Geert Uytterhoeven wrote:
>>> On Wed, Dec 29, 2010 at 19:06, Baruch Siach<baruch@tkos.co.il> wrote:
>>>> Hi Angelo,
>>>>
>>>> On Wed, Dec 29, 2010 at 02:13:22PM +0100, Angelo Dureghello wrote:
>>>>> just FYI, i tested kernel 2.6.36.2, unfortunately the issue is still
>>>>> there, below the call stack trace.
>>>> Help from the m68k experts seems to be needed. Adding the relevant
>>>> list to Cc.
>>> This is uClinux? Added Cc...
>>>
>>>>> [ 4.620000] eth0: link up, 100Mbps, full-duplex, lpa 0x45E1
>>>>> [ 39.390000] BUG: spinlock recursion on CPU#0, httpd/29
>>>>> [ 39.390000] lock: 00189c44, .magic: dead4ead, .owner: httpd/29,
>>>>> .owner_cpu: 0
>>>>> [ 39.390000] Stack from 00d6a990:
>>>>> [ 39.390000] 00d6a9bc 000a9710 0017cac7 00189c44 dead4ead
>>>>> 00de48f4 0000001d 00000000
>>>>> [ 39.390000] 00189c44 0002a646 00145f70 00d6a9f0 000a98e2
>>>>> 00189c44 0017cb2d 00189c44
>>>>> [ 39.390000] 00d6aad8 0000001f 00145f5c 001523f6 00189c08
>>>>> 0002a646 00145f70 0002bc52
>>>>> [ 39.390000] 00d6a9fc 00145f7e 00189c44 00d6aa28 0002a75e
>>>>> 00189c44 0000001f 00d6aad8
>>>>> [ 39.390000] 0000001f 00145f5c 00189c08 0002a646 00145f70
>>>>> 0002bc52 00d6aa3c 00000bb6
>>>>> [ 39.390000] 0000001f 00189c44 00cfc780 00d6aa84 0000337a
>>>>> 0000001f 00d6aa4c 00000001
>>>>> [ 39.390000] Call Trace:
>>>>> [ 39.390000] [000a9710] spin_bug+0x86/0x11a
>>>>> [ 39.390000] [000a98e2] do_raw_spin_lock+0x58/0x120
>>>>> [ 39.390000] [00145f7e] _raw_spin_lock+0xe/0x14
>>>>> [ 39.390000] [0002a75e] __do_IRQ+0x2c/0x108
>>>>> [ 39.390000] [00000bb6] do_IRQ+0x48/0x62
>>>>> [ 39.390000] [0000337a] inthandler+0x6a/0x74
>>>>> [ 39.390000] [0002a82e] __do_IRQ+0xfc/0x108
>>>>> [ 39.390000] [00000bb6] do_IRQ+0x48/0x62
>>>>> [ 39.390000] [0000337a] inthandler+0x6a/0x74
>>>>> [ 39.390000] [000ef0ce] skb_release_all+0x10/0x20
>>>>> [ 39.390000] [000ee6bc] __kfree_skb+0x10/0x92
>>>>> [ 39.390000] [000ee75e] consume_skb+0x20/0x34
>>>>> [ 39.390000] [000e004e] dm9000_start_xmit+0xdc/0xec
>>>>> [ 39.390000] [000f67a2] dev_hard_start_xmit+0x146/0x472
>>>>> [ 39.390000] [00106506] sch_direct_xmit+0xc0/0x1bc
>>>>> [ 39.390000] [000f9914] dev_queue_xmit+0x160/0x3e4
>>>>> [ 39.390000] [00113b3e] ip_finish_output+0xee/0x318
>>>>> [ 39.390000] [001142b4] ip_output+0x7c/0x88
>>>>> [ 39.390000] [00113dc6] ip_local_out+0x26/0x30
>>>>> [ 39.390000] [00114d9a] ip_queue_xmit+0x152/0x374
>>>>> [ 39.390000] [00125c8c] tcp_transmit_skb+0x3f0/0x732
>>>>> [ 39.390000] [00126f26] tcp_write_xmit+0x178/0x868
>>>>> [ 39.390000] [00127644] __tcp_push_pending_frames+0x2e/0x9a
>>>>> [ 39.390000] [0011c3d6] tcp_sendmsg+0x82e/0x98c
>>>>> [ 39.390000] [00137544] inet_sendmsg+0x32/0x54
>>>>> [ 39.390000] [000e79a6] sock_aio_write+0xc8/0x138
>>>>> [ 39.390000] [00042590] do_sync_write+0x9e/0xfe
>>>>> [ 39.390000] [00042668] vfs_write+0x78/0x84
>>>>> [ 39.390000] [00042a92] sys_write+0x40/0x7a
>>>>> [ 39.390000] [00003224] system_call+0x84/0xc2
>>>>> [ 39.390000]
>>>>>
>>>>> dm9000e is as default not visible/selectable in menuconfig for
>>>>> Coldfire architectures, so this probably cannot be considered as a
>>>>> kernel bug.
>>>>>
>>>>> I going forward in investigations, every help is appreciated,
>>>>>
>>>>> regards,
>>>>> angelo
>>>>>
>>>>>
>>>>>
>>>>> On 29/12/2010 07:06, Baruch Siach wrote:
>>>>>> Hi Angelo,
>>>>>>
>>>>>> On Tue, Dec 28, 2010 at 10:52:42PM +0100, Angelo Dureghello wrote:
>>>>>>> sorry to contact you directly but i couldn't get any help from the
>>>>>>> kernel.org mailing list, since i am not a developer my mails are
>>>>>>> generally skipped.
>>>>>> The best way to get the contact info for a piece of kernel code, is
>>>>>> using the
>>>>>> get_maintainer.pl script. Running 'scripts/get_maintainer.pl -f
>>>>>> drivers/net/dm9000.c' gives the following output:
>>>>>>
>>>>>> netdev@vger.kernel.org
>>>>>> linux-kernel@vger.kernel.org
>>>>>>
>>>>>> I added both to Cc.
>>>>>>
>>>>>>> I am very near to have a custom board working with MCF5307 cpu and
>>>>>>> dm9000.
>>>>>>> I am using kernel 2.6.36-rc3 with your last patch about
>>>>>>> spinlock-recursion already included.
>>>>>> You should try to update to the latest .36 kernel, which is 
>>>>>> currently
>>>>>> 2.6.36.2. The problem that you experience might be unrelated to the
>>>>>> dm9000
>>>>>> driver (or to networking at all), so it might have been fixed in
>>>>>> this version.
>>>>>>
>>>>>>> I have "ping" and "telnet" to the embedded board fully working.
>>>>>>> If i try to get a sample web page with some images from the board
>>>>>>> httpd with a browser, in 80% of cases i get a trap/oops:
>>>>>> Try to enable KALLSYMS in your kernel .config to make your stack
>>>>>> trace more
>>>>>> meaningful. This is under 'General setup -> Configure standard
>>>>>> kernel features
>>>>>> (for small systems) -> Load all symbols for debugging/ksymoops'.
>>>>>>
>>>>>> I hope this helps.
>>>>>>
>>>>>> baruch
>>>>>>
>>>>>>> [ 4.590000] eth0: link up, 100Mbps, full-duplex, lpa 0x45E1
>>>>>>> [ 67.630000] BUG: spinlock recursion on CPU#0, httpd/29
>>>>>>> [ 67.630000] lock: 00c42c06, .magic: dead4ead, .owner: httpd/29,
>>>>>>> .owner_cpu: 0
>>>>>>> [ 67.630000] Stack from 00d7b914:
>>>>>>> [ 67.630000] 00d7b940 000a8cf0 0015f693 00c42c06 dead4ead
>>>>>>> 00dec1d4 0000001d 00000000
>>>>>>> [ 67.630000] 00c42c06 00006188 00c42800 00d7b974 000a8ec2
>>>>>>> 00c42c06 0015f6f9 00002704
>>>>>>> [ 67.630000] 00000000 0000001f 00146fa4 00152f0c 00c42b60
>>>>>>> 00006188 00c42800 0002b312
>>>>>>> [ 67.630000] 00d7b984 0014701e 00c42c06 00000000 00d7b9c4
>>>>>>> 000df21c 00c42c06 00000000
>>>>>>> [ 67.630000] 00000000 0000001f 00146fa4 00152f0c 000005ea
>>>>>>> 00cfc640 00006188 000096e8
>>>>>>> [ 67.630000] 0002b312 00146fa4 00c42b60 00002704 00d7b9ec
>>>>>>> 00029d3a 0000001f 00c42800
>>>>>>> [ 67.630000] Call Trace:
>>>>>>> [ 67.630000] [000a8cf0] [000a8ec2] [0014701e] [000df21c] [00029d3a]
>>>>>>> [ 67.630000] [00029e84] [00000bb6] [0000336e] [000df162] [000effd6]
>>>>>>> [ 67.630000] [00100482] [000f312e] [000f9ebc] [0010dd2a] [0010e4a0]
>>>>>>> [ 67.630000] [0010dfb2] [0010ef80] [0011fed6] [00121170] [0012188e]
>>>>>>> [ 67.630000] [0011ecc6] [001249fe] [000e4084] [0011621c] [00131a44]
>>>>>>> [ 67.630000] [000e11ee] [00041944] [00041a1c] [00041e46] [00003218]
>>>>>>> [ 67.630000] BUG: spinlock lockup on CPU#0, httpd/29, 00c42c06
>>>>>>> [ 67.630000] Stack from 00d7b934:
>>>>>>> [ 67.630000] 00d7b974 000a8f66 0015f703 00000000 00dec1d4
>>>>>>> 0000001d 00c42c06 00002704
>>>>>>> [ 67.630000] 00000000 0000001f 00146fa4 00152f0c 00c42b60
>>>>>>> 00006188 00c42800 0002b312
>>>>>>> [ 67.630000] 00d7b984 0014701e 00c42c06 00000000 00d7b9c4
>>>>>>> 000df21c 00c42c06 00000000
>>>>>>> [ 67.630000] 00000000 0000001f 00146fa4 00152f0c 000005ea
>>>>>>> 00cfc640 00006188 000096e8
>>>>>>> [ 67.630000] 0002b312 00146fa4 00c42b60 00002704 00d7b9ec
>>>>>>> 00029d3a 0000001f 00c42800
>>>>>>> [ 67.630000] 0016c1b4 00cfc640 0000001f 0016c178 00029d10
>>>>>>> 00146fb8 00d7ba20 00029e84
>>>>>>> [ 67.630000] Call Trace:
>>>>>>> [ 67.630000] [000a8f66] [0014701e] [000df21c] [00029d3a] [00029e84]
>>>>>>> [ 67.630000] [00000bb6] [0000336e] [000df162] [000effd6] [00100482]
>>>>>>> [ 67.630000] [000f312e] [000f9ebc] [0010dd2a] [0010e4a0] [0010dfb2]
>>>>>>> [ 67.630000] [0010ef80] [0011fed6] [00121170] [0012188e] [0011ecc6]
>>>>>>> [ 67.630000] [001249fe] [000e4084] [0011621c] [00131a44] [000e11ee]
>>>>>>> [ 67.630000] [00041944] [00041a1c] [00041e46] [00003218]
>>>>>>>
>>>>>>> As i said, i was hoping in your patch but i sadly discovered it is
>>>>>>> already included in this kernel version.
>>>>>>> Hope you can give me some help or can forward me to an appropriate
>>>>>>> mailing list.
>>>> -- 
>>>> ~. .~ Tk Open Systems
>>>> =}------------------------------------------------ooO--U--Ooo------------{= 
>>>>
>>>>
>>>> - baruch@tkos.co.il - tel: +972.2.679.5364, http://www.tkos.co.il -
>>>> -- 
>>>> To unsubscribe from this list: send the line "unsubscribe
>>>> linux-kernel" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>> Please read the FAQ at http://www.tux.org/lkml/
>>>>
>>>
>>>
>>
>> _______________________________________________
>> uClinux-dev mailing list
>> uClinux-dev@uclinux.org
>> http://mailman.uclinux.org/mailman/listinfo/uclinux-dev
>> This message was resent by uclinux-dev@uclinux.org
>> To unsubscribe see:
>> http://mailman.uclinux.org/mailman/options/uclinux-dev
>>
>
>


[-- Attachment #1.2: Type: text/html, Size: 22489 bytes --]

[-- Attachment #2: Type: text/plain, Size: 278 bytes --]

_______________________________________________
uClinux-dev mailing list
uClinux-dev@uclinux.org
http://mailman.uclinux.org/mailman/listinfo/uclinux-dev
This message was resent by uclinux-dev@uclinux.org
To unsubscribe see:
http://mailman.uclinux.org/mailman/options/uclinux-dev

^ permalink raw reply

* Re: [PATCH v4] Gemini: Gigabit ethernet driver
From: Michał Mirosław @ 2010-12-30  9:48 UTC (permalink / raw)
  To: Joe Perches; +Cc: Hans Ulli Kroll, gemini-board-dev, netdev, Christoph Biedl
In-Reply-To: <1293700797.2400.13.camel@Joe-Laptop>

On Thu, Dec 30, 2010 at 01:19:57AM -0800, Joe Perches wrote:
> On Thu, 2010-12-30 at 09:39 +0100, Michał Mirosław wrote:
> > Driver for SL351x (Gemini) SoC ethernet peripheral. Based in part
> > on work by Paulius Zaleckas and GPLd code from Raidsonic and other
> > NAS vendors.
> > Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
> > diff --git a/drivers/net/sl351x.c b/drivers/net/sl351x.c
> 
> Output trivia:
> 
> > +static void gmac_tx_interrupt(struct net_device *dev, unsigned txq_num)
> []
> > +	netif_info(gmac, tx_done, dev, "txirq%u: %u,%u,%u\n",
> > +		txq_num, txq->cptr, GET_RPTR(ptr_reg), GET_WPTR(ptr_reg));
> 
> pointers as decimal?
> There seems to be a lot of output using %u where I expected %08x.

These are indexes into TX/RX/RXbuf(FreeQ) rings. It's usually easier to read
them in decimal by humans. ;) I kept the notation of 'PTR' from the original
hardware register names.

Best Regards,
Michał Mirosław

^ permalink raw reply

* Re: [PATCH v2 00/12] make rpc_pipefs be mountable multiple time
From: Kirill A. Shutemov @ 2010-12-30  9:44 UTC (permalink / raw)
  To: Rob Landley
  Cc: Rob Landley, Trond Myklebust, J. Bruce Fields, Neil Brown,
	Pavel Emelyanov, linux-nfs-u79uwXL29TY76Z2rM5mHXA,
	David S. Miller, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <4D1C4C7C.6050606-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>

On Thu, Dec 30, 2010 at 03:10:20AM -0600, Rob Landley wrote:
> On 12/30/2010 02:51 AM, Kirill A. Shutemov wrote:
> > On Wed, Dec 29, 2010 at 08:13:50PM -0600, Rob Landley wrote:
> >> On Wed, Dec 29, 2010 at 7:14 AM, Kirill A. Shutemov<kas-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>  wrote:
> >>>
> >>> Prepare nfs/sunrpc stack to use multiple instances of rpc_pipefs.
> >>> Only for client for now.
> >>
> >> What would a test case for this look like?  (Is there some way to tell
> >> an nfs mount to use a specific instance of rpc_pipefs or something?)
> >
> > You can create a new instance of rpc_pipefs using 'newinstance'
> > mountoption.
> >
> > Then you can specify which rpc_pipefs to use with 'rpcmount' mountoption
> > of nfs mount. If none specifed, '/var/lib/nfs/rpc_pipefs' uses by default.
> 
> That path is as the process performing the mount sees it?

Yep.

> > If no rpcmount mountoption, no rpc_pipefs was found at
> > '/var/lib/nfs/rpc_pipefs' and we are in init's mount namespace, we use
> > init_rpc_pipefs.
> 
> It's the "we are in init's mount namespace" that I was wondering about.
> 
> So if I naievely chroot, nfs mount stops working the way it did before I 
> chrooted unless I do an extra setup step?

No. It will work as before since you are still in init's mount namespace.
Creating new mount namespace changes rules.

> I'm actually poking at getting nfs mount working in LXC containers with 
> different network routing (mostly study so far, it took me a couple 
> weeks just to get lxc to work for me and now I'm trying to wrap my head 
> around Linux's NFS implementation), so I'm very interested in this...

-- 
 Kirill A. Shutemov
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v4] Gemini: Gigabit ethernet driver
From: Joe Perches @ 2010-12-30  9:19 UTC (permalink / raw)
  To: Michał Mirosław
  Cc: Hans Ulli Kroll, gemini-board-dev, netdev, Christoph Biedl
In-Reply-To: <20101230083905.5A8EB13909@rere.qmqm.pl>

On Thu, 2010-12-30 at 09:39 +0100, Michał Mirosław wrote:
> Driver for SL351x (Gemini) SoC ethernet peripheral. Based in part
> on work by Paulius Zaleckas and GPLd code from Raidsonic and other
> NAS vendors.
> Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
> diff --git a/drivers/net/sl351x.c b/drivers/net/sl351x.c

Output trivia:

> +static void gmac_tx_interrupt(struct net_device *dev, unsigned txq_num)
[]
> +	netif_info(gmac, tx_done, dev, "txirq%u: %u,%u,%u\n",
> +		txq_num, txq->cptr, GET_RPTR(ptr_reg), GET_WPTR(ptr_reg));

pointers as decimal?
There seems to be a lot of output using %u where I expected %08x.

> +static int gmac_start_xmit(struct sk_buff *skb, struct net_device *dev)
[]
> +	netif_info(gmac, tx_queued, dev, "txq%u: %u,%u,%u ? %p (%u @ %p) /%u\n",
> +		txq_num, txq->cptr, GET_RPTR(ptr_reg), GET_WPTR(ptr_reg),
> +		skb, skb->len, skb->data, skb_shinfo(skb)->gso_size);
[]
> +	netif_info(gmac, tx_queued, dev, "txq%u: %u,%u,%u + %p\n",
> +		txq_num, txq->cptr, GET_RPTR(ptr_reg), w, skb);
[]
> +static unsigned gmac_rx(struct net_device *dev, unsigned budget)
[]
> +	netif_info(gmac, rx_status, dev, "rxq: %u,%u\n",
> +		 GET_RPTR(ptr_reg), GET_WPTR(ptr_reg));

etc...


^ permalink raw reply

* Re: [PATCH v2 00/12] make rpc_pipefs be mountable multiple time
From: Rob Landley @ 2010-12-30  9:10 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Rob Landley, Trond Myklebust, J. Bruce Fields, Neil Brown,
	Pavel Emelyanov, linux-nfs-u79uwXL29TY76Z2rM5mHXA,
	David S. Miller, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20101230085139.GA29697-oKw7cIdHH8eLwutG50LtGA@public.gmane.org>

On 12/30/2010 02:51 AM, Kirill A. Shutemov wrote:
> On Wed, Dec 29, 2010 at 08:13:50PM -0600, Rob Landley wrote:
>> On Wed, Dec 29, 2010 at 7:14 AM, Kirill A. Shutemov<kas-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>  wrote:
>>>
>>> Prepare nfs/sunrpc stack to use multiple instances of rpc_pipefs.
>>> Only for client for now.
>>
>> What would a test case for this look like?  (Is there some way to tell
>> an nfs mount to use a specific instance of rpc_pipefs or something?)
>
> You can create a new instance of rpc_pipefs using 'newinstance'
> mountoption.
>
> Then you can specify which rpc_pipefs to use with 'rpcmount' mountoption
> of nfs mount. If none specifed, '/var/lib/nfs/rpc_pipefs' uses by default.

That path is as the process performing the mount sees it?

> If no rpcmount mountoption, no rpc_pipefs was found at
> '/var/lib/nfs/rpc_pipefs' and we are in init's mount namespace, we use
> init_rpc_pipefs.

It's the "we are in init's mount namespace" that I was wondering about.

So if I naievely chroot, nfs mount stops working the way it did before I 
chrooted unless I do an extra setup step?

I'm actually poking at getting nfs mount working in LXC containers with 
different network routing (mostly study so far, it took me a couple 
weeks just to get lxc to work for me and now I'm trying to wrap my head 
around Linux's NFS implementation), so I'm very interested in this...

Rob
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] mac80211: fix mesh forwarding when ratelimited too
From: Johannes Berg @ 2010-12-30  8:53 UTC (permalink / raw)
  To: Milton Miller
  Cc: John W. Linville, Javier Cardona, David S. Miller, linux-wireless,
	netdev, linux-kernel
In-Reply-To: <mac80211-rx-ratelimit-goto@mdm.bga.com>

On Thu, 2010-12-30 at 02:01 -0600, Milton Miller wrote:
> Commit b51aff057c9d0ef6c529dc25fd9f775faf7b6c63 said:
> 
>     Under memory pressure, the mac80211 mesh code
>     may helpfully print a message that it failed
>     to clone a mesh frame and then will proceed
>     to crash trying to use it anyway. Fix that.
>     
> Avoid the reference whenever the frame copy is unsuccessful
> regardless of the debug message being suppressed or printed.
> 
> Cc: stable@kernel.org [2.6.27+] 
> Signed-off-by: Milton Miller <miltonm@bga.com>
> ---
> I chose a seperate if vs nesting the ratelimit check to avoid shifting
> the printk further to the right.
> 
> diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
> index b01e467..e98668f 100644
> --- a/net/mac80211/rx.c
> +++ b/net/mac80211/rx.c
> @@ -1788,11 +1788,11 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
>  
>  			fwd_skb = skb_copy(skb, GFP_ATOMIC);
>  
> -			if (!fwd_skb && net_ratelimit()) {
> +			if (!fwd_skb && net_ratelimit())
>  				printk(KERN_DEBUG "%s: failed to clone mesh frame\n",
>  						   sdata->name);
> +			if (!fwd_skb)
>  				goto out;
> -			}

Oops, good catch! Thanks.

johannes

^ permalink raw reply

* Re: [PATCH v2 00/12] make rpc_pipefs be mountable multiple time
From: Kirill A. Shutemov @ 2010-12-30  8:51 UTC (permalink / raw)
  To: Rob Landley
  Cc: Kirill A. Shutemov, Trond Myklebust, J. Bruce Fields, Neil Brown,
	Pavel Emelyanov, linux-nfs, David S. Miller, netdev, linux-kernel
In-Reply-To: <AANLkTi=AXz21wDeE1vPVD-6cEtj6faXN9x09eHDCsTkB@mail.gmail.com>

On Wed, Dec 29, 2010 at 08:13:50PM -0600, Rob Landley wrote:
> On Wed, Dec 29, 2010 at 7:14 AM, Kirill A. Shutemov <kas@openvz.org> wrote:
> >
> > Prepare nfs/sunrpc stack to use multiple instances of rpc_pipefs.
> > Only for client for now.
> 
> What would a test case for this look like?  (Is there some way to tell
> an nfs mount to use a specific instance of rpc_pipefs or something?)

You can create a new instance of rpc_pipefs using 'newinstance'
mountoption.

Then you can specify which rpc_pipefs to use with 'rpcmount' mountoption
of nfs mount. If none specifed, '/var/lib/nfs/rpc_pipefs' uses by default.
If no rpcmount mountoption, no rpc_pipefs was found at
'/var/lib/nfs/rpc_pipefs' and we are in init's mount namespace, we use
init_rpc_pipefs.

-- 
 Kirill A. Shutemov

^ permalink raw reply

* [PATCH v4] Gemini: Gigabit ethernet driver
From: Michał Mirosław @ 2010-12-30  8:39 UTC (permalink / raw)
  To: Hans Ulli Kroll, gemini-board-dev; +Cc: netdev, Christoph Biedl
In-Reply-To: <20101229130207.4F66413909@rere.qmqm.pl>

Driver for SL351x (Gemini) SoC ethernet peripheral. Based in part
on work by Paulius Zaleckas and GPLd code from Raidsonic and other
NAS vendors.

Tested on Raidsonic IcyBox 4220-B (dual SATA NAS).

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---

Against: v2.6.37-rc8

This depends on commit 83097f22d1f91a748bb2c066d049d0d1d2843ef2

        Gemini: create platform device for ethernet in Raidsonic IB-4220B

from  git://git.berlios.de/gemini-board

Note for testers: you may tweak DEFAULT_RX_BUF_ORDER (=log2(buffer size),
at most a page - so in range [6..12]) and RX_MAX_ALLOC_ORDER (max allocated
rx buffer page order) to find best memory usage/performance settings apart
from what is available via ethtool. For TX, there's not much you can do when
offloads are disabled.

MAC address needs to be set by userspace from flash VCTL partition (mtd4
on my box).

Changes from v3:
 - fixed remaining tx_queue_len misuse bugs
 - bulk RX DMA page map/unmap
 - whitespace changes to make checkpatch happier (please ignore remaining
   complaints - long lines in .c and typedefs/whitespace/long lines in .h)

Changes from v2:
 - converted to page buffers and napi_gro_frags()
 - later IRQ acking and NAPI exits
 - larger rings by default
 - tx-interrupt coalescing
 - MTU changing
 - jumbo frames support
 - ringparam and coalesce settings via ethtool
 - more fixes/cleanups

Changes from v1:
 - fixed stats (now using u64_stats_sync; no-op on UP anyway)
 - pre-load mdio-gpio if built as module
 - disable TX checksum offload by default (unreliable HW)
 - convert to NAPI+GRO (netperf TCP STREAM RX test:
        before: 156mbit/s, now: 185mbit/s)

Later TODO:
 - netpoll (netconsole)
 - parse MAC address from flash settings and pass it through platform data

---
 MAINTAINERS             |    9 +
 drivers/net/Kconfig     |   10 +
 drivers/net/Makefile    |    1 +
 drivers/net/sl351x.c    | 2323 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/sl351x_hw.h | 1436 +++++++++++++++++++++++++++++
 5 files changed, 3779 insertions(+), 0 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 71e40f9..694de04 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5439,6 +5439,15 @@ S:	Maintained
 F:	drivers/net/skge.*
 F:	drivers/net/sky2.*
 
+SL351X (STORLINK GEMINI SOC) GIGABIT ETHERNET DRIVER
+M:	Michał Mirosław <mirq-linux@rere.qmqm.pl>
+L:	netdev@vger.kernel.org
+L:	gemini-board-dev@lists.berlios.de
+T:	git git://git.berlios.de/gemini-board
+S:	Maintained
+F:	driver/net/sl351x.c
+F:	driver/net/sl351x_hw.h
+
 SLAB ALLOCATOR
 M:	Christoph Lameter <cl@linux-foundation.org>
 M:	Pekka Enberg <penberg@cs.helsinki.fi>
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 4f1755b..0336c4f 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2068,6 +2068,16 @@ config ACENIC_OMIT_TIGON_I
 
 	  The safe and default value for this is N.
 
+config GEMINI_SL351X
+	tristate "StorLink SL351x Gigabit Ethernet support"
+	depends on ARCH_GEMINI
+	select PHYLIB
+	select MDIO_BITBANG
+	select MDIO_GPIO
+	select CRC32
+	help
+	  This driver supports StorLink SL351x (Gemini) dual Gigabit Ethernet.
+
 config DL2K
 	tristate "DL2000/TC902x-based Gigabit Ethernet support"
 	depends on PCI
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index b90738d..2d3491b 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -257,6 +257,7 @@ obj-$(CONFIG_MLX4_CORE) += mlx4/
 obj-$(CONFIG_ENC28J60) += enc28j60.o
 obj-$(CONFIG_ETHOC) += ethoc.o
 obj-$(CONFIG_GRETH) += greth.o
+obj-$(CONFIG_GEMINI_SL351X) += sl351x.o
 
 obj-$(CONFIG_XTENSA_XT2000_SONIC) += xtsonic.o
 
diff --git a/drivers/net/sl351x.c b/drivers/net/sl351x.c
new file mode 100644
index 0000000..de21f07
--- /dev/null
+++ b/drivers/net/sl351x.c
@@ -0,0 +1,2323 @@
+/*
+ *  Ethernet device driver for Gemini SoC (SL351x GMAC).
+ *
+ *  Copyright (C) 2010, Michał Mirosław <mirq-linux@rere.qmqm.pl>
+ *
+ *  Based on work by Paulius Zaleckas <paulius.zaleckas@gmail.com> and
+ *  GPLd spaghetti code from Raidsonic and other Gemini-based NAS vendors.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/dma-mapping.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+
+#include <linux/platform_device.h>
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/skbuff.h>
+#include <linux/phy.h>
+#include <linux/crc32.h>
+#include <linux/ethtool.h>
+#include <linux/tcp.h>
+#include <linux/u64_stats_sync.h>
+
+#include <mach/hardware.h>
+#include <mach/global_reg.h>
+#include <mach/irqs.h>
+#include <mach/gmac.h>
+#include "sl351x_hw.h"
+
+#define DEFAULT_TX_COALESCE		16
+#define DEFAULT_GMAC_RXQ_ORDER		10
+#define DEFAULT_GMAC_TXQ_ORDER		10
+#define DEFAULT_RX_BUF_ORDER		11
+#define DEFAULT_NAPI_WEIGHT		64
+#define RX_INSERT_BYTES			2
+#define TX_MAX_FRAGS			8
+#define TX_QUEUE_NUM			1	/* max: 6 */
+#define RX_MAX_ALLOC_ORDER		4
+#define NETIF_TSO_FEATURES	\
+	(NETIF_F_TSO|NETIF_F_TSO_ECN|NETIF_F_TSO6)
+
+static int debug_level;
+module_param(debug_level, int, 0600);
+MODULE_PARM_DESC(debug_level, "netif debug level mask");
+
+struct toe_private {
+	void __iomem	*iomem;
+	GMAC_RXDESC_T	*freeq_ring;
+	spinlock_t	irq_lock;
+
+	struct net_device *netdev[2];
+	struct device	*dev;
+	int		irq;
+
+	unsigned int	freeq_frag_order;
+	unsigned int	freeq_order;
+	unsigned int	freeq_entries;
+	dma_addr_t	freeq_dma_base;
+
+	struct page	*freeq_page;
+	unsigned int	freeq_page_count;
+	unsigned int	alloc_order;
+	unsigned int	freeq_page_offs;
+};
+
+struct gmac_txq {
+	GMAC_TXDESC_T	*ring;
+	unsigned int	cptr;
+	struct sk_buff	**skb;
+	unsigned int	noirq_packets;
+} ____cacheline_aligned_in_smp;
+
+struct gmac_private {
+	void __iomem		*dma_iomem;
+
+	void __iomem		*rxq_rwptr;
+	GMAC_RXDESC_T		*rxq_ring;
+	unsigned int		rxq_order;
+
+	struct napi_struct	napi;
+	struct gmac_txq		txq[TX_QUEUE_NUM];
+	unsigned int		txq_order;
+	unsigned int		irq_every_tx_packets;
+
+	dma_addr_t		rxq_dma_base;
+	dma_addr_t		txq_dma_base;
+
+	unsigned int		msg_enable;
+	spinlock_t		config_lock;
+
+	int			in_reset;
+
+	struct u64_stats_sync	tx_stats_syncp;
+	struct u64_stats_sync	rx_stats_syncp;
+	struct u64_stats_sync	ir_stats_syncp;
+
+	struct rtnl_link_stats64 stats;
+	u64			hw_stats[RX_STATS_NUM];
+	u64			rx_stats[RX_STATUS_NUM];
+	u64			rx_csum_stats[RX_CHKSUM_NUM];
+	u64			rx_napi_exits;
+	u64			tx_frag_stats[TX_MAX_FRAGS];
+	u64			tx_frags_linearized;
+	u64			tx_hw_csummed;
+};
+
+#define GMAC_STATS_NUM	( \
+	RX_STATS_NUM + RX_STATUS_NUM + RX_CHKSUM_NUM + 1 + \
+	TX_MAX_FRAGS + 2)
+
+static const char gmac_stats_strings[GMAC_STATS_NUM][ETH_GSTRING_LEN] = {
+	"GMAC_IN_DISCARDS",
+	"GMAC_IN_ERRORS",
+	"GMAC_IN_MCAST",
+	"GMAC_IN_BCAST",
+	"GMAC_IN_MAC1",
+	"GMAC_IN_MAC2",
+	"RX_STATUS_GOOD_FRAME",
+	"RX_STATUS_TOO_LONG_GOOD_CRC",
+	"RX_STATUS_RUNT_FRAME",
+	"RX_STATUS_SFD_NOT_FOUND",
+	"RX_STATUS_CRC_ERROR",
+	"RX_STATUS_TOO_LONG_BAD_CRC",
+	"RX_STATUS_ALIGNMENT_ERROR",
+	"RX_STATUS_TOO_LONG_BAD_ALIGN",
+	"RX_STATUS_RX_ERR",
+	"RX_STATUS_DA_FILTERED",
+	"RX_STATUS_BUFFER_FULL",
+	"RX_STATUS_11",
+	"RX_STATUS_12",
+	"RX_STATUS_13",
+	"RX_STATUS_14",
+	"RX_STATUS_15",
+	"RX_CHKSUM_IP_UDP_TCP_OK",
+	"RX_CHKSUM_IP_OK_ONLY",
+	"RX_CHKSUM_NONE",
+	"RX_CHKSUM_3",
+	"RX_CHKSUM_IP_ERR_UNKNOWN",
+	"RX_CHKSUM_IP_ERR",
+	"RX_CHKSUM_TCP_UDP_ERR",
+	"RX_CHKSUM_7",
+	"RX_NAPI_EXITS",
+	"TX_FRAGS[1]",
+	"TX_FRAGS[2]",
+	"TX_FRAGS[3]",
+	"TX_FRAGS[4]",
+	"TX_FRAGS[5]",
+	"TX_FRAGS[6]",
+	"TX_FRAGS[7]",
+	"TX_FRAGS[8]",
+	"TX_FRAGS_LINEARIZED",
+	"TX_HW_CSUMMED",
+};
+
+static struct gmac_private *netdev_to_gmac(struct net_device *dev)
+{
+	return netdev_priv(dev);
+}
+
+static struct toe_private *netdev_to_toe(struct net_device *dev)
+{
+	return dev->ml_priv;
+}
+
+static struct gmac_private *napi_to_gmac(struct napi_struct *napi)
+{
+	return container_of(napi, struct gmac_private, napi);
+}
+
+static void __iomem *toe_reg(struct toe_private *toe, unsigned int reg)
+{
+	return toe->iomem + reg;
+}
+
+static void __iomem *gmac_dma_reg(struct net_device *dev, unsigned int reg)
+{
+	return netdev_to_gmac(dev)->dma_iomem + reg;
+}
+
+static void __iomem *gmac_ctl_reg(struct net_device *dev, unsigned int reg)
+{
+	return (void __iomem *)dev->base_addr + reg;
+}
+
+static struct page *toe_unmap_rx_desc(struct toe_private *toe,
+	GMAC_RXDESC_T *rx)
+{
+	struct page *page;
+
+	if (unlikely(!rx->word2.buf_adr))
+		return NULL;
+
+	page = dma_to_page(toe->dev, rx->word2.buf_adr);
+
+	dma_unmap_page(toe->dev, rx->word2.buf_adr,
+		1 << toe->freeq_frag_order, DMA_FROM_DEVICE);
+
+	return page;
+}
+
+static void gmac_hw_start(struct net_device *dev)
+{
+	GMAC_DMA_CTRL_T	dma_ctrl;
+
+	dma_ctrl.bits32 = __raw_readl(gmac_dma_reg(dev, GMAC_DMA_CTRL_REG));
+
+	dma_ctrl.bits.rd_enable = 1;
+	dma_ctrl.bits.td_enable = 1;
+	dma_ctrl.bits.loopback = 0;
+	dma_ctrl.bits.drop_small_ack = 0;
+	dma_ctrl.bits.rd_prot = 0;
+	dma_ctrl.bits.rd_burst_size = 3;
+	dma_ctrl.bits.rd_insert_bytes = RX_INSERT_BYTES;
+	dma_ctrl.bits.rd_bus = 3;
+	dma_ctrl.bits.td_prot = 0;
+	dma_ctrl.bits.td_burst_size = 3;
+	dma_ctrl.bits.td_bus = 3;
+
+	__raw_writel(dma_ctrl.bits32, gmac_dma_reg(dev, GMAC_DMA_CTRL_REG));
+}
+
+static void gmac_hw_stop(struct net_device *dev)
+{
+	GMAC_DMA_CTRL_T	dma_ctrl;
+
+	dma_ctrl.bits32 = __raw_readl(gmac_dma_reg(dev, GMAC_DMA_CTRL_REG));
+
+	dma_ctrl.bits.rd_enable = 0;
+	dma_ctrl.bits.td_enable = 0;
+
+	__raw_writel(dma_ctrl.bits32, gmac_dma_reg(dev, GMAC_DMA_CTRL_REG));
+}
+
+static void gmac_update_config0_reg(struct net_device *dev, u32 val, u32 vmask)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	unsigned long flags;
+	u32 reg;
+
+	spin_lock_irqsave(&gmac->config_lock, flags);
+
+	reg = __raw_readl(gmac_ctl_reg(dev, GMAC_CONFIG0));
+	reg = (reg & ~vmask) | val;
+	__raw_writel(reg, gmac_ctl_reg(dev, GMAC_CONFIG0));
+
+	spin_unlock_irqrestore(&gmac->config_lock, flags);
+}
+
+static void gmac_enable_tx_rx(struct net_device *dev)
+{
+	gmac_update_config0_reg(dev, 0, CONFIG0_TX_RX_DISABLE);
+}
+
+static void gmac_disable_tx_rx(struct net_device *dev)
+{
+	gmac_update_config0_reg(dev, CONFIG0_TX_RX_DISABLE,
+		CONFIG0_TX_RX_DISABLE);
+	mdelay(10);	/* let GMAC consume packet */
+}
+
+static void gmac_set_flow_control(struct net_device *dev, bool tx, bool rx)
+{
+	u32 val = (tx ? CONFIG0_FLOW_TX : 0)|(rx ? CONFIG0_FLOW_RX : 0);
+
+	gmac_update_config0_reg(dev, val, CONFIG0_FLOW_CTL);
+}
+
+static void gmac_update_link_state(struct net_device *dev)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	void __iomem *status_reg = gmac_ctl_reg(dev, GMAC_STATUS);
+	struct phy_device *phydev = dev->phydev;
+	GMAC_STATUS_T status, old_status;
+
+	old_status.bits32 = status.bits32 = __raw_readl(status_reg);
+
+	status.bits.link = phydev->link;
+	status.bits.duplex = phydev->duplex;
+
+	switch (phydev->speed) {
+	case 1000:
+		status.bits.speed = GMAC_SPEED_1000;
+		if (phydev->interface == PHY_INTERFACE_MODE_RGMII)
+			status.bits.mii_rmii = GMAC_PHY_RGMII_1000;
+		break;
+	case 100:
+		status.bits.speed = GMAC_SPEED_100;
+		if (phydev->interface == PHY_INTERFACE_MODE_RGMII)
+			status.bits.mii_rmii = GMAC_PHY_RGMII_100_10;
+		break;
+	case 10:
+		status.bits.speed = GMAC_SPEED_10;
+		if (phydev->interface == PHY_INTERFACE_MODE_RGMII)
+			status.bits.mii_rmii = GMAC_PHY_RGMII_100_10;
+		break;
+	default:
+		dev_warn(&dev->dev, "Not supported PHY speed (%d)\n",
+			phydev->speed);
+	}
+
+	gmac_set_flow_control(dev, phydev->pause,
+		phydev->pause ^ phydev->asym_pause);
+
+	if (old_status.bits32 == status.bits32)
+		return;
+
+	if (netif_msg_link(gmac)) {
+		phy_print_status(phydev);
+		netdev_info(dev, "link flow control: %s\n",
+			phydev->pause
+				? (phydev->asym_pause ? "tx" : "both")
+				: (phydev->asym_pause ? "rx" : "none")
+		);
+	}
+
+	gmac_disable_tx_rx(dev);
+	__raw_writel(status.bits32, status_reg);
+	gmac_enable_tx_rx(dev);
+}
+
+static int gmac_setup_phy(struct net_device *dev)
+{
+	struct toe_private *toe = netdev_to_toe(dev);
+	struct gemini_gmac_platform_data *pdata = toe->dev->platform_data;
+	GMAC_STATUS_T status = { .bits32 = 0 };
+	int num = dev->dev_id;
+
+	dev->phydev = phy_connect(dev, pdata->bus_id[num],
+		&gmac_update_link_state, 0, pdata->interface[num]);
+
+	if (IS_ERR(dev->phydev)) {
+		int err = PTR_ERR(dev->phydev);
+		dev->phydev = NULL;
+		return err;
+	}
+
+	dev->phydev->supported &= PHY_GBIT_FEATURES|SUPPORTED_Pause;
+	dev->phydev->advertising = dev->phydev->supported;
+
+	/* set PHY interface type */
+	switch (dev->phydev->interface) {
+	case PHY_INTERFACE_MODE_MII:
+		status.bits.mii_rmii = GMAC_PHY_MII;
+		break;
+	case PHY_INTERFACE_MODE_GMII:
+		status.bits.mii_rmii = GMAC_PHY_GMII;
+		break;
+	case PHY_INTERFACE_MODE_RGMII:
+		status.bits.mii_rmii = GMAC_PHY_RGMII_100_10;
+		break;
+	default:
+		dev_err(&dev->dev, "Unsupported MII interface\n");
+		phy_disconnect(dev->phydev);
+		dev->phydev = NULL;
+		return -EINVAL;
+	}
+	__raw_writel(status.bits32, gmac_ctl_reg(dev, GMAC_STATUS));
+
+	return 0;
+}
+
+static int gmac_pick_rx_max_len(int max_l3_len)
+{
+	/* index = CONFIG_MAXLEN_XXX values */
+	static const int max_len[8] = {
+		1536, 1518, 1522, 1542,
+		9212, 10236, 1518, 1518
+	};
+	int i, n = 5;
+
+	max_l3_len += ETH_HLEN + VLAN_HLEN;
+
+	if (max_l3_len > max_len[n])
+		return -1;
+
+	for (i = 0; i < 5; ++i) {
+		if (max_len[i] >= max_l3_len && max_len[i] < max_len[n])
+			n = i;
+	}
+
+	return n;
+}
+
+static int gmac_init(struct net_device *dev)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	u32 val;
+
+	GMAC_CONFIG0_T config0 = { .bits = {
+		.dis_tx = 1,
+		.dis_rx = 1,
+		.ipv4_rx_chksum = 1,
+		.ipv6_rx_chksum = 1,
+		.rx_err_detect = 1,
+		.rgmm_edge = 1,
+		.port0_chk_hwq = 1,
+		.port1_chk_hwq = 1,
+		.port0_chk_toeq = 1,
+		.port1_chk_toeq = 1,
+		.port0_chk_classq = 1,
+		.port1_chk_classq = 1,
+	} };
+	GMAC_TX_WCR0_T hw_weigh = { .bits = {
+		.hw_tq3 = 1,
+		.hw_tq2 = 1,
+		.hw_tq1 = 1,
+		.hw_tq0 = 1,
+	} };
+	GMAC_TX_WCR1_T sw_weigh = { .bits = {
+		.sw_tq5 = 1,
+		.sw_tq4 = 1,
+		.sw_tq3 = 1,
+		.sw_tq2 = 1,
+		.sw_tq1 = 1,
+		.sw_tq0 = 1,
+	} };
+	GMAC_CONFIG1_T config1 = { .bits = {
+		.set_threshold = 16,
+		.rel_threshold = 24,
+	} };
+	GMAC_CONFIG2_T config2 = { .bits = {
+		.set_threshold = 16,
+		.rel_threshold = 32,
+	} };
+	GMAC_CONFIG3_T config3 = { .bits = {
+		.set_threshold = 0,
+		.rel_threshold = 0,
+	} };
+
+	config0.bits.max_len = gmac_pick_rx_max_len(dev->mtu);
+
+	val = __raw_readl(gmac_ctl_reg(dev, GMAC_CONFIG0));
+	config0.bits.reserved = ((GMAC_CONFIG0_T)val).bits.reserved;
+	__raw_writel(config0.bits32, gmac_ctl_reg(dev, GMAC_CONFIG0));
+	__raw_writel(config1.bits32, gmac_ctl_reg(dev, GMAC_CONFIG1));
+	__raw_writel(config2.bits32, gmac_ctl_reg(dev, GMAC_CONFIG2));
+	__raw_writel(config3.bits32, gmac_ctl_reg(dev, GMAC_CONFIG3));
+	__raw_writel(hw_weigh.bits32,
+		gmac_dma_reg(dev, GMAC_TX_WEIGHTING_CTRL_0_REG));
+	__raw_writel(sw_weigh.bits32,
+		gmac_dma_reg(dev, GMAC_TX_WEIGHTING_CTRL_1_REG));
+
+	gmac->rxq_order = DEFAULT_GMAC_RXQ_ORDER;
+	gmac->txq_order = DEFAULT_GMAC_TXQ_ORDER;
+
+	gmac->irq_every_tx_packets = DEFAULT_TX_COALESCE;
+
+	return 0;
+}
+
+static void gmac_uninit(struct net_device *dev)
+{
+	if (dev->phydev)
+		phy_disconnect(dev->phydev);
+}
+
+static int gmac_setup_txqs(struct net_device *dev)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	struct toe_private *toe = netdev_to_toe(dev);
+	void __iomem *rwptr_reg = gmac_dma_reg(dev, GMAC_SW_TX_QUEUE0_PTR_REG);
+	void __iomem *base_reg = gmac_dma_reg(dev, GMAC_SW_TX_QUEUE_BASE_REG);
+
+	unsigned int n_txq = dev->num_tx_queues;
+	struct gmac_txq *txq = gmac->txq;
+	GMAC_TXDESC_T *desc_ring;
+	struct sk_buff **skb_tab;
+	int i;
+
+	skb_tab = kzalloc(
+		n_txq * sizeof(*skb_tab) << gmac->txq_order, GFP_KERNEL);
+	if (!skb_tab)
+		return -ENOMEM;
+
+	desc_ring = dma_alloc_coherent(toe->dev,
+		n_txq * sizeof(*desc_ring) << gmac->txq_order,
+		&gmac->txq_dma_base, GFP_KERNEL);
+	if (!desc_ring) {
+		kfree(skb_tab);
+		return -ENOMEM;
+	}
+
+	BUG_ON(gmac->txq_dma_base & ~DMA_Q_BASE_MASK);
+
+	for (i = 0; i < n_txq; i++) {
+		netif_info(gmac, ifup, dev,
+			"txq%u: ring %p (dma 0x%08x), skb %p, rwptr %p, len %u (order %u)\n",
+			i, desc_ring, gmac->txq_dma_base, skb_tab, rwptr_reg,
+			1 << gmac->txq_order, gmac->txq_order);
+
+		__raw_writel(0, rwptr_reg);
+		txq->ring = desc_ring;
+		txq->cptr = 0;
+		txq->skb = skb_tab;
+
+		desc_ring += 1 << gmac->txq_order;
+		skb_tab += 1 << gmac->txq_order;
+		rwptr_reg += 4;
+	}
+
+	__raw_writel(gmac->txq_dma_base | gmac->txq_order, base_reg);
+
+	return 0;
+}
+
+static void gmac_cleanup_txqs(struct net_device *dev)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	struct toe_private *toe = netdev_to_toe(dev);
+	void __iomem *rwptr_reg = gmac_dma_reg(dev, GMAC_SW_TX_QUEUE0_PTR_REG);
+	void __iomem *base_reg = gmac_dma_reg(dev, GMAC_SW_TX_QUEUE_BASE_REG);
+
+	struct gmac_txq *txq = gmac->txq;
+	unsigned n_txq = dev->num_tx_queues;
+	int i, j;
+
+	for (i = 0; i < n_txq; ++i, ++txq) {
+		__raw_writel(0, rwptr_reg + 4 * i);
+		for (j = 0; j < (1 << gmac->txq_order); ++j)
+			if (txq->skb[j])
+				dev_kfree_skb(txq->skb[j]);
+	}
+
+	__raw_writel(0, base_reg);
+
+	kfree(gmac->txq->skb);
+	dma_free_coherent(toe->dev,
+		n_txq * sizeof(*gmac->txq->ring) << gmac->txq_order,
+		gmac->txq->ring, gmac->txq_dma_base);
+}
+
+static int gmac_setup_rxq(struct net_device *dev)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	struct toe_private *toe = netdev_to_toe(dev);
+	NONTOE_QHDR_T __iomem *qhdr = toe_reg(toe, TOE_DEFAULT_Q_HDR_BASE(dev->dev_id));
+
+	gmac->rxq_rwptr = &qhdr->word1;
+	gmac->rxq_ring = dma_alloc_coherent(toe->dev,
+		sizeof(*gmac->rxq_ring) << gmac->rxq_order,
+		&gmac->rxq_dma_base, GFP_KERNEL);
+	if (!gmac->rxq_ring)
+		return -ENOMEM;
+
+	BUG_ON(gmac->rxq_dma_base & ~NONTOE_QHDR0_BASE_MASK);
+
+	__raw_writel(0, gmac->rxq_rwptr);
+	__raw_writel(gmac->rxq_dma_base | gmac->rxq_order, &qhdr->word0);
+
+	netif_info(gmac, ifup, dev,
+		"rxq: ring %p (dma 0x%08x), rwptr %p, len %u (order %u)\n",
+		gmac->rxq_ring, gmac->rxq_dma_base, gmac->rxq_rwptr,
+		1 << gmac->rxq_order, gmac->rxq_order);
+	return 0;
+}
+
+static void gmac_cleanup_rxq(struct net_device *dev)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	struct toe_private *toe = netdev_to_toe(dev);
+
+	NONTOE_QHDR_T __iomem *qhdr = toe_reg(toe, TOE_DEFAULT_Q_HDR_BASE(dev->dev_id));
+	void __iomem *dma_reg = &qhdr->word0;
+	void __iomem *ptr_reg = &qhdr->word1;
+	unsigned i, e, mask = __RWPTR_MASK(gmac->rxq_order);
+	struct page *page;
+
+	i = GET_RPTR(ptr_reg);
+	e = GET_WPTR(ptr_reg);
+	__raw_writel(0, ptr_reg);
+	__raw_writel(0, dma_reg);
+
+	for (; i != e; i = __RWPTR_NEXT(i, mask)) {
+		page = toe_unmap_rx_desc(toe, &gmac->rxq_ring[i]);
+		if (likely(page))
+			put_page(page);
+	}
+
+	dma_free_coherent(toe->dev, sizeof(*gmac->rxq_ring) << gmac->rxq_order,
+		gmac->rxq_ring, gmac->rxq_dma_base);
+}
+
+static void __gmac_enable_txfin_irq(struct net_device *, int txq, int enable);
+
+static void gmac_tx_interrupt(struct net_device *dev, unsigned txq_num)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	struct toe_private *toe = netdev_to_toe(dev);
+
+	void __iomem *ptr_reg = gmac_dma_reg(dev, GMAC_SW_TX_QUEUE_PTR_REG(txq_num));
+	struct netdev_queue *ntxq = netdev_get_tx_queue(dev, txq_num);
+	struct gmac_txq *txq = &gmac->txq[txq_num];
+
+	unsigned i, n, errs = 0, mask = __RWPTR_MASK(gmac->txq_order);
+	struct sk_buff *skb;
+	GMAC_TXDESC_T *tx;
+
+	netif_info(gmac, tx_done, dev, "txirq%u: %u,%u,%u\n",
+		txq_num, txq->cptr, GET_RPTR(ptr_reg), GET_WPTR(ptr_reg));
+
+	for (i = txq->cptr; i != GET_RPTR(ptr_reg); i = __RWPTR_NEXT(i, mask)) {
+retry:
+		tx = &txq->ring[i];
+		skb = txq->skb[i];
+		txq->skb[i] = NULL;
+
+		BUG_ON(!skb);
+
+		dma_unmap_single(toe->dev, tx->word2.buf_adr,
+			tx->word0.bits.buffer_size, DMA_TO_DEVICE);
+
+		if (tx->word0.bits.status_tx_ok) {
+			netif_info(gmac, tx_done, dev,
+				"TX done descriptor: [%u] 0x%08x 0x%08x 0x%08x 0x%08x\n",
+				i, tx->word0.bits32, tx->word1.bits32,
+				tx->word2.bits32, tx->word3.bits32);
+		} else {
+			errs++;
+			netif_err(gmac, tx_err, dev,
+				"TX error descriptor: [%u] 0x%08x 0x%08x 0x%08x 0x%08x\n",
+				i, tx->word0.bits32, tx->word1.bits32,
+				tx->word2.bits32, tx->word3.bits32);
+		}
+
+		n = tx->word0.bits.desc_count;
+		BUG_ON(__RWPTR_DISTANCE(i, GET_RPTR(ptr_reg), mask) < n);
+
+		while (--n) {
+			i = __RWPTR_NEXT(i, mask);
+			dma_unmap_page(toe->dev, txq->ring[i].word2.buf_adr,
+				txq->ring[i].word0.bits.buffer_size,
+				DMA_TO_DEVICE);
+			netif_info(gmac, tx_done, dev,
+				"TX frag descriptor: [%u] 0x%08x 0x%08x 0x%08x 0x%08x\n",
+				i, txq->ring[i].word0.bits32,
+				txq->ring[i].word1.bits32,
+				txq->ring[i].word2.bits32,
+				txq->ring[i].word3.bits32);
+		}
+
+		dev_kfree_skb_irq(skb);
+	}
+
+	spin_lock(&toe->irq_lock);
+
+	u64_stats_update_begin(&gmac->ir_stats_syncp);
+	gmac->stats.tx_errors += errs;
+	u64_stats_update_end(&gmac->ir_stats_syncp);
+
+	txq->cptr = i;
+	__gmac_enable_txfin_irq(dev, txq_num, 0);
+	netif_tx_wake_queue(ntxq);
+
+	spin_unlock(&toe->irq_lock);
+
+	__raw_writel(
+		(GMAC0_SWTQ00_EOF_INT_BIT|GMAC0_SWTQ00_FIN_INT_BIT)
+			<< (6 * dev->dev_id + txq_num),
+		toe_reg(toe, GLOBAL_INTERRUPT_STATUS_0_REG));
+	txq->noirq_packets = gmac->irq_every_tx_packets;
+
+	if (unlikely(i != GET_RPTR(ptr_reg))) {
+		errs = 0;
+		goto retry;
+	}
+}
+
+static inline unsigned tss_pkt_len(struct sk_buff *skb)
+{
+	if (!skb_is_gso(skb))
+		return 0;
+
+	return skb_transport_offset(skb) +
+		tcp_hdrlen(skb) + skb_shinfo(skb)->gso_size;
+}
+
+static int gmac_map_tx_bufs(struct net_device *dev, struct sk_buff *skb,
+	struct gmac_txq *txq, int desc, unsigned tss_flags)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	struct device *dma_dev = netdev_to_toe(dev)->dev;
+	skb_frag_t *frag;
+	dma_addr_t mapping;
+	int nfrags, w;
+
+	frag = skb_shinfo(skb)->frags;
+	nfrags = skb_shinfo(skb)->nr_frags;
+	w = desc;
+
+	mapping = dma_map_single(dma_dev, skb->data,
+		skb_headlen(skb), DMA_TO_DEVICE);
+	if (dma_mapping_error(dma_dev, mapping))
+		goto map1_error;
+
+	if (!skb_is_gso(skb))
+		tss_flags |= TSS_BYPASS_BIT;
+
+	txq->ring[w].word0.bits32 = skb_headlen(skb);
+	txq->ring[w].word1.bits32 = skb->len | tss_flags;
+	txq->ring[w].word2.bits32 = mapping;
+	txq->ring[w].word3.bits32 = tss_pkt_len(skb) | SOF_BIT;
+
+	/* racing with TX completion irq, harmless */
+	if (txq->noirq_packets == 1) {
+		txq->noirq_packets = 0;
+		txq->ring[w].word3.bits32 |= EOFIE_BIT;
+	} else if (txq->noirq_packets)
+		txq->noirq_packets--;
+
+	netif_info(gmac, tx_queued, dev,
+		"txq%ld[%u]: 0x%08x 0x%08x 0x%08x 0x%08x, datap %p\n",
+		txq - gmac->txq, w,
+		txq->ring[w].word0.bits32, txq->ring[w].word1.bits32,
+		txq->ring[w].word2.bits32, txq->ring[w].word3.bits32,
+		skb->data);
+
+	while (nfrags--) {
+		mapping = dma_map_page(dma_dev, frag->page,
+			frag->page_offset, frag->size, DMA_TO_DEVICE);
+		if (dma_mapping_error(dma_dev, mapping))
+			goto map_error;
+
+		w = RWPTR_NEXT(w, gmac->txq_order);
+		txq->ring[w].word0.bits32 = frag->size;
+		txq->ring[w].word1.bits32 = 0;
+		txq->ring[w].word2.bits32 = mapping;
+		txq->ring[w].word3.bits32 = 0;
+
+		netif_info(gmac, tx_queued, dev,
+			"txq%ld[%u]: 0x%08x 0x%08x 0x%08x 0x%08x, data %u @ %p+0x%03x\n",
+			txq - gmac->txq, w,
+			txq->ring[w].word0.bits32, txq->ring[w].word1.bits32,
+			txq->ring[w].word2.bits32, txq->ring[w].word3.bits32,
+			frag->size, frag->page, frag->page_offset);
+
+		++frag;
+	}
+
+	txq->ring[w].word3.bits32 |= EOFIE_BIT | EOF_BIT;
+
+	return RWPTR_NEXT(w, gmac->txq_order);
+
+map_error:
+	while (w != desc) {
+		dma_unmap_page(dma_dev, txq->ring[w].word2.buf_adr,
+			txq->ring[w].word0.bits.buffer_size, DMA_TO_DEVICE);
+		w = RWPTR_PREV(w, gmac->txq_order);
+	}
+
+	dma_unmap_single(dma_dev, txq->ring[w].word2.buf_adr,
+		txq->ring[w].word0.bits.buffer_size, DMA_TO_DEVICE);
+
+map1_error:
+	netif_info(gmac, tx_err, dev,
+		"txq%ld: DMA mapping error\n", txq - gmac->txq);
+
+	return -ENOMEM;
+}
+
+static int gmac_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	struct toe_private *toe = netdev_to_toe(dev);
+
+	void __iomem *ptr_reg;
+	struct gmac_txq *txq;
+	struct netdev_queue *ntxq;
+	int w, nw, txq_num, nfrags;
+	unsigned long flags;
+
+	SKB_FRAG_ASSERT(skb);
+
+	txq_num = skb_get_queue_mapping(skb);
+	ptr_reg = gmac_dma_reg(dev, GMAC_SW_TX_QUEUE_PTR_REG(txq_num));
+	txq = &gmac->txq[txq_num];
+	ntxq = netdev_get_tx_queue(dev, txq_num);
+
+	netif_info(gmac, tx_queued, dev, "txq%u: %u,%u,%u ? %p (%u @ %p) /%u\n",
+		txq_num, txq->cptr, GET_RPTR(ptr_reg), GET_WPTR(ptr_reg),
+		skb, skb->len, skb->data, skb_shinfo(skb)->gso_size);
+	if (netif_msg_pktdata(gmac))
+		print_hex_dump(KERN_DEBUG, "TX: ", DUMP_PREFIX_OFFSET, 16, 1,
+			skb->data, skb_headlen(skb), true);
+
+	u64_stats_update_begin(&gmac->tx_stats_syncp);
+
+	if (skb->len >= 0x10000)
+		goto out_drop_free;
+
+	w = GET_WPTR(ptr_reg);
+	spin_lock_irqsave(&toe->irq_lock, flags);
+	nw = RWPTR_DISTANCE(w, txq->cptr - 1, gmac->txq_order);
+	if (!nw) {
+		netif_tx_stop_queue(ntxq);
+		__gmac_enable_txfin_irq(dev, txq_num, 1);
+		spin_unlock_irqrestore(&toe->irq_lock, flags);
+		goto out_drop_free;
+	}
+	spin_unlock_irqrestore(&toe->irq_lock, flags);
+
+	nfrags = skb_shinfo(skb)->nr_frags;
+	if (nw <= nfrags || nfrags >= TX_MAX_FRAGS) {
+		if (skb_linearize(skb))
+			goto out_drop;
+		gmac->tx_frags_linearized++;
+	} else
+		gmac->tx_frag_stats[nfrags]++;
+
+	txq->skb[w] = skb;
+
+	w = gmac_map_tx_bufs(dev, skb, txq, w,
+		(skb->ip_summed != CHECKSUM_NONE ? TSS_CHECKUM_ENABLE : 0)
+	);
+
+	if (w < 0)
+		goto out_drop_free;
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		gmac->tx_hw_csummed++;
+
+	ntxq->tx_bytes += skb->len;
+	ntxq->tx_packets++;
+
+	u64_stats_update_end(&gmac->tx_stats_syncp);
+
+	netif_info(gmac, tx_queued, dev, "txq%u: %u,%u,%u + %p\n",
+		txq_num, txq->cptr, GET_RPTR(ptr_reg), w, skb);
+
+	SET_WPTR(ptr_reg, w);
+	ntxq->trans_start = jiffies;
+
+	/* stats updated on tx completion */
+	return NETDEV_TX_OK;
+
+out_drop_free:
+	dev_kfree_skb(skb);
+out_drop:
+	ntxq->tx_dropped++;
+	u64_stats_update_end(&gmac->tx_stats_syncp);
+	return NETDEV_TX_OK;
+}
+
+static void gmac_unmap_rx_frags(struct toe_private *toe, struct sk_buff *skb)
+{
+	int i = skb_shinfo(skb)->nr_frags;
+	skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
+
+	for (; i--; frag++)
+		dma_unmap_page(toe->dev,
+			page_to_dma(toe->dev, frag->page) + frag->page_offset,
+			frag->size, DMA_FROM_DEVICE);
+}
+
+static struct sk_buff *gmac_drop_napi_skb(struct gmac_private *gmac,
+	struct toe_private *toe)
+{
+	gmac_unmap_rx_frags(toe, napi_get_frags(&gmac->napi));
+
+	napi_free_frags(&gmac->napi);
+
+	u64_stats_update_begin(&gmac->rx_stats_syncp);
+	gmac->stats.rx_dropped++;
+	u64_stats_update_end(&gmac->rx_stats_syncp);
+
+	return NULL;
+}
+
+static struct sk_buff *gmac_skb_if_good_frame(struct gmac_private *gmac,
+	GMAC_RXDESC_T *rx)
+{
+	struct sk_buff *skb;
+	unsigned pkt_size = rx->word1.bits.byte_count;
+	unsigned rx_status = rx->word0.bits.status;
+	unsigned rx_csum = rx->word0.bits.chksum_status;
+
+	u64_stats_update_begin(&gmac->rx_stats_syncp);
+
+	gmac->rx_stats[rx_status]++;
+	gmac->rx_csum_stats[rx_csum]++;
+
+	if (rx->word0.bits.derr || rx->word0.bits.perr ||
+	    rx_status || pkt_size < ETH_ZLEN ||
+	    rx_csum >= RX_CHKSUM_IP_ERR_UNKNOWN) {
+		gmac->stats.rx_errors++;
+
+		if (pkt_size < ETH_ZLEN || RX_ERROR_LENGTH(rx_status))
+			gmac->stats.rx_length_errors++;
+		if (RX_ERROR_OVER(rx_status))
+			gmac->stats.rx_over_errors++;
+		if (RX_ERROR_CRC(rx_status))
+			gmac->stats.rx_crc_errors++;
+		if (RX_ERROR_FRAME(rx_status))
+			gmac->stats.rx_frame_errors++;
+
+		u64_stats_update_end(&gmac->rx_stats_syncp);
+
+		return NULL;
+	}
+
+	skb = napi_get_frags(&gmac->napi);
+	if (!skb) {
+		gmac->stats.rx_dropped++;
+		u64_stats_update_end(&gmac->rx_stats_syncp);
+
+		return NULL;
+	}
+
+	if (rx_csum == RX_CHKSUM_IP_UDP_TCP_OK)
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	gmac->stats.rx_bytes += pkt_size;
+	gmac->stats.rx_packets++;
+
+	u64_stats_update_end(&gmac->rx_stats_syncp);
+
+	return skb;
+}
+
+static unsigned gmac_rx(struct net_device *dev, unsigned budget)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	struct toe_private *toe = netdev_to_toe(dev);
+	void __iomem *ptr_reg = gmac->rxq_rwptr;
+
+	unsigned i, mask = __RWPTR_MASK(gmac->rxq_order);
+	GMAC_RXDESC_T *rx = NULL;
+	struct sk_buff *skb = NULL;
+	struct page *page, *last_page = NULL;
+	unsigned page_offs, next_offs = 0;
+	unsigned pkt_size = 0, frag_size;
+	int frag_nr = 0;
+
+	netif_info(gmac, rx_status, dev, "rxq: %u,%u\n",
+		 GET_RPTR(ptr_reg), GET_WPTR(ptr_reg));
+
+	i = GET_RPTR(ptr_reg);
+	for (; budget && i != GET_WPTR(ptr_reg); i = __RWPTR_NEXT(i, mask)) {
+		rx = &gmac->rxq_ring[i];
+
+		page = dma_to_page(toe->dev, rx->word2.buf_adr);
+		page_offs = rx->word2.buf_adr & ~PAGE_MASK;
+
+		netif_info(gmac, rx_status, dev,
+			"rxq[%u]: 0x%08x 0x%08x 0x%08x 0x%08x, page %p, offs 0x%04x\n",
+			i, rx->word0.bits32, rx->word1.bits32,
+			rx->word2.bits32, rx->word3.bits32, page, page_offs);
+
+		if (unlikely(!rx->word2.buf_adr)) {
+			netif_err(gmac, rx_status, dev,
+				"rxq[%u]: HW BUG: zero DMA descriptor\n", i);
+			if (skb)
+				skb = gmac_drop_napi_skb(gmac, toe);
+			continue;
+		}
+
+		if (rx->word3.bits32 & SOF_BIT) {
+			if (skb)
+				gmac_drop_napi_skb(gmac, toe);
+
+			skb = gmac_skb_if_good_frame(gmac, rx);
+			if (!skb) {
+				put_page(page);
+				continue;
+			}
+			skb->dev = dev;
+
+			pkt_size = rx->word1.bits.byte_count;
+			frag_nr = -1;
+			last_page = NULL;
+			page_offs += RX_INSERT_BYTES;
+		} else if (!skb) {
+			put_page(page);
+			continue;
+		}
+
+		/* append page frag to skb */
+
+		if (rx->word3.bits32 & EOF_BIT)
+			frag_size = pkt_size;
+		else {
+			frag_size = 1 << toe->freeq_frag_order;
+			if (rx->word3.bits32 & SOF_BIT)
+				frag_size -= RX_INSERT_BYTES;
+		}
+
+		if (page == last_page && page_offs == next_offs) {
+			skb_shinfo(skb)->frags[frag_nr].size += frag_size;
+			put_page(page);
+		} else if (likely(++frag_nr != MAX_SKB_FRAGS))
+			skb_fill_page_desc(skb, frag_nr,
+				page, page_offs, frag_size);
+		else {
+			skb = gmac_drop_napi_skb(gmac, toe);
+			put_page(page);
+			continue;
+		}
+
+		last_page = page;
+		next_offs = page_offs + frag_size;
+
+		skb->len += frag_size;
+		skb->data_len += frag_size;
+		skb->truesize += frag_size;
+
+		/* receive */
+
+		if (rx->word3.bits32 & EOF_BIT) {
+			gmac_unmap_rx_frags(toe, skb);
+			napi_gro_frags(&gmac->napi);
+			skb = NULL;
+			--budget;
+		}
+	}
+
+	SET_RPTR(ptr_reg, i);
+
+	if (rx)
+		dev->last_rx = jiffies;
+
+	if (skb)
+		gmac_drop_napi_skb(gmac, toe);
+
+	return budget;
+}
+
+#define GMAC0_IRQ0_2 (GMAC0_TXDERR_INT_BIT|GMAC0_TXPERR_INT_BIT| \
+	GMAC0_RXDERR_INT_BIT|GMAC0_RXPERR_INT_BIT)
+#define GMAC0_IRQ0_6 (GMAC0_SWTQ00_EOF_INT_BIT|GMAC0_SWTQ00_FIN_INT_BIT)
+#define GMAC0_IRQ4_8 (GMAC0_MIB_INT_BIT|GMAC0_RX_OVERRUN_INT_BIT)
+
+static void gmac_enable_irq(struct net_device *dev, int enable)
+{
+	struct toe_private *toe = netdev_to_toe(dev);
+	unsigned long flags;
+	unsigned val, mask;
+
+	spin_lock_irqsave(&toe->irq_lock, flags);
+
+	mask = (GMAC0_IRQ0_2 << (dev->dev_id * 2)) |
+		(GMAC0_IRQ0_6 << (dev->dev_id * 6));
+	val = __raw_readl(toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_0_REG));
+	val = enable ? (val | mask) : (val & ~mask);
+	__raw_writel(val, toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_0_REG));
+
+	mask = DEFAULT_Q0_INT_BIT << dev->dev_id;
+	val = __raw_readl(toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_1_REG));
+	val = enable ? (val | mask) : (val & ~mask);
+	__raw_writel(val, toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_1_REG));
+
+	mask = GMAC0_IRQ4_8 << (dev->dev_id * 8);
+	val = __raw_readl(toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_4_REG));
+	val = enable ? (val | mask) : (val & ~mask);
+	__raw_writel(val, toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_4_REG));
+
+	spin_unlock_irqrestore(&toe->irq_lock, flags);
+}
+
+static void __gmac_enable_txfin_irq(struct net_device *dev, int txq, int enable)
+{
+	struct toe_private *toe = netdev_to_toe(dev);
+	unsigned val, mask;
+
+	mask = GMAC0_SWTQ00_FIN_INT_BIT << (6 * dev->dev_id + txq);
+	val = __raw_readl(toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_0_REG));
+	val = enable ? (val | mask) : (val & ~mask);
+	__raw_writel(val, toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_0_REG));
+}
+
+static void gmac_enable_rx_irq(struct net_device *dev, int enable)
+{
+	struct toe_private *toe = netdev_to_toe(dev);
+	unsigned long flags;
+	unsigned val, mask;
+
+	spin_lock_irqsave(&toe->irq_lock, flags);
+
+	mask = DEFAULT_Q0_INT_BIT << dev->dev_id;
+	val = __raw_readl(toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_1_REG));
+	val = enable ? (val | mask) : (val & ~mask);
+	__raw_writel(val, toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_1_REG));
+
+	spin_unlock_irqrestore(&toe->irq_lock, flags);
+}
+
+static int gmac_napi_poll(struct napi_struct *napi, int max_work)
+{
+	struct gmac_private *gmac = napi_to_gmac(napi);
+	unsigned work_left;
+
+	work_left = gmac_rx(napi->dev, max_work);
+
+	if (work_left != max_work) {
+		if (work_left) {
+			struct toe_private *toe = netdev_to_toe(napi->dev);
+			/* we've cleared the queue, ack rx interrupt;
+			 * on next poll the interrupt will be enabled
+			 * if the queue stays empty
+			 */
+			__raw_writel(DEFAULT_Q0_INT_BIT << napi->dev->dev_id,
+				toe_reg(toe, GLOBAL_INTERRUPT_STATUS_1_REG));
+		}
+		return max_work - work_left;
+	}
+
+	napi_complete(napi);
+	u64_stats_update_begin(&gmac->rx_stats_syncp);
+	++gmac->rx_napi_exits;
+	u64_stats_update_end(&gmac->rx_stats_syncp);
+	gmac_enable_rx_irq(napi->dev, 1);
+
+	return 0;
+}
+
+static void gmac_dump_dma_state(struct net_device *dev)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	struct toe_private *toe = netdev_to_toe(dev);
+	void __iomem *ptr_reg;
+	unsigned reg[5];
+
+	/* Interrupt status */
+	reg[0] = __raw_readl(toe_reg(toe, GLOBAL_INTERRUPT_STATUS_0_REG));
+	reg[1] = __raw_readl(toe_reg(toe, GLOBAL_INTERRUPT_STATUS_1_REG));
+	reg[2] = __raw_readl(toe_reg(toe, GLOBAL_INTERRUPT_STATUS_2_REG));
+	reg[3] = __raw_readl(toe_reg(toe, GLOBAL_INTERRUPT_STATUS_3_REG));
+	reg[4] = __raw_readl(toe_reg(toe, GLOBAL_INTERRUPT_STATUS_4_REG));
+	netdev_err(dev, "IRQ status: 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x\n",
+		reg[0], reg[1], reg[2], reg[3], reg[4]);
+
+	/* Interrupt enable */
+	reg[0] = __raw_readl(toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_0_REG));
+	reg[1] = __raw_readl(toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_1_REG));
+	reg[2] = __raw_readl(toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_2_REG));
+	reg[3] = __raw_readl(toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_3_REG));
+	reg[4] = __raw_readl(toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_4_REG));
+	netdev_err(dev, "IRQ enable: 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x\n",
+		reg[0], reg[1], reg[2], reg[3], reg[4]);
+
+	/* RX DMA status */
+	reg[0] = __raw_readl(gmac_dma_reg(dev, GMAC_DMA_RX_FIRST_DESC_REG));
+	reg[1] = __raw_readl(gmac_dma_reg(dev, GMAC_DMA_RX_CURR_DESC_REG));
+	reg[2] = GET_RPTR(gmac->rxq_rwptr);
+	reg[3] = GET_WPTR(gmac->rxq_rwptr);
+	netdev_err(dev, "RX DMA regs: 0x%08x 0x%08x, ptr: %u %u\n",
+		reg[0], reg[1], reg[2], reg[3]);
+
+	reg[0] = __raw_readl(gmac_dma_reg(dev, GMAC_DMA_RX_DESC_WORD0_REG));
+	reg[1] = __raw_readl(gmac_dma_reg(dev, GMAC_DMA_RX_DESC_WORD1_REG));
+	reg[2] = __raw_readl(gmac_dma_reg(dev, GMAC_DMA_RX_DESC_WORD2_REG));
+	reg[3] = __raw_readl(gmac_dma_reg(dev, GMAC_DMA_RX_DESC_WORD3_REG));
+	netdev_err(dev, "RX DMA descriptor: 0x%08x 0x%08x 0x%08x 0x%08x\n",
+		reg[0], reg[1], reg[2], reg[3]);
+
+	/* TX DMA status */
+	ptr_reg = gmac_dma_reg(dev, GMAC_SW_TX_QUEUE0_PTR_REG);
+
+	reg[0] = __raw_readl(gmac_dma_reg(dev, GMAC_DMA_TX_FIRST_DESC_REG));
+	reg[1] = __raw_readl(gmac_dma_reg(dev, GMAC_DMA_TX_CURR_DESC_REG));
+	reg[2] = GET_RPTR(ptr_reg);
+	reg[3] = GET_WPTR(ptr_reg);
+	netdev_err(dev, "TX DMA regs: 0x%08x 0x%08x, ptr: %u %u\n",
+		reg[0], reg[1], reg[2], reg[3]);
+
+	reg[0] = __raw_readl(gmac_dma_reg(dev, GMAC_DMA_TX_DESC_WORD0_REG));
+	reg[1] = __raw_readl(gmac_dma_reg(dev, GMAC_DMA_TX_DESC_WORD1_REG));
+	reg[2] = __raw_readl(gmac_dma_reg(dev, GMAC_DMA_TX_DESC_WORD2_REG));
+	reg[3] = __raw_readl(gmac_dma_reg(dev, GMAC_DMA_TX_DESC_WORD3_REG));
+	netdev_err(dev, "TX DMA descriptor: 0x%08x 0x%08x 0x%08x 0x%08x\n",
+		reg[0], reg[1], reg[2], reg[3]);
+
+	/* FREE queues status */
+	ptr_reg = toe_reg(toe, GLOBAL_SWFQ_RWPTR_REG);
+
+	reg[0] = GET_RPTR(ptr_reg);
+	reg[1] = GET_WPTR(ptr_reg);
+
+	ptr_reg = toe_reg(toe, GLOBAL_HWFQ_RWPTR_REG);
+
+	reg[2] = GET_RPTR(ptr_reg);
+	reg[3] = GET_WPTR(ptr_reg);
+	netdev_err(dev, "FQ SW ptr: %u %u, HW ptr: %u %u\n",
+		reg[0], reg[1], reg[2], reg[3]);
+}
+
+static void gmac_update_hw_stats(struct net_device *dev)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	struct toe_private *toe = netdev_to_toe(dev);
+	unsigned long flags;
+	unsigned int rx_discards, rx_mcast, rx_bcast;
+
+	spin_lock_irqsave(&toe->irq_lock, flags);
+	u64_stats_update_begin(&gmac->ir_stats_syncp);
+
+	gmac->hw_stats[0] += rx_discards = __raw_readl(gmac_ctl_reg(dev, GMAC_IN_DISCARDS));
+	gmac->hw_stats[1] += __raw_readl(gmac_ctl_reg(dev, GMAC_IN_ERRORS));
+	gmac->hw_stats[2] += rx_mcast = __raw_readl(gmac_ctl_reg(dev, GMAC_IN_MCAST));
+	gmac->hw_stats[3] += rx_bcast = __raw_readl(gmac_ctl_reg(dev, GMAC_IN_BCAST));
+	gmac->hw_stats[4] += __raw_readl(gmac_ctl_reg(dev, GMAC_IN_MAC1));
+	gmac->hw_stats[5] += __raw_readl(gmac_ctl_reg(dev, GMAC_IN_MAC2));
+
+	gmac->stats.rx_missed_errors += rx_discards;
+	gmac->stats.multicast += rx_mcast;
+	gmac->stats.multicast += rx_bcast;
+
+	__raw_writel(GMAC0_MIB_INT_BIT << (dev->dev_id * 8),
+		toe_reg(toe, GLOBAL_INTERRUPT_STATUS_4_REG));
+
+	u64_stats_update_end(&gmac->ir_stats_syncp);
+	spin_unlock_irqrestore(&toe->irq_lock, flags);
+}
+
+static inline unsigned gmac_get_intr_flags(struct net_device *dev, int i)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	struct toe_private *toe = netdev_to_toe(dev);
+	void __iomem *irqif_reg, *irqen_reg;
+	unsigned offs, val;
+
+	offs = i * (GLOBAL_INTERRUPT_STATUS_1_REG - GLOBAL_INTERRUPT_STATUS_0_REG);
+
+	irqif_reg = toe_reg(toe, GLOBAL_INTERRUPT_STATUS_0_REG + offs);
+	irqen_reg = toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_0_REG + offs);
+
+	val = __raw_readl(irqif_reg) & __raw_readl(irqen_reg);
+	if (val)
+		netif_info(gmac, intr, dev, "irq: val%d&en = 0x%08x\n", i, val);
+
+	return val;
+}
+
+static irqreturn_t gmac_interrupt(int irq, void *data)
+{
+	struct net_device *dev = data;
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	struct toe_private *toe = netdev_to_toe(dev);
+	unsigned val, orr = 0;
+
+
+	orr |= val = gmac_get_intr_flags(dev, 0);
+
+	if (unlikely(val & (GMAC0_IRQ0_2 << (dev->dev_id * 2)))) {
+		/* oh, crap. */
+		netif_err(gmac, intr, dev, "hw failure/sw bug\n");
+		gmac_dump_dma_state(dev);
+
+		/* don't know how to recover, just reduce losses */
+		gmac_enable_irq(dev, 0);
+		return IRQ_HANDLED;
+	}
+
+	if (val & (GMAC0_IRQ0_6 << (dev->dev_id * 6)))
+		gmac_tx_interrupt(dev, 0);
+
+
+	orr |= val = gmac_get_intr_flags(dev, 1);
+
+	if (val & (DEFAULT_Q0_INT_BIT << dev->dev_id)) {
+		gmac_enable_rx_irq(dev, 0);
+		napi_schedule(&gmac->napi);
+	}
+
+
+	orr |= val = gmac_get_intr_flags(dev, 4);
+
+	if (unlikely(val & (GMAC0_MIB_INT_BIT << (dev->dev_id * 8))))
+		gmac_update_hw_stats(dev);
+
+	if (unlikely(val & (GMAC0_RX_OVERRUN_INT_BIT << (dev->dev_id * 8)))) {
+		__raw_writel(GMAC0_RXDERR_INT_BIT << (dev->dev_id * 8),
+			toe_reg(toe, GLOBAL_INTERRUPT_STATUS_4_REG));
+
+		spin_lock(&toe->irq_lock);
+		u64_stats_update_begin(&gmac->ir_stats_syncp);
+		++gmac->stats.rx_fifo_errors;
+		u64_stats_update_end(&gmac->ir_stats_syncp);
+		spin_unlock(&toe->irq_lock);
+	}
+
+	return orr ? IRQ_HANDLED : IRQ_NONE;
+}
+
+static int gmac_open_running(struct net_device *dev)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	int err;
+
+	err = gmac_setup_rxq(dev);
+	if (unlikely(err))
+		return err;
+
+	err = gmac_setup_txqs(dev);
+	if (unlikely(err)) {
+		gmac_cleanup_rxq(dev);
+		return err;
+	}
+
+	napi_enable(&gmac->napi);
+	gmac_hw_start(dev);
+	gmac_enable_irq(dev, 1);
+	gmac_enable_tx_rx(dev);
+	netif_tx_start_all_queues(dev);
+
+	gmac->in_reset = 0;
+
+	return 0;
+}
+
+static void gmac_stop_running(struct net_device *dev)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+
+	netif_tx_stop_all_queues(dev);
+
+	gmac_disable_tx_rx(dev);
+	gmac_hw_stop(dev);
+
+	napi_disable(&gmac->napi);
+
+	gmac_enable_irq(dev, 0);
+
+	gmac_cleanup_txqs(dev);
+	gmac_cleanup_rxq(dev);
+
+	gmac->in_reset = 1;
+}
+
+static int gmac_open(struct net_device *dev)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	int err;
+
+	if (!dev->phydev) {
+		err = gmac_setup_phy(dev);
+		if (err) {
+			netif_err(gmac, ifup, dev,
+				"PHY init failed: %d\n", err);
+			return err;
+		}
+	}
+
+	err = request_irq(dev->irq, gmac_interrupt,
+		IRQF_SHARED, dev->name, dev);
+	if (unlikely(err))
+		return err;
+
+	netif_carrier_off(dev);
+	phy_start(dev->phydev);
+
+	err = gmac_open_running(dev);
+	if (likely(!err))
+		return 0;
+
+	phy_stop(dev->phydev);
+	free_irq(dev->irq, dev);
+	return err;
+}
+
+static int gmac_stop(struct net_device *dev)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+
+	if (!gmac->in_reset)
+		gmac_stop_running(dev);
+
+	phy_stop(dev->phydev);
+	free_irq(dev->irq, dev);
+
+	gmac_update_hw_stats(dev);
+
+	return 0;
+}
+
+static void gmac_set_multicast_list(struct net_device *dev)
+{
+	struct netdev_hw_addr *ha;
+	__u32 mc_filter[2];
+	unsigned bit_nr;
+
+	if (dev->flags & IFF_ALLMULTI)
+		return;
+
+	mc_filter[1] = mc_filter[0] = 0;
+	netdev_for_each_mc_addr(ha, dev) {
+		bit_nr = ~crc32_be(~0, ha->addr, ETH_ALEN) & 0x3f;
+		mc_filter[bit_nr >> 5] |= 1 << (bit_nr & 0x1f);
+	}
+
+	__raw_writel(mc_filter[0], gmac_ctl_reg(dev, GMAC_MCAST_FIL0));
+	__raw_writel(mc_filter[1], gmac_ctl_reg(dev, GMAC_MCAST_FIL1));
+}
+
+static void gmac_set_rx_mode(struct net_device *dev)
+{
+	GMAC_RX_FLTR_T filter = { .bits = {
+		.broadcast = 1,
+		.multicast = 1,
+		.unicast = 1,
+	} };
+
+	if (dev->flags & IFF_PROMISC) {
+		filter.bits.error = 1;
+		filter.bits.promiscuous = 1;
+	} else if (dev->flags & IFF_ALLMULTI) {
+		__raw_writel(~0, gmac_ctl_reg(dev, GMAC_MCAST_FIL0));
+		__raw_writel(~0, gmac_ctl_reg(dev, GMAC_MCAST_FIL1));
+	} else {
+		gmac_set_multicast_list(dev);
+	}
+
+	__raw_writel(filter.bits32, gmac_ctl_reg(dev, GMAC_RX_FLTR));
+}
+
+static void __gmac_set_mac_address(struct net_device *dev)
+{
+	__le32 addr[3];
+
+	memset(addr, 0, sizeof(addr));
+	memcpy(addr, dev->dev_addr, ETH_ALEN);
+
+	__raw_writel(le32_to_cpu(addr[0]), gmac_ctl_reg(dev, GMAC_STA_ADD0));
+	__raw_writel(le32_to_cpu(addr[1]), gmac_ctl_reg(dev, GMAC_STA_ADD1));
+	__raw_writel(le32_to_cpu(addr[2]), gmac_ctl_reg(dev, GMAC_STA_ADD2));
+}
+
+static int gmac_set_mac_address(struct net_device *dev, void *addr)
+{
+	struct sockaddr *sa = addr;
+
+	memcpy(dev->dev_addr, sa->sa_data, ETH_ALEN);
+	__gmac_set_mac_address(dev);
+
+	return 0;
+}
+
+static void gmac_clear_hw_stats(struct net_device *dev)
+{
+	__raw_readl(gmac_ctl_reg(dev, GMAC_IN_DISCARDS));
+	__raw_readl(gmac_ctl_reg(dev, GMAC_IN_ERRORS));
+	__raw_readl(gmac_ctl_reg(dev, GMAC_IN_MCAST));
+	__raw_readl(gmac_ctl_reg(dev, GMAC_IN_BCAST));
+	__raw_readl(gmac_ctl_reg(dev, GMAC_IN_MAC1));
+	__raw_readl(gmac_ctl_reg(dev, GMAC_IN_MAC2));
+}
+
+static struct rtnl_link_stats64 *gmac_get_stats64(struct net_device *dev,
+	struct rtnl_link_stats64 *storage)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	unsigned int start;
+
+	gmac_update_hw_stats(dev);
+	dev_txq_stats_fold(dev, storage);
+
+	/* racing with RX NAPI */
+	do {
+		start = u64_stats_fetch_begin(&gmac->rx_stats_syncp);
+
+		storage->rx_packets = gmac->stats.rx_packets;
+		storage->rx_bytes = gmac->stats.rx_bytes;
+		storage->rx_errors = gmac->stats.rx_errors;
+		storage->rx_dropped = gmac->stats.rx_dropped;
+
+		storage->rx_length_errors = gmac->stats.rx_length_errors;
+		storage->rx_over_errors = gmac->stats.rx_over_errors;
+		storage->rx_crc_errors = gmac->stats.rx_crc_errors;
+		storage->rx_frame_errors = gmac->stats.rx_frame_errors;
+
+	} while (u64_stats_fetch_retry(&gmac->rx_stats_syncp, start));
+
+	/* racing with MIB and TX completion interrupts */
+	do {
+		start = u64_stats_fetch_begin(&gmac->ir_stats_syncp);
+
+		storage->tx_errors = gmac->stats.tx_errors;
+
+		storage->multicast = gmac->stats.multicast;
+		storage->rx_missed_errors = gmac->stats.rx_missed_errors;
+		storage->rx_fifo_errors = gmac->stats.rx_fifo_errors;
+
+	} while (u64_stats_fetch_retry(&gmac->ir_stats_syncp, start));
+
+	storage->rx_dropped += storage->rx_missed_errors;
+
+	return storage;
+}
+
+static int gmac_change_mtu(struct net_device *dev, int new_mtu)
+{
+	int max_len = gmac_pick_rx_max_len(new_mtu);
+
+	if (max_len < 0)
+		return -EINVAL;
+
+	gmac_disable_tx_rx(dev);
+
+	dev->mtu = new_mtu;
+	gmac_update_config0_reg(dev,
+		max_len << CONFIG0_MAXLEN_SHIFT,
+		CONFIG0_MAXLEN_MASK);
+
+	gmac_enable_tx_rx(dev);
+
+	return 0;
+}
+
+static u32 gmac_get_rx_csum(struct net_device *dev)
+{
+	return !!(__raw_readl(gmac_ctl_reg(dev, GMAC_CONFIG0)) & CONFIG0_RX_CHKSUM);
+}
+
+static int gmac_set_rx_csum(struct net_device *dev, u32 enable)
+{
+	gmac_update_config0_reg(dev,
+		enable ? CONFIG0_RX_CHKSUM : 0, CONFIG0_RX_CHKSUM);
+
+	return 0;
+}
+
+static int gmac_set_tso(struct net_device *dev, u32 data)
+{
+	if (data)
+		dev->features |= NETIF_TSO_FEATURES;
+	else
+		dev->features &= ~NETIF_TSO_FEATURES;
+
+	return 0;
+}
+
+static int gmac_get_sset_count(struct net_device *dev, int sset)
+{
+	return sset == ETH_SS_STATS ? GMAC_STATS_NUM : 0;
+}
+
+static void gmac_get_strings(struct net_device *dev, u32 stringset, u8 *data)
+{
+	if (stringset != ETH_SS_STATS)
+		return;
+
+	memcpy(data, gmac_stats_strings, sizeof(gmac_stats_strings));
+}
+
+static void gmac_get_ethtool_stats(struct net_device *dev,
+	struct ethtool_stats *estats, u64 *values)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	unsigned int start;
+	u64 *p;
+	int i;
+
+	gmac_update_hw_stats(dev);
+
+	/* racing with MIB interrupt */
+	do {
+		p = values;
+		start = u64_stats_fetch_begin(&gmac->ir_stats_syncp);
+
+		for (i = 0; i < RX_STATS_NUM; ++i)
+			*p++ = gmac->hw_stats[i];
+
+	} while (u64_stats_fetch_retry(&gmac->ir_stats_syncp, start));
+	values = p;
+
+	/* racing with RX NAPI */
+	do {
+		p = values;
+		start = u64_stats_fetch_begin(&gmac->rx_stats_syncp);
+
+		for (i = 0; i < RX_STATUS_NUM; ++i)
+			*p++ = gmac->rx_stats[i];
+		for (i = 0; i < RX_CHKSUM_NUM; ++i)
+			*p++ = gmac->rx_csum_stats[i];
+		*p++ = gmac->rx_napi_exits;
+
+	} while (u64_stats_fetch_retry(&gmac->rx_stats_syncp, start));
+	values = p;
+
+	/* racing with TX start_xmit */
+	do {
+		p = values;
+		start = u64_stats_fetch_begin(&gmac->tx_stats_syncp);
+
+		for (i = 0; i < TX_MAX_FRAGS; ++i)
+			*values++ = gmac->tx_frag_stats[i];
+		*values++ = gmac->tx_frags_linearized;
+		*values++ = gmac->tx_hw_csummed;
+
+	} while (u64_stats_fetch_retry(&gmac->tx_stats_syncp, start));
+}
+
+static int gmac_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	if (!dev->phydev)
+		return -ENXIO;
+	return phy_ethtool_gset(dev->phydev, cmd);
+}
+
+static int gmac_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	if (!dev->phydev)
+		return -ENXIO;
+	return phy_ethtool_sset(dev->phydev, cmd);
+}
+
+static int gmac_nway_reset(struct net_device *dev)
+{
+	if (!dev->phydev)
+		return -ENXIO;
+	return phy_start_aneg(dev->phydev);
+}
+
+static void gmac_get_pauseparam(struct net_device *dev,
+	struct ethtool_pauseparam *pparam)
+{
+	GMAC_CONFIG0_T config0;
+
+	config0.bits32 = __raw_readl(gmac_ctl_reg(dev, GMAC_CONFIG0));
+
+	pparam->rx_pause = config0.bits.rx_fc_en;
+	pparam->tx_pause = config0.bits.tx_fc_en;
+	pparam->autoneg = true;
+}
+
+static void gmac_get_ringparam(struct net_device *dev,
+	struct ethtool_ringparam *rp)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	GMAC_CONFIG0_T config0;
+
+	config0.bits32 = __raw_readl(gmac_ctl_reg(dev, GMAC_CONFIG0));
+
+	rp->rx_max_pending = 1 << 15;
+	rp->rx_mini_max_pending = 0;
+	rp->rx_jumbo_max_pending = 0;
+	rp->tx_max_pending = 1 << 15;
+
+	rp->rx_pending = 1 << gmac->rxq_order;
+	rp->rx_mini_pending = 0;
+	rp->rx_jumbo_pending = 0;
+	rp->tx_pending = 1 << gmac->txq_order;
+}
+
+static int toe_resize_freeq(struct toe_private *toe, int changing_dev_id);
+
+static int gmac_set_ringparam(struct net_device *dev,
+	struct ethtool_ringparam *rp)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	struct toe_private *toe = netdev_to_toe(dev);
+	int err = 0;
+
+	if (netif_running(dev))
+		return -EBUSY;
+
+	if (rp->rx_pending) {
+		gmac->rxq_order = min(15, ilog2(rp->rx_pending - 1) + 1);
+		err = toe_resize_freeq(toe, dev->dev_id);
+	}
+
+	if (rp->tx_pending)
+		gmac->txq_order = min(15, ilog2(rp->tx_pending - 1) + 1);
+
+	return err;
+}
+
+static int gmac_get_coalesce(struct net_device *dev,
+	struct ethtool_coalesce *ecmd)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+
+	ecmd->rx_max_coalesced_frames = 1;
+	ecmd->tx_max_coalesced_frames = gmac->irq_every_tx_packets;
+
+	return 0;
+}
+
+static int gmac_set_coalesce(struct net_device *dev,
+	struct ethtool_coalesce *ecmd)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+
+	if (ecmd->tx_max_coalesced_frames < 1)
+		return -EINVAL;
+	if (ecmd->tx_max_coalesced_frames >= 1 << gmac->txq_order)
+		return -EINVAL;
+
+	gmac->irq_every_tx_packets = ecmd->tx_max_coalesced_frames;
+
+	return 0;
+}
+
+static u32 gmac_get_msglevel(struct net_device *dev)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	return gmac->msg_enable;
+}
+
+static void gmac_set_msglevel(struct net_device *dev, u32 level)
+{
+	struct gmac_private *gmac = netdev_to_gmac(dev);
+	gmac->msg_enable = level;
+}
+
+static void gmac_get_drvinfo(struct net_device *dev,
+	struct ethtool_drvinfo *info)
+{
+	strcpy(info->driver, "sl351x");
+	strcpy(info->version, "mq-k");
+	strcpy(info->bus_info, dev->dev_id ? "1" : "0");
+}
+
+static const struct net_device_ops gmac_351x_ops = {
+	.ndo_init		= gmac_init,
+	.ndo_uninit		= gmac_uninit,
+	.ndo_open		= gmac_open,
+	.ndo_stop		= gmac_stop,
+	.ndo_start_xmit		= gmac_start_xmit,
+	.ndo_tx_timeout		= gmac_dump_dma_state,
+	.ndo_set_multicast_list	= gmac_set_multicast_list,
+	.ndo_set_rx_mode	= gmac_set_rx_mode,
+	.ndo_set_mac_address	= gmac_set_mac_address,
+	.ndo_get_stats64	= gmac_get_stats64,
+	.ndo_change_mtu		= gmac_change_mtu,
+};
+
+static const struct ethtool_ops gmac_351x_ethtool_ops = {
+	.get_rx_csum	= gmac_get_rx_csum,
+	.set_rx_csum	= gmac_set_rx_csum,
+	.get_tx_csum	= ethtool_op_get_tx_csum,
+	.set_tx_csum	= ethtool_op_set_tx_ipv6_csum,
+	.get_sg		= ethtool_op_get_sg,
+	.set_sg		= ethtool_op_set_sg,
+	.get_tso	= ethtool_op_get_tso,
+	.set_tso	= gmac_set_tso,
+	.get_sset_count	= gmac_get_sset_count,
+	.get_strings	= gmac_get_strings,
+	.get_ethtool_stats = gmac_get_ethtool_stats,
+	.get_settings	= gmac_get_settings,
+	.set_settings	= gmac_set_settings,
+	.get_link	= ethtool_op_get_link,
+	.nway_reset	= gmac_nway_reset,
+	.get_pauseparam	= gmac_get_pauseparam,
+	.get_ringparam	= gmac_get_ringparam,
+	.set_ringparam	= gmac_set_ringparam,
+	.get_coalesce	= gmac_get_coalesce,
+	.set_coalesce	= gmac_set_coalesce,
+	.get_msglevel	= gmac_get_msglevel,
+	.set_msglevel	= gmac_set_msglevel,
+	.get_drvinfo	= gmac_get_drvinfo,
+};
+
+static int __devinit gmac_init_netdev(struct toe_private *toe, int num,
+	struct platform_device *pdev)
+{
+	struct gemini_gmac_platform_data *pdata = pdev->dev.platform_data;
+	struct gmac_private *gmac;
+	struct net_device *dev;
+	__le32 addr[3];
+	int irq, err;
+
+	if (!pdata->bus_id[num])
+		return 0;
+
+	irq = platform_get_irq(pdev, num);
+	if (irq < 0) {
+		dev_err(toe->dev, "No IRQ for ethernet device #%d\n", num);
+		return irq;
+	}
+
+	dev = alloc_etherdev_mq(sizeof(*gmac), TX_QUEUE_NUM);
+	if (!dev) {
+		dev_err(toe->dev, "Can't allocate ethernet device #%d\n", num);
+		return -ENOMEM;
+	}
+
+	gmac = netdev_priv(dev);
+	dev->ml_priv = toe;
+	SET_NETDEV_DEV(dev, toe->dev);
+
+	toe->netdev[num] = dev;
+	dev->dev_id = num;
+
+	gmac->dma_iomem = toe->iomem + TOE_GMAC_DMA_BASE(num);
+	dev->base_addr = (unsigned long)(toe->iomem + TOE_GMAC_BASE(num));
+	dev->irq = irq;
+
+	dev->netdev_ops = &gmac_351x_ops;
+	SET_ETHTOOL_OPS(dev, &gmac_351x_ethtool_ops);
+
+	spin_lock_init(&gmac->config_lock);
+	gmac->msg_enable = debug_level;
+	gmac_clear_hw_stats(dev);
+
+	/* select working offloads by default */
+	/* (SG will be disabled when HW csum is disabled) */
+	/* TSO_ECN untested, TX csum unreliable, TX DMA unreliable */
+	dev->features |= NETIF_F_SG | NETIF_F_GSO | NETIF_F_GRO;
+
+	netif_napi_add(dev, &gmac->napi, gmac_napi_poll, DEFAULT_NAPI_WEIGHT);
+
+	/* dump MAC address regs; CPU is LE anyway */
+	addr[0] = cpu_to_le32(__raw_readl(gmac_ctl_reg(dev, GMAC_STA_ADD0)));
+	addr[1] = cpu_to_le32(__raw_readl(gmac_ctl_reg(dev, GMAC_STA_ADD1)));
+	addr[2] = cpu_to_le32(__raw_readl(gmac_ctl_reg(dev, GMAC_STA_ADD2)));
+	dev_dbg(&pdev->dev, "port %d address regs: %pM %pM\n",
+		num, (char *)addr, (char *)addr + ETH_ALEN);
+
+	if (is_valid_ether_addr((void *)addr))
+		memcpy(dev->dev_addr, addr, ETH_ALEN);
+	else
+		random_ether_addr(dev->dev_addr);
+	__gmac_set_mac_address(dev);
+
+	err = gmac_setup_phy(dev);
+	if (err)
+		netif_warn(gmac, probe, dev,
+			"PHY init failed: %d, deferring to ifup time\n", err);
+
+	err = register_netdev(dev);
+	if (!err)
+		return 0;
+
+	toe->netdev[num] = NULL;
+	free_netdev(dev);
+	return err;
+}
+
+static struct page *toe_alloc_freeq_pages(struct toe_private *toe, bool emerg)
+{
+	gfp_t gfp_mask = GFP_NOIO;
+	bool retried = false;
+
+retry:
+	toe->freeq_page = alloc_pages(gfp_mask, toe->alloc_order);
+	if (!toe->freeq_page) {
+		toe->alloc_order >>= 1;
+		if (gfp_mask & __GFP_HIGH)
+			/* even emergency alloc failed */
+			return NULL;
+		if (!toe->alloc_order && emerg)
+			gfp_mask |= __GFP_HIGH;
+		retried = true;
+		goto retry;
+	}
+
+	if (dma_mapping_error(toe->dev, dma_map_page(toe->dev,
+	    toe->freeq_page, 0, PAGE_SIZE << toe->alloc_order,
+	    DMA_FROM_DEVICE))) {
+		put_page(toe->freeq_page);
+		goto retry;
+	}
+
+	toe->freeq_page_count = 1 << toe->alloc_order;
+	toe->freeq_page_offs = 0;
+
+	if (!retried && toe->alloc_order < RX_MAX_ALLOC_ORDER)
+		toe->alloc_order++;
+
+	return toe->freeq_page;
+}
+
+static struct page *toe_get_next_page(struct toe_private *toe,
+	struct page *page, unsigned eaten_size)
+{
+	toe->freeq_page_offs += eaten_size;
+	if (toe->freeq_page_offs & ~PAGE_MASK) {
+		get_page(page);
+		return page;
+	}
+
+	if (!--toe->freeq_page_count)
+		return NULL;
+
+	toe->freeq_page_offs = 0;
+
+	toe->freeq_page = ++page;
+	get_page(page);
+
+	return page;
+}
+
+static unsigned int toe_fill_freeq_range(struct toe_private *toe,
+	unsigned int begin, unsigned int end)
+{
+	void __iomem *ptr_reg = toe_reg(toe, GLOBAL_SWFQ_RWPTR_REG);
+	GMAC_RXDESC_T *desc, *dend;
+	struct page *page;
+	unsigned count;
+
+	dev_dbg(toe->dev, "freeq: filling <%u,%u)  (ptr: %u %u)\n",
+		begin, end, GET_RPTR(ptr_reg), GET_WPTR(ptr_reg));
+
+	if (toe->freeq_page_count)
+		page = toe->freeq_page;
+	else
+		page = NULL;
+
+	desc = toe->freeq_ring + begin;
+	dend = toe->freeq_ring + end;
+
+	for (count = 0; desc != dend; ++desc, ++count) {
+		if (!page) {
+			page = toe_alloc_freeq_pages(toe, 0);
+			if (!page)
+				break;
+		}
+
+		/* only word2 gets copied to rxq descriptor */
+		/* buffer size is taken from DMA_SKB_SIZE_REG */
+		desc->word2.buf_adr = page_to_dma(toe->dev, page) +
+			toe->freeq_page_offs;
+
+		if (unlikely(!count))
+			dev_dbg(toe->dev,
+				"freeq[%zu]: 0x%08x 0x%08x 0x%08x 0x%08x, page %p, offs 0x%04x\n",
+				desc - toe->freeq_ring,
+				desc->word0.bits32, desc->word1.bits32,
+				desc->word2.bits32, desc->word3.bits32,
+				page, toe->freeq_page_offs);
+
+		page = toe_get_next_page(toe, page,
+			1 << toe->freeq_frag_order);
+	}
+
+	end = (desc - toe->freeq_ring) & __RWPTR_MASK(toe->freeq_order);
+	SET_WPTR(ptr_reg, end);
+
+	return count;
+}
+
+static void toe_enable_irq(struct toe_private *toe, int enable)
+{
+	void __iomem *irqen_reg = toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_4_REG);
+
+	unsigned long flags;
+	unsigned val;
+
+	spin_lock_irqsave(&toe->irq_lock, flags);
+
+	val = __raw_readl(irqen_reg);
+	if (enable)
+		val |= SWFQ_EMPTY_INT_BIT;
+	else
+		val &= ~SWFQ_EMPTY_INT_BIT;
+	__raw_writel(val, irqen_reg);
+
+	spin_unlock_irqrestore(&toe->irq_lock, flags);
+}
+
+static irqreturn_t toe_interrupt(int irq, void *data)
+{
+	struct toe_private *toe = data;
+
+	void __iomem *irqif_reg = toe_reg(toe, GLOBAL_INTERRUPT_STATUS_4_REG);
+	void __iomem *irqen_reg = toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_4_REG);
+	unsigned val, en;
+	irqreturn_t ret = IRQ_NONE;
+
+	spin_lock(&toe->irq_lock);
+
+	val = __raw_readl(irqif_reg);
+	val &= en = __raw_readl(irqen_reg);
+
+	if (val & SWFQ_EMPTY_INT_BIT) {
+		toe_enable_irq(toe, 0);
+		ret = IRQ_WAKE_THREAD;
+	}
+
+	spin_unlock(&toe->irq_lock);
+
+	return ret;
+}
+
+static irqreturn_t toe_interrupt_thread(int irq, void *data)
+{
+	struct toe_private *toe = data;
+	void __iomem *rwptr_reg = toe_reg(toe, GLOBAL_SWFQ_RWPTR_REG);
+	void __iomem *irqif_reg = toe_reg(toe, GLOBAL_INTERRUPT_STATUS_4_REG);
+	unsigned int r, w, d, end, count = 0;
+
+retry:
+	w = GET_WPTR(rwptr_reg);
+	r = GET_RPTR(rwptr_reg);
+
+	d = RWPTR_DISTANCE(r, w, toe->freeq_order);
+	if (unlikely(d >= toe->freeq_entries))
+		goto full;
+	d = toe->freeq_entries - d;
+
+	end = min(w + d, 1u << toe->freeq_order);
+	count += toe_fill_freeq_range(toe, w, end);
+
+	if (count == end - w && !(end & __RWPTR_MASK(toe->freeq_order)))
+		goto retry;
+
+full:
+	dev_dbg(toe->dev, "freeq: filled %u buffers\n", count);
+
+	__raw_writel(SWFQ_EMPTY_INT_BIT, irqif_reg);
+	if (unlikely(GET_WPTR(rwptr_reg) == GET_RPTR(rwptr_reg))) {
+		if (unlikely(!count))
+			dev_warn(toe->dev, "freeq: full, but empty?\n");
+		count = 0;
+		goto retry;
+	}
+
+	toe_enable_irq(toe, 1);
+
+	return IRQ_HANDLED;
+}
+
+static int toe_setup_freeq(struct toe_private *toe)
+{
+	void __iomem *dma_reg = toe_reg(toe, GLOBAL_SW_FREEQ_BASE_SIZE_REG);
+	QUEUE_THRESHOLD_T qt;
+	DMA_SKB_SIZE_T skbsz = { .bits = { .sw_skb_size = 1 << toe->freeq_frag_order } };
+	unsigned n;
+
+	toe->freeq_ring = dma_alloc_coherent(toe->dev,
+		sizeof(*toe->freeq_ring) << toe->freeq_order,
+		&toe->freeq_dma_base, GFP_KERNEL);
+	if (!toe->freeq_ring)
+		return -ENOMEM;
+
+	BUG_ON(toe->freeq_dma_base & ~DMA_Q_BASE_MASK);
+
+	__raw_writel(skbsz.bits32, toe_reg(toe, GLOBAL_DMA_SKB_SIZE_REG));
+	__raw_writel(toe->freeq_dma_base | toe->freeq_order, dma_reg);
+
+	/* fill ring */
+	n = toe_fill_freeq_range(toe, 0, toe->freeq_entries);
+	if (!n)
+		goto err_freeq;
+	if (n != toe->freeq_entries)
+		dev_warn(toe->dev, "Allocated only %u of %u RX buffers\n",
+			n, toe->freeq_entries);
+
+	qt.bits32 = __raw_readl(toe_reg(toe, GLOBAL_QUEUE_THRESHOLD_REG));
+	qt.bits.swfq_empty = min_t(unsigned, (n + 1) >> 1, 255);
+	__raw_writel(qt.bits32, toe_reg(toe, GLOBAL_QUEUE_THRESHOLD_REG));
+
+	dev_dbg(toe->dev, "freeq: ring %p (dma 0x%08x), len %u (order %u), thr %u\n",
+		toe->freeq_ring, toe->freeq_dma_base,
+		toe->freeq_entries, toe->freeq_order, qt.bits.swfq_empty);
+
+	return 0;
+
+err_freeq:
+	__raw_writel(0, dma_reg);
+	dma_free_coherent(toe->dev,
+		sizeof(*toe->freeq_ring) << toe->freeq_order,
+		toe->freeq_ring, toe->freeq_dma_base);
+	toe->freeq_ring = NULL;
+
+	return -ENOMEM;
+}
+
+static void toe_cleanup_freeq(struct toe_private *toe)
+{
+	void __iomem *dma_reg = toe_reg(toe, GLOBAL_SW_FREEQ_BASE_SIZE_REG);
+	void __iomem *ptr_reg = toe_reg(toe, GLOBAL_SWFQ_RWPTR_REG);
+	unsigned i, e, mask = __RWPTR_MASK(toe->freeq_order);
+
+	i = GET_RPTR(ptr_reg);
+	e = GET_WPTR(ptr_reg);
+	__raw_writel(0, ptr_reg);
+	__raw_writel(0, dma_reg);
+
+	for (; i != e; i = __RWPTR_NEXT(i, mask))
+		put_page(toe_unmap_rx_desc(toe, &toe->freeq_ring[i]));
+
+	dma_free_coherent(toe->dev,
+		sizeof(*toe->freeq_ring) << toe->freeq_order,
+		toe->freeq_ring, toe->freeq_dma_base);
+
+	toe->freeq_ring = NULL;
+}
+
+static int toe_resize_freeq(struct toe_private *toe, int changing_dev_id)
+{
+	struct net_device *other = toe->netdev[1 - changing_dev_id];
+	unsigned new_size = 0;
+	unsigned new_order;
+	int err;
+
+	if (other && netif_running(other))
+		return -EBUSY;
+
+	if (toe->netdev[0])
+		new_size  = 1 << netdev_to_gmac(toe->netdev[0])->rxq_order;
+
+	if (toe->netdev[1])
+		new_size += 1 << netdev_to_gmac(toe->netdev[1])->rxq_order;
+
+	new_order = min(15, ilog2(new_size - 1) + 1);
+	if (new_size >= 1 << new_order)
+		new_size = (1 << new_order) - 1;
+
+	toe_enable_irq(toe, 0);
+	if (toe->freeq_ring)
+		toe_cleanup_freeq(toe);
+
+	toe->freeq_order = new_order;
+	toe->freeq_entries = new_size;
+
+	err = toe_setup_freeq(toe);
+	if (unlikely(err))
+		return err;
+
+	toe_enable_irq(toe, 1);
+
+	return 0;
+}
+
+
+/*
+ * Interrupt config:
+ *
+ *	GMAC0 intr bits ------> int0 ----> eth0
+ *	GMAC1 intr bits ------> int1 ----> eth1
+ *	TOE intr -------------> int1 ----> eth1
+ *	Classification Intr --> int0 ----> eth0
+ *	Default Q0 -----------> int0 ----> eth0
+ *	Default Q1 -----------> int1 ----> eth1
+ *	FreeQ intr -----------> int1 ----> eth1
+ */
+static void toe_prepare_irq(struct toe_private *toe)
+{
+	__raw_writel(0, toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_0_REG));
+	__raw_writel(0, toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_1_REG));
+	__raw_writel(0, toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_2_REG));
+	__raw_writel(0, toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_3_REG));
+	__raw_writel(0, toe_reg(toe, GLOBAL_INTERRUPT_ENABLE_4_REG));
+
+	__raw_writel(0xCCFC0FC0, toe_reg(toe, GLOBAL_INTERRUPT_SELECT_0_REG));
+	__raw_writel(0x00F00002, toe_reg(toe, GLOBAL_INTERRUPT_SELECT_1_REG));
+	__raw_writel(0xFFFFFFFF, toe_reg(toe, GLOBAL_INTERRUPT_SELECT_2_REG));
+	__raw_writel(0xFFFFFFFF, toe_reg(toe, GLOBAL_INTERRUPT_SELECT_3_REG));
+	__raw_writel(0xFF000003, toe_reg(toe, GLOBAL_INTERRUPT_SELECT_4_REG));
+
+	/* edge-triggered interrupts packed to level-triggered one... */
+	__raw_writel(~0, toe_reg(toe, GLOBAL_INTERRUPT_STATUS_0_REG));
+	__raw_writel(~0, toe_reg(toe, GLOBAL_INTERRUPT_STATUS_1_REG));
+	__raw_writel(~0, toe_reg(toe, GLOBAL_INTERRUPT_STATUS_2_REG));
+	__raw_writel(~0, toe_reg(toe, GLOBAL_INTERRUPT_STATUS_3_REG));
+	__raw_writel(~0, toe_reg(toe, GLOBAL_INTERRUPT_STATUS_4_REG));
+}
+
+static __devinit int toe_init(struct toe_private *toe,
+	struct platform_device *pdev)
+{
+	int err;
+
+	__raw_writel(0, toe_reg(toe, GLOBAL_SW_FREEQ_BASE_SIZE_REG));
+	__raw_writel(0, toe_reg(toe, GLOBAL_HW_FREEQ_BASE_SIZE_REG));
+	__raw_writel(0, toe_reg(toe, GLOBAL_SWFQ_RWPTR_REG));
+	__raw_writel(0, toe_reg(toe, GLOBAL_HWFQ_RWPTR_REG));
+
+	toe->freeq_frag_order = DEFAULT_RX_BUF_ORDER;
+	toe->freeq_order = ~0;
+
+	toe_prepare_irq(toe);
+	err = request_threaded_irq(toe->irq, toe_interrupt,
+		toe_interrupt_thread, IRQF_SHARED, "sl351x-TOE", toe);
+	if (err)
+		goto err_freeq;
+
+	return 0;
+
+err_freeq:
+	toe_cleanup_freeq(toe);
+	return err;
+}
+
+static void toe_deinit(struct toe_private *toe)
+{
+	toe_prepare_irq(toe);
+	free_irq(toe->irq, toe);
+	toe_cleanup_freeq(toe);
+
+	if (toe->freeq_page_count)
+		put_page(toe->freeq_page);
+}
+
+static int toe_reset(struct toe_private *toe)
+{
+	unsigned int reg, retry = 5;
+
+	reg = __raw_readl(IO_ADDRESS(GEMINI_GLOBAL_BASE) + GLOBAL_RESET);
+	reg |= RESET_GMAC1 | RESET_GMAC0;
+	__raw_writel(reg, IO_ADDRESS(GEMINI_GLOBAL_BASE) + GLOBAL_RESET);
+
+	do {
+		udelay(2);
+		reg = __raw_readl(toe_reg(toe, GLOBAL_TOE_VERSION_REG));
+		barrier();
+	} while (!reg && --retry);
+
+	dev_info(toe->dev, "Gemini GMAC version 0x%x\n", reg);
+
+	return reg ? 0 : -EIO;
+}
+
+static int __devinit gemini_gmac_probe(struct platform_device *pdev)
+{
+	struct resource *res;
+	struct toe_private *toe;
+	int retval;
+
+	if (!pdev->dev.platform_data)
+		return -EINVAL;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&pdev->dev, "can't get device resources\n");
+		return -ENODEV;
+	}
+
+	toe = kzalloc(sizeof(*toe), GFP_KERNEL);
+	if (!toe)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, toe);
+	toe->dev = &pdev->dev;
+	toe->iomem = ioremap(res->start, resource_size(res));
+	if (!toe->iomem) {
+		dev_err(toe->dev, "ioremap failed\n");
+		retval = -EIO;
+		goto err_data;
+	}
+
+	retval = toe_reset(toe);
+	if (retval < 0)
+		goto err_unmap;
+
+	retval = toe->irq = platform_get_irq(pdev, 1);
+	if (retval < 0)
+		goto err_unmap;
+
+	spin_lock_init(&toe->irq_lock);
+
+	retval = toe_init(toe, pdev);
+	if (retval)
+		goto err_unmap;
+
+	retval = gmac_init_netdev(toe, 0, pdev);
+	if (retval)
+		goto err_uninit;
+
+	retval = gmac_init_netdev(toe, 1, pdev);
+	if (retval)
+		goto err_uninit;
+
+	toe_resize_freeq(toe, 0);
+
+	dev_dbg(&pdev->dev, "initialized.\n");
+	return 0;
+
+err_uninit:
+	if (toe->netdev[0])
+		unregister_netdev(toe->netdev[0]);
+	toe_deinit(toe);
+err_unmap:
+	iounmap(toe->iomem);
+err_data:
+	kfree(toe);
+	return retval;
+}
+
+static int __devexit gemini_gmac_remove(struct platform_device *pdev)
+{
+	struct toe_private *toe = platform_get_drvdata(pdev);
+	int i;
+
+	for (i = 0; i < 2; i++)
+		if (toe->netdev[i])
+			unregister_netdev(toe->netdev[i]);
+	toe_deinit(toe);
+
+	iounmap(toe->iomem);
+	kfree(toe);
+
+	return 0;
+}
+
+static struct platform_driver gemini_gmac_driver = {
+	.probe		= gemini_gmac_probe,
+	.remove		= __devexit_p(gemini_gmac_remove),
+	.driver.name	= "gemini-gmac",
+	.driver.owner	= THIS_MODULE,
+};
+
+static int __init gemini_gmac_init(void)
+{
+#ifdef CONFIG_MDIO_GPIO_MODULE
+	request_module("mdio-gpio");
+#endif
+	return platform_driver_register(&gemini_gmac_driver);
+}
+
+static void __exit gemini_gmac_exit(void)
+{
+	platform_driver_unregister(&gemini_gmac_driver);
+}
+
+module_init(gemini_gmac_init);
+module_exit(gemini_gmac_exit);
+
+MODULE_AUTHOR("Michał Mirosław");
+MODULE_DESCRIPTION("StorLink SL351x (Gemini) ethernet driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:gemini-gmac");
diff --git a/drivers/net/sl351x_hw.h b/drivers/net/sl351x_hw.h
new file mode 100644
index 0000000..05b1b36
--- /dev/null
+++ b/drivers/net/sl351x_hw.h
@@ -0,0 +1,1436 @@
+/*
+ *  Register definitions for Gemini LEPUS GMAC Ethernet device driver.
+ *
+ *  Copyright (C) 2006, Storlink, Corp.
+ *  Copyright (C) 2008-2009, Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
+ *  Copyright (C) 2010, Michał Mirosław <mirq-linux@rere.qmqm.pl>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#ifndef _GMAC_HW_H
+#define _GMAC_HW_H
+
+#include <linux/bitops.h>
+
+/*
+ * Base Registers
+ */
+#define TOE_NONTOE_QUE_HDR_BASE		0x2000
+#define TOE_TOE_QUE_HDR_BASE		0x3000
+#define TOE_V_BIT_BASE			0x4000
+#define TOE_A_BIT_BASE			0x6000
+#define TOE_GMAC_DMA_BASE(x)		(0x8000 + 0x4000 * (x))
+#define TOE_GMAC_BASE(x)		(0xA000 + 0x4000 * (x))
+
+/*
+ * Queue ID
+ */
+#define TOE_SW_FREE_QID			0x00
+#define TOE_HW_FREE_QID			0x01
+#define TOE_GMAC0_SW_TXQ0_QID		0x02
+#define TOE_GMAC0_SW_TXQ1_QID		0x03
+#define TOE_GMAC0_SW_TXQ2_QID		0x04
+#define TOE_GMAC0_SW_TXQ3_QID		0x05
+#define TOE_GMAC0_SW_TXQ4_QID		0x06
+#define TOE_GMAC0_SW_TXQ5_QID		0x07
+#define TOE_GMAC0_HW_TXQ0_QID		0x08
+#define TOE_GMAC0_HW_TXQ1_QID		0x09
+#define TOE_GMAC0_HW_TXQ2_QID		0x0A
+#define TOE_GMAC0_HW_TXQ3_QID		0x0B
+#define TOE_GMAC1_SW_TXQ0_QID		0x12
+#define TOE_GMAC1_SW_TXQ1_QID		0x13
+#define TOE_GMAC1_SW_TXQ2_QID		0x14
+#define TOE_GMAC1_SW_TXQ3_QID		0x15
+#define TOE_GMAC1_SW_TXQ4_QID		0x16
+#define TOE_GMAC1_SW_TXQ5_QID		0x17
+#define TOE_GMAC1_HW_TXQ0_QID		0x18
+#define TOE_GMAC1_HW_TXQ1_QID		0x19
+#define TOE_GMAC1_HW_TXQ2_QID		0x1A
+#define TOE_GMAC1_HW_TXQ3_QID		0x1B
+#define TOE_GMAC0_DEFAULT_QID		0x20
+#define TOE_GMAC1_DEFAULT_QID		0x21
+#define TOE_CLASSIFICATION_QID(x)	(0x22 + x)	/* 0x22 ~ 0x2F */
+#define TOE_TOE_QID(x)			(0x40 + x)	/* 0x40 ~ 0x7F */
+
+/*
+ * old info:
+ * TOE DMA Queue Size should be 2^n, n = 6...12
+ * TOE DMA Queues are the following queue types:
+ *		SW Free Queue, HW Free Queue,
+ *		GMAC 0/1 SW TX Q0-5, and GMAC 0/1 HW TX Q0-5
+ * The base address and descriptor number are configured at
+ * DMA Queues Descriptor Ring Base Address/Size Register (offset 0x0004)
+ */
+
+#define GET_WPTR(addr)			__raw_readw((addr) + 2)
+#define GET_RPTR(addr)			__raw_readw((addr))
+#define SET_WPTR(addr, data)		__raw_writew((data), (addr) + 2)
+#define SET_RPTR(addr, data)		__raw_writew((data), (addr))
+#define __RWPTR_NEXT(x, mask)		(((unsigned int)(x) + 1) & (mask))
+#define __RWPTR_PREV(x, mask)		(((unsigned int)(x) - 1) & (mask))
+#define __RWPTR_DISTANCE(r, w, mask)	(((unsigned int)(w) - (r)) & (mask))
+#define __RWPTR_MASK(order)		((1 << (order)) - 1)
+#define RWPTR_NEXT(x, order)		__RWPTR_NEXT((x), __RWPTR_MASK((order)))
+#define RWPTR_PREV(x, order)		__RWPTR_PREV((x), __RWPTR_MASK((order)))
+#define RWPTR_DISTANCE(r, w, order)	__RWPTR_DISTANCE((r), (w), \
+						__RWPTR_MASK((order)))
+
+/*
+ * Global registers
+ * #define TOE_GLOBAL_BASE			(TOE_BASE + 0x0000)
+ * Base 0x60000000
+ */
+#define GLOBAL_TOE_VERSION_REG		0x0000
+#define GLOBAL_SW_FREEQ_BASE_SIZE_REG	0x0004
+#define GLOBAL_HW_FREEQ_BASE_SIZE_REG	0x0008
+#define GLOBAL_DMA_SKB_SIZE_REG		0x0010
+#define GLOBAL_SWFQ_RWPTR_REG		0x0014
+#define GLOBAL_HWFQ_RWPTR_REG		0x0018
+#define GLOBAL_INTERRUPT_STATUS_0_REG	0x0020
+#define GLOBAL_INTERRUPT_ENABLE_0_REG	0x0024
+#define GLOBAL_INTERRUPT_SELECT_0_REG	0x0028
+#define GLOBAL_INTERRUPT_STATUS_1_REG	0x0030
+#define GLOBAL_INTERRUPT_ENABLE_1_REG	0x0034
+#define GLOBAL_INTERRUPT_SELECT_1_REG	0x0038
+#define GLOBAL_INTERRUPT_STATUS_2_REG	0x0040
+#define GLOBAL_INTERRUPT_ENABLE_2_REG	0x0044
+#define GLOBAL_INTERRUPT_SELECT_2_REG	0x0048
+#define GLOBAL_INTERRUPT_STATUS_3_REG	0x0050
+#define GLOBAL_INTERRUPT_ENABLE_3_REG	0x0054
+#define GLOBAL_INTERRUPT_SELECT_3_REG	0x0058
+#define GLOBAL_INTERRUPT_STATUS_4_REG	0x0060
+#define GLOBAL_INTERRUPT_ENABLE_4_REG	0x0064
+#define GLOBAL_INTERRUPT_SELECT_4_REG	0x0068
+#define GLOBAL_HASH_TABLE_BASE_REG	0x006C
+#define GLOBAL_QUEUE_THRESHOLD_REG	0x0070
+
+/*
+ * GMAC 0/1 DMA/TOE register
+ * #define TOE_GMAC0_DMA_BASE		(TOE_BASE + 0x8000)
+ * #define TOE_GMAC1_DMA_BASE		(TOE_BASE + 0xC000)
+ * Base 0x60008000 or 0x6000C000
+ */
+#define GMAC_DMA_CTRL_REG		0x0000
+#define GMAC_TX_WEIGHTING_CTRL_0_REG	0x0004
+#define GMAC_TX_WEIGHTING_CTRL_1_REG	0x0008
+#define GMAC_SW_TX_QUEUE0_PTR_REG	0x000C
+#define GMAC_SW_TX_QUEUE1_PTR_REG	0x0010
+#define GMAC_SW_TX_QUEUE2_PTR_REG	0x0014
+#define GMAC_SW_TX_QUEUE3_PTR_REG	0x0018
+#define GMAC_SW_TX_QUEUE4_PTR_REG	0x001C
+#define GMAC_SW_TX_QUEUE5_PTR_REG	0x0020
+#define GMAC_SW_TX_QUEUE_PTR_REG(i)	(GMAC_SW_TX_QUEUE0_PTR_REG + 4 * (i))
+#define GMAC_HW_TX_QUEUE0_PTR_REG	0x0024
+#define GMAC_HW_TX_QUEUE1_PTR_REG	0x0028
+#define GMAC_HW_TX_QUEUE2_PTR_REG	0x002C
+#define GMAC_HW_TX_QUEUE3_PTR_REG	0x0030
+#define GMAC_HW_TX_QUEUE_PTR_REG(i)	(GMAC_HW_TX_QUEUE0_PTR_REG + 4 * (i))
+#define GMAC_DMA_TX_FIRST_DESC_REG	0x0038
+#define GMAC_DMA_TX_CURR_DESC_REG	0x003C
+#define GMAC_DMA_TX_DESC_WORD0_REG	0x0040
+#define GMAC_DMA_TX_DESC_WORD1_REG	0x0044
+#define GMAC_DMA_TX_DESC_WORD2_REG	0x0048
+#define GMAC_DMA_TX_DESC_WORD3_REG	0x004C
+#define GMAC_SW_TX_QUEUE_BASE_REG	0x0050
+#define GMAC_HW_TX_QUEUE_BASE_REG	0x0054
+#define GMAC_DMA_RX_FIRST_DESC_REG	0x0058
+#define GMAC_DMA_RX_CURR_DESC_REG	0x005C
+#define GMAC_DMA_RX_DESC_WORD0_REG	0x0060
+#define GMAC_DMA_RX_DESC_WORD1_REG	0x0064
+#define GMAC_DMA_RX_DESC_WORD2_REG	0x0068
+#define GMAC_DMA_RX_DESC_WORD3_REG	0x006C
+#define GMAC_HASH_ENGINE_REG0		0x0070
+#define GMAC_HASH_ENGINE_REG1		0x0074
+/* matching rule 0 Control register 0 */
+#define GMAC_MR0CR0			0x0078
+#define GMAC_MR0CR1			0x007C
+#define GMAC_MR0CR2			0x0080
+#define GMAC_MR1CR0			0x0084
+#define GMAC_MR1CR1			0x0088
+#define GMAC_MR1CR2			0x008C
+#define GMAC_MR2CR0			0x0090
+#define GMAC_MR2CR1			0x0094
+#define GMAC_MR2CR2			0x0098
+#define GMAC_MR3CR0			0x009C
+#define GMAC_MR3CR1			0x00A0
+#define GMAC_MR3CR2			0x00A4
+/* Support Protocol Regsister 0 */
+#define GMAC_SPR0			0x00A8
+#define GMAC_SPR1			0x00AC
+#define GMAC_SPR2			0x00B0
+#define GMAC_SPR3			0x00B4
+#define GMAC_SPR4			0x00B8
+#define GMAC_SPR5			0x00BC
+#define GMAC_SPR6			0x00C0
+#define GMAC_SPR7			0x00C4
+/* GMAC Hash/Rx/Tx AHB Weighting register */
+#define GMAC_AHB_WEIGHT_REG		0x00C8
+
+/*
+ * TOE GMAC 0/1 register
+ * #define TOE_GMAC0_BASE				(TOE_BASE + 0xA000)
+ * #define TOE_GMAC1_BASE				(TOE_BASE + 0xE000)
+ * Base 0x6000A000 or 0x6000E000
+ */
+enum GMAC_REGISTER {
+	GMAC_STA_ADD0	= 0x0000,
+	GMAC_STA_ADD1	= 0x0004,
+	GMAC_STA_ADD2	= 0x0008,
+	GMAC_RX_FLTR	= 0x000c,
+	GMAC_MCAST_FIL0 = 0x0010,
+	GMAC_MCAST_FIL1 = 0x0014,
+	GMAC_CONFIG0	= 0x0018,
+	GMAC_CONFIG1	= 0x001c,
+	GMAC_CONFIG2	= 0x0020,
+	GMAC_CONFIG3	= 0x0024,
+	GMAC_RESERVED	= 0x0028,
+	GMAC_STATUS	= 0x002c,
+	GMAC_IN_DISCARDS= 0x0030,
+	GMAC_IN_ERRORS  = 0x0034,
+	GMAC_IN_MCAST   = 0x0038,
+	GMAC_IN_BCAST   = 0x003c,
+	GMAC_IN_MAC1    = 0x0040,	/* for STA 1 MAC Address */
+	GMAC_IN_MAC2    = 0x0044	/* for STA 2 MAC Address */
+};
+
+#define RX_STATS_NUM	6
+
+/*
+ * DMA Queues description Ring Base Address/Size Register (offset 0x0004)
+ */
+typedef union {
+	unsigned int bits32;
+	unsigned int base_size;
+} DMA_Q_BASE_SIZE_T;
+#define DMA_Q_BASE_MASK		(~0x0f)
+
+/*
+ * DMA SKB Buffer register (offset 0x0008)
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_0008 {
+		unsigned int sw_skb_size : 16;	/* SW Free poll SKB Size */
+		unsigned int hw_skb_size : 16;	/* HW Free poll SKB Size */
+	} bits;
+} DMA_SKB_SIZE_T;
+
+/*
+ * DMA SW Free Queue Read/Write Pointer Register (offset 0x000C)
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_000c {
+		unsigned int rptr	: 16;	/* Read Ptr, RO */
+		unsigned int wptr	: 16;	/* Write Ptr, RW */
+	} bits;
+} DMA_RWPTR_T;
+
+/*
+ * DMA HW Free Queue Read/Write Pointer Register (offset 0x0010)
+ * see DMA_RWPTR_T structure
+ */
+
+/*
+ * Interrupt Status Register 0	(offset 0x0020)
+ * Interrupt Mask Register 0	(offset 0x0024)
+ * Interrupt Select Register 0	(offset 0x0028)
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_0020 {
+		/* GMAC0 SW Tx Queue 0 EOF Interrupt */
+		unsigned int swtq00_eof	: 1;
+		unsigned int swtq01_eof	: 1;
+		unsigned int swtq02_eof	: 1;
+		unsigned int swtq03_eof	: 1;
+		unsigned int swtq04_eof	: 1;
+		unsigned int swtq05_eof	: 1;
+		/* GMAC1 SW Tx Queue 0 EOF Interrupt */
+		unsigned int swtq10_eof	: 1;
+		unsigned int swtq11_eof	: 1;
+		unsigned int swtq12_eof	: 1;
+		unsigned int swtq13_eof	: 1;
+		unsigned int swtq14_eof	: 1;
+		unsigned int swtq15_eof	: 1;
+		/* GMAC0 SW Tx Queue 0 Finish Interrupt */
+		unsigned int swtq00_fin	: 1;
+		unsigned int swtq01_fin	: 1;
+		unsigned int swtq02_fin	: 1;
+		unsigned int swtq03_fin	: 1;
+		unsigned int swtq04_fin	: 1;
+		unsigned int swtq05_fin	: 1;
+		/* GMAC1 SW Tx Queue 0 Finish Interrupt */
+		unsigned int swtq10_fin	: 1;
+		unsigned int swtq11_fin	: 1;
+		unsigned int swtq12_fin	: 1;
+		unsigned int swtq13_fin	: 1;
+		unsigned int swtq14_fin	: 1;
+		unsigned int swtq15_fin	: 1;
+		/* GMAC0 Rx Descriptor Protocol Error */
+		unsigned int rxPerr0	: 1;
+		/* GMAC0 AHB Bus Error while Rx */
+		unsigned int rxDerr0	: 1;
+		/* GMAC1 Rx Descriptor Protocol Error */
+		unsigned int rxPerr1	: 1;
+		/* GMAC1 AHB Bus Error while Rx */
+		unsigned int rxDerr1	: 1;
+		/* GMAC0 Tx Descriptor Protocol Error */
+		unsigned int txPerr0	: 1;
+		/* GMAC0 AHB Bus Error while Tx */
+		unsigned int txDerr0	: 1;
+		/* GMAC1 Tx Descriptor Protocol Error */
+		unsigned int txPerr1	: 1;
+		/* GMAC1 AHB Bus Error while Tx */
+		unsigned int txDerr1	: 1;
+	} bits;
+} INTR_REG0_T;
+
+#define GMAC1_TXDERR_INT_BIT		BIT(31)
+#define GMAC1_TXPERR_INT_BIT		BIT(30)
+#define GMAC0_TXDERR_INT_BIT		BIT(29)
+#define GMAC0_TXPERR_INT_BIT		BIT(28)
+#define GMAC1_RXDERR_INT_BIT		BIT(27)
+#define GMAC1_RXPERR_INT_BIT		BIT(26)
+#define GMAC0_RXDERR_INT_BIT		BIT(25)
+#define GMAC0_RXPERR_INT_BIT		BIT(24)
+#define GMAC1_SWTQ15_FIN_INT_BIT	BIT(23)
+#define GMAC1_SWTQ14_FIN_INT_BIT	BIT(22)
+#define GMAC1_SWTQ13_FIN_INT_BIT	BIT(21)
+#define GMAC1_SWTQ12_FIN_INT_BIT	BIT(20)
+#define GMAC1_SWTQ11_FIN_INT_BIT	BIT(19)
+#define GMAC1_SWTQ10_FIN_INT_BIT	BIT(18)
+#define GMAC0_SWTQ05_FIN_INT_BIT	BIT(17)
+#define GMAC0_SWTQ04_FIN_INT_BIT	BIT(16)
+#define GMAC0_SWTQ03_FIN_INT_BIT	BIT(15)
+#define GMAC0_SWTQ02_FIN_INT_BIT	BIT(14)
+#define GMAC0_SWTQ01_FIN_INT_BIT	BIT(13)
+#define GMAC0_SWTQ00_FIN_INT_BIT	BIT(12)
+#define GMAC1_SWTQ15_EOF_INT_BIT	BIT(11)
+#define GMAC1_SWTQ14_EOF_INT_BIT	BIT(10)
+#define GMAC1_SWTQ13_EOF_INT_BIT	BIT(9)
+#define GMAC1_SWTQ12_EOF_INT_BIT	BIT(8)
+#define GMAC1_SWTQ11_EOF_INT_BIT	BIT(7)
+#define GMAC1_SWTQ10_EOF_INT_BIT	BIT(6)
+#define GMAC0_SWTQ05_EOF_INT_BIT	BIT(5)
+#define GMAC0_SWTQ04_EOF_INT_BIT	BIT(4)
+#define GMAC0_SWTQ03_EOF_INT_BIT	BIT(3)
+#define GMAC0_SWTQ02_EOF_INT_BIT	BIT(2)
+#define GMAC0_SWTQ01_EOF_INT_BIT	BIT(1)
+#define GMAC0_SWTQ00_EOF_INT_BIT	BIT(0)
+
+/*
+ * Interrupt Status Register 1	(offset 0x0030)
+ * Interrupt Mask Register 1	(offset 0x0034)
+ * Interrupt Select Register 1	(offset 0x0038)
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_0030 {
+		unsigned int default_q0_eof	: 1;	/* Default Queue 0 EOF Interrupt */
+		unsigned int default_q1_eof	: 1;	/* Default Queue 1 EOF Interrupt */
+		unsigned int class_rx		: 14;	/* Classification Queue Rx Interrupt */
+		unsigned int hwtq00_eof		: 1;	/* GMAC0 HW Tx Queue0 EOF Interrupt */
+		unsigned int hwtq01_eof		: 1;	/* GMAC0 HW Tx Queue1 EOF Interrupt */
+		unsigned int hwtq02_eof		: 1;	/* GMAC0 HW Tx Queue2 EOF Interrupt */
+		unsigned int hwtq03_eof		: 1;	/* GMAC0 HW Tx Queue3 EOF Interrupt */
+		unsigned int hwtq10_eof		: 1;	/* GMAC1 HW Tx Queue0 EOF Interrupt */
+		unsigned int hwtq11_eof		: 1;	/* GMAC1 HW Tx Queue1 EOF Interrupt */
+		unsigned int hwtq12_eof		: 1;	/* GMAC1 HW Tx Queue2 EOF Interrupt */
+		unsigned int hwtq13_eof		: 1;	/* GMAC1 HW Tx Queue3 EOF Interrupt */
+		unsigned int toe_iq0_intr	: 1;	/* TOE Interrupt Queue 0 with Interrupts */
+		unsigned int toe_iq1_intr	: 1;	/* TOE Interrupt Queue 1 with Interrupts */
+		unsigned int toe_iq2_intr	: 1;	/* TOE Interrupt Queue 2 with Interrupts */
+		unsigned int toe_iq3_intr	: 1;	/* TOE Interrupt Queue 3 with Interrupts */
+		unsigned int toe_iq0_full	: 1;	/* TOE Interrupt Queue 0 Full Interrupt */
+		unsigned int toe_iq1_full	: 1;	/* TOE Interrupt Queue 1 Full Interrupt */
+		unsigned int toe_iq2_full	: 1;	/* TOE Interrupt Queue 2 Full Interrupt */
+		unsigned int toe_iq3_full	: 1;	/* TOE Interrupt Queue 3 Full Interrupt */
+	} bits;
+} INTR_REG1_T;
+
+#define TOE_IQ3_FULL_INT_BIT		BIT(31)
+#define TOE_IQ2_FULL_INT_BIT		BIT(30)
+#define TOE_IQ1_FULL_INT_BIT		BIT(29)
+#define TOE_IQ0_FULL_INT_BIT		BIT(28)
+#define TOE_IQ3_INT_BIT			BIT(27)
+#define TOE_IQ2_INT_BIT			BIT(26)
+#define TOE_IQ1_INT_BIT			BIT(25)
+#define TOE_IQ0_INT_BIT			BIT(24)
+#define GMAC1_HWTQ13_EOF_INT_BIT	BIT(23)
+#define GMAC1_HWTQ12_EOF_INT_BIT	BIT(22)
+#define GMAC1_HWTQ11_EOF_INT_BIT	BIT(21)
+#define GMAC1_HWTQ10_EOF_INT_BIT	BIT(20)
+#define GMAC0_HWTQ03_EOF_INT_BIT	BIT(19)
+#define GMAC0_HWTQ02_EOF_INT_BIT	BIT(18)
+#define GMAC0_HWTQ01_EOF_INT_BIT	BIT(17)
+#define GMAC0_HWTQ00_EOF_INT_BIT	BIT(16)
+#define CLASS_RX_INT_BIT(x)		BIT((x + 2))
+#define DEFAULT_Q1_INT_BIT		BIT(1)
+#define DEFAULT_Q0_INT_BIT		BIT(0)
+
+#define TOE_IQ_INT_BITS		(TOE_IQ0_INT_BIT | TOE_IQ1_INT_BIT | \
+				 TOE_IQ2_INT_BIT | TOE_IQ3_INT_BIT)
+#define	TOE_IQ_FULL_BITS	(TOE_IQ0_FULL_INT_BIT | TOE_IQ1_FULL_INT_BIT | \
+		                 TOE_IQ2_FULL_INT_BIT | TOE_IQ3_FULL_INT_BIT)
+#define	TOE_IQ_ALL_BITS		(TOE_IQ_INT_BITS | TOE_IQ_FULL_BITS)
+#define TOE_CLASS_RX_INT_BITS	0xfffc
+
+/*
+ * Interrupt Status Register 2	(offset 0x0040)
+ * Interrupt Mask Register 2	(offset 0x0044)
+ * Interrupt Select Register 2	(offset 0x0048)
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_0040 {
+		unsigned int toe_q0_full	: 1;	/* bit 0	TOE Queue 0 Full Interrupt */
+		unsigned int toe_q1_full	: 1;	/* bit 1	TOE Queue 1 Full Interrupt */
+		unsigned int toe_q2_full	: 1;	/* bit 2	TOE Queue 2 Full Interrupt */
+		unsigned int toe_q3_full	: 1;	/* bit 3	TOE Queue 3 Full Interrupt */
+		unsigned int toe_q4_full	: 1;	/* bit 4	TOE Queue 4 Full Interrupt */
+		unsigned int toe_q5_full	: 1;	/* bit 5	TOE Queue 5 Full Interrupt */
+		unsigned int toe_q6_full	: 1;	/* bit 6	TOE Queue 6 Full Interrupt */
+		unsigned int toe_q7_full	: 1;	/* bit 7	TOE Queue 7 Full Interrupt */
+		unsigned int toe_q8_full	: 1;	/* bit 8	TOE Queue 8 Full Interrupt */
+		unsigned int toe_q9_full	: 1;	/* bit 9	TOE Queue 9 Full Interrupt */
+		unsigned int toe_q10_full	: 1;	/* bit 10	TOE Queue 10 Full Interrupt */
+		unsigned int toe_q11_full	: 1;	/* bit 11	TOE Queue 11 Full Interrupt */
+		unsigned int toe_q12_full	: 1;	/* bit 12	TOE Queue 12 Full Interrupt */
+		unsigned int toe_q13_full	: 1;	/* bit 13	TOE Queue 13 Full Interrupt */
+		unsigned int toe_q14_full	: 1;	/* bit 14	TOE Queue 14 Full Interrupt */
+		unsigned int toe_q15_full	: 1;	/* bit 15	TOE Queue 15 Full Interrupt */
+		unsigned int toe_q16_full	: 1;	/* bit 16	TOE Queue 16 Full Interrupt */
+		unsigned int toe_q17_full	: 1;	/* bit 17	TOE Queue 17 Full Interrupt */
+		unsigned int toe_q18_full	: 1;	/* bit 18	TOE Queue 18 Full Interrupt */
+		unsigned int toe_q19_full	: 1;	/* bit 19	TOE Queue 19 Full Interrupt */
+		unsigned int toe_q20_full	: 1;	/* bit 20	TOE Queue 20 Full Interrupt */
+		unsigned int toe_q21_full	: 1;	/* bit 21	TOE Queue 21 Full Interrupt */
+		unsigned int toe_q22_full	: 1;	/* bit 22	TOE Queue 22 Full Interrupt */
+		unsigned int toe_q23_full	: 1;	/* bit 23	TOE Queue 23 Full Interrupt */
+		unsigned int toe_q24_full	: 1;	/* bit 24	TOE Queue 24 Full Interrupt */
+		unsigned int toe_q25_full	: 1;	/* bit 25	TOE Queue 25 Full Interrupt */
+		unsigned int toe_q26_full	: 1;	/* bit 26	TOE Queue 26 Full Interrupt */
+		unsigned int toe_q27_full	: 1;	/* bit 27	TOE Queue 27 Full Interrupt */
+		unsigned int toe_q28_full	: 1;	/* bit 28	TOE Queue 28 Full Interrupt */
+		unsigned int toe_q29_full	: 1;	/* bit 29	TOE Queue 29 Full Interrupt */
+		unsigned int toe_q30_full	: 1;	/* bit 30	TOE Queue 30 Full Interrupt */
+		unsigned int toe_q31_full	: 1;	/* bit 31	TOE Queue 31 Full Interrupt */
+	} bits;
+} INTR_REG2_T;
+
+#define TOE_QL_FULL_INT_BIT(x)		BIT(x)
+
+/*
+ * Interrupt Status Register 3	(offset 0x0050)
+ * Interrupt Mask Register 3	(offset 0x0054)
+ * Interrupt Select Register 3	(offset 0x0058)
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_0050 {
+		unsigned int toe_q32_full	: 1;	/* bit 32	TOE Queue 32 Full Interrupt */
+		unsigned int toe_q33_full	: 1;	/* bit 33	TOE Queue 33 Full Interrupt */
+		unsigned int toe_q34_full	: 1;	/* bit 34	TOE Queue 34 Full Interrupt */
+		unsigned int toe_q35_full	: 1;	/* bit 35	TOE Queue 35 Full Interrupt */
+		unsigned int toe_q36_full	: 1;	/* bit 36	TOE Queue 36 Full Interrupt */
+		unsigned int toe_q37_full	: 1;	/* bit 37	TOE Queue 37 Full Interrupt */
+		unsigned int toe_q38_full	: 1;	/* bit 38	TOE Queue 38 Full Interrupt */
+		unsigned int toe_q39_full	: 1;	/* bit 39	TOE Queue 39 Full Interrupt */
+		unsigned int toe_q40_full	: 1;	/* bit 40	TOE Queue 40 Full Interrupt */
+		unsigned int toe_q41_full	: 1;	/* bit 41	TOE Queue 41 Full Interrupt */
+		unsigned int toe_q42_full	: 1;	/* bit 42	TOE Queue 42 Full Interrupt */
+		unsigned int toe_q43_full	: 1;	/* bit 43	TOE Queue 43 Full Interrupt */
+		unsigned int toe_q44_full	: 1;	/* bit 44	TOE Queue 44 Full Interrupt */
+		unsigned int toe_q45_full	: 1;	/* bit 45	TOE Queue 45 Full Interrupt */
+		unsigned int toe_q46_full	: 1;	/* bit 46	TOE Queue 46 Full Interrupt */
+		unsigned int toe_q47_full	: 1;	/* bit 47	TOE Queue 47 Full Interrupt */
+		unsigned int toe_q48_full	: 1;	/* bit 48	TOE Queue 48 Full Interrupt */
+		unsigned int toe_q49_full	: 1;	/* bit 49	TOE Queue 49 Full Interrupt */
+		unsigned int toe_q50_full	: 1;	/* bit 50	TOE Queue 50 Full Interrupt */
+		unsigned int toe_q51_full	: 1;	/* bit 51	TOE Queue 51 Full Interrupt */
+		unsigned int toe_q52_full	: 1;	/* bit 52	TOE Queue 52 Full Interrupt */
+		unsigned int toe_q53_full	: 1;	/* bit 53	TOE Queue 53 Full Interrupt */
+		unsigned int toe_q54_full	: 1;	/* bit 54	TOE Queue 54 Full Interrupt */
+		unsigned int toe_q55_full	: 1;	/* bit 55	TOE Queue 55 Full Interrupt */
+		unsigned int toe_q56_full	: 1;	/* bit 56	TOE Queue 56 Full Interrupt */
+		unsigned int toe_q57_full	: 1;	/* bit 57	TOE Queue 57 Full Interrupt */
+		unsigned int toe_q58_full	: 1;	/* bit 58	TOE Queue 58 Full Interrupt */
+		unsigned int toe_q59_full	: 1;	/* bit 59	TOE Queue 59 Full Interrupt */
+		unsigned int toe_q60_full	: 1;	/* bit 60	TOE Queue 60 Full Interrupt */
+		unsigned int toe_q61_full	: 1;	/* bit 61	TOE Queue 61 Full Interrupt */
+		unsigned int toe_q62_full	: 1;	/* bit 62	TOE Queue 62 Full Interrupt */
+		unsigned int toe_q63_full	: 1;	/* bit 63	TOE Queue 63 Full Interrupt */
+	} bits;
+} INTR_REG3_T;
+
+#define TOE_QH_FULL_INT_BIT(x)		BIT(x-32)
+
+/*
+ * Interrupt Status Register 4	(offset 0x0060)
+ * Interrupt Mask Register 4	(offset 0x0064)
+ * Interrupt Select Register 4	(offset 0x0068)
+ */
+typedef union {
+	unsigned char byte;
+	struct bit_0060 {
+		unsigned char status_changed	: 1;	/* Status Changed Intr for RGMII Mode */
+		unsigned char rx_overrun	: 1;   /* GMAC Rx FIFO overrun interrupt */
+		unsigned char tx_pause_off	: 1;	/* received pause off frame interrupt */
+		unsigned char rx_pause_off	: 1;	/* received pause off frame interrupt */
+		unsigned char tx_pause_on	: 1;	/* transmit pause on frame interrupt */
+		unsigned char rx_pause_on	: 1;	/* received pause on frame interrupt */
+		unsigned char cnt_full		: 1;	/* MIB counters half full interrupt */
+		unsigned char reserved		: 1;	/* */
+	} __packed bits;
+} __packed GMAC_INTR_T;
+
+typedef union {
+	unsigned int bits32;
+	struct bit_0060_2 {
+		unsigned int    swfq_empty	: 1;	/* bit 0	Software Free Queue Empty Intr. */
+		unsigned int    hwfq_empty	: 1;	/* bit 1	Hardware Free Queue Empty Intr. */
+		unsigned int	class_qf_int	: 14;	/* bit 15:2 Classification Rx Queue13-0 Full Intr. */
+		GMAC_INTR_T	gmac0;
+		GMAC_INTR_T	gmac1;
+	} bits;
+} INTR_REG4_T;
+
+#define GMAC1_RESERVED_INT_BIT		BIT(31)
+#define GMAC1_MIB_INT_BIT		BIT(30)
+#define GMAC1_RX_PAUSE_ON_INT_BIT	BIT(29)
+#define GMAC1_TX_PAUSE_ON_INT_BIT	BIT(28)
+#define GMAC1_RX_PAUSE_OFF_INT_BIT	BIT(27)
+#define GMAC1_TX_PAUSE_OFF_INT_BIT	BIT(26)
+#define GMAC1_RX_OVERRUN_INT_BIT	BIT(25)
+#define GMAC1_STATUS_CHANGE_INT_BIT	BIT(24)
+#define GMAC0_RESERVED_INT_BIT		BIT(23)
+#define GMAC0_MIB_INT_BIT		BIT(22)
+#define GMAC0_RX_PAUSE_ON_INT_BIT	BIT(21)
+#define GMAC0_TX_PAUSE_ON_INT_BIT	BIT(20)
+#define GMAC0_RX_PAUSE_OFF_INT_BIT	BIT(19)
+#define GMAC0_TX_PAUSE_OFF_INT_BIT	BIT(18)
+#define GMAC0_RX_OVERRUN_INT_BIT	BIT(17)
+#define GMAC0_STATUS_CHANGE_INT_BIT	BIT(16)
+#define CLASS_RX_FULL_INT_BIT(x)	BIT((x+2))
+#define HWFQ_EMPTY_INT_BIT		BIT(1)
+#define SWFQ_EMPTY_INT_BIT		BIT(0)
+
+#define GMAC0_INT_BITS		(GMAC0_RESERVED_INT_BIT | GMAC0_MIB_INT_BIT | \
+				 GMAC0_RX_PAUSE_ON_INT_BIT | GMAC0_TX_PAUSE_ON_INT_BIT |	\
+				 GMAC0_RX_PAUSE_OFF_INT_BIT | GMAC0_TX_PAUSE_OFF_INT_BIT |	\
+				 GMAC0_RX_OVERRUN_INT_BIT | GMAC0_STATUS_CHANGE_INT_BIT)
+#define GMAC1_INT_BITS		(GMAC1_RESERVED_INT_BIT | GMAC1_MIB_INT_BIT | \
+				 GMAC1_RX_PAUSE_ON_INT_BIT | GMAC1_TX_PAUSE_ON_INT_BIT |	\
+				 GMAC1_RX_PAUSE_OFF_INT_BIT | GMAC1_TX_PAUSE_OFF_INT_BIT |	\
+				 GMAC1_RX_OVERRUN_INT_BIT | GMAC1_STATUS_CHANGE_INT_BIT)
+
+#define CLASS_RX_FULL_INT_BITS		0xfffc
+
+/*
+ * GLOBAL_QUEUE_THRESHOLD_REG	(offset 0x0070)
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_0070_2 {
+		unsigned int    swfq_empty	: 8;	/*  7:0		Software Free Queue Empty Threshold */
+		unsigned int    hwfq_empty	: 8;	/* 15:8		Hardware Free Queue Empty Threshold */
+		unsigned int	intrq		: 8;	/* 23:16 */
+		unsigned int	toe_class	: 8;	/* 31:24 */
+	} bits;
+} QUEUE_THRESHOLD_T;
+
+
+/*
+ * GMAC DMA Control Register
+ * GMAC0 offset 0x8000
+ * GMAC1 offset 0xC000
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_8000 {
+		unsigned int	td_bus		: 2;	/* bit 1:0	Peripheral Bus Width */
+		unsigned int	td_burst_size	: 2;	/* bit 3:2	TxDMA max burst size for every AHB request */
+		unsigned int	td_prot		: 4;	/* bit 7:4	TxDMA protection control */
+		unsigned int	rd_bus		: 2;	/* bit 9:8	Peripheral Bus Width */
+		unsigned int	rd_burst_size	: 2;	/* bit 11:10	DMA max burst size for every AHB request */
+		unsigned int	rd_prot		: 4;	/* bit 15:12	DMA Protection Control */
+		unsigned int	rd_insert_bytes	: 2;	/* bit 17:16 */
+		unsigned int	reserved	: 10;	/* bit 27:18 */
+		unsigned int    drop_small_ack	: 1;	/* bit 28	1: Drop, 0: Accept */
+		unsigned int    loopback	: 1;	/* bit 29	Loopback TxDMA to RxDMA */
+		unsigned int    td_enable	: 1;	/* bit 30	Tx DMA Enable */
+		unsigned int    rd_enable	: 1;	/* bit 31	Rx DMA Enable */
+	} bits;
+} GMAC_DMA_CTRL_T;
+
+/*
+ * GMAC Tx Weighting Control Register 0
+ * GMAC0 offset 0x8004
+ * GMAC1 offset 0xC004
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_8004 {
+		unsigned int    hw_tq0		: 6;	/* bit 5:0	HW TX Queue 3 */
+		unsigned int    hw_tq1		: 6;	/* bit 11:6	HW TX Queue 2 */
+		unsigned int    hw_tq2		: 6;	/* bit 17:12	HW TX Queue 1 */
+		unsigned int    hw_tq3		: 6;	/* bit 23:18	HW TX Queue 0 */
+		unsigned int    reserved	: 8;	/* bit 31:24 */
+	} bits;
+} GMAC_TX_WCR0_T;	/* Weighting Control Register 0 */
+
+/*
+ * GMAC Tx Weighting Control Register 1
+ * GMAC0 offset 0x8008
+ * GMAC1 offset 0xC008
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_8008 {
+		unsigned int    sw_tq0		: 5;	/* bit 4:0	SW TX Queue 0 */
+		unsigned int    sw_tq1		: 5;	/* bit 9:5	SW TX Queue 1 */
+		unsigned int    sw_tq2		: 5;	/* bit 14:10	SW TX Queue 2 */
+		unsigned int    sw_tq3		: 5;	/* bit 19:15	SW TX Queue 3 */
+		unsigned int    sw_tq4		: 5;	/* bit 24:20	SW TX Queue 4 */
+		unsigned int    sw_tq5		: 5;	/* bit 29:25	SW TX Queue 5 */
+		unsigned int    reserved	: 2;	/* bit 31:30 */
+	} bits;
+} GMAC_TX_WCR1_T;	/* Weighting Control Register 1 */
+
+/*
+ * Queue Read/Write Pointer
+ * GMAC SW TX Queue 0~5 Read/Write Pointer register
+ * GMAC0 offset 0x800C ~ 0x8020
+ * GMAC1 offset 0xC00C ~ 0xC020
+ * GMAC HW TX Queue 0~3 Read/Write Pointer register
+ * GMAC0 offset 0x8024 ~ 0x8030
+ * GMAC1 offset 0xC024 ~ 0xC030
+ *
+ * see DMA_RWPTR_T structure
+ */
+
+/*
+ * GMAC DMA Tx First Description Address Register
+ * GMAC0 offset 0x8038
+ * GMAC1 offset 0xC038
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_8038 {
+		unsigned int reserved		:  3;
+		unsigned int td_busy		:  1;	/* bit 3	1: TxDMA busy; 0: TxDMA idle */
+		unsigned int td_first_des_ptr	: 28;	/* bit 31:4	first descriptor address */
+	} bits;
+} GMAC_TXDMA_FIRST_DESC_T;
+
+/*
+ * GMAC DMA Tx Current Description Address Register
+ * GMAC0 offset 0x803C
+ * GMAC1 offset 0xC03C
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_803C {
+		unsigned int reserved		:  4;
+		unsigned int td_curr_desc_ptr	: 28;	/* bit 31:4	current descriptor address */
+	} bits;
+} GMAC_TXDMA_CURR_DESC_T;
+
+/*
+ * GMAC DMA Tx Description Word 0 Register
+ * GMAC0 offset 0x8040
+ * GMAC1 offset 0xC040
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_8040 {
+		unsigned int buffer_size	: 16;	/* bit 15:0	Transfer size */
+		unsigned int desc_count		: 6;	/* bit 21:16	number of descriptors used for the current frame */
+		unsigned int status_tx_ok	: 1;	/* bit 22	Tx Status, 1: Successful 0: Failed */
+		unsigned int status_rvd		: 6;	/* bit 28:23	Tx Status, Reserved bits */
+		unsigned int perr		: 1;	/* bit 29	protocol error during processing this descriptor */
+		unsigned int derr		: 1;	/* bit 30	data error during processing this descriptor */
+		unsigned int reserved		: 1;	/* bit 31 */
+	} bits;
+} GMAC_TXDESC_0_T;
+
+/*
+ * GMAC DMA Tx Description Word 1 Register
+ * GMAC0 offset 0x8044
+ * GMAC1 offset 0xC044
+ */
+typedef union {
+	unsigned int bits32;
+	struct txdesc_word1 {
+		unsigned int	byte_count	: 16;	/* bit 15: 0	Tx Frame Byte Count */
+		unsigned int	mtu_enable	: 1;	/* bit 16	TSS segmentation use MTU setting */
+		unsigned int	ip_chksum	: 1;	/* bit 17	IPV4 Header Checksum Enable */
+		unsigned int	ipv6_enable	: 1;	/* bit 18	IPV6 Tx Enable */
+		unsigned int	tcp_chksum	: 1;	/* bit 19	TCP Checksum Enable */
+		unsigned int	udp_chksum	: 1;	/* bit 20	UDP Checksum Enable */
+		unsigned int	bypass_tss	: 1;	/* bit 21 */
+		unsigned int	ip_fixed_len	: 1;	/* bit 22 */
+		unsigned int	reserved	: 9;	/* bit 31:23	Tx Flag, Reserved */
+	} bits;
+} GMAC_TXDESC_1_T;
+
+#define TSS_IP_FIXED_LEN_BIT	BIT(22)
+#define TSS_BYPASS_BIT		BIT(21)
+#define TSS_UDP_CHKSUM_BIT	BIT(20)
+#define TSS_TCP_CHKSUM_BIT	BIT(19)
+#define TSS_IPV6_ENABLE_BIT	BIT(18)
+#define TSS_IP_CHKSUM_BIT	BIT(17)
+#define TSS_MTU_ENABLE_BIT	BIT(16)
+
+#define TSS_CHECKUM_ENABLE	\
+	(TSS_IP_CHKSUM_BIT|TSS_IPV6_ENABLE_BIT| \
+	 TSS_TCP_CHKSUM_BIT|TSS_UDP_CHKSUM_BIT)
+
+/*
+ * GMAC DMA Tx Description Word 2 Register
+ * GMAC0 offset 0x8048
+ * GMAC1 offset 0xC048
+ */
+typedef union {
+	unsigned int	bits32;
+	unsigned int	buf_adr;
+} GMAC_TXDESC_2_T;
+
+/*
+ * GMAC DMA Tx Description Word 3 Register
+ * GMAC0 offset 0x804C
+ * GMAC1 offset 0xC04C
+ */
+typedef union {
+	unsigned int bits32;
+	struct txdesc_word3 {
+		unsigned int	mtu_size	: 11;	/* bit 10: 0	Tx Frame Byte Count */
+		unsigned int	reserved	: 18;	/* bit 28:11 */
+		unsigned int	eofie		: 1;	/* bit 29	End of frame interrupt enable */
+		unsigned int	sof_eof		: 2;	/* bit 31:30	11: only one, 10: first, 01: last, 00: linking */
+	} bits;
+} GMAC_TXDESC_3_T;
+#define SOF_EOF_BIT_MASK	0x3fffffff
+#define SOF_BIT			0x80000000
+#define EOF_BIT			0x40000000
+#define EOFIE_BIT		BIT(29)
+#define MTU_SIZE_BIT_MASK	0x7ff
+
+/*
+ * GMAC Tx Descriptor
+ */
+typedef struct {
+	GMAC_TXDESC_0_T	word0;
+	GMAC_TXDESC_1_T	word1;
+	GMAC_TXDESC_2_T	word2;
+	GMAC_TXDESC_3_T	word3;
+} GMAC_TXDESC_T;
+
+/*
+ * GMAC DMA Rx First Description Address Register
+ * GMAC0 offset 0x8058
+ * GMAC1 offset 0xC058
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_8058 {
+		unsigned int reserved		:  3;	/* bit 2:0 */
+		unsigned int rd_busy		:  1;	/* bit 3	1-RxDMA busy; 0-RxDMA idle */
+		unsigned int rd_first_des_ptr	: 28;	/* bit 31:4 first descriptor address */
+	} bits;
+} GMAC_RXDMA_FIRST_DESC_T;
+
+/*
+ * GMAC DMA Rx Current Description Address Register
+ * GMAC0 offset 0x805C
+ * GMAC1 offset 0xC05C
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_805C {
+		unsigned int reserved		:  4;	/* bit 3:0 */
+		unsigned int rd_curr_des_ptr	: 28;	/* bit 31:4 current descriptor address */
+	} bits;
+} GMAC_RXDMA_CURR_DESC_T;
+
+/*
+ * GMAC DMA Rx Description Word 0 Register
+ * GMAC0 offset 0x8060
+ * GMAC1 offset 0xC060
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_8060 {
+		unsigned int buffer_size	: 16;	/* bit 15:0  number of descriptors used for the current frame */
+		unsigned int desc_count		: 6;	/* bit 21:16 number of descriptors used for the current frame */
+		unsigned int status		: 4;	/* bit 24:22 Status of rx frame */
+		unsigned int chksum_status	: 3;	/* bit 28:26 Check Sum Status */
+		unsigned int perr		: 1;	/* bit 29	 protocol error during processing this descriptor */
+		unsigned int derr		: 1;	/* bit 30	 data error during processing this descriptor */
+		unsigned int drop		: 1;	/* bit 31	 TOE/CIS Queue Full dropped packet to default queue */
+	} bits;
+} GMAC_RXDESC_0_T;
+
+#define		GMAC_RXDESC_0_T_derr			BIT(30)
+#define		GMAC_RXDESC_0_T_perr			BIT(29)
+#define		GMAC_RXDESC_0_T_chksum_status(x)	BIT((x+26))
+#define		GMAC_RXDESC_0_T_status(x)		BIT((x+22))
+#define		GMAC_RXDESC_0_T_desc_count(x)		BIT((x+16))
+
+#define	RX_CHKSUM_IP_UDP_TCP_OK			0
+#define	RX_CHKSUM_IP_OK_ONLY			1
+#define	RX_CHKSUM_NONE				2
+#define	RX_CHKSUM_IP_ERR_UNKNOWN		4
+#define	RX_CHKSUM_IP_ERR			5
+#define	RX_CHKSUM_TCP_UDP_ERR			6
+#define RX_CHKSUM_NUM				8
+
+#define RX_STATUS_GOOD_FRAME			0
+#define RX_STATUS_TOO_LONG_GOOD_CRC		1
+#define RX_STATUS_RUNT_FRAME			2
+#define RX_STATUS_SFD_NOT_FOUND			3
+#define RX_STATUS_CRC_ERROR			4
+#define RX_STATUS_TOO_LONG_BAD_CRC		5
+#define RX_STATUS_ALIGNMENT_ERROR		6
+#define RX_STATUS_TOO_LONG_BAD_ALIGN		7
+#define RX_STATUS_RX_ERR			8
+#define RX_STATUS_DA_FILTERED			9
+#define RX_STATUS_BUFFER_FULL			10
+#define RX_STATUS_NUM				16
+
+#define RX_ERROR_LENGTH(s) \
+	((s) == RX_STATUS_TOO_LONG_GOOD_CRC || \
+	 (s) == RX_STATUS_TOO_LONG_BAD_CRC || \
+	 (s) == RX_STATUS_TOO_LONG_BAD_ALIGN)
+#define RX_ERROR_OVER(s) \
+	((s) == RX_STATUS_BUFFER_FULL)
+#define RX_ERROR_CRC(s) \
+	((s) == RX_STATUS_CRC_ERROR || \
+	 (s) == RX_STATUS_TOO_LONG_BAD_CRC)
+#define RX_ERROR_FRAME(s) \
+	((s) == RX_STATUS_ALIGNMENT_ERROR || \
+	 (s) == RX_STATUS_TOO_LONG_BAD_ALIGN)
+#define RX_ERROR_FIFO(s) \
+	(0)
+
+/*
+ * GMAC DMA Rx Description Word 1 Register
+ * GMAC0 offset 0x8064
+ * GMAC1 offset 0xC064
+ */
+typedef union {
+	unsigned int bits32;
+	struct rxdesc_word1 {
+		unsigned int	byte_count	: 16;	/* bit 15: 0	Rx Frame Byte Count */
+		unsigned int	sw_id		: 16;	/* bit 31:16	Software ID */
+	} bits;
+} GMAC_RXDESC_1_T;
+
+/*
+ * GMAC DMA Rx Description Word 2 Register
+ * GMAC0 offset 0x8068
+ * GMAC1 offset 0xC068
+ */
+typedef union {
+	unsigned int	bits32;
+	unsigned int	buf_adr;
+} GMAC_RXDESC_2_T;
+
+#define RX_INSERT_NONE		0
+#define RX_INSERT_1_BYTE	1
+#define RX_INSERT_2_BYTE	2
+#define RX_INSERT_3_BYTE	3
+
+/*
+ * GMAC DMA Rx Description Word 3 Register
+ * GMAC0 offset 0x806C
+ * GMAC1 offset 0xC06C
+ */
+typedef union {
+	unsigned int bits32;
+	struct rxdesc_word3 {
+		unsigned int	l3_offset	: 8;	/* bit 7: 0	L3 data offset */
+		unsigned int	l4_offset	: 8;	/* bit 15: 8	L4 data offset */
+		unsigned int	l7_offset	: 8;	/* bit 23: 16	L7 data offset */
+		unsigned int	dup_ack		: 1;	/* bit 24	Duplicated ACK detected */
+		unsigned int	abnormal	: 1;	/* bit 25	abnormal case found */
+		unsigned int	option		: 1;	/* bit 26	IPV4 option or IPV6 extension header */
+		unsigned int	out_of_seq	: 1;	/* bit 27	Out of Sequence packet */
+		unsigned int	ctrl_flag	: 1;	/* bit 28	Control Flag is present */
+		unsigned int	eofie		: 1;	/* bit 29	End of frame interrupt enable */
+		unsigned int	sof_eof		: 2;	/* bit 31:30	11: only one, 10: first, 01: last, 00: linking */
+	} bits;
+} GMAC_RXDESC_3_T;
+
+/*
+ * GMAC Rx Descriptor
+ */
+typedef struct {
+	GMAC_RXDESC_0_T	word0;
+	GMAC_RXDESC_1_T	word1;
+	GMAC_RXDESC_2_T	word2;
+	GMAC_RXDESC_3_T	word3;
+} GMAC_RXDESC_T;
+
+/*
+ * GMAC Hash Engine Enable/Action Register 0 Offset Register
+ * GMAC0 offset 0x8070
+ * GMAC1 offset 0xC070
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_8070 {
+		unsigned int	mr0hel		: 6;	/* bit 5:0	match rule 0 hash entry size */
+		unsigned int	mr0_action	: 5;	/* bit 10:6	Matching Rule 0 action offset */
+		unsigned int	reserved0	: 4;	/* bit 14:11 */
+		unsigned int	mr0en		: 1;	/* bit 15	Enable Matching Rule 0 */
+		unsigned int	mr1hel		: 6;	/* bit 21:16	match rule 1 hash entry size */
+		unsigned int	mr1_action	: 5;	/* bit 26:22	Matching Rule 1 action offset */
+		unsigned int	timing		: 3;	/* bit 29:27 */
+		unsigned int	reserved1	: 1;	/* bit 30 */
+		unsigned int	mr1en		: 1;	/* bit 31	Enable Matching Rule 1 */
+	} bits;
+} GMAC_HASH_ENABLE_REG0_T;
+
+/*
+ * GMAC Hash Engine Enable/Action Register 1 Offset Register
+ * GMAC0 offset 0x8074
+ * GMAC1 offset 0xC074
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_8074 {
+		unsigned int	mr2hel		: 6;	/* bit 5:0	match rule 2 hash entry size */
+		unsigned int	mr2_action	: 5;	/* bit 10:6	Matching Rule 2 action offset */
+		unsigned int	reserved2	: 4;	/* bit 14:11 */
+		unsigned int	mr2en		: 1;	/* bit 15	Enable Matching Rule 2 */
+		unsigned int	mr3hel		: 6;	/* bit 21:16	match rule 3 hash entry size */
+		unsigned int	mr3_action	: 5;	/* bit 26:22	Matching Rule 3 action offset */
+		unsigned int	reserved1	: 4;	/* bit 30:27 */
+		unsigned int	mr3en		: 1;	/* bit 31	Enable Matching Rule 3 */
+	} bits;
+} GMAC_HASH_ENABLE_REG1_T;
+
+/*
+ * GMAC Matching Rule Control Register 0
+ * GMAC0 offset 0x8078
+ * GMAC1 offset 0xC078
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_8078 {
+		unsigned int	sprx		: 8;	/* bit 7:0	Support Protocol Register 7:0 */
+		unsigned int	reserved2	: 4;	/* bit 11:8 */
+		unsigned int	tos_traffic	: 1;	/* bit 12	IPV4 TOS or IPV6 Traffice Class */
+		unsigned int	flow_lable	: 1;	/* bit 13	IPV6 Flow label */
+		unsigned int	ip_hdr_len	: 1;	/* bit 14	IPV4 Header length */
+		unsigned int	ip_version	: 1;	/* bit 15	0: IPV4, 1: IPV6 */
+		unsigned int	reserved1	: 3;	/* bit 18:16 */
+		unsigned int	pppoe		: 1;	/* bit 19	PPPoE Session ID enable */
+		unsigned int	vlan		: 1;	/* bit 20	VLAN ID enable */
+		unsigned int	ether_type	: 1;	/* bit 21	Ethernet type enable */
+		unsigned int	sa		: 1;	/* bit 22	MAC SA enable */
+		unsigned int	da		: 1;	/* bit 23	MAC DA enable */
+		unsigned int	priority	: 3;	/* bit 26:24	priority if multi-rules matched */
+		unsigned int	port		: 1;	/* bit 27	PORT ID matching enable */
+		unsigned int	l7		: 1;	/* bit 28	L7 matching enable */
+		unsigned int	l4		: 1;	/* bit 29	L4 matching enable */
+		unsigned int	l3		: 1;	/* bit 30	L3 matching enable */
+		unsigned int	l2		: 1;	/* bit 31	L2 matching enable */
+	} bits;
+} GMAC_MRxCR0_T;
+
+#define MR_L2_BIT		BIT(31)
+#define MR_L3_BIT		BIT(30)
+#define MR_L4_BIT		BIT(29)
+#define MR_L7_BIT		BIT(28)
+#define MR_PORT_BIT		BIT(27)
+#define MR_PRIORITY_BIT		BIT(26)
+#define MR_DA_BIT		BIT(23)
+#define MR_SA_BIT		BIT(22)
+#define MR_ETHER_TYPE_BIT	BIT(21)
+#define MR_VLAN_BIT		BIT(20)
+#define MR_PPPOE_BIT		BIT(19)
+#define MR_IP_VER_BIT		BIT(15)
+#define MR_IP_HDR_LEN_BIT	BIT(14)
+#define MR_FLOW_LABLE_BIT	BIT(13)
+#define MR_TOS_TRAFFIC_BIT	BIT(12)
+#define MR_SPR_BIT(x)		BIT(x)
+#define MR_SPR_BITS		0xff
+
+/*
+ * GMAC Matching Rule Control Register 1
+ * GMAC0 offset 0x807C
+ * GMAC1 offset 0xC07C
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_807C {
+		unsigned int    l4_byte0_15	: 16;	/* bit 15: 0 */
+		unsigned int	dip_netmask	: 7;	/* bit 22:16	Dest IP net mask, number of mask bits */
+		unsigned int	dip		: 1;	/* bit 23		Dest IP */
+		unsigned int	sip_netmask	: 7;	/* bit 30:24	Srce IP net mask, number of mask bits */
+		unsigned int	sip		: 1;	/* bit 31		Srce IP */
+	} bits;
+} GMAC_MRxCR1_T;
+
+/*
+ * GMAC Matching Rule Control Register 2
+ * GMAC0 offset 0x8080
+ * GMAC1 offset 0xC080
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_8080 {
+		unsigned int    l7_byte0_23	: 24;	/* bit 23:0 */
+		unsigned int    l4_byte16_24	: 8;	/* bit 31: 24 */
+	} bits;
+} GMAC_MRxCR2_T;
+
+/*
+ * GMAC Support registers
+ * GMAC0 offset 0x80A8
+ * GMAC1 offset 0xC0A8
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_80A8 {
+		unsigned int    protocol	: 8;	/* bit 7:0		Supported protocol */
+		unsigned int    swap		: 3;	/* bit 10:8		Swap */
+		unsigned int    reserved	: 21;	/* bit 31:11 */
+	} bits;
+} GMAC_SPR_T;
+
+/*
+ * GMAC_AHB_WEIGHT registers
+ * GMAC0 offset 0x80C8
+ * GMAC1 offset 0xC0C8
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_80C8 {
+		unsigned int    hash_weight	: 5;	/* 4:0 */
+		unsigned int    rx_weight	: 5;	/* 9:5 */
+		unsigned int    tx_weight	: 5;	/* 14:10 */
+		unsigned int    pre_req		: 5;	/* 19:15 Rx Data Pre Request FIFO Threshold */
+		unsigned int    tqDV_threshold	: 5;	/* 24:20 DMA TqCtrl to Start tqDV FIFO Threshold */
+		unsigned int    reserved	: 7;	/* 31:25 */
+	} bits;
+} GMAC_AHB_WEIGHT_T;
+
+/*
+ * the register structure of GMAC
+ */
+
+/*
+ * GMAC RX FLTR
+ * GMAC0 Offset 0xA00C
+ * GMAC1 Offset 0xE00C
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit1_000c {
+		unsigned int unicast		:  1;	/* enable receive of unicast frames that are sent to STA address */
+		unsigned int multicast		:  1;	/* enable receive of multicast frames that pass multicast filter */
+		unsigned int broadcast		:  1;	/* enable receive of broadcast frames */
+		unsigned int promiscuous	:  1;   /* enable receive of all frames */
+		unsigned int error		:  1;	/* enable receive of all error frames */
+		unsigned int			: 27;
+	} bits;
+} GMAC_RX_FLTR_T;
+
+/*
+ * GMAC Configuration 0
+ * GMAC0 Offset 0xA018
+ * GMAC1 Offset 0xE018
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit1_0018 {
+		unsigned int dis_tx		:  1;	/* 0: disable transmit */
+		unsigned int dis_rx		:  1;	/* 1: disable receive */
+		unsigned int loop_back		:  1;	/* 2: transmit data loopback enable */
+		unsigned int flow_ctrl		:  1;   /* 3: flow control also trigged by Rx queues */
+		unsigned int adj_ifg		:  4;	/* 4-7: adjust IFG from 96+/-56 */
+		unsigned int max_len		:  3;	/* 8-10 maximum receive frame length allowed */
+		unsigned int dis_bkoff		:  1;	/* 11: disable back-off function */
+		unsigned int dis_col		:  1;	/* 12: disable 16 collisions abort function */
+		unsigned int sim_test		:  1;	/* 13: speed up timers in simulation */
+		unsigned int rx_fc_en		:  1;	/* 14: RX flow control enable */
+		unsigned int tx_fc_en		:  1;	/* 15: TX flow control enable */
+		unsigned int rgmii_en		:  1;   /* 16: RGMII in-band status enable */
+		unsigned int ipv4_rx_chksum	:  1;   /* 17: IPv4 RX Checksum enable */
+		unsigned int ipv6_rx_chksum	:  1;   /* 18: IPv6 RX Checksum enable */
+		unsigned int rx_tag_remove	:  1;   /* 19: Remove Rx VLAN tag */
+		unsigned int rgmm_edge		:  1;	/* 20 */
+		unsigned int rxc_inv		:  1;	/* 21 */
+		unsigned int ipv6_exthdr_order	:  1;	/* 22 */
+		unsigned int rx_err_detect	:  1;	/* 23 */
+		unsigned int port0_chk_hwq	:  1;	/* 24 */
+		unsigned int port1_chk_hwq	:  1;	/* 25 */
+		unsigned int port0_chk_toeq	:  1;	/* 26 */
+		unsigned int port1_chk_toeq	:  1;	/* 27 */
+		unsigned int port0_chk_classq	:  1;	/* 28 */
+		unsigned int port1_chk_classq	:  1;	/* 29 */
+		unsigned int reserved		:  2;	/* 31 */
+	} bits;
+} GMAC_CONFIG0_T;
+
+#define CONFIG0_TX_RX_DISABLE	(BIT(1)|BIT(0))
+#define CONFIG0_RX_CHKSUM	(BIT(18)|BIT(17))
+#define CONFIG0_FLOW_RX		(BIT(14))
+#define CONFIG0_FLOW_TX		(BIT(15))
+#define CONFIG0_FLOW_TX_RX	(BIT(14)|BIT(15))
+#define CONFIG0_FLOW_CTL	(BIT(14)|BIT(15))
+
+#define CONFIG0_MAXLEN_SHIFT	8
+#define CONFIG0_MAXLEN_MASK	(7 << CONFIG0_MAXLEN_SHIFT)
+#define  CONFIG0_MAXLEN_1536	0
+#define  CONFIG0_MAXLEN_1518	1
+#define  CONFIG0_MAXLEN_1522	2
+#define  CONFIG0_MAXLEN_1542	3
+#define  CONFIG0_MAXLEN_9k	4	/* 9212 */
+#define  CONFIG0_MAXLEN_10k	5	/* 10236 */
+#define  CONFIG0_MAXLEN_1518__6	6
+#define  CONFIG0_MAXLEN_1518__7	7
+
+/*
+ * GMAC Configuration 1
+ * GMAC0 Offset 0xA01C
+ * GMAC1 Offset 0xE01C
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit1_001c {
+		unsigned int set_threshold	: 8;	/* flow control set threshold */
+		unsigned int rel_threshold	: 8;	/* flow control release threshold */
+		unsigned int reserved		: 16;
+	} bits;
+} GMAC_CONFIG1_T;
+
+#define GMAC_FLOWCTRL_SET_MAX		32
+#define GMAC_FLOWCTRL_SET_MIN		0
+#define GMAC_FLOWCTRL_RELEASE_MAX	32
+#define GMAC_FLOWCTRL_RELEASE_MIN	0
+
+/*
+ * GMAC Configuration 2
+ * GMAC0 Offset 0xA020
+ * GMAC1 Offset 0xE020
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit1_0020 {
+		unsigned int set_threshold	: 16;	/* flow control set threshold */
+		unsigned int rel_threshold	: 16;	/* flow control release threshold */
+	} bits;
+} GMAC_CONFIG2_T;
+
+/*
+ * GMAC Configuration 3
+ * GMAC0 Offset 0xA024
+ * GMAC1 Offset 0xE024
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit1_0024 {
+		unsigned int set_threshold	: 16;	/* flow control set threshold */
+		unsigned int rel_threshold	: 16;	/* flow control release threshold */
+	} bits;
+} GMAC_CONFIG3_T;
+
+
+/*
+ * GMAC STATUS
+ * GMAC0 Offset 0xA02C
+ * GMAC1 Offset 0xE02C
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit1_002c {
+		unsigned int link		:  1;	/* link status */
+		unsigned int speed		:  2;	/* link speed(00->2.5M 01->25M 10->125M) */
+		unsigned int duplex		:  1;	/* duplex mode */
+		unsigned int reserved		:  1;
+		unsigned int mii_rmii		:  2;   /* PHY interface type */
+		unsigned int			: 25;
+	} bits;
+} GMAC_STATUS_T;
+
+#define GMAC_SPEED_10			0
+#define GMAC_SPEED_100			1
+#define GMAC_SPEED_1000			2
+
+#define GMAC_PHY_MII			0
+#define GMAC_PHY_GMII			1
+#define GMAC_PHY_RGMII_100_10		2
+#define GMAC_PHY_RGMII_1000		3
+
+/*
+ * Queue Header
+ *	(1) TOE Queue Header
+ *	(2) Non-TOE Queue Header
+ *	(3) Interrupt Queue Header
+ *
+ * memory Layout
+ *	TOE Queue Header
+ *		     0x60003000 +---------------------------+ 0x0000
+ *				|     TOE Queue 0 Header    |
+ *				|         8 * 4 Bytes	    |
+ *				+---------------------------+ 0x0020
+ *				|     TOE Queue 1 Header    |
+ *				|         8 * 4 Bytes	    |
+ *				+---------------------------+ 0x0040
+ *				|          ......           |
+ *				|                           |
+ *				+---------------------------+
+ *
+ *	Non TOE Queue Header
+ *		     0x60002000 +---------------------------+ 0x0000
+ *				|   Default Queue 0 Header  |
+ *				|         2 * 4 Bytes       |
+ *				+---------------------------+ 0x0008
+ *				|   Default Queue 1 Header  |
+ *				|         2 * 4 Bytes       |
+ *				+---------------------------+ 0x0010
+ *				|   Classification Queue 0  |
+ *				|	  2 * 4 Bytes       |
+ *				+---------------------------+
+ *				|   Classification Queue 1  |
+ *				|	  2 * 4 Bytes       |
+ *				+---------------------------+ (n * 8 + 0x10)
+ *				|		...	    |
+ *				|	  2 * 4 Bytes	    |
+ *				+---------------------------+ (13 * 8 + 0x10)
+ *				|   Classification Queue 13 |
+ *				|	  2 * 4 Bytes	    |
+ *				+---------------------------+ 0x80
+ *				|      Interrupt Queue 0    |
+ *				|	  2 * 4 Bytes	    |
+ *				+---------------------------+
+ *				|      Interrupt Queue 1    |
+ *				|	  2 * 4 Bytes	    |
+ *				+---------------------------+
+ *				|      Interrupt Queue 2    |
+ *				|	  2 * 4 Bytes	    |
+ *				+---------------------------+
+ *				|      Interrupt Queue 3    |
+ *				|	  2 * 4 Bytes	    |
+ *				+---------------------------+
+ *
+ */
+#define TOE_QUEUE_HDR_ADDR(n)		(TOE_TOE_QUE_HDR_BASE + n * 32)
+#define TOE_Q_HDR_AREA_END		(TOE_QUEUE_HDR_ADDR(TOE_TOE_QUEUE_MAX + 1))
+#define TOE_DEFAULT_Q_HDR_BASE(x)	(TOE_NONTOE_QUE_HDR_BASE + 0x08 * (x))
+#define TOE_CLASS_Q_HDR_BASE		(TOE_NONTOE_QUE_HDR_BASE + 0x10)
+#define TOE_INTR_Q_HDR_BASE		(TOE_NONTOE_QUE_HDR_BASE + 0x80)
+#define INTERRUPT_QUEUE_HDR_ADDR(n)	(TOE_INTR_Q_HDR_BASE + n * 8)
+#define NONTOE_Q_HDR_AREA_END		(INTERRUPT_QUEUE_HDR_ADDR(TOE_INTR_QUEUE_MAX + 1))
+/*
+ * TOE Queue Header Word 0
+ */
+typedef union {
+	unsigned int bits32;
+	unsigned int base_size;
+} TOE_QHDR0_T;
+
+#define TOE_QHDR0_BASE_MASK	(~0x0f)
+
+/*
+ * TOE Queue Header Word 1
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_qhdr1 {
+		unsigned int rptr	: 16;	/* bit 15:0 */
+		unsigned int wptr	: 16;	/* bit 31:16 */
+	} bits;
+} TOE_QHDR1_T;
+
+/*
+ * TOE Queue Header Word 2
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_qhdr2 {
+		unsigned int TotalPktSize	: 17;	/* bit 16: 0	Total packet size */
+		unsigned int reserved		: 7;	/* bit 23:17 */
+		unsigned int dack		: 1;	/* bit 24	1: Duplicated ACK */
+		unsigned int abn		: 1;	/* bit 25	1: Abnormal case Found */
+		unsigned int tcp_opt		: 1;	/* bit 26	1: Have TCP option */
+		unsigned int ip_opt		: 1;	/* bit 27	1: have IPV4 option or IPV6 Extension header */
+		unsigned int sat		: 1;	/* bit 28	1: SeqCnt > SeqThreshold, or AckCnt > AckThreshold */
+		unsigned int osq		: 1;	/* bit 29	1: out of sequence */
+		unsigned int ctl		: 1;	/* bit 30	1: have control flag bits (except ack) */
+		unsigned int usd		: 1;	/* bit 31	0: if no data assembled yet */
+	} bits;
+} TOE_QHDR2_T;
+
+/*
+ * TOE Queue Header Word 3
+ */
+typedef union {
+	unsigned int bits32;
+	unsigned int seq_num;
+} TOE_QHDR3_T;
+
+/*
+ * TOE Queue Header Word 4
+ */
+typedef union {
+	unsigned int bits32;
+	unsigned int ack_num;
+} TOE_QHDR4_T;
+
+/*
+ * TOE Queue Header Word 5
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_qhdr5 {
+		unsigned int AckCnt	: 16;	/* bit 15:0 */
+		unsigned int SeqCnt	: 16;	/* bit 31:16 */
+	} bits;
+} TOE_QHDR5_T;
+
+/*
+ * TOE Queue Header Word 6
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_qhdr6 {
+		unsigned int WinSize	: 16;	/* bit 15:0 */
+		unsigned int iq_num	: 2;	/* bit 17:16 */
+		unsigned int MaxPktSize	: 14;	/* bit 31:18 */
+	} bits;
+} TOE_QHDR6_T;
+
+/*
+ * TOE Queue Header Word 7
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_qhdr7 {
+		unsigned int AckThreshold	: 16;	/* bit 15:0 */
+		unsigned int SeqThreshold	: 16;	/* bit 31:16 */
+	} bits;
+} TOE_QHDR7_T;
+
+/*
+ * TOE Queue Header
+ */
+typedef struct {
+	TOE_QHDR0_T		word0;
+	TOE_QHDR1_T		word1;
+	TOE_QHDR2_T		word2;
+	TOE_QHDR3_T		word3;
+	TOE_QHDR4_T		word4;
+	TOE_QHDR5_T		word5;
+	TOE_QHDR6_T		word6;
+	TOE_QHDR7_T		word7;
+} TOE_QHDR_T;
+
+/*
+ * NONTOE Queue Header Word 0
+ */
+typedef union {
+	unsigned int bits32;
+	unsigned int base_size;
+} NONTOE_QHDR0_T;
+
+#define NONTOE_QHDR0_BASE_MASK	(~0x0f)
+
+/*
+ * NONTOE Queue Header Word 1
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_nonqhdr1 {
+		unsigned int rptr	: 16;	/* bit 15:0 */
+		unsigned int wptr	: 16;	/* bit 31:16 */
+	} bits;
+} NONTOE_QHDR1_T;
+
+/*
+ * Non-TOE Queue Header
+ */
+typedef struct {
+	NONTOE_QHDR0_T		word0;
+	NONTOE_QHDR1_T		word1;
+} NONTOE_QHDR_T;
+
+/*
+ * Interrupt Queue Header Word 0
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_intrqhdr0 {
+		unsigned int win_size	: 16;	/* bit 15:0	Descriptor Ring Size */
+		unsigned int wptr	: 16;	/* bit 31:16	Write Pointer where hw stopped */
+	} bits;
+} INTR_QHDR0_T;
+
+/*
+ * Interrupt Queue Header Word 1
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_intrqhdr1 {
+		unsigned int TotalPktSize	: 17;	/* bit 16: 0	Total packet size */
+		unsigned int tcp_qid		: 8;	/* bit 24:17	TCP Queue ID */
+		unsigned int dack		: 1;	/* bit 25	1: Duplicated ACK */
+		unsigned int abn		: 1;	/* bit 26	1: Abnormal case Found */
+		unsigned int tcp_opt		: 1;	/* bit 27	1: Have TCP option */
+		unsigned int ip_opt		: 1;	/* bit 28	1: have IPV4 option or IPV6 Extension header */
+		unsigned int sat		: 1;	/* bit 29	1: SeqCnt > SeqThreshold, or AckCnt > AckThreshold */
+		unsigned int osq		: 1;	/* bit 30	1: out of sequence */
+		unsigned int ctl		: 1;	/* bit 31	1: have control flag bits (except ack) */
+	} bits;
+} INTR_QHDR1_T;
+
+/*
+ * Interrupt Queue Header Word 2
+ */
+typedef union {
+	unsigned int bits32;
+	unsigned int seq_num;
+} INTR_QHDR2_T;
+
+/*
+ * Interrupt Queue Header Word 3
+ */
+typedef union {
+	unsigned int bits32;
+	unsigned int ack_num;
+} INTR_QHDR3_T;
+
+/*
+ * Interrupt Queue Header Word 4
+ */
+typedef union {
+	unsigned int bits32;
+	struct bit_intrqhdr4 {
+		unsigned int AckCnt		: 16;	/* bit 15:0	Ack# change since last ack# intr. */
+		unsigned int SeqCnt		: 16;	/* bit 31:16	Seq# change since last seq# intr. */
+	} bits;
+} INTR_QHDR4_T;
+
+/*
+ * Interrupt Queue Header
+ */
+typedef struct {
+	INTR_QHDR0_T		word0;
+	INTR_QHDR1_T		word1;
+	INTR_QHDR2_T		word2;
+	INTR_QHDR3_T		word3;
+	INTR_QHDR4_T		word4;
+	unsigned int		word5;
+	unsigned int		word6;
+	unsigned int		word7;
+} INTR_QHDR_T;
+
+#endif /* _GMAC_SL351x_H */


^ permalink raw reply related

* Re: [PATCH 03/10] net/fec: add mac field into platform data and consolidate fec_get_mac
From: Uwe Kleine-König @ 2010-12-30  8:04 UTC (permalink / raw)
  To: Shawn Guo
  Cc: Baruch Siach, davem, gerg, eric, bryan.wu, r64343, B32542, lw,
	w.sang, s.hauer, linux-arm-kernel, netdev
In-Reply-To: <20101230021243.GA20155@freescale.com>

Hello Shawn,

On Thu, Dec 30, 2010 at 10:12:44AM +0800, Shawn Guo wrote:
> On Wed, Dec 29, 2010 at 01:42:21PM +0100, Uwe Kleine-König wrote:
> > On Wed, Dec 29, 2010 at 07:58:09PM +0800, Shawn Guo wrote:
> > > On Wed, Dec 29, 2010 at 11:31:38AM +0100, Uwe Kleine-König wrote:
> > > > On Wed, Dec 29, 2010 at 06:05:21PM +0800, Shawn Guo wrote:
> > > > > On Wed, Dec 29, 2010 at 08:53:30AM +0200, Baruch Siach wrote:
> > > > > 	if (iap == fec_mac_default)
> > > > > 		dev->dev_addr[ETH_ALEN-1] = fec_mac_default[ETH_ALEN-1] + fep->pdev->id;
> > > > Can this overflow?  (I didn't check the code, so my concern might be
> > > > completely stupid here.)
> > > No. dev->dev_addr points to netdev_hw_addr->addr, which is a 32 bytes array.
> > I didn't mean an out-of-bound access, but what is if
> > fec_mac_default[ETH_ALEN-1] is 0xff and you add 1?  Does that result in
> > 0x100 or 0?  What if id is <0?  For big ids you might even handle a
> > carry to indixes <ETH_ALEN-2.
> > 
> First of all, all my patch did is changing fep->index to,
> fep->pdev->id, which should not bring any problem you are concerned.
> 
> Secondly, I do not understand how the overflow on 
> fec_mac_default[ETH_ALEN-1] can result in a carry on the next array
> element. Here is what I'm seeing with fec_mac=00:04:9f:01:30:ff.
There is no automatic carry to the next array element.  I just wondered
how overflow should be handled ...
 
> eth0      Link encap:Ethernet  HWaddr 00:04:9F:01:30:FF
> eth1      Link encap:Ethernet  HWaddr 00:04:9F:01:30:00
If this is intended, it's totally OK for me.

Best regards
Uwe

-- 
Pengutronix e.K.                           | Uwe Kleine-König            |
Industrial Linux Solutions                 | http://www.pengutronix.de/  |

^ permalink raw reply

* [PATCH] mac80211: fix mesh forwarding when ratelimited too
From: Milton Miller @ 2010-12-30  8:01 UTC (permalink / raw)
  To: John W. Linville, Johannes Berg, Javier Cardona
  Cc: David S. Miller, linux-wireless, netdev, linux-kernel
In-Reply-To: <201012262159.oBQLxOsw008865@hera.kernel.org>

Commit b51aff057c9d0ef6c529dc25fd9f775faf7b6c63 said:

    Under memory pressure, the mac80211 mesh code
    may helpfully print a message that it failed
    to clone a mesh frame and then will proceed
    to crash trying to use it anyway. Fix that.
    
Avoid the reference whenever the frame copy is unsuccessful
regardless of the debug message being suppressed or printed.

Cc: stable@kernel.org [2.6.27+] 
Signed-off-by: Milton Miller <miltonm@bga.com>
---
I chose a seperate if vs nesting the ratelimit check to avoid shifting
the printk further to the right.

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index b01e467..e98668f 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1788,11 +1788,11 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
 
 			fwd_skb = skb_copy(skb, GFP_ATOMIC);
 
-			if (!fwd_skb && net_ratelimit()) {
+			if (!fwd_skb && net_ratelimit())
 				printk(KERN_DEBUG "%s: failed to clone mesh frame\n",
 						   sdata->name);
+			if (!fwd_skb)
 				goto out;
-			}
 
 			fwd_hdr =  (struct ieee80211_hdr *) fwd_skb->data;
 			memcpy(fwd_hdr->addr2, sdata->vif.addr, ETH_ALEN);

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox