Netdev List
 help / color / mirror / Atom feed
* [PATCH 07/14] batman-adv: split neigh_new function into generic and batman iv specific parts
From: Antonio Quartulli @ 2012-05-09 11:12 UTC (permalink / raw)
  To: davem; +Cc: netdev, b.a.t.m.a.n, Marek Lindner, Antonio Quartulli
In-Reply-To: <1336561976-16088-1-git-send-email-ordex@autistici.org>

From: Marek Lindner <lindner_marek@yahoo.de>

Signed-off-by: Marek Lindner <lindner_marek@yahoo.de>
Acked-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Antonio Quartulli <ordex@autistici.org>
---
 net/batman-adv/bat_iv_ogm.c |   40 ++++++++++++++++++++++++++++++++++------
 net/batman-adv/originator.c |   27 ++++++++++-----------------
 net/batman-adv/originator.h |    6 ++----
 3 files changed, 46 insertions(+), 27 deletions(-)

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 8652a75..4baabf9 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -30,6 +30,32 @@
 #include "send.h"
 #include "bat_algo.h"
 
+static struct neigh_node *bat_iv_ogm_neigh_new(struct hard_iface *hard_iface,
+					       const uint8_t *neigh_addr,
+					       struct orig_node *orig_node,
+					       struct orig_node *orig_neigh,
+					       uint32_t seqno)
+{
+	struct neigh_node *neigh_node;
+
+	neigh_node = neigh_node_new(hard_iface, neigh_addr, seqno);
+	if (!neigh_node)
+		goto out;
+
+	INIT_LIST_HEAD(&neigh_node->bonding_list);
+	spin_lock_init(&neigh_node->tq_lock);
+
+	neigh_node->orig_node = orig_neigh;
+	neigh_node->if_incoming = hard_iface;
+
+	spin_lock_bh(&orig_node->neigh_list_lock);
+	hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list);
+	spin_unlock_bh(&orig_node->neigh_list_lock);
+
+out:
+	return neigh_node;
+}
+
 static int bat_iv_ogm_iface_enable(struct hard_iface *hard_iface)
 {
 	struct batman_ogm_packet *batman_ogm_packet;
@@ -638,8 +664,9 @@ static void bat_iv_ogm_orig_update(struct bat_priv *bat_priv,
 		if (!orig_tmp)
 			goto unlock;
 
-		neigh_node = create_neighbor(orig_node, orig_tmp,
-					     ethhdr->h_source, if_incoming);
+		neigh_node = bat_iv_ogm_neigh_new(if_incoming, ethhdr->h_source,
+						  orig_node, orig_tmp,
+						  batman_ogm_packet->seqno);
 
 		orig_node_free_ref(orig_tmp);
 		if (!neigh_node)
@@ -764,10 +791,11 @@ static int bat_iv_ogm_calc_tq(struct orig_node *orig_node,
 	rcu_read_unlock();
 
 	if (!neigh_node)
-		neigh_node = create_neighbor(orig_neigh_node,
-					     orig_neigh_node,
-					     orig_neigh_node->orig,
-					     if_incoming);
+		neigh_node = bat_iv_ogm_neigh_new(if_incoming,
+						  orig_neigh_node->orig,
+						  orig_neigh_node,
+						  orig_neigh_node,
+						  batman_ogm_packet->seqno);
 
 	if (!neigh_node)
 		goto out;
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 962636b..1bcc9c5 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -85,35 +85,28 @@ struct neigh_node *orig_node_get_router(struct orig_node *orig_node)
 	return router;
 }
 
-struct neigh_node *create_neighbor(struct orig_node *orig_node,
-				   struct orig_node *orig_neigh_node,
-				   const uint8_t *neigh,
-				   struct hard_iface *if_incoming)
+struct neigh_node *neigh_node_new(struct hard_iface *hard_iface,
+				  const uint8_t *neigh_addr, uint32_t seqno)
 {
-	struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+	struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
 	struct neigh_node *neigh_node;
 
-	bat_dbg(DBG_BATMAN, bat_priv,
-		"Creating new last-hop neighbor of originator\n");
-
 	neigh_node = kzalloc(sizeof(*neigh_node), GFP_ATOMIC);
 	if (!neigh_node)
-		return NULL;
+		goto out;
 
 	INIT_HLIST_NODE(&neigh_node->list);
-	INIT_LIST_HEAD(&neigh_node->bonding_list);
-	spin_lock_init(&neigh_node->tq_lock);
 
-	memcpy(neigh_node->addr, neigh, ETH_ALEN);
-	neigh_node->orig_node = orig_neigh_node;
-	neigh_node->if_incoming = if_incoming;
+	memcpy(neigh_node->addr, neigh_addr, ETH_ALEN);
 
 	/* extra reference for return */
 	atomic_set(&neigh_node->refcount, 2);
 
-	spin_lock_bh(&orig_node->neigh_list_lock);
-	hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list);
-	spin_unlock_bh(&orig_node->neigh_list_lock);
+	bat_dbg(DBG_BATMAN, bat_priv,
+		"Creating new neighbor %pM, initial seqno %d\n",
+		neigh_addr, seqno);
+
+out:
 	return neigh_node;
 }
 
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 3fe2eda..64c5d94 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -29,10 +29,8 @@ void originator_free(struct bat_priv *bat_priv);
 void purge_orig_ref(struct bat_priv *bat_priv);
 void orig_node_free_ref(struct orig_node *orig_node);
 struct orig_node *get_orig_node(struct bat_priv *bat_priv, const uint8_t *addr);
-struct neigh_node *create_neighbor(struct orig_node *orig_node,
-				   struct orig_node *orig_neigh_node,
-				   const uint8_t *neigh,
-				   struct hard_iface *if_incoming);
+struct neigh_node *neigh_node_new(struct hard_iface *hard_iface,
+				  const uint8_t *neigh_addr, uint32_t seqno);
 void neigh_node_free_ref(struct neigh_node *neigh_node);
 struct neigh_node *orig_node_get_router(struct orig_node *orig_node);
 int orig_seq_print_text(struct seq_file *seq, void *offset);
-- 
1.7.9.4

^ permalink raw reply related

* pull request: batman-adv 2012-05-09
From: Antonio Quartulli @ 2012-05-09 11:12 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	b.a.t.m.a.n-ZwoEplunGu2X36UT3dwllkB+6BGkLq7r

Dear David,

here is a new set of changes I would like to propose for next-next/linux-3.5.
Changes proposed here are the same of the last unsuccessful pull request, issued
on 2012-05-01, plus some other minor fixes.

The patchset is based on your net-next tree.
Please, let me know if there is any problem.


Sorry again for the touble I caused last time, but it was really a mistake, it
was not my intention to let you pull an entire wrong tree.



Thank you very much,
		Antonio


The following changes since commit 9bb862beb6e5839e92f709d33fda07678f062f20:

  Merge branch 'master' of git://1984.lsi.us.es/net-next (2012-05-08 14:40:21 -0400)

are available in the git repository at:


  git://git.open-mesh.org/linux-merge.git tags/batman-adv-for-davem

for you to fetch changes up to ab74e43c05964efe75472b5433ede8800ecbfc43:

  batman-adv: add contributor name (2012-05-09 09:54:55 +0200)

----------------------------------------------------------------
Included changes:

* fix a little bug in the DHCP packet snooping introduced so far
* minor fixes and cleanups
* minor routing protocol API cleanups
* add a new contributor name to translation-table.{c,h}
* update copyright years in file headers

----------------------------------------------------------------
Antonio Quartulli (3):
      batman-adv: fix wrong dhcp option list browsing
      batman-adv: update copyright years
      batman-adv: add contributor name

Linus Luessing (1):
      batman-adv: Adding hard_iface specific sysfs wrapper macros for UINT

Marek Lindner (10):
      batman-adv: introduce is_single_hop_neigh variable to increase readability
      batman-adv: introduce packet type handler array for incoming packets
      batman-adv: register batman ogm receive function during protocol init
      batman-adv: rename last_valid to last_seen
      batman-adv: replace HZ calculations with jiffies_to_msecs()
      batman-adv: split neigh_new function into generic and batman iv specific parts
      batman-adv: ignore protocol packets if the interface did not enable this protocol
      batman-adv: refactoring API: find generalized name for bat_ogm_update_mac callback
      batman-adv: rename sysfs macros to reflect the soft-interface dependency
      batman-adv: fix checkpatch string complaint

 net/batman-adv/bat_debugfs.c           |    4 +-
 net/batman-adv/bat_iv_ogm.c            |  176 +++++++++++++++++++++-----------
 net/batman-adv/bat_sysfs.c             |  100 +++++++++++++-----
 net/batman-adv/bridge_loop_avoidance.c |    2 +-
 net/batman-adv/bridge_loop_avoidance.h |    2 +-
 net/batman-adv/gateway_client.c        |    6 +-
 net/batman-adv/hard-interface.c        |  117 +--------------------
 net/batman-adv/main.c                  |  124 +++++++++++++++++++++-
 net/batman-adv/main.h                  |    6 ++
 net/batman-adv/originator.c            |   50 +++++----
 net/batman-adv/originator.h            |    6 +-
 net/batman-adv/packet.h                |    1 +
 net/batman-adv/routing.c               |   22 ++--
 net/batman-adv/routing.h               |    4 +-
 net/batman-adv/send.c                  |    2 +-
 net/batman-adv/translation-table.c     |    2 +-
 net/batman-adv/translation-table.h     |    2 +-
 net/batman-adv/types.h                 |   16 ++-
 18 files changed, 377 insertions(+), 265 deletions(-)

^ permalink raw reply

* [PATCH 06/14] batman-adv: replace HZ calculations with jiffies_to_msecs()
From: Antonio Quartulli @ 2012-05-09 11:12 UTC (permalink / raw)
  To: davem; +Cc: netdev, b.a.t.m.a.n, Marek Lindner, Antonio Quartulli
In-Reply-To: <1336561976-16088-1-git-send-email-ordex@autistici.org>

From: Marek Lindner <lindner_marek@yahoo.de>

Signed-off-by: Marek Lindner <lindner_marek@yahoo.de>
Acked-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Antonio Quartulli <ordex@autistici.org>
---
 net/batman-adv/bat_debugfs.c |    4 ++--
 net/batman-adv/originator.c  |   15 ++++++++++-----
 net/batman-adv/send.c        |    2 +-
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/net/batman-adv/bat_debugfs.c b/net/batman-adv/bat_debugfs.c
index 916380c..3b588f8 100644
--- a/net/batman-adv/bat_debugfs.c
+++ b/net/batman-adv/bat_debugfs.c
@@ -83,8 +83,8 @@ int debug_log(struct bat_priv *bat_priv, const char *fmt, ...)
 
 	va_start(args, fmt);
 	vscnprintf(tmp_log_buf, sizeof(tmp_log_buf), fmt, args);
-	fdebug_log(bat_priv->debug_log, "[%10lu] %s",
-		   (jiffies / HZ), tmp_log_buf);
+	fdebug_log(bat_priv->debug_log, "[%10u] %s",
+		   jiffies_to_msecs(jiffies), tmp_log_buf);
 	va_end(args);
 
 	return 0;
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 21c1f83..962636b 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -35,7 +35,8 @@ static void purge_orig(struct work_struct *work);
 static void start_purge_timer(struct bat_priv *bat_priv)
 {
 	INIT_DELAYED_WORK(&bat_priv->orig_work, purge_orig);
-	queue_delayed_work(bat_event_workqueue, &bat_priv->orig_work, 1 * HZ);
+	queue_delayed_work(bat_event_workqueue,
+			   &bat_priv->orig_work, msecs_to_jiffies(1000));
 }
 
 /* returns 1 if they are the same originator */
@@ -274,6 +275,7 @@ static bool purge_orig_neighbors(struct bat_priv *bat_priv,
 	struct hlist_node *node, *node_tmp;
 	struct neigh_node *neigh_node;
 	bool neigh_purged = false;
+	unsigned long last_seen;
 
 	*best_neigh_node = NULL;
 
@@ -288,6 +290,8 @@ static bool purge_orig_neighbors(struct bat_priv *bat_priv,
 		    (neigh_node->if_incoming->if_status == IF_NOT_IN_USE) ||
 		    (neigh_node->if_incoming->if_status == IF_TO_BE_REMOVED)) {
 
+			last_seen = neigh_node->last_seen;
+
 			if ((neigh_node->if_incoming->if_status ==
 								IF_INACTIVE) ||
 			    (neigh_node->if_incoming->if_status ==
@@ -300,9 +304,9 @@ static bool purge_orig_neighbors(struct bat_priv *bat_priv,
 					neigh_node->if_incoming->net_dev->name);
 			else
 				bat_dbg(DBG_BATMAN, bat_priv,
-					"neighbor timeout: originator %pM, neighbor: %pM, last_seen: %lu\n",
+					"neighbor timeout: originator %pM, neighbor: %pM, last_seen: %u\n",
 					orig_node->orig, neigh_node->addr,
-					(neigh_node->last_seen / HZ));
+					jiffies_to_msecs(last_seen));
 
 			neigh_purged = true;
 
@@ -327,8 +331,9 @@ static bool purge_orig_node(struct bat_priv *bat_priv,
 
 	if (has_timed_out(orig_node->last_seen, 2 * PURGE_TIMEOUT)) {
 		bat_dbg(DBG_BATMAN, bat_priv,
-			"Originator timeout: originator %pM, last_seen %lu\n",
-			orig_node->orig, (orig_node->last_seen / HZ));
+			"Originator timeout: originator %pM, last_seen %u\n",
+			orig_node->orig,
+			jiffies_to_msecs(orig_node->last_seen));
 		return true;
 	} else {
 		if (purge_orig_neighbors(bat_priv, orig_node,
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 7c66b61..8e74d97 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -292,7 +292,7 @@ static void send_outstanding_bcast_packet(struct work_struct *work)
 	/* if we still have some more bcasts to send */
 	if (forw_packet->num_packets < 3) {
 		_add_bcast_packet_to_list(bat_priv, forw_packet,
-					  ((5 * HZ) / 1000));
+					  msecs_to_jiffies(5));
 		return;
 	}
 
-- 
1.7.9.4

^ permalink raw reply related

* [PATCH 05/14] batman-adv: rename last_valid to last_seen
From: Antonio Quartulli @ 2012-05-09 11:12 UTC (permalink / raw)
  To: davem; +Cc: netdev, b.a.t.m.a.n, Marek Lindner, Antonio Quartulli
In-Reply-To: <1336561976-16088-1-git-send-email-ordex@autistici.org>

From: Marek Lindner <lindner_marek@yahoo.de>

Signed-off-by: Marek Lindner <lindner_marek@yahoo.de>
Acked-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Antonio Quartulli <ordex@autistici.org>
---
 net/batman-adv/bat_iv_ogm.c |    8 ++++----
 net/batman-adv/originator.c |   16 ++++++++--------
 net/batman-adv/types.h      |    8 ++++----
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index e0aaf8c..8652a75 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -651,7 +651,7 @@ static void bat_iv_ogm_orig_update(struct bat_priv *bat_priv,
 	rcu_read_unlock();
 
 	orig_node->flags = batman_ogm_packet->flags;
-	neigh_node->last_valid = jiffies;
+	neigh_node->last_seen = jiffies;
 
 	spin_lock_bh(&neigh_node->tq_lock);
 	ring_buffer_set(neigh_node->tq_recv,
@@ -772,11 +772,11 @@ static int bat_iv_ogm_calc_tq(struct orig_node *orig_node,
 	if (!neigh_node)
 		goto out;
 
-	/* if orig_node is direct neighbor update neigh_node last_valid */
+	/* if orig_node is direct neighbor update neigh_node last_seen */
 	if (orig_node == orig_neigh_node)
-		neigh_node->last_valid = jiffies;
+		neigh_node->last_seen = jiffies;
 
-	orig_node->last_valid = jiffies;
+	orig_node->last_seen = jiffies;
 
 	/* find packet count of corresponding one hop neighbor */
 	spin_lock_bh(&orig_node->ogm_cnt_lock);
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index ce49698..21c1f83 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -283,7 +283,7 @@ static bool purge_orig_neighbors(struct bat_priv *bat_priv,
 	hlist_for_each_entry_safe(neigh_node, node, node_tmp,
 				  &orig_node->neigh_list, list) {
 
-		if ((has_timed_out(neigh_node->last_valid, PURGE_TIMEOUT)) ||
+		if ((has_timed_out(neigh_node->last_seen, PURGE_TIMEOUT)) ||
 		    (neigh_node->if_incoming->if_status == IF_INACTIVE) ||
 		    (neigh_node->if_incoming->if_status == IF_NOT_IN_USE) ||
 		    (neigh_node->if_incoming->if_status == IF_TO_BE_REMOVED)) {
@@ -300,9 +300,9 @@ static bool purge_orig_neighbors(struct bat_priv *bat_priv,
 					neigh_node->if_incoming->net_dev->name);
 			else
 				bat_dbg(DBG_BATMAN, bat_priv,
-					"neighbor timeout: originator %pM, neighbor: %pM, last_valid: %lu\n",
+					"neighbor timeout: originator %pM, neighbor: %pM, last_seen: %lu\n",
 					orig_node->orig, neigh_node->addr,
-					(neigh_node->last_valid / HZ));
+					(neigh_node->last_seen / HZ));
 
 			neigh_purged = true;
 
@@ -325,10 +325,10 @@ static bool purge_orig_node(struct bat_priv *bat_priv,
 {
 	struct neigh_node *best_neigh_node;
 
-	if (has_timed_out(orig_node->last_valid, 2 * PURGE_TIMEOUT)) {
+	if (has_timed_out(orig_node->last_seen, 2 * PURGE_TIMEOUT)) {
 		bat_dbg(DBG_BATMAN, bat_priv,
-			"Originator timeout: originator %pM, last_valid %lu\n",
-			orig_node->orig, (orig_node->last_valid / HZ));
+			"Originator timeout: originator %pM, last_seen %lu\n",
+			orig_node->orig, (orig_node->last_seen / HZ));
 		return true;
 	} else {
 		if (purge_orig_neighbors(bat_priv, orig_node,
@@ -446,9 +446,9 @@ int orig_seq_print_text(struct seq_file *seq, void *offset)
 				goto next;
 
 			last_seen_secs = jiffies_to_msecs(jiffies -
-						orig_node->last_valid) / 1000;
+						orig_node->last_seen) / 1000;
 			last_seen_msecs = jiffies_to_msecs(jiffies -
-						orig_node->last_valid) % 1000;
+						orig_node->last_seen) % 1000;
 
 			seq_printf(seq, "%pM %4i.%03is   (%3i) %pM [%10s]:",
 				   orig_node->orig, last_seen_secs,
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 50e1895..9fa8b73 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -52,7 +52,7 @@ struct hard_iface {
 /**
  *	orig_node - structure for orig_list maintaining nodes of mesh
  *	@primary_addr: hosts primary interface address
- *	@last_valid: when last packet from this node was received
+ *	@last_seen: when last packet from this node was received
  *	@bcast_seqno_reset: time when the broadcast seqno window was reset
  *	@batman_seqno_reset: time when the batman seqno window was reset
  *	@gw_flags: flags related to gateway class
@@ -70,7 +70,7 @@ struct orig_node {
 	struct neigh_node __rcu *router; /* rcu protected pointer */
 	unsigned long *bcast_own;
 	uint8_t *bcast_own_sum;
-	unsigned long last_valid;
+	unsigned long last_seen;
 	unsigned long bcast_seqno_reset;
 	unsigned long batman_seqno_reset;
 	uint8_t gw_flags;
@@ -120,7 +120,7 @@ struct gw_node {
 
 /**
  *	neigh_node
- *	@last_valid: when last packet via this neighbor was received
+ *	@last_seen: when last packet via this neighbor was received
  */
 struct neigh_node {
 	struct hlist_node list;
@@ -131,7 +131,7 @@ struct neigh_node {
 	uint8_t tq_avg;
 	uint8_t last_ttl;
 	struct list_head bonding_list;
-	unsigned long last_valid;
+	unsigned long last_seen;
 	DECLARE_BITMAP(real_bits, TQ_LOCAL_WINDOW_SIZE);
 	atomic_t refcount;
 	struct rcu_head rcu;
-- 
1.7.9.4

^ permalink raw reply related

* [PATCH 04/14] batman-adv: register batman ogm receive function during protocol init
From: Antonio Quartulli @ 2012-05-09 11:12 UTC (permalink / raw)
  To: davem; +Cc: netdev, b.a.t.m.a.n, Marek Lindner, Antonio Quartulli
In-Reply-To: <1336561976-16088-1-git-send-email-ordex@autistici.org>

From: Marek Lindner <lindner_marek@yahoo.de>

The B.A.T.M.A.N. IV OGM receive function still was hard-coded although
it is a routing protocol specific function. This patch takes advantage
of the dynamic packet handler registration to remove the hard-coded
function calls.

Signed-off-by: Marek Lindner <lindner_marek@yahoo.de>
Acked-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Antonio Quartulli <ordex@autistici.org>
---
 net/batman-adv/bat_iv_ogm.c |   31 +++++++++++++++++++++++++++----
 net/batman-adv/main.c       |    5 +----
 net/batman-adv/routing.c    |   22 ++++++++++------------
 net/batman-adv/routing.h    |    4 +++-
 net/batman-adv/types.h      |    3 ---
 5 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index cd8f473..e0aaf8c 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1155,13 +1155,18 @@ out:
 	orig_node_free_ref(orig_node);
 }
 
-static void bat_iv_ogm_receive(struct hard_iface *if_incoming,
-			       struct sk_buff *skb)
+static int bat_iv_ogm_receive(struct sk_buff *skb,
+			      struct hard_iface *if_incoming)
 {
 	struct batman_ogm_packet *batman_ogm_packet;
 	struct ethhdr *ethhdr;
 	int buff_pos = 0, packet_len;
 	unsigned char *tt_buff, *packet_buff;
+	bool ret;
+
+	ret = check_management_packet(skb, if_incoming, BATMAN_OGM_HLEN);
+	if (!ret)
+		return NET_RX_DROP;
 
 	packet_len = skb_headlen(skb);
 	ethhdr = (struct ethhdr *)skb_mac_header(skb);
@@ -1187,6 +1192,9 @@ static void bat_iv_ogm_receive(struct hard_iface *if_incoming,
 						(packet_buff + buff_pos);
 	} while (bat_iv_ogm_aggr_packet(buff_pos, packet_len,
 					batman_ogm_packet->tt_num_changes));
+
+	kfree_skb(skb);
+	return NET_RX_SUCCESS;
 }
 
 static struct bat_algo_ops batman_iv __read_mostly = {
@@ -1197,10 +1205,25 @@ static struct bat_algo_ops batman_iv __read_mostly = {
 	.bat_ogm_update_mac = bat_iv_ogm_update_mac,
 	.bat_ogm_schedule = bat_iv_ogm_schedule,
 	.bat_ogm_emit = bat_iv_ogm_emit,
-	.bat_ogm_receive = bat_iv_ogm_receive,
 };
 
 int __init bat_iv_init(void)
 {
-	return bat_algo_register(&batman_iv);
+	int ret;
+
+	/* batman originator packet */
+	ret = recv_handler_register(BAT_IV_OGM, bat_iv_ogm_receive);
+	if (ret < 0)
+		goto out;
+
+	ret = bat_algo_register(&batman_iv);
+	if (ret < 0)
+		goto handler_unregister;
+
+	goto out;
+
+handler_unregister:
+	recv_handler_unregister(BAT_IV_OGM);
+out:
+	return ret;
 }
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index d19b935..f80c447 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -266,8 +266,6 @@ static void recv_handler_init(void)
 	for (i = 0; i < ARRAY_SIZE(recv_packet_handler); i++)
 		recv_packet_handler[i] = recv_unhandled_packet;
 
-	/* batman originator packet */
-	recv_packet_handler[BAT_IV_OGM] = recv_bat_ogm_packet;
 	/* batman icmp packet */
 	recv_packet_handler[BAT_ICMP] = recv_icmp_packet;
 	/* unicast packet */
@@ -334,8 +332,7 @@ int bat_algo_register(struct bat_algo_ops *bat_algo_ops)
 	    !bat_algo_ops->bat_primary_iface_set ||
 	    !bat_algo_ops->bat_ogm_update_mac ||
 	    !bat_algo_ops->bat_ogm_schedule ||
-	    !bat_algo_ops->bat_ogm_emit ||
-	    !bat_algo_ops->bat_ogm_receive) {
+	    !bat_algo_ops->bat_ogm_emit) {
 		pr_info("Routing algo '%s' does not implement required ops\n",
 			bat_algo_ops->name);
 		goto out;
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index ff56086..7ed9d8f 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -248,37 +248,35 @@ int window_protected(struct bat_priv *bat_priv, int32_t seq_num_diff,
 	return 0;
 }
 
-int recv_bat_ogm_packet(struct sk_buff *skb, struct hard_iface *hard_iface)
+bool check_management_packet(struct sk_buff *skb,
+			     struct hard_iface *hard_iface,
+			     int header_len)
 {
-	struct bat_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
 	struct ethhdr *ethhdr;
 
 	/* drop packet if it has not necessary minimum size */
-	if (unlikely(!pskb_may_pull(skb, BATMAN_OGM_HLEN)))
-		return NET_RX_DROP;
+	if (unlikely(!pskb_may_pull(skb, header_len)))
+		return false;
 
 	ethhdr = (struct ethhdr *)skb_mac_header(skb);
 
 	/* packet with broadcast indication but unicast recipient */
 	if (!is_broadcast_ether_addr(ethhdr->h_dest))
-		return NET_RX_DROP;
+		return false;
 
 	/* packet with broadcast sender address */
 	if (is_broadcast_ether_addr(ethhdr->h_source))
-		return NET_RX_DROP;
+		return false;
 
 	/* create a copy of the skb, if needed, to modify it. */
 	if (skb_cow(skb, 0) < 0)
-		return NET_RX_DROP;
+		return false;
 
 	/* keep skb linear */
 	if (skb_linearize(skb) < 0)
-		return NET_RX_DROP;
+		return false;
 
-	bat_priv->bat_algo_ops->bat_ogm_receive(hard_iface, skb);
-
-	kfree_skb(skb);
-	return NET_RX_SUCCESS;
+	return true;
 }
 
 static int recv_my_icmp_packet(struct bat_priv *bat_priv,
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index 3d729cb..d6bbbeb 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -23,6 +23,9 @@
 #define _NET_BATMAN_ADV_ROUTING_H_
 
 void slide_own_bcast_window(struct hard_iface *hard_iface);
+bool check_management_packet(struct sk_buff *skb,
+			     struct hard_iface *hard_iface,
+			     int header_len);
 void update_route(struct bat_priv *bat_priv, struct orig_node *orig_node,
 		  struct neigh_node *neigh_node);
 int recv_icmp_packet(struct sk_buff *skb, struct hard_iface *recv_if);
@@ -30,7 +33,6 @@ int recv_unicast_packet(struct sk_buff *skb, struct hard_iface *recv_if);
 int recv_ucast_frag_packet(struct sk_buff *skb, struct hard_iface *recv_if);
 int recv_bcast_packet(struct sk_buff *skb, struct hard_iface *recv_if);
 int recv_vis_packet(struct sk_buff *skb, struct hard_iface *recv_if);
-int recv_bat_ogm_packet(struct sk_buff *skb, struct hard_iface *recv_if);
 int recv_tt_query(struct sk_buff *skb, struct hard_iface *recv_if);
 int recv_roam_adv(struct sk_buff *skb, struct hard_iface *recv_if);
 struct neigh_node *find_router(struct bat_priv *bat_priv,
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 2f4848b..50e1895 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -390,9 +390,6 @@ struct bat_algo_ops {
 				 int tt_num_changes);
 	/* send scheduled OGM */
 	void (*bat_ogm_emit)(struct forw_packet *forw_packet);
-	/* receive incoming OGM */
-	void (*bat_ogm_receive)(struct hard_iface *if_incoming,
-				struct sk_buff *skb);
 };
 
 #endif /* _NET_BATMAN_ADV_TYPES_H_ */
-- 
1.7.9.4

^ permalink raw reply related

* [PATCH 02/14] batman-adv: introduce is_single_hop_neigh variable to increase readability
From: Antonio Quartulli @ 2012-05-09 11:12 UTC (permalink / raw)
  To: davem; +Cc: netdev, b.a.t.m.a.n, Marek Lindner, Antonio Quartulli
In-Reply-To: <1336561976-16088-1-git-send-email-ordex@autistici.org>

From: Marek Lindner <lindner_marek@yahoo.de>

Signed-off-by: Marek Lindner <lindner_marek@yahoo.de>
Acked-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Antonio Quartulli <ordex@autistici.org>
---
 net/batman-adv/bat_iv_ogm.c |   16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 8b2db2e..cd8f473 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -480,7 +480,8 @@ static void bat_iv_ogm_queue_add(struct bat_priv *bat_priv,
 static void bat_iv_ogm_forward(struct orig_node *orig_node,
 			       const struct ethhdr *ethhdr,
 			       struct batman_ogm_packet *batman_ogm_packet,
-			       int directlink, struct hard_iface *if_incoming)
+			       bool is_single_hop_neigh,
+			       struct hard_iface *if_incoming)
 {
 	struct bat_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
 	struct neigh_node *router;
@@ -533,7 +534,7 @@ static void bat_iv_ogm_forward(struct orig_node *orig_node,
 
 	/* switch of primaries first hop flag when forwarding */
 	batman_ogm_packet->flags &= ~PRIMARIES_FIRST_HOP;
-	if (directlink)
+	if (is_single_hop_neigh)
 		batman_ogm_packet->flags |= DIRECTLINK;
 	else
 		batman_ogm_packet->flags &= ~DIRECTLINK;
@@ -918,7 +919,8 @@ static void bat_iv_ogm_process(const struct ethhdr *ethhdr,
 	struct neigh_node *orig_neigh_router = NULL;
 	int has_directlink_flag;
 	int is_my_addr = 0, is_my_orig = 0, is_my_oldorig = 0;
-	int is_broadcast = 0, is_bidirectional, is_single_hop_neigh;
+	int is_broadcast = 0, is_bidirectional;
+	bool is_single_hop_neigh = false;
 	int is_duplicate;
 	uint32_t if_incoming_seqno;
 
@@ -942,8 +944,8 @@ static void bat_iv_ogm_process(const struct ethhdr *ethhdr,
 
 	has_directlink_flag = (batman_ogm_packet->flags & DIRECTLINK ? 1 : 0);
 
-	is_single_hop_neigh = (compare_eth(ethhdr->h_source,
-					   batman_ogm_packet->orig) ? 1 : 0);
+	if (compare_eth(ethhdr->h_source, batman_ogm_packet->orig))
+		is_single_hop_neigh = true;
 
 	bat_dbg(DBG_BATMAN, bat_priv,
 		"Received BATMAN packet via NB: %pM, IF: %s [%pM] (from OG: %pM, via prev OG: %pM, seqno %u, ttvn %u, crc %u, changes %u, td %d, TTL %d, V %d, IDF %d)\n",
@@ -1114,7 +1116,7 @@ static void bat_iv_ogm_process(const struct ethhdr *ethhdr,
 
 		/* mark direct link on incoming interface */
 		bat_iv_ogm_forward(orig_node, ethhdr, batman_ogm_packet,
-				   1, if_incoming);
+				   is_single_hop_neigh, if_incoming);
 
 		bat_dbg(DBG_BATMAN, bat_priv,
 			"Forwarding packet: rebroadcast neighbor packet with direct link flag\n");
@@ -1137,7 +1139,7 @@ static void bat_iv_ogm_process(const struct ethhdr *ethhdr,
 	bat_dbg(DBG_BATMAN, bat_priv,
 		"Forwarding packet: rebroadcast originator packet\n");
 	bat_iv_ogm_forward(orig_node, ethhdr, batman_ogm_packet,
-			   0, if_incoming);
+			   is_single_hop_neigh, if_incoming);
 
 out_neigh:
 	if ((orig_neigh_node) && (!is_single_hop_neigh))
-- 
1.7.9.4

^ permalink raw reply related

* [PATCH v3][resend] bonding: don't increase rx_dropped after processing LACPDUs
From: Jiri Bohac @ 2012-05-09 11:01 UTC (permalink / raw)
  To: David Miller; +Cc: andy, netdev, fubar

Since commit 3aba891d, bonding processes LACP frames (802.3ad
mode) with bond_handle_frame(). Currently a copy of the skb is
made and the original is left to be processed by other
rx_handlers and the rest of the network stack by returning
RX_HANDLER_ANOTHER.  As there is no protocol handler for
PKT_TYPE_LACPDU, the frame is dropped and dev->rx_dropped
increased.

Fix this by making bond_handle_frame() return RX_HANDLER_CONSUMED
if bonding has processed the LACP frame.

Signed-off-by: Jiri Bohac <jbohac@suse.cz>

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2173,9 +2173,10 @@ re_arm:
  * received frames (loopback). Since only the payload is given to this
  * function, it check for loopback.
  */
-static void bond_3ad_rx_indication(struct lacpdu *lacpdu, struct slave *slave, u16 length)
+static int bond_3ad_rx_indication(struct lacpdu *lacpdu, struct slave *slave, u16 length)
 {
 	struct port *port;
+	int ret = RX_HANDLER_ANOTHER;
 
 	if (length >= sizeof(struct lacpdu)) {
 
@@ -2184,11 +2185,12 @@ static void bond_3ad_rx_indication(struct lacpdu *lacpdu, struct slave *slave, u
 		if (!port->slave) {
 			pr_warning("%s: Warning: port of slave %s is uninitialized\n",
 				   slave->dev->name, slave->dev->master->name);
-			return;
+			return ret;
 		}
 
 		switch (lacpdu->subtype) {
 		case AD_TYPE_LACPDU:
+			ret = RX_HANDLER_CONSUMED;
 			pr_debug("Received LACPDU on port %d\n",
 				 port->actor_port_number);
 			/* Protect against concurrent state machines */
@@ -2198,6 +2200,7 @@ static void bond_3ad_rx_indication(struct lacpdu *lacpdu, struct slave *slave, u
 			break;
 
 		case AD_TYPE_MARKER:
+			ret = RX_HANDLER_CONSUMED;
 			// No need to convert fields to Little Endian since we don't use the marker's fields.
 
 			switch (((struct bond_marker *)lacpdu)->tlv_type) {
@@ -2219,6 +2222,7 @@ static void bond_3ad_rx_indication(struct lacpdu *lacpdu, struct slave *slave, u
 			}
 		}
 	}
+	return ret;
 }
 
 /**
@@ -2456,18 +2460,20 @@ out:
 	return NETDEV_TX_OK;
 }
 
-void bond_3ad_lacpdu_recv(struct sk_buff *skb, struct bonding *bond,
+int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct bonding *bond,
 			  struct slave *slave)
 {
+	int ret = RX_HANDLER_ANOTHER;
 	if (skb->protocol != PKT_TYPE_LACPDU)
-		return;
+		return ret;
 
 	if (!pskb_may_pull(skb, sizeof(struct lacpdu)))
-		return;
+		return ret;
 
 	read_lock(&bond->lock);
-	bond_3ad_rx_indication((struct lacpdu *) skb->data, slave, skb->len);
+	ret = bond_3ad_rx_indication((struct lacpdu *) skb->data, slave, skb->len);
 	read_unlock(&bond->lock);
+	return ret;
 }
 
 /*
diff --git a/drivers/net/bonding/bond_3ad.h b/drivers/net/bonding/bond_3ad.h
index 235b2cc..5ee7e3c 100644
--- a/drivers/net/bonding/bond_3ad.h
+++ b/drivers/net/bonding/bond_3ad.h
@@ -274,7 +274,7 @@ void bond_3ad_adapter_duplex_changed(struct slave *slave);
 void bond_3ad_handle_link_change(struct slave *slave, char link);
 int  bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info);
 int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev);
-void bond_3ad_lacpdu_recv(struct sk_buff *skb, struct bonding *bond,
+int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct bonding *bond,
 			  struct slave *slave);
 int bond_3ad_set_carrier(struct bonding *bond);
 void bond_3ad_update_lacp_rate(struct bonding *bond);
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 62d2409..0a0f4a6 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1444,8 +1444,9 @@ static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb)
 	struct sk_buff *skb = *pskb;
 	struct slave *slave;
 	struct bonding *bond;
-	void (*recv_probe)(struct sk_buff *, struct bonding *,
+	int (*recv_probe)(struct sk_buff *, struct bonding *,
 				struct slave *);
+	int ret = RX_HANDLER_ANOTHER;
 
 	skb = skb_share_check(skb, GFP_ATOMIC);
 	if (unlikely(!skb))
@@ -1464,8 +1465,12 @@ static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb)
 		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
 
 		if (likely(nskb)) {
-			recv_probe(nskb, bond, slave);
+			ret = recv_probe(nskb, bond, slave);
 			dev_kfree_skb(nskb);
+			if (ret == RX_HANDLER_CONSUMED) {
+				consume_skb(skb);
+				return ret;
+			}
 		}
 	}
 
@@ -1487,7 +1492,7 @@ static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb)
 		memcpy(eth_hdr(skb)->h_dest, bond->dev->dev_addr, ETH_ALEN);
 	}
 
-	return RX_HANDLER_ANOTHER;
+	return ret;
 }
 
 /* enslave device <slave> to bond device <master> */
@@ -2723,7 +2728,7 @@ static void bond_validate_arp(struct bonding *bond, struct slave *slave, __be32
 	}
 }
 
-static void bond_arp_rcv(struct sk_buff *skb, struct bonding *bond,
+static int bond_arp_rcv(struct sk_buff *skb, struct bonding *bond,
 			 struct slave *slave)
 {
 	struct arphdr *arp;
@@ -2731,7 +2736,7 @@ static void bond_arp_rcv(struct sk_buff *skb, struct bonding *bond,
 	__be32 sip, tip;
 
 	if (skb->protocol != __cpu_to_be16(ETH_P_ARP))
-		return;
+		return RX_HANDLER_ANOTHER;
 
 	read_lock(&bond->lock);
 
@@ -2776,6 +2781,7 @@ static void bond_arp_rcv(struct sk_buff *skb, struct bonding *bond,
 
 out_unlock:
 	read_unlock(&bond->lock);
+	return RX_HANDLER_ANOTHER;
 }
 
 /*

-- 
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, SUSE CZ

^ permalink raw reply related

* Re: [v12 PATCH 1/3] NETFILTER added flags to ipv6_find_hdr()
From: Pablo Neira Ayuso @ 2012-05-09 11:01 UTC (permalink / raw)
  To: Hans Schillstrom; +Cc: kaber, jengelh, netfilter-devel, netdev, hans
In-Reply-To: <1335188128-23645-2-git-send-email-hans.schillstrom@ericsson.com>

I have applied this with minor changes.

BTW, please use the following patch tagging next time, I'll save time:

netfilter: ip6_tables: add flags parameter to ipv6_find_hdr()

note the initial netfilter, then ip6_tables, then the description.

This is useful for grepping.

More minor glitches:

On Mon, Apr 23, 2012 at 03:35:26PM +0200, Hans Schillstrom wrote:
> Two new flags to ipv6_find_hdr,
> One that tells us that this is a fragment.
> One that stops at AH if any i.e. treat it like a transport header.
> i.e. make handling of ESP and AH the same.
> Param offset can now point to an inner icmp ipv5 header.
> 
> Version 3:
>     offset param into ipv6_find_hdr set to zero.
> 
> Version 2:
>     wrapper removed and changes made at every call.
> 
> Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
> ---
>  include/linux/netfilter_ipv6/ip6_tables.h |   12 +++++++++-
>  net/ipv6/netfilter/ip6_tables.c           |   35 ++++++++++++++++++++++++----
>  net/ipv6/netfilter/ip6t_ah.c              |    4 +-
>  net/ipv6/netfilter/ip6t_frag.c            |    4 +-
>  net/ipv6/netfilter/ip6t_hbh.c             |    4 +-
>  net/ipv6/netfilter/ip6t_rt.c              |    4 +-
>  net/netfilter/xt_TPROXY.c                 |    4 +-
>  net/netfilter/xt_socket.c                 |    4 +-
>  8 files changed, 53 insertions(+), 18 deletions(-)
> 
> diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
> index 1bc898b..d96a39d 100644
> --- a/include/linux/netfilter_ipv6/ip6_tables.h
> +++ b/include/linux/netfilter_ipv6/ip6_tables.h
> @@ -287,6 +287,7 @@ extern unsigned int ip6t_do_table(struct sk_buff *skb,
>  				  struct xt_table *table);
>  
>  /* Check for an extension */
> +

removed this extra line.

>  static inline int
>  ip6t_ext_hdr(u8 nexthdr)
>  {	return (nexthdr == IPPROTO_HOPOPTS) ||
> @@ -298,9 +299,18 @@ ip6t_ext_hdr(u8 nexthdr)
>  	       (nexthdr == IPPROTO_DSTOPTS);
>  }
>  
> +

removed double extra line.

> +extern int ip6t_ext_hdr(u8 nexthdr);
> +enum {
> +	IP6T_FH_FRAG,
> +	IP6T_FH_AUTH,

removed these two above, the are not used anywhere in the code.

> +	IP6T_FH_F_FRAG = 1 << IP6T_FH_FRAG,
> +	IP6T_FH_F_AUTH = 1 << IP6T_FH_AUTH,
> +};
> +
>  /* find specified header and get offset to it */
>  extern int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
> -			 int target, unsigned short *fragoff);
> +			 int target, unsigned short *fragoff, int *fragflg);
>  
>  #ifdef CONFIG_COMPAT
>  #include <net/compat.h>
> diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
> index d4e350f..1f18662 100644
> --- a/net/ipv6/netfilter/ip6_tables.c
> +++ b/net/ipv6/netfilter/ip6_tables.c
> @@ -133,7 +133,7 @@ ip6_packet_match(const struct sk_buff *skb,
>  		int protohdr;
>  		unsigned short _frag_off;
>  
> -		protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off);
> +		protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off, NULL);
>  		if (protohdr < 0) {
>  			if (_frag_off == 0)
>  				*hotdrop = true;
> @@ -362,6 +362,7 @@ ip6t_do_table(struct sk_buff *skb,
>  		const struct xt_entry_match *ematch;
>  
>  		IP_NF_ASSERT(e);
> +		acpar.thoff = 0;
>  		if (!ip6_packet_match(skb, indev, outdev, &e->ipv6,
>  		    &acpar.thoff, &acpar.fragoff, &acpar.hotdrop)) {
>   no_match:
> @@ -2277,6 +2278,8 @@ static void __exit ip6_tables_fini(void)
>   * find the offset to specified header or the protocol number of last header
>   * if target < 0. "last header" is transport protocol header, ESP, or
>   * "No next header".
> + * Note, *offset is used as input param. an if != 0
> + * it must be an offset to an inner ipv6 header ex. icmp error
>   *
>   * If target header is found, its offset is set in *offset and return protocol
>   * number. Otherwise, return -1.
> @@ -2289,17 +2292,34 @@ static void __exit ip6_tables_fini(void)
>   * *offset is meaningless and fragment offset is stored in *fragoff if fragoff
>   * isn't NULL.
>   *
> + * if flags != NULL AND
> + *    it's a fragment the frag flag "IP6T_FH_F_FRAG" will be set
> + *    it's an AH header and IP6T_FH_F_AUTH is set and target < 0
> + *      stop at AH (i.e. treat is as a transport header)

I've cleaned up these comments. The format does not look very orthodox
(I'm not blaming your English, but the way the text is organized).

^ permalink raw reply

* RE: [PATCH] pch_gbe: Adding read memory barriers
From: David Laight @ 2012-05-09 10:47 UTC (permalink / raw)
  To: Erwan Velu, netdev, linux-kernel; +Cc: tshimizu818
In-Reply-To: <4FA822C4.60809@gmail.com>

 
> Under a strong incoming packet stream and/or high cpu usage,
> the pch_gbe driver reports "Receive CRC Error" and discards packet.
> 
> It occurred on an Intel ATOM E620T while running a 
> 300mbit/sec multicast
> network stream leading to a ~100% cpu usage.
> 
> Adding rmb() calls before considering the network card's status solve
> this issue.
> 
> Getting it into stable would be perfect as it solves 
> reliability issues.
> 
> Signed-off-by: Erwan Velu <erwan.velu@zodiacaerospace.com>
> ---
>   .../net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c   |    3 +++
>   1 files changed, 3 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c 
> b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
> index 8035e5f..ace3654 100644
> --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
> +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
> @@ -1413,6 +1413,7 @@ static irqreturn_t pch_gbe_intr(int 
> irq, void *data)
>   			pch_gbe_mac_set_pause_packet(hw);
>   		}
>   	}
> +	smp_rmb(); /* prevent any other reads before*/

Under the assumption that your memory references are uncached,
you only need to stop gcc reordering the object code,
Rather than actually adding one of the 'fence' instructions.

So you should only need: asm volatile(:::"memory")
NFI which define generates that, the defines in the copy of
sysdep.h I just looked at always include one of the fences.

	David

^ permalink raw reply

* Re: [PATCH] net: skb_set_dev do not unconditionally drop ref to dst
From: Frank Blaschka @ 2012-05-09 10:48 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, arnd, linux-s390, eric.dumazet
In-Reply-To: <20120503.022837.2021054007853235758.davem@davemloft.net>

On Thu, May 03, 2012 at 02:28:37AM -0400, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Wed, 02 May 2012 08:59:21 +0200
> 
> > If Arnd doesnt care to even reply, we can just revert his buggy
> > patch, instead of trying to understand and fix all issues.
> 
> Agreed, it's the best sounding solution by far.
Hi Dave,

looks like discussion is done and everybody is ok with reverting
Arnds patch. Can you revert commit 8a83a00b0735190384a348156837918271034144 ?
Thx

Fran ?
Thx

Frank

> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* [PATCH v5] tilegx network driver: initial support
From: Chris Metcalf @ 2012-05-09 10:42 UTC (permalink / raw)
  To: David Miller, arnd, linux-kernel, netdev
In-Reply-To: <20120504.024216.1730395431103337434.davem@davemloft.net>

This change adds support for the tilegx network driver based on the
GXIO IORPC support in the tilegx software stack, using the on-chip
mPIPE packet processing engine.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
This version removes some "generic" comments on network driver
infrastructure, removes runs of multiple blank lines, and eliminates
the hand-rolled ROUND_UP() macro in favor of ALIGN().  There are also
a number of additional cleanups.

 drivers/net/ethernet/tile/Kconfig  |    1 +
 drivers/net/ethernet/tile/Makefile |    4 +-
 drivers/net/ethernet/tile/tilegx.c | 1928 ++++++++++++++++++++++++++++++++++++
 3 files changed, 1931 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/tile/tilegx.c

diff --git a/drivers/net/ethernet/tile/Kconfig b/drivers/net/ethernet/tile/Kconfig
index 2d9218f..9184b61 100644
--- a/drivers/net/ethernet/tile/Kconfig
+++ b/drivers/net/ethernet/tile/Kconfig
@@ -7,6 +7,7 @@ config TILE_NET
 	depends on TILE
 	default y
 	select CRC32
+	select TILE_GXIO_MPIPE if TILEGX
 	---help---
 	  This is a standard Linux network device driver for the
 	  on-chip Tilera Gigabit Ethernet and XAUI interfaces.
diff --git a/drivers/net/ethernet/tile/Makefile b/drivers/net/ethernet/tile/Makefile
index f634f14..0ef9eef 100644
--- a/drivers/net/ethernet/tile/Makefile
+++ b/drivers/net/ethernet/tile/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_TILE_NET) += tile_net.o
 ifdef CONFIG_TILEGX
-tile_net-objs := tilegx.o mpipe.o iorpc_mpipe.o dma_queue.o
+tile_net-y := tilegx.o
 else
-tile_net-objs := tilepro.o
+tile_net-y := tilepro.o
 endif
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
new file mode 100644
index 0000000..1ba52a7
--- /dev/null
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -0,0 +1,1928 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>      /* printk() */
+#include <linux/slab.h>        /* kmalloc() */
+#include <linux/errno.h>       /* error codes */
+#include <linux/types.h>       /* size_t */
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/irq.h>
+#include <linux/netdevice.h>   /* struct device, and other headers */
+#include <linux/etherdevice.h> /* eth_type_trans */
+#include <linux/skbuff.h>
+#include <linux/ioctl.h>
+#include <linux/cdev.h>
+#include <linux/hugetlb.h>
+#include <linux/in6.h>
+#include <linux/timer.h>
+#include <linux/io.h>
+#include <linux/ctype.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+#include <asm/checksum.h>
+#include <asm/homecache.h>
+#include <gxio/mpipe.h>
+#include <arch/sim.h>
+
+/* Define to support GSO. */
+#undef TILE_NET_GSO
+
+/* Define to support TSO. */
+#define TILE_NET_TSO
+
+/* Use 3000 to enable the Linux Traffic Control (QoS) layer, else 0. */
+#define TILE_NET_TX_QUEUE_LEN 0
+
+/* Define to dump packets (prints out the whole packet on tx and rx). */
+#undef TILE_NET_DUMP_PACKETS
+
+/* Default transmit lockup timeout period, in jiffies. */
+#define TILE_NET_TIMEOUT (5 * HZ)
+
+/* The maximum number of distinct channels (idesc.channel is 5 bits). */
+#define TILE_NET_CHANNELS 32
+
+/* Maximum number of idescs to handle per "poll". */
+#define TILE_NET_BATCH 128
+
+/* Maximum number of packets to handle per "poll". */
+#define TILE_NET_WEIGHT 64
+
+/* Number of entries in each iqueue. */
+#define IQUEUE_ENTRIES 512
+
+/* Number of entries in each equeue. */
+#define EQUEUE_ENTRIES 2048
+
+/* Total header bytes per equeue slot.  Must be big enough for 2 bytes
+ * of NET_IP_ALIGN alignment, plus 14 bytes (?) of L2 header, plus up to
+ * 60 bytes of actual TCP header.  We round up to align to cache lines.
+ */
+#define HEADER_BYTES 128
+
+/* Maximum completions per cpu per device (must be a power of two).
+ * ISSUE: What is the right number here?
+ */
+#define TILE_NET_MAX_COMPS 64
+
+#define MAX_FRAGS (65536 / PAGE_SIZE + 2 + 1)
+
+MODULE_AUTHOR("Tilera Corporation");
+MODULE_LICENSE("GPL");
+
+/* A "packet fragment" (a chunk of memory). */
+struct frag {
+	void *buf;
+	size_t length;
+};
+
+/* A single completion. */
+struct tile_net_comp {
+	/* The "complete_count" when the completion will be complete. */
+	s64 when;
+	/* The buffer to be freed when the completion is complete. */
+	struct sk_buff *skb;
+};
+
+/* The completions for a given cpu and device. */
+struct tile_net_comps {
+	/* The completions. */
+	struct tile_net_comp comp_queue[TILE_NET_MAX_COMPS];
+	/* The number of completions used. */
+	unsigned long comp_next;
+	/* The number of completions freed. */
+	unsigned long comp_last;
+};
+
+/* Info for a specific cpu. */
+struct tile_net_info {
+	/* The NAPI struct. */
+	struct napi_struct napi;
+	/* Packet queue. */
+	gxio_mpipe_iqueue_t iqueue;
+	/* Our cpu. */
+	int my_cpu;
+	/* True if iqueue is valid. */
+	bool has_iqueue;
+	/* NAPI flags. */
+	bool napi_added;
+	bool napi_enabled;
+	/* Number of small sk_buffs which must still be provided. */
+	unsigned int num_needed_small_buffers;
+	/* Number of large sk_buffs which must still be provided. */
+	unsigned int num_needed_large_buffers;
+	/* A timer for handling egress completions. */
+	struct timer_list egress_timer;
+	/* True if "egress_timer" is scheduled. */
+	bool egress_timer_scheduled;
+	/* Comps for each egress channel. */
+	struct tile_net_comps *comps_for_echannel[TILE_NET_CHANNELS];
+};
+
+/* Info for egress on a particular egress channel. */
+struct tile_net_egress {
+	/* The "equeue". */
+	gxio_mpipe_equeue_t *equeue;
+	/* The headers for TSO. */
+	unsigned char *headers;
+};
+
+/* Info for a specific device. */
+struct tile_net_priv {
+	/* Our network device. */
+	struct net_device *dev;
+	/* The primary link. */
+	gxio_mpipe_link_t link;
+	/* The primary channel, if open, else -1. */
+	int channel;
+	/* The "loopify" egress link, if needed. */
+	gxio_mpipe_link_t loopify_link;
+	/* The "loopify" egress channel, if open, else -1. */
+	int loopify_channel;
+	/* The egress channel (channel or loopify_channel). */
+	int echannel;
+	/* Total stats. */
+	struct net_device_stats stats;
+};
+
+/* Egress info, indexed by "priv->echannel" (lazily created as needed). */
+static struct tile_net_egress egress_for_echannel[TILE_NET_CHANNELS];
+
+/* Devices currently associated with each channel.
+ * NOTE: The array entry can become NULL after ifconfig down, but
+ * we do not free the underlying net_device structures, so it is
+ * safe to use a pointer after reading it from this array.
+ */
+static struct net_device *tile_net_devs_for_channel[TILE_NET_CHANNELS];
+
+/* A mutex for "tile_net_devs_for_channel". */
+static DEFINE_MUTEX(tile_net_devs_for_channel_mutex);
+
+/* The per-cpu info. */
+static DEFINE_PER_CPU(struct tile_net_info, per_cpu_info);
+
+/* The "context" for all devices. */
+static gxio_mpipe_context_t context;
+
+/* The small/large "buffer stacks". */
+static int small_buffer_stack = -1;
+static int large_buffer_stack = -1;
+
+/* The buckets. */
+static int first_bucket = -1;
+static int num_buckets = 1;
+
+/* The ingress irq. */
+static int ingress_irq = -1;
+
+/* Text value of tile_net.cpus if passed as a module parameter. */
+static char *network_cpus_string;
+
+/* The actual cpus in "network_cpus". */
+static struct cpumask network_cpus_map;
+
+/* If "loopify=LINK" was specified, this is "LINK". */
+static char *loopify_link_name;
+
+/* The "tile_net.cpus" argument specifies the cpus that are dedicated
+ * to handle ingress packets.
+ *
+ * The parameter should be in the form "tile_net.cpus=m-n[,x-y]", where
+ * m, n, x, y are integer numbers that represent the cpus that can be
+ * neither a dedicated cpu nor a dataplane cpu.
+ */
+static bool network_cpus_init(void)
+{
+	char buf[1024];
+	int rc;
+
+	if (network_cpus_string == NULL)
+		return false;
+
+	rc = cpulist_parse_crop(network_cpus_string, &network_cpus_map);
+	if (rc != 0) {
+		pr_warn("tile_net.cpus=%s: malformed cpu list\n",
+			network_cpus_string);
+		return false;
+	}
+
+	/* Remove dedicated cpus. */
+	cpumask_and(&network_cpus_map, &network_cpus_map, cpu_possible_mask);
+
+	if (cpumask_empty(&network_cpus_map)) {
+		pr_warn("Ignoring empty tile_net.cpus='%s'.\n",
+			network_cpus_string);
+		return false;
+	}
+
+	cpulist_scnprintf(buf, sizeof(buf), &network_cpus_map);
+	pr_info("Linux network CPUs: %s\n", buf);
+	return true;
+}
+
+module_param_named(cpus, network_cpus_string, charp, 0444);
+MODULE_PARM_DESC(cpus, "cpulist of cores that handle network interrupts");
+
+/* The "tile_net.loopify=LINK" argument causes the named device to
+ * actually use "loop0" for ingress, and "loop1" for egress.  This
+ * allows an app to sit between the actual link and linux, passing
+ * (some) packets along to linux, and forwarding (some) packets sent
+ * out by linux.
+ */
+module_param_named(loopify, loopify_link_name, charp, 0444);
+MODULE_PARM_DESC(loopify, "name the device to use loop0/1 for ingress/egress");
+
+#ifdef TILE_NET_DUMP_PACKETS
+/* Dump a packet. */
+static void dump_packet(unsigned char *data, unsigned long length, char *s)
+{
+	unsigned long i;
+	static unsigned int count;
+	char buf[128];
+
+	pr_info("Dumping %s packet of 0x%lx bytes at %p [%d]\n",
+		s, length, data, count++);
+
+	pr_info("\n");
+
+	for (i = 0; i < length; i++) {
+		if ((i & 0xf) == 0)
+			sprintf(buf, "%8.8lx:", i);
+		sprintf(buf + strlen(buf), " %02x", data[i]);
+		if ((i & 0xf) == 0xf || i == length - 1)
+			pr_info("%s\n", buf);
+	}
+
+	pr_info("\n");
+}
+#endif
+
+/* Allocate and push a buffer. */
+static bool tile_net_provide_buffer(bool small)
+{
+	int stack = small ? small_buffer_stack : large_buffer_stack;
+
+	/* Buffers must be aligned. */
+	const unsigned long align = 128;
+
+	/* Note that "dev_alloc_skb()" adds NET_SKB_PAD more bytes,
+	 * and also "reserves" that many bytes.
+	 */
+	int len = sizeof(struct sk_buff **) + align + (small ? 128 : 1664);
+
+	/* Allocate (or fail). */
+	struct sk_buff *skb = dev_alloc_skb(len);
+	if (skb == NULL)
+		return false;
+
+	/* Make room for a back-pointer to 'skb'. */
+	skb_reserve(skb, sizeof(struct sk_buff **));
+
+	/* Make sure we are aligned. */
+	skb_reserve(skb, -(long)skb->data & (align - 1));
+
+	/* Save a back-pointer to 'skb'. */
+	*(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;
+
+	/* Make sure "skb" and the back-pointer have been flushed. */
+	wmb();
+
+	gxio_mpipe_push_buffer(&context, stack,
+			       (void *)va_to_tile_io_addr(skb->data));
+
+	return true;
+}
+
+static void tile_net_pop_all_buffers(int stack)
+{
+	void *va;
+	while ((va = gxio_mpipe_pop_buffer(&context, stack)) != NULL) {
+		struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+		struct sk_buff *skb = *skb_ptr;
+		dev_kfree_skb_irq(skb);
+	}
+}
+
+/* Provide linux buffers to mPIPE. */
+static void tile_net_provide_needed_buffers(struct tile_net_info *info)
+{
+	while (info->num_needed_small_buffers != 0) {
+		if (!tile_net_provide_buffer(true))
+			goto oops;
+		info->num_needed_small_buffers--;
+	}
+
+	while (info->num_needed_large_buffers != 0) {
+		if (!tile_net_provide_buffer(false))
+			goto oops;
+		info->num_needed_large_buffers--;
+	}
+
+	return;
+
+oops:
+	/* Add a description to the page allocation failure dump. */
+	pr_notice("Tile %d still needs some buffers\n", info->my_cpu);
+}
+
+/* Handle a packet.  Return true if "processed", false if "filtered". */
+static bool tile_net_handle_packet(struct tile_net_info *info,
+				   gxio_mpipe_idesc_t *idesc)
+{
+	struct net_device *dev = tile_net_devs_for_channel[idesc->channel];
+	uint8_t l2_offset = gxio_mpipe_idesc_get_l2_offset(idesc);
+	void *va;
+	void *buf;
+	unsigned long len;
+	int filter = 0;
+
+	/* Drop packets for which no buffer was available.
+	 * NOTE: This happens under heavy load.
+	 */
+	if (idesc->be) {
+		gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+		if (net_ratelimit())
+			pr_info("Dropping packet (insufficient buffers).\n");
+		return false;
+	}
+
+	/* Get the raw buffer VA. */
+	va = tile_io_addr_to_va((unsigned long)gxio_mpipe_idesc_get_va(idesc));
+
+	/* Get the actual packet start/length. */
+	buf = va + l2_offset;
+	len = gxio_mpipe_idesc_get_l2_length(idesc);
+
+	/* Point "va" at the raw buffer. */
+	va -= NET_IP_ALIGN;
+
+#ifdef TILE_NET_DUMP_PACKETS
+	dump_packet(buf, len, "rx");
+#endif
+
+	if (dev != NULL) {
+		/* ISSUE: Is this needed? */
+		dev->last_rx = jiffies;
+	}
+
+	if (dev == NULL || !(dev->flags & IFF_UP)) {
+		/* Filter packets received before we're up. */
+		filter = 1;
+	} else if (!(dev->flags & IFF_PROMISC)) {
+		/* ISSUE: "eth_type_trans()" implies that "IFF_PROMISC"
+		 * is set for "all silly devices", however, it appears
+		 * to NOT be set for us, so this code here DOES run.
+		 */
+		if (!is_multicast_ether_addr(buf)) {
+			/* Filter packets not for our address. */
+			const u8 *mine = dev->dev_addr;
+			filter = compare_ether_addr(mine, buf);
+		}
+	}
+
+	if (filter) {
+
+		/* ISSUE: Update "drop" statistics? */
+
+		gxio_mpipe_iqueue_drop(&info->iqueue, idesc);
+
+	} else {
+
+		struct tile_net_priv *priv = netdev_priv(dev);
+
+		/* Acquire the associated "skb". */
+		struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+		struct sk_buff *skb = *skb_ptr;
+
+		/* Paranoia. */
+		if (skb->data != va) {
+			/* Panic here since there's a reasonable chance
+			 * that corrupt buffers means generic memory
+			 * corruption, with unpredictable system effects.
+			 */
+			panic("Corrupt linux buffer! "
+			      "buf=%p, skb=%p, skb->data=%p",
+			      buf, skb, skb->data);
+		}
+
+		/* Skip headroom, and any custom header. */
+		skb_reserve(skb, NET_IP_ALIGN + l2_offset);
+
+		/* Encode the actual packet length. */
+		skb_put(skb, len);
+
+		/* NOTE: This call also sets "skb->dev = dev".
+		 * ISSUE: The classifier provides us with "eth_type"
+		 * (aka "eth->h_proto"), which is basically the value
+		 * returned by "eth_type_trans()".
+		 * Note that "eth_type_trans()" computes "skb->pkt_type",
+		 * which would be useful for the "filter" check above,
+		 * if we had a (modifiable) "skb" to work with.
+		 */
+		skb->protocol = eth_type_trans(skb, dev);
+
+		/* Acknowledge "good" hardware checksums. */
+		if (idesc->cs && idesc->csum_seed_val == 0xFFFF)
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+		netif_receive_skb(skb);
+
+		/* Update stats. */
+		atomic_add(1, (atomic_t *)&priv->stats.rx_packets);
+		atomic_add(len, (atomic_t *)&priv->stats.rx_bytes);
+
+		/* Need a new buffer. */
+		if (idesc->size == GXIO_MPIPE_BUFFER_SIZE_128)
+			info->num_needed_small_buffers++;
+		else
+			info->num_needed_large_buffers++;
+	}
+
+	gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+
+	return !filter;
+}
+
+/* Handle some packets for the current CPU.
+ *
+ * This function handles up to TILE_NET_BATCH idescs per call.
+ *
+ * ISSUE: Since we do not provide new buffers until this function is
+ * complete, we must initially provide enough buffers for each network
+ * cpu to fill its iqueue and also its batched idescs.
+ *
+ * ISSUE: The "rotting packet" race condition occurs if a packet
+ * arrives after the queue appears to be empty, and before the
+ * hypervisor interrupt is re-enabled.
+ */
+static int tile_net_poll(struct napi_struct *napi, int budget)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	unsigned int work = 0;
+	gxio_mpipe_idesc_t *idesc;
+	int i, n;
+
+	/* Process packets. */
+	while ((n = gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc)) > 0) {
+		for (i = 0; i < n; i++) {
+			if (i == TILE_NET_BATCH)
+				goto done;
+			if (tile_net_handle_packet(info, idesc + i)) {
+				if (++work >= budget)
+					goto done;
+			}
+		}
+	}
+
+	/* There are no packets left. */
+	napi_complete(&info->napi);
+
+	/* Re-enable hypervisor interrupts. */
+	gxio_mpipe_enable_notif_ring_interrupt(&context, info->iqueue.ring);
+
+	/* HACK: Avoid the "rotting packet" problem. */
+	if (gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc) > 0)
+		napi_schedule(&info->napi);
+
+	/* ISSUE: Handle completions? */
+
+done:
+	tile_net_provide_needed_buffers(info);
+
+	return work;
+}
+
+/* Handle an ingress interrupt on the current cpu. */
+static irqreturn_t tile_net_handle_ingress_irq(int irq, void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	napi_schedule(&info->napi);
+	return IRQ_HANDLED;
+}
+
+/* Free some completions.  This must be called with interrupts blocked. */
+static void tile_net_free_comps(gxio_mpipe_equeue_t *equeue,
+				struct tile_net_comps *comps,
+				int limit, bool force_update)
+{
+	int n = 0;
+	while (comps->comp_last < comps->comp_next) {
+		unsigned int cid = comps->comp_last % TILE_NET_MAX_COMPS;
+		struct tile_net_comp *comp = &comps->comp_queue[cid];
+		if (!gxio_mpipe_equeue_is_complete(equeue, comp->when,
+						   force_update || n == 0))
+			return;
+		dev_kfree_skb_irq(comp->skb);
+		comps->comp_last++;
+		if (++n == limit)
+			return;
+	}
+}
+
+/* Make sure the egress timer is scheduled.
+ *
+ * Note that we use "schedule if not scheduled" logic instead of the more
+ * obvious "reschedule" logic, because "reschedule" is fairly expensive.
+ */
+static void tile_net_schedule_egress_timer(struct tile_net_info *info)
+{
+	if (!info->egress_timer_scheduled) {
+		mod_timer_pinned(&info->egress_timer, jiffies + 1);
+		info->egress_timer_scheduled = true;
+	}
+}
+
+/* The "function" for "info->egress_timer".
+ *
+ * This timer will reschedule itself as long as there are any pending
+ * completions expected for this tile.
+ */
+static void tile_net_handle_egress_timer(unsigned long arg)
+{
+	struct tile_net_info *info = (struct tile_net_info *)arg;
+	unsigned long irqflags;
+	bool pending = false;
+	int i;
+
+	local_irq_save(irqflags);
+
+	/* The timer is no longer scheduled. */
+	info->egress_timer_scheduled = false;
+
+	/* Free all possible comps for this tile. */
+	for (i = 0; i < TILE_NET_CHANNELS; i++) {
+		struct tile_net_egress *egress = &egress_for_echannel[i];
+		struct tile_net_comps *comps = info->comps_for_echannel[i];
+		if (comps->comp_last >= comps->comp_next)
+			continue;
+		tile_net_free_comps(egress->equeue, comps, -1, true);
+		pending = pending || (comps->comp_last < comps->comp_next);
+	}
+
+	/* Reschedule timer if needed. */
+	if (pending)
+		tile_net_schedule_egress_timer(info);
+
+	local_irq_restore(irqflags);
+}
+
+/* Prepare each CPU. */
+static void tile_net_prepare_cpu(void *unused)
+{
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+	int my_cpu = smp_processor_id();
+
+	info->has_iqueue = false;
+
+	info->my_cpu = my_cpu;
+
+	/* Initialize the egress timer. */
+	init_timer(&info->egress_timer);
+	info->egress_timer.data = (long)info;
+	info->egress_timer.function = tile_net_handle_egress_timer;
+}
+
+/* Helper function for "tile_net_update()". */
+static void tile_net_update_cpu(void *arg)
+{
+	struct net_device *dev = arg;
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	if (info->has_iqueue) {
+		if (dev != NULL) {
+			if (!info->napi_added) {
+				netif_napi_add(dev, &info->napi,
+					       tile_net_poll, TILE_NET_WEIGHT);
+				info->napi_added = true;
+			}
+			if (!info->napi_enabled) {
+				napi_enable(&info->napi);
+				info->napi_enabled = true;
+			}
+			enable_percpu_irq(ingress_irq, 0);
+		} else {
+			disable_percpu_irq(ingress_irq);
+			if (info->napi_enabled) {
+				napi_disable(&info->napi);
+				info->napi_enabled = false;
+			}
+			/* FIXME: Drain the iqueue. */
+		}
+	}
+}
+
+/* Helper function for tile_net_open() and tile_net_stop().
+ * Always called under tile_net_devs_for_channel_mutex.
+ */
+static int tile_net_update(struct net_device *dev)
+{
+	int channel;
+	long count = 0;
+	int cpu;
+	int rc;
+	bool saw_channel;
+	static gxio_mpipe_rules_t rules;
+
+	gxio_mpipe_rules_init(&rules, &context);
+
+	saw_channel = false;
+	for (channel = 0; channel < TILE_NET_CHANNELS; channel++) {
+		if (tile_net_devs_for_channel[channel] == NULL)
+			continue;
+		if (!saw_channel) {
+			saw_channel = true;
+			gxio_mpipe_rules_begin(&rules, first_bucket,
+					       num_buckets, NULL);
+			gxio_mpipe_rules_set_headroom(&rules, NET_IP_ALIGN);
+		}
+		gxio_mpipe_rules_add_channel(&rules, channel);
+	}
+
+	/* NOTE: This can fail if there is no classifier.
+	 * ISSUE: Can anything else cause it to fail?
+	 */
+	rc = gxio_mpipe_rules_commit(&rules);
+	if (rc != 0) {
+		netdev_warn(dev, "gxio_mpipe_rules_commit failed: %d\n", rc);
+		return -EIO;
+	}
+
+	/* Update all cpus, sequentially (to protect "netif_napi_add()"). */
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, tile_net_update_cpu,
+					 (saw_channel ? dev : NULL), 1);
+
+	/* HACK: Allow packets to flow in the simulator. */
+	if (count != 0)
+		sim_enable_mpipe_links(0, -1);
+
+	return 0;
+}
+
+/* The first time any tilegx network device is opened, we initialize
+ * the global mpipe state.  If this step fails, we fail to open the
+ * device, but if it succeeds, we never need to do it again, and since
+ * tile_net can't be unloaded, we never undo it.
+ *
+ * Note that some resources in this path (buffer stack indices,
+ * bindings from init_buffer_stack, etc.) are hypervisor resources
+ * that are freed simply via gxio_mpipe_destroy().
+ */
+static int tile_net_init_mpipe(struct net_device *dev)
+{
+	gxio_mpipe_buffer_size_enum_t small_buf_size =
+		GXIO_MPIPE_BUFFER_SIZE_128;
+	gxio_mpipe_buffer_size_enum_t large_buf_size =
+		GXIO_MPIPE_BUFFER_SIZE_1664;
+	size_t stack_bytes;
+	pte_t pte = { 0 };
+	void *small = NULL;
+	void *large = NULL;
+	int i, num_buffers, rc;
+	int network_cpus_count, cpu;
+	int ring, group, next_ring;
+	size_t comps_size = 0;
+	size_t notif_ring_size = 0;
+
+	if (!hash_default) {
+		netdev_err(dev, "Networking requires hash_default!\n");
+		return -EIO;
+	}
+
+	rc =  gxio_mpipe_init(&context, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init failed: %d\n", rc);
+		return -EIO;
+	}
+
+	network_cpus_count = cpus_weight(network_cpus_map);
+
+	num_buffers =
+		network_cpus_count * (IQUEUE_ENTRIES + TILE_NET_BATCH);
+
+	/* Compute stack bytes; we round up to 64KB and then use
+	 * alloc_pages() so we get the required 64KB alignment as well.
+	 */
+	stack_bytes = ALIGN(gxio_mpipe_calc_buffer_stack_bytes(num_buffers),
+			    64 * 1024);
+
+	/* Allocate two buffer stack indices. */
+	rc = gxio_mpipe_alloc_buffer_stacks(&context, 2, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_buffer_stacks failed: %d\n",
+			   rc);
+		goto fail;
+	}
+	small_buffer_stack = rc;
+	large_buffer_stack = rc + 1;
+
+	/* Allocate the small memory stack. */
+	small = alloc_pages_exact(stack_bytes, GFP_KERNEL);
+	if (small == NULL) {
+		netdev_err(dev,
+			   "Could not alloc %zd bytes for buffer stacks\n",
+			   stack_bytes);
+		rc = -ENOMEM;
+		goto fail;
+	}
+	rc = gxio_mpipe_init_buffer_stack(&context, small_buffer_stack,
+					  small_buf_size,
+					  small, stack_bytes, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init_buffer_stack: %d\n", rc);
+		goto fail;
+	}
+
+	/* Allocate the large buffer stack. */
+	large = alloc_pages_exact(stack_bytes, GFP_KERNEL);
+	if (large == NULL) {
+		netdev_err(dev,
+			   "Could not alloc %zd bytes for buffer stacks\n",
+			   stack_bytes);
+		rc = -ENOMEM;
+		goto fail;
+	}
+	rc = gxio_mpipe_init_buffer_stack(&context, large_buffer_stack,
+					  large_buf_size,
+					  large, stack_bytes, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_init_buffer_stack failed: %d\n",
+			   rc);
+		goto fail;
+	}
+
+	/* Register all the client memory in mpipe TLBs. */
+	pte = pte_set_home(pte, PAGE_HOME_HASH);
+	rc = gxio_mpipe_register_client_memory(&context, small_buffer_stack,
+					       pte, 0);
+	if (rc != 0) {
+		netdev_err(dev,
+			   "gxio_mpipe_register_buffer_memory failed: %d\n",
+			   rc);
+		goto fail;
+	}
+	rc = gxio_mpipe_register_client_memory(&context, large_buffer_stack,
+					       pte, 0);
+	if (rc != 0) {
+		netdev_err(dev,
+			   "gxio_mpipe_register_buffer_memory failed: %d\n",
+			   rc);
+		goto fail;
+	}
+
+	/* Provide initial buffers. */
+	rc = -ENOMEM;
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(true)) {
+			netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+			goto fail_pop;
+		}
+	}
+	for (i = 0; i < num_buffers; i++) {
+		if (!tile_net_provide_buffer(false)) {
+			netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+			goto fail_pop;
+		}
+	}
+
+	/* Allocate one NotifRing for each network cpu. */
+	rc = gxio_mpipe_alloc_notif_rings(&context, network_cpus_count,
+					  0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_notif_rings failed %d\n",
+			   rc);
+		goto fail_pop;
+	}
+
+	/* Init NotifRings. */
+	ring = rc;
+	next_ring = rc;
+
+	/* ISSUE: This is more than strictly necessary. */
+	comps_size = TILE_NET_CHANNELS * sizeof(struct tile_net_comps);
+
+	notif_ring_size = IQUEUE_ENTRIES * sizeof(gxio_mpipe_idesc_t);
+
+	for_each_online_cpu(cpu) {
+
+		int order;
+		struct page *page;
+		void *addr;
+
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+
+		/* Allocate the "comps". */
+		order = get_order(comps_size);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL) {
+			netdev_err(dev,
+				   "Failed to alloc %zd bytes comps memory\n",
+				   comps_size);
+			rc = -ENOMEM;
+			goto fail_pop;
+		}
+
+		addr = pfn_to_kaddr(page_to_pfn(page));
+		memset(addr, 0, comps_size);
+		for (i = 0; i < TILE_NET_CHANNELS; i++)
+			info->comps_for_echannel[i] =
+				addr + i * sizeof(struct tile_net_comps);
+
+		/* Only network cpus can receive packets. */
+		if (!cpu_isset(cpu, network_cpus_map))
+			continue;
+
+		/* Allocate the actual idescs array. */
+		order = get_order(notif_ring_size);
+		page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+		if (page == NULL) {
+			netdev_err(dev,
+				   "Failed to alloc %zd bytes iqueue memory\n",
+				   notif_ring_size);
+			rc = -ENOMEM;
+			goto fail_pop;
+		}
+		addr = pfn_to_kaddr(page_to_pfn(page));
+		rc = gxio_mpipe_iqueue_init(&info->iqueue, &context, next_ring,
+					    addr, notif_ring_size, 0);
+		if (rc != 0) {
+			netdev_err(dev,
+				   "gxio_mpipe_iqueue_init failed: %d\n", rc);
+			goto fail_pop;
+		}
+
+		info->has_iqueue = true;
+
+		next_ring++;
+	}
+
+	/* Allocate one NotifGroup. */
+	rc = gxio_mpipe_alloc_notif_groups(&context, 1, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_notif_groups failed: %d\n",
+			   rc);
+		goto fail_pop;
+	}
+	group = rc;
+
+	if (network_cpus_count > 4)
+		num_buckets = 256;
+	else if (network_cpus_count > 1)
+		num_buckets = 16;
+
+	/* Allocate some buckets. */
+	rc = gxio_mpipe_alloc_buckets(&context, num_buckets, 0, 0);
+	if (rc < 0) {
+		netdev_err(dev, "gxio_mpipe_alloc_buckets failed: %d\n", rc);
+		goto fail_pop;
+	}
+	first_bucket = rc;
+
+	/* Init group and buckets. */
+	rc = gxio_mpipe_init_notif_group_and_buckets(
+		&context, group, ring, network_cpus_count,
+		first_bucket, num_buckets,
+		GXIO_MPIPE_BUCKET_STICKY_FLOW_LOCALITY);
+
+	if (rc != 0) {
+		netdev_err(
+			dev,
+			"gxio_mpipe_init_notif_group_and_buckets failed: %d\n",
+			rc);
+		goto fail_pop;
+	}
+
+	/* Create an irq and register it. Note that "ingress_irq" being
+	 * initialized is how we know not to call this function again.
+	 */
+	rc = create_irq();
+	if (rc < 0) {
+		netdev_err(dev, "create_irq failed: %d\n", rc);
+		goto fail_pop;
+
+	}
+	ingress_irq = rc;
+	tile_irq_activate(ingress_irq, TILE_IRQ_PERCPU);
+	rc = request_irq(ingress_irq, tile_net_handle_ingress_irq,
+			 0, NULL, NULL);
+	if (rc != 0) {
+		netdev_err(dev, "request_irq failed: %d\n", rc);
+		destroy_irq(ingress_irq);
+		ingress_irq = -1;
+		goto fail_pop;
+	}
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		if (info->has_iqueue) {
+			gxio_mpipe_request_notif_ring_interrupt(
+				&context, cpu_x(cpu), cpu_y(cpu),
+				1, ingress_irq, info->iqueue.ring);
+		}
+	}
+
+	return 0;
+
+fail_pop:
+	/* Do cleanups that require the mpipe context first. */
+	tile_net_pop_all_buffers(small_buffer_stack);
+	tile_net_pop_all_buffers(large_buffer_stack);
+
+fail:
+	/* Destroy mpipe context so the hardware no longer owns any memory. */
+	gxio_mpipe_destroy(&context);
+
+	for_each_online_cpu(cpu) {
+		struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+		free_pages((unsigned long)(info->comps_for_echannel[0]),
+			   get_order(comps_size));
+		info->comps_for_echannel[0] = NULL;
+		free_pages((unsigned long)(info->iqueue.idescs),
+			   get_order(notif_ring_size));
+		info->iqueue.idescs = NULL;
+	}
+
+	if (small)
+		free_pages_exact(small, stack_bytes);
+	if (large)
+		free_pages_exact(large, stack_bytes);
+
+	large_buffer_stack = -1;
+	small_buffer_stack = -1;
+	first_bucket = -1;
+
+	return rc;
+}
+
+/* Create persistent egress info for a given egress channel.
+ *
+ * Note that this may be shared between, say, "gbe0" and "xgbe0".
+ *
+ * ISSUE: Defer header allocation until TSO is actually needed?
+ */
+static int tile_net_init_egress(struct net_device *dev, int echannel)
+{
+	size_t headers_order;
+	struct page *headers_page;
+	unsigned char *headers;
+
+	size_t edescs_size;
+	int edescs_order;
+	struct page *edescs_page;
+	gxio_mpipe_edesc_t *edescs;
+
+	int equeue_order;
+	struct page *equeue_page;
+	gxio_mpipe_equeue_t *equeue;
+	int edma;
+
+	int rc = -ENOMEM;
+
+	/* Only initialize once. */
+	if (egress_for_echannel[echannel].equeue != NULL)
+		return 0;
+
+	/* Allocate memory for the "headers". */
+	headers_order = get_order(EQUEUE_ENTRIES * HEADER_BYTES);
+	headers_page = alloc_pages(GFP_KERNEL, headers_order);
+	if (headers_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for TSO headers.\n",
+			    PAGE_SIZE << headers_order);
+		goto fail;
+	}
+	headers = pfn_to_kaddr(page_to_pfn(headers_page));
+
+	/* Allocate memory for the "edescs". */
+	edescs_size = EQUEUE_ENTRIES * sizeof(*edescs);
+	edescs_order = get_order(edescs_size);
+	edescs_page = alloc_pages(GFP_KERNEL, edescs_order);
+	if (edescs_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for eDMA ring.\n",
+			    edescs_size);
+		goto fail_headers;
+	}
+	edescs = pfn_to_kaddr(page_to_pfn(edescs_page));
+
+	/* Allocate memory for the "equeue". */
+	equeue_order = get_order(sizeof(*equeue));
+	equeue_page = alloc_pages(GFP_KERNEL, equeue_order);
+	if (equeue_page == NULL) {
+		netdev_warn(dev,
+			    "Could not alloc %zd bytes for equeue info.\n",
+			    PAGE_SIZE << equeue_order);
+		goto fail_edescs;
+	}
+	equeue = pfn_to_kaddr(page_to_pfn(equeue_page));
+
+	/* Allocate an edma ring.  Note that in practice this can't
+	 * fail, which is good, because we will leak an edma ring if so.
+	 */
+	rc = gxio_mpipe_alloc_edma_rings(&context, 1, 0, 0);
+	if (rc < 0) {
+		netdev_warn(dev, "gxio_mpipe_alloc_edma_rings failed: %d\n",
+			    rc);
+		goto fail_equeue;
+	}
+	edma = rc;
+
+	/* Initialize the equeue. */
+	rc = gxio_mpipe_equeue_init(equeue, &context, edma, echannel,
+				    edescs, edescs_size, 0);
+	if (rc != 0) {
+		netdev_err(dev, "gxio_mpipe_equeue_init failed: %d\n", rc);
+		goto fail_equeue;
+	}
+
+	/* Done. */
+	egress_for_echannel[echannel].equeue = equeue;
+	egress_for_echannel[echannel].headers = headers;
+	return 0;
+
+fail_equeue:
+	__free_pages(equeue_page, equeue_order);
+
+fail_edescs:
+	__free_pages(edescs_page, edescs_order);
+
+fail_headers:
+	__free_pages(headers_page, headers_order);
+
+fail:
+	return rc;
+}
+
+/* Return channel number for a newly-opened link. */
+static int tile_net_link_open(struct net_device *dev, gxio_mpipe_link_t *link,
+			      const char *link_name)
+{
+	int rc = gxio_mpipe_link_open(link, &context, link_name, 0);
+	if (rc < 0) {
+		netdev_err(dev, "Failed to open '%s'\n", link_name);
+		return rc;
+	}
+	rc = gxio_mpipe_link_channel(link);
+	if (rc < 0 || rc >= TILE_NET_CHANNELS) {
+		netdev_err(dev, "gxio_mpipe_link_channel bad value: %d\n", rc);
+		gxio_mpipe_link_close(link);
+		return -EINVAL;
+	}
+	return rc;
+}
+
+/* Help the kernel activate the given network interface. */
+static int tile_net_open(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	int rc;
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+
+	/* Do one-time initialization the first time any device is opened. */
+	if (ingress_irq < 0) {
+		rc = tile_net_init_mpipe(dev);
+		if (rc != 0)
+			goto fail;
+	}
+
+	/* Determine if this is the "loopify" device. */
+	if (unlikely((loopify_link_name != NULL) &&
+		     !strcmp(dev->name, loopify_link_name))) {
+		rc = tile_net_link_open(dev, &priv->link, "loop0");
+		if (rc < 0)
+			goto fail;
+		priv->channel = rc;
+		rc = tile_net_link_open(dev, &priv->loopify_link, "loop1");
+		if (rc < 0)
+			goto fail;
+		priv->loopify_channel = rc;
+		priv->echannel = rc;
+	} else {
+		rc = tile_net_link_open(dev, &priv->link, dev->name);
+		if (rc < 0)
+			goto fail;
+		priv->channel = rc;
+		priv->echannel = rc;
+	}
+
+	/* Initialize egress info (if needed).  Once ever, per echannel. */
+	rc = tile_net_init_egress(dev, priv->echannel);
+	if (rc != 0)
+		goto fail;
+
+	tile_net_devs_for_channel[priv->channel] = dev;
+
+	rc = tile_net_update(dev);
+	if (rc != 0)
+		goto fail;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	/* Start our transmit queue. */
+	netif_start_queue(dev);
+
+	netif_carrier_on(dev);
+
+	return 0;
+
+fail:
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			netdev_warn(dev, "Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			netdev_warn(dev, "Failed to close link!\n");
+		priv->channel = -1;
+	}
+
+	priv->echannel = -1;
+
+	tile_net_devs_for_channel[priv->channel] = NULL;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	/* Don't return raw gxio error codes to generic Linux. */
+	return (rc > -512) ? rc : -EIO;
+}
+
+/* Help the kernel deactivate the given network interface. */
+static int tile_net_stop(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	/* Stop our transmit queue. */
+	netif_stop_queue(dev);
+
+	mutex_lock(&tile_net_devs_for_channel_mutex);
+
+	tile_net_devs_for_channel[priv->channel] = NULL;
+
+	(void)tile_net_update(dev);
+
+	if (priv->loopify_channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+			netdev_warn(dev, "Failed to close loopify link!\n");
+		priv->loopify_channel = -1;
+	}
+
+	if (priv->channel >= 0) {
+		if (gxio_mpipe_link_close(&priv->link) != 0)
+			netdev_warn(dev, "Failed to close link!\n");
+		priv->channel = -1;
+	}
+
+	priv->echannel = -1;
+
+	mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+	return 0;
+}
+
+/* Determine the VA for a fragment. */
+static inline void *tile_net_frag_buf(skb_frag_t *f)
+{
+	unsigned long pfn = page_to_pfn(skb_frag_page(f));
+	return pfn_to_kaddr(pfn) + f->page_offset;
+}
+
+/* Used for paranoia to make sure we handle no ill-formed packets. */
+#define TSO_DROP_IF(cond) \
+	do { if (WARN_ON(cond)) return NETDEV_TX_OK; } while (0)
+
+
+/* This function takes "skb", consisting of a header template and a
+ * (presumably) huge payload, and egresses it as one or more segments
+ * (aka packets), each consisting of a (possibly modified) copy of the
+ * header plus a piece of the payload, via "tcp segmentation offload".
+ *
+ * Usually, "data" will contain the header template, of size "sh_len",
+ * and "sh->frags" will contain "skb->data_len" bytes of payload, and
+ * there will be "sh->gso_segs" segments.
+ *
+ * Sometimes, if "sendfile()" requires copying, we will be called with
+ * "data" containing the header and payload, with "frags" being empty.
+ *
+ * Sometimes, for example when using NFS over TCP, a single segment can
+ * span 3 fragments.  This requires special care below.
+ *
+ * See "emulate_large_send_offload()" for some reference code, which
+ * does not handle checksumming.
+ */
+static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+
+	struct tile_net_comps *comps =
+		info->comps_for_echannel[priv->echannel];
+
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+
+	/* The ip header follows the ethernet header. */
+	struct iphdr *ih = ip_hdr(skb);
+	unsigned int ih_len = ih->ihl * 4;
+
+	/* Note that "nh == iph", by definition. */
+	unsigned char *nh = skb_network_header(skb);
+	unsigned int eh_len = nh - data;
+
+	/* The tcp header follows the ip header. */
+	struct tcphdr *th = (struct tcphdr *)(nh + ih_len);
+	unsigned int th_len = th->doff * 4;
+
+	/* The total number of header bytes. */
+	unsigned int sh_len = eh_len + ih_len + th_len;
+
+	/* Help compute "jh->check". */
+	unsigned int isum_hack =
+		((0xFFFF - ih->check) +
+		 (0xFFFF - ih->tot_len) +
+		 (0xFFFF - ih->id));
+
+	/* Help compute "uh->check". */
+	unsigned int tsum_hack = th->check + (0xFFFF ^ htons(len));
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	/* The maximum payload size. */
+	unsigned int gso_size = sh->gso_size;
+
+	/* The size of the initial segments (including header). */
+	unsigned int mtu = sh_len + gso_size;
+
+	/* The size of the final segment (including header). */
+	unsigned int mtu2 = len - ((sh->gso_segs - 1) * gso_size);
+
+	/* Track tx stats. */
+	unsigned int tx_packets = 0;
+	unsigned int tx_bytes = 0;
+
+	/* Which segment are we on. */
+	unsigned int segment;
+
+	/* Get the initial ip "id". */
+	u16 id = ntohs(ih->id);
+
+	/* Get the initial tcp "seq". */
+	u32 seq = ntohl(th->seq);
+
+	/* The id of the current fragment (or -1). */
+	long f_id;
+
+	/* The size of the current fragment (or -1). */
+	long f_size;
+
+	/* The bytes used from the current fragment (or -1). */
+	long f_used;
+
+	/* The size of the current piece of payload. */
+	long n;
+
+	/* Prepare checksum info. */
+	unsigned int csum_start = skb_checksum_start_offset(skb);
+
+	/* The header/payload edesc's. */
+	gxio_mpipe_edesc_t edesc_head = { { 0 } };
+	gxio_mpipe_edesc_t edesc_body = { { 0 } };
+
+	/* Total number of edescs needed. */
+	unsigned int num_edescs = 0;
+
+	unsigned long irqflags;
+
+	/* First reserved egress slot. */
+	s64 slot;
+
+	int cid;
+
+	/* Empty packets (etc) would cause trouble below. */
+	TSO_DROP_IF(skb->data_len == 0);
+	TSO_DROP_IF(sh->nr_frags == 0);
+	TSO_DROP_IF(sh->gso_segs == 0);
+
+	/* We assume the frags contain the entire payload. */
+	TSO_DROP_IF(skb_headlen(skb) != sh_len);
+	TSO_DROP_IF(len != sh_len + skb->data_len);
+
+	/* Implicitly verify "gso_segs" and "gso_size". */
+	TSO_DROP_IF(mtu2 > mtu);
+
+	/* We only have HEADER_BYTES for each header. */
+	TSO_DROP_IF(NET_IP_ALIGN + sh_len > HEADER_BYTES);
+
+	/* Paranoia. */
+	TSO_DROP_IF(skb->protocol != htons(ETH_P_IP));
+	TSO_DROP_IF(ih->protocol != IPPROTO_TCP);
+	TSO_DROP_IF(skb->ip_summed != CHECKSUM_PARTIAL);
+	TSO_DROP_IF(csum_start != eh_len + ih_len);
+
+	/* NOTE: ".hwb = 0", so ".size" is unused.
+	 * NOTE: ".stack_idx" determines the TLB.
+	 */
+
+	/* Prepare to egress the headers. */
+	edesc_head.csum = 1;
+	edesc_head.csum_start = csum_start;
+	edesc_head.csum_dest = csum_start + skb->csum_offset;
+	edesc_head.xfer_size = sh_len;
+	edesc_head.stack_idx = large_buffer_stack;
+
+	/* Prepare to egress the body. */
+	edesc_body.stack_idx = large_buffer_stack;
+
+	/* Reset. */
+	f_id = -1;
+	f_size = -1;
+	f_used = -1;
+
+	/* Determine how many edesc's are needed. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* One edesc for the header. */
+		num_edescs++;
+
+		/* One edesc for each piece of the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			num_edescs++;
+		}
+	}
+
+	/* Verify all fragments consumed. */
+	TSO_DROP_IF(f_id + 1 != sh->nr_frags);
+	TSO_DROP_IF(f_used != f_size);
+
+	local_irq_save(irqflags);
+
+	/* Reserve slots, or return NETDEV_TX_BUSY if "full". */
+	slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		/* ISSUE: "Virtual device xxx asks to queue packet". */
+		return NETDEV_TX_BUSY;
+	}
+
+	/* Reset. */
+	f_id = -1;
+	f_size = -1;
+	f_used = -1;
+
+	/* Prepare all the headers. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* Access the header memory for this segment. */
+		unsigned int bn = slot % EQUEUE_ENTRIES;
+		unsigned char *buf =
+			egress->headers + bn * HEADER_BYTES + NET_IP_ALIGN;
+
+		/* The soon-to-be copied "ip" header. */
+		struct iphdr *jh = (struct iphdr *)(buf + eh_len);
+
+		/* The soon-to-be copied "tcp" header. */
+		struct tcphdr *uh = (struct tcphdr *)(buf + eh_len + ih_len);
+
+		unsigned int jsum;
+
+		/* Copy the header. */
+		memcpy(buf, data, sh_len);
+
+		/* The packet size, not including ethernet header. */
+		jh->tot_len = htons(s_len - eh_len);
+
+		/* Update the ip "id". */
+		jh->id = htons(id);
+
+		/* Compute the "ip checksum". */
+		jsum = isum_hack + htons(s_len - eh_len) + htons(id);
+		jh->check = csum_long(jsum) ^ 0xffff;
+
+		/* Update the tcp "seq". */
+		uh->seq = htonl(seq);
+
+		/* Update some flags. */
+		if (!final) {
+			uh->fin = 0;
+			uh->psh = 0;
+		}
+
+		/* Compute the tcp pseudo-header checksum. */
+		uh->check = csum_long(tsum_hack + htons(s_len));
+
+		/* Skip past the header. */
+		slot++;
+
+		/* Skip past the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			slot++;
+		}
+
+		id++;
+		seq += p_len;
+	}
+
+	/* Reset "slot". */
+	slot -= num_edescs;
+
+	/* Flush the headers. */
+	wmb();
+
+	/* Reset. */
+	f_id = -1;
+	f_size = -1;
+	f_used = -1;
+
+	/* Egress all the edescs. */
+	for (segment = 0; segment < sh->gso_segs; segment++) {
+
+		/* Detect the final segment. */
+		bool final = (segment == sh->gso_segs - 1);
+
+		/* The segment size (including header). */
+		unsigned int s_len = final ? mtu2 : mtu;
+
+		/* The size of the payload. */
+		unsigned int p_len = s_len - sh_len;
+
+		/* The bytes used from the payload. */
+		unsigned int p_used = 0;
+
+		/* Access the header memory for this segment. */
+		unsigned int bn = slot % EQUEUE_ENTRIES;
+		unsigned char *buf =
+			egress->headers + bn * HEADER_BYTES + NET_IP_ALIGN;
+
+		void *va;
+
+		/* Egress the header. */
+		edesc_head.va = va_to_tile_io_addr(buf);
+		gxio_mpipe_equeue_put_at(equeue, edesc_head, slot);
+		slot++;
+
+		/* Egress the payload. */
+		while (p_used < p_len) {
+
+			/* Advance as needed. */
+			while (f_used >= f_size) {
+				f_id++;
+				f_size = sh->frags[f_id].size;
+				f_used = 0;
+			}
+
+			va = tile_net_frag_buf(&sh->frags[f_id]) + f_used;
+
+			/* Use bytes from the current fragment. */
+			n = p_len - p_used;
+			if (n > f_size - f_used)
+				n = f_size - f_used;
+			f_used += n;
+			p_used += n;
+
+			/* Egress a piece of the payload. */
+			edesc_body.va = va_to_tile_io_addr(va);
+			edesc_body.xfer_size = n;
+			edesc_body.bound = !(p_used < p_len);
+			gxio_mpipe_equeue_put_at(equeue, edesc_body, slot);
+			slot++;
+		}
+
+		tx_packets++;
+		tx_bytes += s_len;
+	}
+
+	/* Wait for a free completion entry.
+	 * ISSUE: Is this the best logic?
+	 * ISSUE: Can this cause undesirable "blocking"?
+	 */
+	while (comps->comp_next - comps->comp_last >= TILE_NET_MAX_COMPS - 1)
+		tile_net_free_comps(equeue, comps, 32, false);
+
+	/* Update the completions array. */
+	cid = comps->comp_next % TILE_NET_MAX_COMPS;
+	comps->comp_queue[cid].when = slot;
+	comps->comp_queue[cid].skb = skb;
+	comps->comp_next++;
+
+	/* Update stats. */
+	atomic_add(tx_packets, (atomic_t *)&priv->stats.tx_packets);
+	atomic_add(tx_bytes, (atomic_t *)&priv->stats.tx_bytes);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+/* Analyze the body and frags for a transmit request. */
+static unsigned int tile_net_tx_frags(struct frag *frags,
+				       struct sk_buff *skb,
+				       void *b_data, unsigned int b_len)
+{
+	unsigned int i, n = 0;
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	if (b_len != 0) {
+		frags[n].buf = b_data;
+		frags[n++].length = b_len;
+	}
+
+	for (i = 0; i < sh->nr_frags; i++) {
+		skb_frag_t *f = &sh->frags[i];
+		frags[n].buf = tile_net_frag_buf(f);
+		frags[n++].length = skb_frag_size(f);
+	}
+
+	return n;
+}
+
+/* Help the kernel transmit a packet. */
+static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+	struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
+	gxio_mpipe_equeue_t *equeue = egress->equeue;
+
+	struct tile_net_comps *comps =
+		info->comps_for_echannel[priv->echannel];
+
+	struct skb_shared_info *sh = skb_shinfo(skb);
+
+	unsigned int len = skb->len;
+	unsigned char *data = skb->data;
+
+	unsigned int num_frags;
+	struct frag frags[MAX_FRAGS];
+	gxio_mpipe_edesc_t edescs[MAX_FRAGS];
+
+	unsigned int i;
+
+	int cid;
+
+	s64 slot;
+
+	unsigned long irqflags;
+
+	/* Save the timestamp. */
+	dev->trans_start = jiffies;
+
+#ifdef TILE_NET_DUMP_PACKETS
+	/* ISSUE: Does not dump the "frags". */
+	dump_packet(data, skb_headlen(skb), "tx");
+#endif
+
+	if (sh->gso_size != 0)
+		return tile_net_tx_tso(skb, dev);
+
+	/* NOTE: This is usually 2, sometimes 3, for big writes. */
+	num_frags = tile_net_tx_frags(frags, skb, data, skb_headlen(skb));
+
+	/* Prepare the edescs. */
+	for (i = 0; i < num_frags; i++) {
+
+		/* NOTE: ".hwb = 0", so ".size" is unused.
+		 * NOTE: ".stack_idx" determines the TLB.
+		 */
+
+		gxio_mpipe_edesc_t edesc = { { 0 } };
+
+		/* Prepare the basic command. */
+		edesc.bound = (i == num_frags - 1);
+		edesc.xfer_size = frags[i].length;
+		edesc.va = va_to_tile_io_addr(frags[i].buf);
+		edesc.stack_idx = large_buffer_stack;
+
+		edescs[i] = edesc;
+	}
+
+	/* Add checksum info if needed. */
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		unsigned int csum_start = skb->csum_start - skb_headroom(skb);
+		edescs[0].csum = 1;
+		edescs[0].csum_start = csum_start;
+		edescs[0].csum_dest = csum_start + skb->csum_offset;
+	}
+
+	local_irq_save(irqflags);
+
+	/* Reserve slots, or return NETDEV_TX_BUSY if "full". */
+	slot = gxio_mpipe_equeue_try_reserve(equeue, num_frags);
+	if (slot < 0) {
+		local_irq_restore(irqflags);
+		/* ISSUE: "Virtual device xxx asks to queue packet". */
+		return NETDEV_TX_BUSY;
+	}
+
+	for (i = 0; i < num_frags; i++)
+		gxio_mpipe_equeue_put_at(equeue, edescs[i], slot + i);
+
+	/* Wait for a free completion entry.
+	 * ISSUE: Is this the best logic?
+	 * ISSUE: Can this cause undesirable "blocking"?
+	 */
+	while (comps->comp_next - comps->comp_last >= TILE_NET_MAX_COMPS - 1)
+		tile_net_free_comps(equeue, comps, 32, false);
+
+	/* Update the completions array. */
+	cid = comps->comp_next % TILE_NET_MAX_COMPS;
+	comps->comp_queue[cid].when = slot + num_frags;
+	comps->comp_queue[cid].skb = skb;
+	comps->comp_next++;
+
+	/* HACK: Track "expanded" size for short packets (e.g. 42 < 60). */
+	atomic_add(1, (atomic_t *)&priv->stats.tx_packets);
+	atomic_add((len >= ETH_ZLEN) ? len : ETH_ZLEN,
+		   (atomic_t *)&priv->stats.tx_bytes);
+
+	local_irq_restore(irqflags);
+
+	/* Make sure the egress timer is scheduled. */
+	tile_net_schedule_egress_timer(info);
+
+	return NETDEV_TX_OK;
+}
+
+/* Deal with a transmit timeout. */
+static void tile_net_tx_timeout(struct net_device *dev)
+{
+	/* ISSUE: This doesn't seem useful for us. */
+	netif_wake_queue(dev);
+}
+
+/* Ioctl commands. */
+static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	return -EOPNOTSUPP;
+}
+
+/* Get system network statistics for device. */
+static struct net_device_stats *tile_net_get_stats(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+	return &priv->stats;
+}
+
+/* Change the MTU. */
+static int tile_net_change_mtu(struct net_device *dev, int new_mtu)
+{
+	/* Check ranges. */
+	if ((new_mtu < 68) || (new_mtu > 1500))
+		return -EINVAL;
+
+	/* Accept the value. */
+	dev->mtu = new_mtu;
+
+	return 0;
+}
+
+/* Change the Ethernet address of the NIC.
+ *
+ * The hypervisor driver does not support changing MAC address.  However,
+ * the hardware does not do anything with the MAC address, so the address
+ * which gets used on outgoing packets, and which is accepted on incoming
+ * packets, is completely up to us.
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int tile_net_set_mac_address(struct net_device *dev, void *p)
+{
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EINVAL;
+
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+
+	return 0;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/* Polling 'interrupt' - used by things like netconsole to send skbs
+ * without having to re-enable interrupts. It's not called while
+ * the interrupt routine is executing.
+ */
+static void tile_net_netpoll(struct net_device *dev)
+{
+	disable_percpu_irq(ingress_irq);
+	tile_net_handle_ingress_irq(ingress_irq, NULL);
+	enable_percpu_irq(ingress_irq, 0);
+}
+#endif
+
+static const struct net_device_ops tile_net_ops = {
+	.ndo_open = tile_net_open,
+	.ndo_stop = tile_net_stop,
+	.ndo_start_xmit = tile_net_tx,
+	.ndo_do_ioctl = tile_net_ioctl,
+	.ndo_get_stats = tile_net_get_stats,
+	.ndo_change_mtu = tile_net_change_mtu,
+	.ndo_tx_timeout = tile_net_tx_timeout,
+	.ndo_set_mac_address = tile_net_set_mac_address,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller = tile_net_netpoll,
+#endif
+};
+
+/* The setup function.
+ *
+ * This uses ether_setup() to assign various fields in dev, including
+ * setting IFF_BROADCAST and IFF_MULTICAST, then sets some extra fields.
+ */
+static void tile_net_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+
+	dev->netdev_ops = &tile_net_ops;
+	dev->watchdog_timeo = TILE_NET_TIMEOUT;
+
+	/* We want lockless xmit. */
+	dev->features |= NETIF_F_LLTX;
+
+	/* We support hardware tx checksums. */
+	dev->features |= NETIF_F_HW_CSUM;
+
+	/* We support scatter/gather. */
+	dev->features |= NETIF_F_SG;
+
+#ifdef TILE_NET_GSO
+	/* We support GSO. */
+	dev->features |= NETIF_F_GSO;
+#endif
+
+#ifdef TILE_NET_TSO
+	/* We support TSO. */
+	dev->features |= NETIF_F_TSO;
+#endif
+
+	dev->tx_queue_len = TILE_NET_TX_QUEUE_LEN;
+
+	dev->mtu = 1500;
+}
+
+/* Allocate the device structure, register the device, and obtain the
+ * MAC address from the hypervisor.
+ */
+static void tile_net_dev_init(const char *name, const uint8_t *mac)
+{
+	int ret;
+	int i;
+	int nz_addr = 0;
+	struct net_device *dev;
+	struct tile_net_priv *priv;
+
+	/* HACK: Ignore "loop" links. */
+	if (strncmp(name, "loop", 4) == 0)
+		return;
+
+	/* Allocate the device structure.  This allocates "priv", calls
+	 * tile_net_setup(), and saves "name".  Normally, "name" is a
+	 * template, instantiated by register_netdev(), but not for us.
+	 */
+	dev = alloc_netdev(sizeof(*priv), name, tile_net_setup);
+	if (!dev) {
+		pr_err("alloc_netdev(%s) failed\n", name);
+		return;
+	}
+
+	priv = netdev_priv(dev);
+
+	/* Initialize "priv". */
+
+	memset(priv, 0, sizeof(*priv));
+
+	priv->dev = dev;
+
+	priv->channel = -1;
+	priv->loopify_channel = -1;
+	priv->echannel = -1;
+
+	/* Register the network device. */
+	ret = register_netdev(dev);
+	if (ret) {
+		netdev_err(dev, "register_netdev failed %d\n", ret);
+		free_netdev(dev);
+		return;
+	}
+
+	/* Get the MAC address and set it in the device struct; this must
+	 * be done before the device is opened.  If the MAC is all zeroes,
+	 * we use a random address, since we're probably on the simulator.
+	 */
+	for (i = 0; i < 6; i++)
+		nz_addr |= mac[i];
+
+	if (nz_addr) {
+		memcpy(dev->dev_addr, mac, 6);
+		dev->addr_len = 6;
+	} else {
+		random_ether_addr(dev->dev_addr);
+	}
+}
+
+/* Module initialization. */
+static int __init tile_net_init_module(void)
+{
+	int i;
+	char name[GXIO_MPIPE_LINK_NAME_LEN];
+	uint8_t mac[6];
+
+	pr_info("Tilera Network Driver\n");
+
+	mutex_init(&tile_net_devs_for_channel_mutex);
+
+	/* Initialize each CPU. */
+	on_each_cpu(tile_net_prepare_cpu, NULL, 1);
+
+	/* Find out what devices we have, and initialize them. */
+	for (i = 0; gxio_mpipe_link_enumerate_mac(i, name, mac) >= 0; i++)
+		tile_net_dev_init(name, mac);
+
+	if (!network_cpus_init())
+		network_cpus_map = *cpu_online_mask;
+
+	return 0;
+}
+
+module_init(tile_net_init_module);
-- 
1.6.5.2

^ permalink raw reply related

* Re: [v12 PATCH 2/3] NETFILTER module xt_hmark, new target for HASH based fwmark
From: Pablo Neira Ayuso @ 2012-05-09 10:38 UTC (permalink / raw)
  To: Hans Schillstrom
  Cc: kaber@trash.net, jengelh@medozas.de,
	netfilter-devel@vger.kernel.org, netdev@vger.kernel.org,
	hans@schillstrom.com
In-Reply-To: <201205080937.36853.hans.schillstrom@ericsson.com>

On Tue, May 08, 2012 at 09:37:35AM +0200, Hans Schillstrom wrote:
> From d5065af3988cc7561a02f30bae8342e1a89126a4 Mon Sep 17 00:00:00 2001
> From: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Date: Wed, 2 May 2012 07:49:47 +0000
> Subject: netfilter: add xt_hmark target for hash-based skb
>  marking
> 
> The target allows you to create rules in the "raw" and "mangle" tables
> which set the skbuff mark by means of hash calculation within a given
> range. The nfmark can influence the routing method (see "Use netfilter
> MARK value as routing key") and can also be used by other subsystems to
> change their behaviour.
> 
> Some examples:
> 
> * Default rule handles all TCP, UDP, SCTP, ESP & AH
> 
>  iptables -t mangle -A PREROUTING -m state --state NEW,ESTABLISHED,RELATED \
> 	-j HMARK --hmark-offset 10000 --hmark-mod 10
> 
> * Handle SCTP and hash dest port only and produce a nfmark between 100-119.
> 
>  iptables -t mangle -A PREROUTING -p SCTP -j HMARK --src-mask 0 --dst-mask 0 \
> 	--sp-mask 0 --offset 100 --mod 20
> 
> * Fragment safe Layer 3 only, that keep a class C network flow together
> 
>  iptables -t mangle -A PREROUTING -j HMARK --method L3 \
> 	--src-mask 24 --mod 20 --offset 100

I have removed these examples. Just in case we make changes to the
user-space part. We'll have the time for this (the entire 3.5 cycle).

Some minor glitches I made on this patch:

>  include/linux/netfilter/xt_HMARK.h |   48 +++++
>  net/netfilter/Kconfig              |   15 ++
>  net/netfilter/Makefile             |    1 +
>  net/netfilter/xt_HMARK.c           |  358 ++++++++++++++++++++++++++++++++++++
>  4 files changed, 422 insertions(+)
>  create mode 100644 include/linux/netfilter/xt_HMARK.h
>  create mode 100644 net/netfilter/xt_HMARK.c
> 
> diff --git a/include/linux/netfilter/xt_HMARK.h b/include/linux/netfilter/xt_HMARK.h
> new file mode 100644
> index 0000000..05e43ba
> --- /dev/null
> +++ b/include/linux/netfilter/xt_HMARK.h
> @@ -0,0 +1,46 @@
> +#ifndef XT_HMARK_H_
> +#define XT_HMARK_H_
> +
> +#include <linux/types.h>
> +
> +enum {
> +	XT_HMARK_NONE,

this means (1 << 0) is unused. I have removed this _NONE.

> +	XT_HMARK_SADR_MASK,
> +	XT_HMARK_DADR_MASK,
> +	XT_HMARK_SPI_MASK,
> +	XT_HMARK_SPI,
> +	XT_HMARK_SPORT_MASK,
> +	XT_HMARK_DPORT_MASK,
> +	XT_HMARK_SPORT,
> +	XT_HMARK_DPORT,
> +	XT_HMARK_PROTO_MASK,
> +	XT_HMARK_RND,
> +	XT_HMARK_MODULUS,
> +	XT_HMARK_OFFSET,
> +	XT_HMARK_CT,
> +	XT_HMARK_METHOD_L3,
> +	XT_HMARK_METHOD_L3_4,

I have also rearrange the order of the flags:

enum {
        XT_HMARK_SADDR_MASK,
        XT_HMARK_DADDR_MASK,
        XT_HMARK_SPI,       
        XT_HMARK_SPI_MASK,  
        XT_HMARK_SPORT,     
        XT_HMARK_DPORT,     
        XT_HMARK_SPORT_MASK,
        XT_HMARK_DPORT_MASK,
        XT_HMARK_PROTO_MASK,
        XT_HMARK_RND,       
        XT_HMARK_MODULUS,   
        XT_HMARK_OFFSET,    
        XT_HMARK_CT,        
        XT_HMARK_METHOD_L3, 
        XT_HMARK_METHOD_L3_4,
};

I don't want people to ask me why we where using some strange order in
the flag definition in the future (yes, you'll have to recompile your
iptables HMARK support in your setups, sorry)

> +};
> +#define XT_HMARK_FLAG(flag)	(1 << flag)
> +
> +union hmark_ports {
> +	struct {
> +		__u16	src;
> +		__u16	dst;
> +	} p16;
> +	__u32	v32;
> +};
> +
> +struct xt_hmark_info {
> +	union nf_inet_addr	src_mask;	/* Source address mask */
> +	union nf_inet_addr	dst_mask;	/* Dest address mask */
> +	union hmark_ports	port_mask;
> +	union hmark_ports	port_set;
> +	__u32			flags;		/* Print out only */
> +	__u16			proto_mask;	/* L4 Proto mask */
> +	__u32			hashrnd;
> +	__u32			hmodulus;	/* Modulus */
> +	__u32			hoffset;	/* Offset */

I've removed these comments, they provide no extra information. Still
I left the one that described hoffset, that may seem not obvious.

> +#endif /* XT_HMARK_H_ */

I have applied this, I'm going to pass it to davem.

^ permalink raw reply

* Re: [PATCH 7/9] net: add skb_orphan_frags to copy aside frags with destructors
From: Ian Campbell @ 2012-05-09  9:36 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev@vger.kernel.org, David Miller, Eric Dumazet
In-Reply-To: <20120507102441.GA18591@redhat.com>


> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index 945b807..f009abb 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -697,31 +697,25 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
> >  }
> >  EXPORT_SYMBOL_GPL(skb_morph);
> >  
> > -/*	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
> > - *	@skb: the skb to modify
> > - *	@gfp_mask: allocation priority
> > - *
> > - *	This must be called on SKBTX_DEV_ZEROCOPY skb.
> > - *	It will copy all frags into kernel and drop the reference
> > - *	to userspace pages.
> > - *
> > - *	If this function is called from an interrupt gfp_mask() must be
> > - *	%GFP_ATOMIC.
> > - *
> > - *	Returns 0 on success or a negative error code on failure
> > - *	to allocate kernel memory to copy to.
> > +/*
> > + * If uarg != NULL copy and replace all frags.
> > + * If uarg == NULL then only copy and replace those which have a destructor
> > + * pointer.
> >   */
> > -int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
> > +static int skb_copy_frags(struct sk_buff *skb, gfp_t gfp_mask,
> > +			  struct ubuf_info *uarg)
> >  {
> >  	int i;
> >  	int num_frags = skb_shinfo(skb)->nr_frags;
> >  	struct page *page, *head = NULL;
> > -	struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
> >  
> >  	for (i = 0; i < num_frags; i++) {
> >  		u8 *vaddr;
> >  		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
> >  
> > +		if (!uarg && !f->page.destructor)
> > +			continue;
> > +
> >  		page = alloc_page(GFP_ATOMIC);
> >  		if (!page) {
> >  			while (head) {
> > @@ -739,11 +733,16 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
> >  		head = page;
> >  	}
> >  
> > -	/* skb frags release userspace buffers */
> > -	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
> > +	/* skb frags release buffers */
> > +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
> > +		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
> > +		if (!uarg && !f->page.destructor)
> > +			continue;
> >  		skb_frag_unref(skb, i);
> > +	}
> >  
> > -	uarg->callback(uarg);
> > +	if (uarg)
> > +		uarg->callback(uarg);
> >  
> 
> So above we only linked up copied pages, but below we
> try to use the list for all frags. Looks like a bug,
> I think it needs to check destructor and uarg too.

You are absolutely correct. We need the same check in all three loops.

> 
> 
> >  	/* skb frags point to kernel buffers */
> >  	for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) {
> > @@ -752,10 +751,41 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
> >  		head = (struct page *)head->private;
> >  	}
> >  
> > -	skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
> >  	return 0;
> >  }

^ permalink raw reply

* Re: IP_MULTICAST_IF getsockopt man part
From: Jiri Pirko @ 2012-05-09  9:34 UTC (permalink / raw)
  To: Michael Kerrisk (man-pages)
  Cc: linux-man-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <CAKgNAkgBz_dRV97JVV1Np0YuEd0EVqoLPZkajiE_KJ2Lk20YRA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

Sat, May 05, 2012 at 01:57:35PM CEST, mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org wrote:
>On Fri, May 4, 2012 at 9:00 PM, Jiri Pirko <jpirko-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>> Hi.
>>
>> <quote>
>> IP_MULTICAST_IF (since Linux 1.2)
>>                      Set the local device for a multicast socket.
>>                      Argument is an ip_mreqn or ip_mreq structure
>>                      similar to IP_ADD_MEMBERSHIP.
>> </quote>
>>
>> That is not true. Setsockopt recognizes only ip_mreqn and in_addr. I
>> made patch which makes it recognize ip_mreq as well. So that would be
>> probably ok.
>> http://patchwork.ozlabs.org/patch/156815/
>>
>> On the other hand, getsockopt works only with in_addr. That I think is
>> good behaviour but manpages here needs to be corrected in this way (read
>> part needs to be added here)
>
>Jirka,
>
>I'm having trouble to understand what you mean. Perhaps it would be
>simplest if you showed your proposed replacement text for the text
>quoted above.

<orig>
IP_MULTICAST_IF (since Linux 1.2)
Set the local device for a multicast socket. Argument is an ip_mreqn
or ip_mreq structure similar to IP_ADD_MEMBERSHIP.
</orig>
<new>
IP_MULTICAST_IF (since Linux 1.2)
Set or read the local device for a multicast socket.
Argument for set is an ip_mreqn or ip_mreq or addr_in structure.
Argument for read is an addr_in structure.
</new>


Jirka



>
>Thanks,
>
>Michael
>
>
>
>-- 
>Michael Kerrisk
>Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
>Author of "The Linux Programming Interface"; http://man7.org/tlpi/
--
To unsubscribe from this list: send the line "unsubscribe linux-man" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [net-next] e1000e: Fix merge conflict (net->net-next)
From: Jeff Kirsher @ 2012-05-09  9:23 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, gospo, sassmann

During merge of net to net-next the changes in patch:

e1000e: Fix default interrupt throttle rate not set in NIC HW

got munged in param.c of the e1000e driver.  This rectifies the
merge issues.

Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

This patch is also available on my net-next tree on kernel.org if you
want to pull this patch.

---
 drivers/net/ethernet/intel/e1000e/param.c |   56 ++++++-----------------------
 1 files changed, 11 insertions(+), 45 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/param.c b/drivers/net/ethernet/intel/e1000e/param.c
index 42444d1..55cc156 100644
--- a/drivers/net/ethernet/intel/e1000e/param.c
+++ b/drivers/net/ethernet/intel/e1000e/param.c
@@ -344,50 +344,16 @@ void __devinit e1000e_check_options(struct e1000_adapter *adapter)
 
 		if (num_InterruptThrottleRate > bd) {
 			adapter->itr = InterruptThrottleRate[bd];
-			switch (adapter->itr) {
-			case 0:
-				e_info("%s turned off\n", opt.name);
-				break;
-			case 1:
-				e_info("%s set to dynamic mode\n", opt.name);
-				adapter->itr_setting = adapter->itr;
-				adapter->itr = 20000;
-				break;
-			case 3:
-				e_info("%s set to dynamic conservative mode\n",
-					opt.name);
-				adapter->itr_setting = adapter->itr;
-				adapter->itr = 20000;
-				break;
-			case 4:
-				e_info("%s set to simplified (2000-8000 ints) mode\n",
-				       opt.name);
-				adapter->itr_setting = 4;
-				break;
-			default:
-				/*
-				 * Save the setting, because the dynamic bits
-				 * change itr.
-				 */
-				if (e1000_validate_option(&adapter->itr, &opt,
-							  adapter) &&
-				    (adapter->itr == 3)) {
-					/*
-					 * In case of invalid user value,
-					 * default to conservative mode.
-					 */
-					adapter->itr_setting = adapter->itr;
-					adapter->itr = 20000;
-				} else {
-					/*
-					 * Clear the lower two bits because
-					 * they are used as control.
-					 */
-					adapter->itr_setting =
-						adapter->itr & ~3;
-				}
-				break;
-			}
+
+			/*
+			 * Make sure a message is printed for non-special
+			 * values. And in case of an invalid option, display
+			 * warning, use default and go through itr/itr_setting
+			 * adjustment logic below
+			 */
+			if ((adapter->itr > 4) &&
+			    e1000_validate_option(&adapter->itr, &opt, adapter))
+				adapter->itr = opt.def;
 		} else {
 			/*
 			 * If no option specified, use default value and go
@@ -399,7 +365,7 @@ void __devinit e1000e_check_options(struct e1000_adapter *adapter)
 			 * Make sure a message is printed for non-special
 			 * default values
 			 */
-			if (adapter->itr > 40)
+			if (adapter->itr > 4)
 				e_info("%s set to default %d\n", opt.name,
 				       adapter->itr);
 		}
-- 
1.7.7.6

^ permalink raw reply related

* Memory exhaust issue with only IPsec policies configured on continuous traffic
From: Agarwal Nikhil-B38457 @ 2012-05-09  9:23 UTC (permalink / raw)
  To: linux-kernel@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <F0B93D0554724543876B9A180DC988575B70E7@039-SN1MPN1-001.039d.mgd.msft.net>

Hi all,
               In a typical scenario, when IPSEC policies are configured in the system but SA is not present or negotiation fails or IKE daemon is not running.  The current behavior of xfrm is to send those matching packets to blackhole route.  i.e. xfrm_bundle_lookup returns a bundle with null route and xfrm_lookup returns a blackhole route.

For each of these packet a dst_alloc is called in ipv4_blackhole_route. However when these skbs get free and their dst's get discarded using dst_free and the garbage collector is scheduled using cancel_delayed_work and schedule_delayed_work.

If the packets are coming continuously garbage collector may not get scheduled and large amount of memory is stuck to be freed causing the system to go into non-recoverable state.

Any ideas? 

Regards
Nikhil

^ permalink raw reply

* Memory exhaust issue with only IPsec policies configured on continuous traffic
From: Agarwal Nikhil-B38457 @ 2012-05-09  9:10 UTC (permalink / raw)
  To: linux-kernel@vger.kernel.org, linux-crypto@vger.kernel.org,
	linux-netdev@vger.kernel.org, netdev@vger.kernel.org

Hi all,
               In a typical scenario, when IPSEC policies are configured in the system but SA is not present or negotiation fails or IKE daemon is not running.  The current behavior of xfrm is to send those matching packets to blackhole route.  i.e. xfrm_bundle_lookup returns a bundle with null route and xfrm_lookup returns a blackhole route.

For each of these packet a dst_alloc is called in ipv4_blackhole_route. However when these skbs get free and their dst's get discarded using dst_free and the garbage collector is scheduled using cancel_delayed_work and schedule_delayed_work.

If the packets are coming continuously garbage collector may not get scheduled and large amount of memory is stuck to be freed causing the system to go into non-recoverable state.

Any ideas? 

Regards
Nikhil

^ permalink raw reply

* Re: [PATCH 0/2] Add and use vsprintf extension %pMbt for bluetooth macs
From: Andrei Emeltchenko @ 2012-05-09  9:01 UTC (permalink / raw)
  To: Joe Perches
  Cc: Marcel Holtmann, Gustavo F. Padovan, linux-bluetooth, netdev,
	linux-kernel
In-Reply-To: <cover.1291419007.git.joe@perches.com>

Hi Joe

On Fri, Dec 03, 2010 at 06:33:02PM -0800, Joe Perches wrote:
> Using vsprintf extensions can save text and data.
> Add %pMbt for the byte reversed output for bluetooth addresses.
> 
> Joe Perches (2):
>   vsprintf: Add %pMbt, bluetooth mac address
>   bluetooth: Use printf extension %pMbt

I think this would be the best way to solve our issues with batostr.

BTW: What is the status with this patch series? I saw that it was acked by
Gustavo but not applied for some reason.

Best regards 
Andrei Emeltchenko 

^ permalink raw reply

* [PATCH] xfrm: make xfrm_algo.c a module
From: Jan Beulich @ 2012-05-09  7:53 UTC (permalink / raw)
  To: davem, netdev; +Cc: linux-kernel

By making this a standalone config option (selected as needed),
selecting CRYPTO from here rather than from XFRM (which is boolean)
allows the core crypto code to become a module again even when XFRM=y.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

---
 net/ipv4/Kconfig     |    4 ++--
 net/ipv6/Kconfig     |    4 ++--
 net/xfrm/Kconfig     |   13 +++++++++----
 net/xfrm/Makefile    |    3 ++-
 net/xfrm/xfrm_algo.c |    5 ++---
 5 files changed, 17 insertions(+), 12 deletions(-)

--- 3.4-rc6/net/ipv4/Kconfig
+++ 3.4-rc6-xfrm-algo-module/net/ipv4/Kconfig
@@ -312,7 +312,7 @@ config SYN_COOKIES
 
 config INET_AH
 	tristate "IP: AH transformation"
-	select XFRM
+	select XFRM_ALGO
 	select CRYPTO
 	select CRYPTO_HMAC
 	select CRYPTO_MD5
@@ -324,7 +324,7 @@ config INET_AH
 
 config INET_ESP
 	tristate "IP: ESP transformation"
-	select XFRM
+	select XFRM_ALGO
 	select CRYPTO
 	select CRYPTO_AUTHENC
 	select CRYPTO_HMAC
--- 3.4-rc6/net/ipv6/Kconfig
+++ 3.4-rc6-xfrm-algo-module/net/ipv6/Kconfig
@@ -69,7 +69,7 @@ config IPV6_OPTIMISTIC_DAD
 
 config INET6_AH
 	tristate "IPv6: AH transformation"
-	select XFRM
+	select XFRM_ALGO
 	select CRYPTO
 	select CRYPTO_HMAC
 	select CRYPTO_MD5
@@ -81,7 +81,7 @@ config INET6_AH
 
 config INET6_ESP
 	tristate "IPv6: ESP transformation"
-	select XFRM
+	select XFRM_ALGO
 	select CRYPTO
 	select CRYPTO_AUTHENC
 	select CRYPTO_HMAC
--- 3.4-rc6/net/xfrm/Kconfig
+++ 3.4-rc6-xfrm-algo-module/net/xfrm/Kconfig
@@ -3,12 +3,17 @@
 #
 config XFRM
        bool
-       select CRYPTO
        depends on NET
 
+config XFRM_ALGO
+	tristate
+	select XFRM
+	select CRYPTO
+
 config XFRM_USER
 	tristate "Transformation user configuration interface"
-	depends on INET && XFRM
+	depends on INET
+	select XFRM_ALGO
 	---help---
 	  Support for Transformation(XFRM) user configuration interface
 	  like IPsec used by native Linux tools.
@@ -48,13 +53,13 @@ config XFRM_STATISTICS
 
 config XFRM_IPCOMP
 	tristate
-	select XFRM
+	select XFRM_ALGO
 	select CRYPTO
 	select CRYPTO_DEFLATE
 
 config NET_KEY
 	tristate "PF_KEY sockets"
-	select XFRM
+	select XFRM_ALGO
 	---help---
 	  PF_KEYv2 socket family, compatible to KAME ones.
 	  They are required if you are going to use IPsec tools ported
--- 3.4-rc6/net/xfrm/Makefile
+++ 3.4-rc6-xfrm-algo-module/net/xfrm/Makefile
@@ -3,8 +3,9 @@
 #
 
 obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \
-		      xfrm_input.o xfrm_output.o xfrm_algo.o \
+		      xfrm_input.o xfrm_output.o \
 		      xfrm_sysctl.o xfrm_replay.o
 obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o
+obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o
 obj-$(CONFIG_XFRM_USER) += xfrm_user.o
 obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
--- 3.4-rc6/net/xfrm/xfrm_algo.c
+++ 3.4-rc6-xfrm-algo-module/net/xfrm/xfrm_algo.c
@@ -15,9 +15,6 @@
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
 #include <net/xfrm.h>
-#if defined(CONFIG_INET_AH) || defined(CONFIG_INET_AH_MODULE) || defined(CONFIG_INET6_AH) || defined(CONFIG_INET6_AH_MODULE)
-#include <net/ah.h>
-#endif
 #if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE)
 #include <net/esp.h>
 #endif
@@ -752,3 +749,5 @@ void *pskb_put(struct sk_buff *skb, stru
 }
 EXPORT_SYMBOL_GPL(pskb_put);
 #endif
+
+MODULE_LICENSE("GPL");

^ permalink raw reply

* Re: [PATCH 3.3.0] phy:icplus:fix Auto Power Saving in ip101a_config_init.
From: Jonathan Nieder @ 2012-05-09  7:25 UTC (permalink / raw)
  To: David Miller; +Cc: srinivas.kandagatla, netdev, peppe.cavallaro
In-Reply-To: <20120403.184937.744089580257691570.davem@davemloft.net>

Hi Dave,

David Miller wrote:
> From: Srinivas KANDAGATLA <srinivas.kandagatla@st.com>

>> This patch fixes Auto Power Saving configuration in ip101a_config_init
>> which was broken as there is no phy register write followed after
>> setting IP101A_APS_ON flag.
>>
>> This patch also fixes the return value of ip101a_config_init.
>>
>> Without this patch ip101a_config_init returns 2 which is not an error
>> accroding to IS_ERR and the mac driver will continue accessing 2 as
>> valid pointer to phy_dev resulting in memory fault.
>>
>> Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@st.com>
>
> Applied and queued up for -stable, thanks.

I'm guessing 3.2.y needs this, too, since it also includes
v3.2-rc1~129^2~247 ("net/phy: add IC+ IP101A and support APS").

Here's a quick backport made by adjusting the context line to
use IP101A_APS_ON instead of IP101A_G_APS_ON.

Please include it in some later stable update if appropriate.

Thanks,
Jonathan

-- >8 --
From: Srinivas Kandagatla <srinivas.kandagatla@st.com>
Date: Mon, 2 Apr 2012 00:02:09 +0000

[ Upstream commit b3300146aa8efc5d3937fd33f3cfdc580a3843bc ]

This patch fixes Auto Power Saving configuration in ip101a_config_init
which was broken as there is no phy register write followed after
setting IP101A_APS_ON flag.

This patch also fixes the return value of ip101a_config_init.

Without this patch ip101a_config_init returns 2 which is not an error
accroding to IS_ERR and the mac driver will continue accessing 2 as
valid pointer to phy_dev resulting in memory fault.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
---
 drivers/net/phy/icplus.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/icplus.c b/drivers/net/phy/icplus.c
index c81f136ae670..b14230016d9b 100644
--- a/drivers/net/phy/icplus.c
+++ b/drivers/net/phy/icplus.c
@@ -150,7 +150,8 @@ static int ip101a_config_init(struct phy_device *phydev)
 	/* Enable Auto Power Saving mode */
 	c = phy_read(phydev, IP10XX_SPEC_CTRL_STATUS);
 	c |= IP101A_APS_ON;
-	return c;
+
+	return phy_write(phydev, IP10XX_SPEC_CTRL_STATUS, c);
 }
 
 static int ip175c_read_status(struct phy_device *phydev)
-- 
1.7.10.1

^ permalink raw reply related

* Re: [PATCH net-next 1/2] net_sched/mqprio: add support for different pgroup types
From: John Fastabend @ 2012-05-09  6:36 UTC (permalink / raw)
  To: Amir Vadai; +Cc: David S. Miller, netdev, Oren Duer, Liran Liss
In-Reply-To: <1336287910-12010-2-git-send-email-amirv@mellanox.com>

On 5/6/2012 12:05 AM, Amir Vadai wrote:
> Currently, HW based QoS mechanisms use the framework and means introduced in
> commits 4f57c087d "net: implement mechanism for HW based QOS" and b8970f0bfc
> "net_sched: implement a root container qdisc sch_mqprio".
> 
> The approach present in these patches is strongly orientated to the extended
> transmission selection (ETS) algorithm traffic classes (TC).
> 

I would argue the strongly orientated part per our other thread [0/2]

> This patch enhances the current scheme to allow for these mechanisms to be used
> also with hardware who has queues per UP - user priority (Linux has well
> established mechanisms to set UP for both tagged and untagged traffic).
> 

also per other thread I think this is only needed if you have many different
egress map configurations on vlans.

> Now, __skb_tx_hash() will direct a flow to a tx ring from a range of tx rings.
> This range is defined by the admin through the mqprio scheduler for the
> specific HW. For TC based queues, the range is by TC number and for UP based
> queues, the range is by UP.
> 
> Signed-off-by: Amir Vadai <amirv@mellanox.com>
> ---
>  include/linux/netdevice.h |   27 +++++++++++++++++++++++++++
>  include/linux/pkt_sched.h |    3 ++-
>  net/core/dev.c            |   12 +++++++++---
>  net/sched/sch_mqprio.c    |   11 +++++++++--
>  4 files changed, 47 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 7f377fb..ecdd953 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -835,6 +835,9 @@ struct netdev_fcoe_hbainfo {
>   * 	is always called from the stack with the rtnl lock held and netif tx
>   * 	queues stopped. This allows the netdevice to perform queue management
>   * 	safely.
> + * int (*ndo_set_pg_type)(struct net_device *dev, u8 pg_type)
> + *	Called to setup 'tc' type. According to this type, traffic is
> + *	distributed across tx rings. If not set, ETS TC is in use.
>   *
>   *	Fiber Channel over Ethernet (FCoE) offload functions.
>   * int (*ndo_fcoe_enable)(struct net_device *dev);
> @@ -973,6 +976,8 @@ struct net_device_ops {
>  	int			(*ndo_get_vf_port)(struct net_device *dev,
>  						   int vf, struct sk_buff *skb);
>  	int			(*ndo_setup_tc)(struct net_device *dev, u8 tc);
> +	int			(*ndo_set_pg_type)(struct net_device *dev,
> +						   u8 pg_type);

expand ndo_setup_tc() to take either another parameter 'pg_type' or just
start passing in the entire tc_mqprio_opt that way we get the number of
queues as well. I would prefer passing tc_mpqrio_opt to adding more parameters.

This avoids adding another ndo op.


>  #if IS_ENABLED(CONFIG_FCOE)
>  	int			(*ndo_fcoe_enable)(struct net_device *dev);
>  	int			(*ndo_fcoe_disable)(struct net_device *dev);
> @@ -1307,6 +1312,11 @@ struct net_device {
>  	/* Data Center Bridging netlink ops */
>  	const struct dcbnl_rtnl_ops *dcbnl_ops;
>  #endif
> +	enum {
> +		PGROUP_TC,
> +		PGROUP_UP,
> +		PGROUP_MAX,
> +	} pg_type:8;
>  	u8 num_tc;
>  	struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE];
>  	u8 prio_tc_map[TC_BITMASK + 1];
> @@ -1329,6 +1339,23 @@ struct net_device {
>  #define	NETDEV_ALIGN		32
>  
>  static inline
> +int netdev_get_pg_type(const struct net_device *dev)
> +{
> +	return dev->pg_type;
> +}
> +
> +static inline
> +int netdev_set_pg_type(struct net_device *dev, u8 pg_type)
> +{
> +	if (pg_type >= PGROUP_MAX)
> +		return -EINVAL;
> +
> +	dev->pg_type = pg_type;
> +
> +	return 0;
> +}
> +
> +static inline
>  int netdev_get_prio_tc_map(const struct net_device *dev, u32 prio)
>  {
>  	return dev->prio_tc_map[prio & TC_BITMASK];
> diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
> index ffe975c..1ae7d3c 100644
> --- a/include/linux/pkt_sched.h
> +++ b/include/linux/pkt_sched.h
> @@ -596,7 +596,8 @@ struct tc_drr_stats {
>  struct tc_mqprio_qopt {
>  	__u8	num_tc;
>  	__u8	prio_tc_map[TC_QOPT_BITMASK + 1];
> -	__u8	hw;
> +	__u8	hw;	/* bit 0: hw owned, bits 1-7: hw queuing type.
> +			 * valid types: 0 - ETS TC, 1 - UP */
>  	__u16	count[TC_QOPT_MAX_QUEUE];
>  	__u16	offset[TC_QOPT_MAX_QUEUE];
>  };
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 09024fd..72ac4bf 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2325,9 +2325,15 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
>  	}
>  
>  	if (dev->num_tc) {
> -		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
> -		qoffset = dev->tc_to_txq[tc].offset;
> -		qcount = dev->tc_to_txq[tc].count;
> +		u8 pgroup;
> +
> +		if (dev->pg_type == PGROUP_TC || !vlan_tx_tag_present(skb))
> +			pgroup = netdev_get_prio_tc_map(dev, skb->priority);
> +		else
> +			pgroup = (vlan_tx_tag_get(skb) >> 13);
> +
> +		qoffset = dev->tc_to_txq[pgroup].offset;
> +		qcount = dev->tc_to_txq[pgroup].count;
>  	}
>  
>  	if (skb->sk && skb->sk->sk_hash)
> diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
> index d1831ca..2149cbb 100644
> --- a/net/sched/sch_mqprio.c
> +++ b/net/sched/sch_mqprio.c
> @@ -134,11 +134,18 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
>  		priv->qdiscs[i] = qdisc;
>  	}
>  
> +	if (dev->netdev_ops->ndo_set_pg_type)
> +		err = dev->netdev_ops->ndo_set_pg_type(dev, qopt->hw >> 1);
> +	else
> +		err = netdev_set_pg_type(dev, PGROUP_TC);

Software should still be allowed to set PGROUP_UP even though hardware may
not support it.

> +	if (err)
> +		goto err;
> +
>  	/* If the mqprio options indicate that hardware should own
>  	 * the queue mapping then run ndo_setup_tc otherwise use the
>  	 * supplied and verified mapping
>  	 */
> -	if (qopt->hw) {
> +	if (qopt->hw & 1) {
>  		priv->hw_owned = 1;
>  		err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc);
>  		if (err)

^ permalink raw reply

* Re: [PATCH net-next 0/2] extend sch_mqprio to distribute traffic not only by ETS TC
From: John Fastabend @ 2012-05-09  6:22 UTC (permalink / raw)
  To: Amir Vadai
  Cc: David S. Miller, netdev, Oren Duer, Liran Liss, Jamal Hadi Salim
In-Reply-To: <4FA92606.6070401@mellanox.com>

On 5/8/2012 6:56 AM, Amir Vadai wrote:
> On 05/08/2012 03:54 AM, John Fastabend wrote:
>> On 5/6/2012 12:05 AM, Amir Vadai wrote:
>>> This series comes to revive the discussion initiated on the thread "net:
>>> support tx_ring per UP in HW based QoS mechanism" (see
>>> http://marc.info/?t=133165957200004&r=1&w=2) with the major issue to be address
>>> is - how should sk_prio<=>   TC be done, for both, tagged and untagged traffic.
>>> Following is a staged description addressing the background, problem
>>> description, current situation, suggestion for the change and implementation of
>>> it.
>>
>> OK but the mqprio qdisc is only concerned with mapping skb->priority to
>> queue sets I perhaps unfortunately called the queue sets tc's. Try not
>> to get hung up on my perhaps limiting naming of variables and functions.
> I understand that.

I figured you did. Just wanted to state it again.

> 
>>
>> mqprio is used outside of 802.1Q as well in these cases a traffic class
>> is not usually even defined.
>>
>>>

[...]

>>> Implementation
>>> --------------
>>> Extended mqprio hw attribute:
>>> * Bit 1: is queue offset/count owned by HW
>>> * Bits 2-7: HW queueing type.
>>>    * 0 - by ETS TC
>>>    * 1 - by UP
>>>
>>> __skb_tx_hash() is now aware to the HW queuing type (pg_type): for pg_type
>>> being ETS TC, traffic is distributed as it was before - tagged and untagged
>>> packets are distributed by netdev_get_prio_tc_map. For pg_type being UP, tagged
>>> and untagged packets are distributed by UP (taken from egress map for tagged
>>> traffic, or netdev_get_prio_tc_map for untagged).
>>
>> I guess I don't see why we need this. If you keep the mqprio priority to
>> queue set mapping set to 1:1. Then modify the egress map accordingly then
>> this should work right?
>>
>> For example:
>>
>> If we want to map 8 802.1Q priority code points onto 4 traffic classes this
>> should work,
>>
>> egress map: 0:0 1:0 2:1 3:1 4:2 5:2 6:3 7:3<-- vlan layer inserts correct tag
>> mqprio up2tc: 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7<-- mqprio with unfortunate 'tc' name maps priority to queues
> But mqprio is mapping sk_priority to queue set, which is different from UP to queue set.
> The term UP which we use, is the 8021q PCP in tagged traffic, and a priority mapped by the host admin for untagged traffic.
> What you wrote, is actually, that the application will set the priority without enabling the host admin to control it.
>> dcbnl up2tc: 0:0 1:0 2:1 3:1 4:2 5:2 6:3 7:3<-- dcbx pushes up2tc mapping correctly
> 
> For example, lets take an application that calls setsockopt(SO_PRIORITY, 2):
> according to egress map: 8021q PCP field in vlan tag should be set to 1 (=UP)
> according to mqprio: a tx ring belonging to UP 2 will be selected.
> according to dcbnl: traffic will have ETS attributes of TC 1
> 
> Except some conceptual problems, it won't work:
> 8021q PCP field is set by HW according to the tx ring (UP=2). which
> is different from the one that the user configured in the egress_map
> (UP=1).

sorry I set it up wrong above but it _can_ be made to work as best I
can tell (for a single egress_map at least)

egress map: 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7     <-- ignored
mqprio    : 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7     <-- map priority to up queue sets
up2tc     : 0:0 1:0 2:1 3:1 4:2 5:2 6:3 7:3     <-- dcbx negotiated up2tc map

With your example application does setsockopt(SO_PRIORITY, 2):
	according to egress map : insert PCP 2
	according to mqprio	: tx ring belonging to UP2 is selected
	accroding to dcbnl	: traffic will have ETS attributes of TC1

The point is either you use the skb->priority to PCP map and then a 1:1
mqprio map or you use the mqprio map directly. Agree?

One more example translating the case with these patches onto a case
without these patches as I understand them.

first with these patches mapping:

up2tc     : 0:0 1:0 2:1 3:1 4:2 5:2 6:3 7:3     <-- dcbnl up2tc map 
egress map: 0:0 1:0 2:1 3:1 4:2 5:2 6:3 7:3     <-- sets pcp bits
mqprio    : 0:0 1:1 2:2 3:3 4:4 5:5 6:6 7:7     <-- with PGROUP_UP set

equivalent mapping without PGROUP_UP set

up2tc     : 0:0 1:0 2:1 3:1 4:2 5:2 6:3 7:3     <-- dcbnl negotiated up2tc map 
egress map: 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:0     <-- default unset egress map
mqprio    : 0:0 1:0 2:1 3:1 4:2 5:2 6:3 7:3     <-- with PGROUP_UP set

Application sets skb->priority to 2, in the first case egress map
sets the PCP to 1 and mqprio maps it to queues for UP1 based on PCP bits.

In case two egress map does nothing but skb->priority is still 2 so the
mqprio maps these to queues associated with UP1 so everything works
still.

I apologize if I'm missing something here but it seems correct to me. Spell
it out for me if I am still wrong.

> 
>>
>> We may need to fixup userspace tools some but I think the kernel mechanisms
>> are in place.
>>
>> BTW I did think about this while implementing the existing code and decided
>> that rather than create more if/else branches the case you are describing
>> could be handled by independently controlling the priority to queue set mappings
>> in mqprio and the egress map.
>>
>> Feel free to let me know where I went wrong or why this doesn't work on
>> your hardware. I agree though we may need to fixup lldpad and maybe even
>> 'tc' some to get this to look correct in user space and get automagically
>> setup correctly.
>>
>> Thanks,
>> .John
> 
> I think I understand your stand, that mqprio mapping should be generic
> and should not be TC nor UP oriented.
> 

Right. Also I want to avoid user space having to somehow "know" which
mode to put the qdisc in. Checking if the hardware is a mellanox card
does not scale.

> The problem is that our HW needs the queues to be selected by UP. ETS
> TC mapping is configured before traffic starts, and therefore ETS
> attributes are enforced by HW according to the UP, which the tx ring
> belongs to. Same thing for vlan tag in tagged traffic. 8021q PCP is
> placed by HW according to the tx ring.

OK. Can you agree though in the case where we restrict the egress_map
to be the same for all vlan's on a real_dev the existing mqprio map
_can_ work?

> 
> For backward compatibility, tagged traffic should be steered to a tx
> ring by UP taken from egress map - skb_tx_hash() as it is today, can't
> do it. Even if we implement ndo_select_queue(), we will need to
> duplicate most of the code from skb_tx_hash(), because there are more
> than one tx ring per UP, and we need skb_tx_hash() to be able to select
> a ring from a range of rings belonging to a UP.

agreed implementing this in select_queue() is no good.

> 
> Also, there are some configurations that can't be done by mqprio and
> egress map. For example when having two vlans with different egress
> maps.

This is a good example thanks. This currently doesn't work for hardware
that maps tx_rings to traffic classes either. So if we need/want to
solve this case then I guess we need something like your patches. Note
I think the patch you submitted should work regardless if you map
the PCP bits to UP queues or PCP bits to TC queues. We could use this
in both cases which helps my user space concerns.

> 
> And in the conceptual level, I think that kernel should not accept bad
> configurations and rely on user space to protect it.

I disagree policy should be managed in user space. Also I see no reason
to create a strong coupling between egress maps and mqprio maps. I can
imagine a case where they could be used independently. DCB is just one
usage model for mqprio. Using a bit like you did seems fine though.

> 
> - Amir

Assuming we want the multiple different egress map case to work I guess
we will have to do something like this. At least right now I can't think
of anything better but I'll think on it tomorrow some more.

The other thought would be to provide a qdisc hook before queue selection
to attach 'tc filters' and provide a new action to hash a skb across
queue sets.

Thanks,
John

^ permalink raw reply

* Re: [PATCH] net: update the usage of CHECKSUM_UNNECESSARY
From: Michael S. Tsirkin @ 2012-05-09  6:03 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: Yi Zou, netdev, devel, jeffrey.t.kirsher
In-Reply-To: <1336501258.2749.3.camel@bwh-desktop.uk.solarflarecom.com>

On Tue, May 08, 2012 at 07:20:58PM +0100, Ben Hutchings wrote:
> On Tue, 2012-05-08 at 20:48 +0300, Michael S. Tsirkin wrote:
> > On Mon, Mar 19, 2012 at 02:12:41PM -0700, Yi Zou wrote:
> > > As suggested by Ben, this adds the clarification on the usage of
> > > CHECKSUM_UNNECESSARY on the outgoing patch. Also add the usage
> > > description of NETIF_F_FCOE_CRC and CHECKSUM_UNNECESSARY
> > > for the kernel FCoE protocol driver.
> > > 
> > > This is a follow-up to the following:
> > > http://patchwork.ozlabs.org/patch/147315/
> > > 
> > > Signed-off-by: Yi Zou <yi.zou@intel.com>
> > > Cc: Ben Hutchings <bhutchings@solarflare.com>
> > > Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> > > Cc: www.Open-FCoE.org <devel@open-fcoe.org>
> > > ---
> > > 
> > >  include/linux/skbuff.h |    7 +++++++
> > >  1 files changed, 7 insertions(+), 0 deletions(-)
> > > 
> > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> > > index 8dc8257..a2b9953 100644
> > > --- a/include/linux/skbuff.h
> > > +++ b/include/linux/skbuff.h
> > > @@ -94,6 +94,13 @@
> > >   *			  about CHECKSUM_UNNECESSARY. 8)
> > >   *	NETIF_F_IPV6_CSUM about as dumb as the last one but does IPv6 instead.
> > >   *
> > > + *	UNNECESSARY: device will do per protocol specific csum. Protocol drivers
> > > + *	that do not want net to perform the checksum calculation should use
> > > + *	this flag in their outgoing skbs.
> > > + *	NETIF_F_FCOE_CRC  this indicates the device can do FCoE FC CRC
> > > + *			  offload. Correspondingly, the FCoE protocol driver
> > > + *			  stack should use CHECKSUM_UNNECESSARY.
> > > + *
> > >   *	Any questions? No questions, good. 		--ANK
> > >   */
> > >  
> > 
> > So just to make sure I understand, you never get
> > UNNECESSARY packets on tx unless you declared NETIF_F_FCOE_CRC?
> > 
> > Maybe the comment says this somehow but could not figure it out.
> 
> That's what should happen now.  In future CHECKSUM_UNNECESSARY could be
> used on output by other protocols which don't use TCP/IP-style
> checksums, but always dependent on the output device supporting the
> relevant offload feature.
> 
> Ben.

Isn't there another case: a device passes UNNECESSARY in on rx,
and the skb is forwarded to another device?
For example it is handled this way by tun, giving a nice
performance boost for VMs, see 10a8d94a95742bb15b4e617ee9884bb4381362be

> -- 
> Ben Hutchings, Staff Engineer, Solarflare
> Not speaking for my employer; that's the marketing department's job.
> They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* RE: [PATCH net-next 3/4] netxen: added miniDIMM support in driver.
From: Rajesh Borundia @ 2012-05-09  5:59 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Ameen Rahman, Sony Chacko, Sucheta Chakraborty
In-Reply-To: <20120508.125311.1669426371305176333.davem@davemloft.net>


> -----Original Message-----
> From: David Miller [mailto:davem@davemloft.net]
> Sent: Tuesday, May 08, 2012 10:23 PM
> To: Rajesh Borundia
> Cc: netdev; Ameen Rahman; Sony Chacko; Sucheta Chakraborty
> Subject: Re: [PATCH net-next 3/4] netxen: added miniDIMM support in
> driver.
> 
> From: Rajesh Borundia <rajesh.borundia@qlogic.com>
> Date: Tue, 8 May 2012 08:01:27 -0700
> 
> > +static struct bin_attribute bin_attr_dimm = {
> > +	.attr = {.name = "dimm", .mode = (S_IRUGO | S_IWUSR)},
> 
> How about a space next to the braces?
> 

I will resend the patches with review changes.

Rajesh

^ permalink raw reply

* [PATCH net-next] be2net: avoid disabling sriov while VFs are assigned
From: Sathya Perla @ 2012-05-09  5:41 UTC (permalink / raw)
  To: netdev; +Cc: Sathya Perla

Calling pci_disable_sriov() while VFs are assigned to VMs causes
kernel panic. This patch uses PCI_DEV_FLAGS_ASSIGNED bit state of the
VF's pci_dev to avoid this. Also, the unconditional function reset cmd
issued on a PF probe can delete the VF configuration for the
previously enabled VFs. A scratchpad register is now used to issue a
function reset only when needed (i.e., in a crash dump scenario.)

Signed-off-by: Sathya Perla <sathya.perla@emulex.com>
---
 drivers/net/ethernet/emulex/benet/be.h      |   22 ++--
 drivers/net/ethernet/emulex/benet/be_hw.h   |    2 +
 drivers/net/ethernet/emulex/benet/be_main.c |  199 ++++++++++++++++-----------
 3 files changed, 134 insertions(+), 89 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
index c3ee910..ecf1a81 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -313,6 +313,11 @@ struct be_vf_cfg {
 	u32 tx_rate;
 };
 
+enum vf_state {
+	ENABLED = 0,
+	ASSIGNED = 1
+};
+
 #define BE_FLAGS_LINK_STATUS_INIT		1
 #define BE_FLAGS_WORKER_SCHEDULED		(1 << 3)
 #define BE_UC_PMAC_COUNT		30
@@ -403,8 +408,9 @@ struct be_adapter {
 	u32 flash_status;
 	struct completion flash_compl;
 
-	u32 num_vfs;
-	u8 is_virtfn;
+	u32 num_vfs;		/* Number of VFs provisioned by PF driver */
+	u32 dev_num_vfs;	/* Number of VFs supported by HW */
+	u8 virtfn;
 	struct be_vf_cfg *vf_cfg;
 	bool be3_native;
 	u32 sli_family;
@@ -417,8 +423,10 @@ struct be_adapter {
 	u32 uc_macs;		/* Count of secondary UC MAC programmed */
 };
 
-#define be_physfn(adapter) (!adapter->is_virtfn)
+#define be_physfn(adapter)		(!adapter->virtfn)
 #define	sriov_enabled(adapter)		(adapter->num_vfs > 0)
+#define	sriov_want(adapter)		(adapter->dev_num_vfs && num_vfs && \
+					 be_physfn(adapter))
 #define for_all_vfs(adapter, vf_cfg, i)					\
 	for (i = 0, vf_cfg = &adapter->vf_cfg[i]; i < adapter->num_vfs;	\
 		i++, vf_cfg++)
@@ -547,14 +555,6 @@ static inline u8 is_udp_pkt(struct sk_buff *skb)
 	return val;
 }
 
-static inline void be_check_sriov_fn_type(struct be_adapter *adapter)
-{
-	u32 sli_intf;
-
-	pci_read_config_dword(adapter->pdev, SLI_INTF_REG_OFFSET, &sli_intf);
-	adapter->is_virtfn = (sli_intf & SLI_INTF_FT_MASK) ? 1 : 0;
-}
-
 static inline void be_vf_eth_addr_generate(struct be_adapter *adapter, u8 *mac)
 {
 	u32 addr;
diff --git a/drivers/net/ethernet/emulex/benet/be_hw.h b/drivers/net/ethernet/emulex/benet/be_hw.h
index 0949aa6..f38b58c 100644
--- a/drivers/net/ethernet/emulex/benet/be_hw.h
+++ b/drivers/net/ethernet/emulex/benet/be_hw.h
@@ -58,6 +58,8 @@
 
 #define SLI_PORT_CONTROL_IP_MASK	0x08000000
 
+#define PCICFG_CUST_SCRATCHPAD_CSR	0x1EC
+
 /********* Memory BAR register ************/
 #define PCICFG_MEMBAR_CTRL_INT_CTRL_OFFSET 	0xfc
 /* Host Interrupt Enable, if set interrupts are enabled although "PCI Interrupt
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 6d5d30b..a01f734 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -1049,6 +1049,29 @@ static int be_set_vf_tx_rate(struct net_device *netdev,
 	return status;
 }
 
+static int be_find_vfs(struct be_adapter *adapter, int vf_state)
+{
+	struct pci_dev *dev, *pdev = adapter->pdev;
+	int vfs = 0, assigned_vfs = 0, pos, vf_fn;
+	u16 offset, stride;
+
+	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
+	pci_read_config_word(pdev, pos + PCI_SRIOV_VF_OFFSET, &offset);
+	pci_read_config_word(pdev, pos + PCI_SRIOV_VF_STRIDE, &stride);
+
+	dev = pci_get_device(pdev->vendor, PCI_ANY_ID, NULL);
+	while (dev) {
+		vf_fn = (pdev->devfn + offset + stride * vfs) & 0xFFFF;
+		if (dev->is_virtfn && dev->devfn == vf_fn) {
+			vfs++;
+			if (dev->dev_flags & PCI_DEV_FLAGS_ASSIGNED)
+				assigned_vfs++;
+		}
+		dev = pci_get_device(pdev->vendor, PCI_ANY_ID, dev);
+	}
+	return (vf_state == ASSIGNED) ? assigned_vfs : vfs;
+}
+
 static void be_eqd_update(struct be_adapter *adapter, struct be_eq_obj *eqo)
 {
 	struct be_rx_stats *stats = rx_stats(&adapter->rx_obj[eqo->idx]);
@@ -1789,9 +1812,9 @@ static void be_tx_queues_destroy(struct be_adapter *adapter)
 
 static int be_num_txqs_want(struct be_adapter *adapter)
 {
-	if (sriov_enabled(adapter) || be_is_mc(adapter) ||
-		lancer_chip(adapter) || !be_physfn(adapter) ||
-		adapter->generation == BE_GEN2)
+	if (sriov_want(adapter) || be_is_mc(adapter) ||
+	    lancer_chip(adapter) || !be_physfn(adapter) ||
+	    adapter->generation == BE_GEN2)
 		return 1;
 	else
 		return MAX_TX_QS;
@@ -2118,7 +2141,7 @@ static void be_msix_disable(struct be_adapter *adapter)
 static uint be_num_rss_want(struct be_adapter *adapter)
 {
 	if ((adapter->function_caps & BE_FUNCTION_CAPS_RSS) &&
-	     adapter->num_vfs == 0 && be_physfn(adapter) &&
+	     !sriov_want(adapter) && be_physfn(adapter) &&
 	     !be_is_mc(adapter))
 		return (adapter->be3_native) ? BE3_MAX_RSS_QS : BE2_MAX_RSS_QS;
 	else
@@ -2152,53 +2175,6 @@ done:
 	return;
 }
 
-static int be_sriov_enable(struct be_adapter *adapter)
-{
-	be_check_sriov_fn_type(adapter);
-
-#ifdef CONFIG_PCI_IOV
-	if (be_physfn(adapter) && num_vfs) {
-		int status, pos;
-		u16 dev_vfs;
-
-		pos = pci_find_ext_capability(adapter->pdev,
-						PCI_EXT_CAP_ID_SRIOV);
-		pci_read_config_word(adapter->pdev,
-				     pos + PCI_SRIOV_TOTAL_VF, &dev_vfs);
-
-		adapter->num_vfs = min_t(u16, num_vfs, dev_vfs);
-		if (adapter->num_vfs != num_vfs)
-			dev_info(&adapter->pdev->dev,
-				 "Device supports %d VFs and not %d\n",
-				 adapter->num_vfs, num_vfs);
-
-		status = pci_enable_sriov(adapter->pdev, adapter->num_vfs);
-		if (status)
-			adapter->num_vfs = 0;
-
-		if (adapter->num_vfs) {
-			adapter->vf_cfg = kcalloc(num_vfs,
-						sizeof(struct be_vf_cfg),
-						GFP_KERNEL);
-			if (!adapter->vf_cfg)
-				return -ENOMEM;
-		}
-	}
-#endif
-	return 0;
-}
-
-static void be_sriov_disable(struct be_adapter *adapter)
-{
-#ifdef CONFIG_PCI_IOV
-	if (sriov_enabled(adapter)) {
-		pci_disable_sriov(adapter->pdev);
-		kfree(adapter->vf_cfg);
-		adapter->num_vfs = 0;
-	}
-#endif
-}
-
 static inline int be_msix_vec_get(struct be_adapter *adapter,
 				struct be_eq_obj *eqo)
 {
@@ -2500,6 +2476,11 @@ static void be_vf_clear(struct be_adapter *adapter)
 	struct be_vf_cfg *vf_cfg;
 	u32 vf;
 
+	if (be_find_vfs(adapter, ASSIGNED)) {
+		dev_warn(&adapter->pdev->dev, "VFs are assigned to VMs\n");
+		goto done;
+	}
+
 	for_all_vfs(adapter, vf_cfg, vf) {
 		if (lancer_chip(adapter))
 			be_cmd_set_mac_list(adapter, NULL, 0, vf + 1);
@@ -2509,6 +2490,10 @@ static void be_vf_clear(struct be_adapter *adapter)
 
 		be_cmd_if_destroy(adapter, vf_cfg->if_handle, vf + 1);
 	}
+	pci_disable_sriov(adapter->pdev);
+done:
+	kfree(adapter->vf_cfg);
+	adapter->num_vfs = 0;
 }
 
 static int be_clear(struct be_adapter *adapter)
@@ -2538,29 +2523,60 @@ static int be_clear(struct be_adapter *adapter)
 	be_cmd_fw_clean(adapter);
 
 	be_msix_disable(adapter);
-	kfree(adapter->pmac_id);
+	pci_write_config_dword(adapter->pdev, PCICFG_CUST_SCRATCHPAD_CSR, 0);
 	return 0;
 }
 
-static void be_vf_setup_init(struct be_adapter *adapter)
+static int be_vf_setup_init(struct be_adapter *adapter)
 {
 	struct be_vf_cfg *vf_cfg;
 	int vf;
 
+	adapter->vf_cfg = kcalloc(adapter->num_vfs, sizeof(*vf_cfg),
+				  GFP_KERNEL);
+	if (!adapter->vf_cfg)
+		return -ENOMEM;
+
 	for_all_vfs(adapter, vf_cfg, vf) {
 		vf_cfg->if_handle = -1;
 		vf_cfg->pmac_id = -1;
 	}
+	return 0;
 }
 
 static int be_vf_setup(struct be_adapter *adapter)
 {
 	struct be_vf_cfg *vf_cfg;
+	struct device *dev = &adapter->pdev->dev;
 	u32 cap_flags, en_flags, vf;
 	u16 def_vlan, lnk_speed;
-	int status;
+	int status, enabled_vfs;
+
+	enabled_vfs = be_find_vfs(adapter, ENABLED);
+	if (enabled_vfs) {
+		dev_warn(dev, "%d VFs are already enabled\n", enabled_vfs);
+		dev_warn(dev, "Ignoring num_vfs=%d setting\n", num_vfs);
+		return 0;
+	}
 
-	be_vf_setup_init(adapter);
+	if (num_vfs > adapter->dev_num_vfs) {
+		dev_warn(dev, "Device supports %d VFs and not %d\n",
+			 adapter->dev_num_vfs, num_vfs);
+		num_vfs = adapter->dev_num_vfs;
+	}
+
+	status = pci_enable_sriov(adapter->pdev, num_vfs);
+	if (!status) {
+		adapter->num_vfs = num_vfs;
+	} else {
+		/* Platform doesn't support SRIOV though device supports it */
+		dev_warn(dev, "SRIOV enable failed\n");
+		return 0;
+	}
+
+	status = be_vf_setup_init(adapter);
+	if (status)
+		goto err;
 
 	cap_flags = en_flags = BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_BROADCAST |
 				BE_IF_FLAGS_MULTICAST;
@@ -2571,9 +2587,11 @@ static int be_vf_setup(struct be_adapter *adapter)
 			goto err;
 	}
 
-	status = be_vf_eth_addr_config(adapter);
-	if (status)
-		goto err;
+	if (!enabled_vfs) {
+		status = be_vf_eth_addr_config(adapter);
+		if (status)
+			goto err;
+	}
 
 	for_all_vfs(adapter, vf_cfg, vf) {
 		status = be_cmd_link_status_query(adapter, NULL, &lnk_speed,
@@ -2630,9 +2648,25 @@ do_none:
 	return status;
 }
 
+/* Routine to query per function resource limits */
+static int be_get_config(struct be_adapter *adapter)
+{
+	int pos;
+	u16 dev_num_vfs;
+
+	pos = pci_find_ext_capability(adapter->pdev, PCI_EXT_CAP_ID_SRIOV);
+	if (pos) {
+		pci_read_config_word(adapter->pdev, pos + PCI_SRIOV_TOTAL_VF,
+				     &dev_num_vfs);
+		adapter->dev_num_vfs = dev_num_vfs;
+	}
+	return 0;
+}
+
 static int be_setup(struct be_adapter *adapter)
 {
 	struct net_device *netdev = adapter->netdev;
+	struct device *dev = &adapter->pdev->dev;
 	u32 cap_flags, en_flags;
 	u32 tx_fc, rx_fc;
 	int status;
@@ -2640,6 +2674,8 @@ static int be_setup(struct be_adapter *adapter)
 
 	be_setup_init(adapter);
 
+	be_get_config(adapter);
+
 	be_cmd_req_native_mode(adapter);
 
 	be_msix_enable(adapter);
@@ -2718,10 +2754,11 @@ static int be_setup(struct be_adapter *adapter)
 
 	pcie_set_readrq(adapter->pdev, 4096);
 
-	if (sriov_enabled(adapter)) {
-		status = be_vf_setup(adapter);
-		if (status)
-			goto err;
+	if (be_physfn(adapter) && num_vfs) {
+		if (adapter->dev_num_vfs)
+			be_vf_setup(adapter);
+		else
+			dev_warn(dev, "device doesn't support SRIOV\n");
 	}
 
 	be_cmd_get_phy_info(adapter);
@@ -2731,6 +2768,7 @@ static int be_setup(struct be_adapter *adapter)
 	schedule_delayed_work(&adapter->work, msecs_to_jiffies(1000));
 	adapter->flags |= BE_FLAGS_WORKER_SCHEDULED;
 
+	pci_write_config_dword(adapter->pdev, PCICFG_CUST_SCRATCHPAD_CSR, 1);
 	return 0;
 err:
 	be_clear(adapter);
@@ -3352,8 +3390,6 @@ static void __devexit be_remove(struct pci_dev *pdev)
 
 	be_ctrl_cleanup(adapter);
 
-	be_sriov_disable(adapter);
-
 	pci_set_drvdata(pdev, NULL);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
@@ -3367,7 +3403,7 @@ bool be_is_wol_supported(struct be_adapter *adapter)
 		!be_is_wol_excluded(adapter)) ? true : false;
 }
 
-static int be_get_config(struct be_adapter *adapter)
+static int be_get_initial_config(struct be_adapter *adapter)
 {
 	int status;
 
@@ -3410,7 +3446,7 @@ static int be_get_config(struct be_adapter *adapter)
 	return 0;
 }
 
-static int be_dev_family_check(struct be_adapter *adapter)
+static int be_dev_type_check(struct be_adapter *adapter)
 {
 	struct pci_dev *pdev = adapter->pdev;
 	u32 sli_intf = 0, if_type;
@@ -3443,6 +3479,9 @@ static int be_dev_family_check(struct be_adapter *adapter)
 	default:
 		adapter->generation = 0;
 	}
+
+	pci_read_config_dword(adapter->pdev, SLI_INTF_REG_OFFSET, &sli_intf);
+	adapter->virtfn = (sli_intf & SLI_INTF_FT_MASK) ? 1 : 0;
 	return 0;
 }
 
@@ -3586,6 +3625,14 @@ reschedule:
 	schedule_delayed_work(&adapter->work, msecs_to_jiffies(1000));
 }
 
+static bool be_reset_required(struct be_adapter *adapter)
+{
+	u32 reg;
+
+	pci_read_config_dword(adapter->pdev, PCICFG_CUST_SCRATCHPAD_CSR, &reg);
+	return reg;
+}
+
 static int __devinit be_probe(struct pci_dev *pdev,
 			const struct pci_device_id *pdev_id)
 {
@@ -3611,7 +3658,7 @@ static int __devinit be_probe(struct pci_dev *pdev,
 	adapter->pdev = pdev;
 	pci_set_drvdata(pdev, adapter);
 
-	status = be_dev_family_check(adapter);
+	status = be_dev_type_check(adapter);
 	if (status)
 		goto free_netdev;
 
@@ -3629,13 +3676,9 @@ static int __devinit be_probe(struct pci_dev *pdev,
 		}
 	}
 
-	status = be_sriov_enable(adapter);
-	if (status)
-		goto free_netdev;
-
 	status = be_ctrl_init(adapter);
 	if (status)
-		goto disable_sriov;
+		goto free_netdev;
 
 	if (lancer_chip(adapter)) {
 		status = lancer_wait_ready(adapter);
@@ -3662,9 +3705,11 @@ static int __devinit be_probe(struct pci_dev *pdev,
 	if (status)
 		goto ctrl_clean;
 
-	status = be_cmd_reset_function(adapter);
-	if (status)
-		goto ctrl_clean;
+	if (be_reset_required(adapter)) {
+		status = be_cmd_reset_function(adapter);
+		if (status)
+			goto ctrl_clean;
+	}
 
 	/* The INTR bit may be set in the card when probed by a kdump kernel
 	 * after a crash.
@@ -3676,7 +3721,7 @@ static int __devinit be_probe(struct pci_dev *pdev,
 	if (status)
 		goto ctrl_clean;
 
-	status = be_get_config(adapter);
+	status = be_get_initial_config(adapter);
 	if (status)
 		goto stats_clean;
 
@@ -3705,8 +3750,6 @@ stats_clean:
 	be_stats_cleanup(adapter);
 ctrl_clean:
 	be_ctrl_cleanup(adapter);
-disable_sriov:
-	be_sriov_disable(adapter);
 free_netdev:
 	free_netdev(netdev);
 	pci_set_drvdata(pdev, NULL);
-- 
1.7.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox