Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFC net-next PATCH V1 7/9] net: frag queue locking per hash bucket
From: Jesper Dangaard Brouer @ 2012-11-23 13:08 UTC (permalink / raw)
  To: Eric Dumazet, David S. Miller, Florian Westphal
  Cc: Jesper Dangaard Brouer, netdev, Pablo Neira Ayuso, Thomas Graf,
	Cong Wang, Patrick McHardy, Paul E. McKenney, Herbert Xu
In-Reply-To: <20121123130749.18764.25962.stgit@dragon>

DO NOT apply - patch not finished, can cause on OOPS/PANIC during hash rebuild

This patch implements per hash bucket locking for the frag queue
hash.  This removes two write locks, and the only remaining write
lock is for protecting hash rebuild.  This essentially reduce the
readers-writer lock to a rebuild lock.

NOT-Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---

 include/net/inet_frag.h  |   10 +++++++-
 net/ipv4/inet_fragment.c |   56 +++++++++++++++++++++++++++++++++++-----------
 2 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 9938ea4..1efec6b 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -51,9 +51,15 @@ struct inet_frag_queue {
 
 #define INETFRAGS_HASHSZ		64
 
+
+struct inet_frag_bucket {
+	struct hlist_head	chain;
+	spinlock_t		chain_lock;
+};
+
 struct inet_frags {
-	struct hlist_head	hash[INETFRAGS_HASHSZ];
-	rwlock_t		lock;
+	struct inet_frag_bucket	hash[INETFRAGS_HASHSZ];
+	rwlock_t		lock; /* Rebuild lock */
 	u32			rnd;
 	int			qsize;
 	int			secret_interval;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 1620a21..447423f 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -35,20 +35,27 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
 	unsigned long now = jiffies;
 	int i;
 
+	/* Per bucket lock NOT needed here, due to write lock protection */
 	write_lock(&f->lock);
+
 	get_random_bytes(&f->rnd, sizeof(u32));
 	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+		struct inet_frag_bucket *hb;
 		struct inet_frag_queue *q;
 		struct hlist_node *p, *n;
 
-		hlist_for_each_entry_safe(q, p, n, &f->hash[i], list) {
+		hb = &f->hash[i];
+		hlist_for_each_entry_safe(q, p, n, &hb->chain, list) {
 			unsigned int hval = f->hashfn(q);
 
 			if (hval != i) {
+				struct inet_frag_bucket *hb_dest;
+
 				hlist_del(&q->list);
 
 				/* Relink to new hash chain. */
-				hlist_add_head(&q->list, &f->hash[hval]);
+				hb_dest = &f->hash[hval];
+				hlist_add_head(&q->list, &hb->chain);
 			}
 		}
 	}
@@ -61,9 +68,12 @@ void inet_frags_init(struct inet_frags *f)
 {
 	int i;
 
-	for (i = 0; i < INETFRAGS_HASHSZ; i++)
-		INIT_HLIST_HEAD(&f->hash[i]);
+	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+		struct inet_frag_bucket *hb = &f->hash[i];
 
+		spin_lock_init(&hb->chain_lock);
+		INIT_HLIST_HEAD(&hb->chain);
+	}
 	rwlock_init(&f->lock);
 
 	f->rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
@@ -102,9 +112,17 @@ EXPORT_SYMBOL(inet_frags_exit_net);
 
 static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
 {
-	write_lock(&f->lock);
+	struct inet_frag_bucket *hb;
+	unsigned int hash;
+
+	read_lock(&f->lock);
+	hash = f->hashfn(fq);
+	hb = &f->hash[hash];
+
+	spin_lock_bh(&hb->chain_lock);
 	hlist_del(&fq->list);
-	write_unlock(&f->lock);
+	spin_unlock_bh(&hb->chain_lock);
+	read_unlock(&f->lock);
 	inet_frag_lru_del(fq);
 }
 
@@ -224,28 +242,33 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 		struct inet_frag_queue *qp_in, struct inet_frags *f,
 		void *arg)
 {
+	struct inet_frag_bucket *hb;
 	struct inet_frag_queue *qp;
 #ifdef CONFIG_SMP
 	struct hlist_node *n;
 #endif
 	unsigned int hash;
 
-	write_lock(&f->lock);
+	read_lock(&f->lock); /* Protects against hash rebuild */
 	/*
 	 * While we stayed w/o the lock other CPU could update
 	 * the rnd seed, so we need to re-calculate the hash
 	 * chain. Fortunatelly the qp_in can be used to get one.
 	 */
 	hash = f->hashfn(qp_in);
+	hb = &f->hash[hash];
+	spin_lock_bh(&hb->chain_lock);
+
 #ifdef CONFIG_SMP
 	/* With SMP race we have to recheck hash table, because
 	 * such entry could be created on other cpu, while we
-	 * promoted read lock to write lock.
+	 * promoted read lock to write lock. ***Comment FIXME***
 	 */
-	hlist_for_each_entry(qp, n, &f->hash[hash], list) {
+	hlist_for_each_entry(qp, n, &hb->chain, list) {
 		if (qp->net == nf && f->match(qp, arg)) {
 			atomic_inc(&qp->refcnt);
-			write_unlock(&f->lock);
+			spin_unlock_bh(&hb->chain_lock);
+			read_unlock(&f->lock);
 			qp_in->last_in |= INET_FRAG_COMPLETE;
 			inet_frag_put(qp_in, f);
 			return qp;
@@ -257,8 +280,9 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 		atomic_inc(&qp->refcnt);
 
 	atomic_inc(&qp->refcnt);
-	hlist_add_head(&qp->list, &f->hash[hash]);
-	write_unlock(&f->lock);
+	hlist_add_head(&qp->list, &hb->chain);
+	spin_unlock_bh(&hb->chain_lock);
+	read_unlock(&f->lock);
 	inet_frag_lru_add(nf, qp);
 	return qp;
 }
@@ -307,16 +331,22 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 		struct inet_frags *f, void *key, unsigned int hash)
 	__releases(&f->lock)
 {
+	struct inet_frag_bucket *hb;
 	struct inet_frag_queue *q;
 	struct hlist_node *n;
 
-	hlist_for_each_entry(q, n, &f->hash[hash], list) {
+	hb = &f->hash[hash];
+
+	spin_lock_bh(&hb->chain_lock);
+	hlist_for_each_entry(q, n, &hb->chain, list) {
 		if (q->net == nf && f->match(q, key)) {
 			atomic_inc(&q->refcnt);
+			spin_unlock_bh(&hb->chain_lock);
 			read_unlock(&f->lock);
 			return q;
 		}
 	}
+	spin_unlock_bh(&hb->chain_lock);
 	read_unlock(&f->lock);
 
 	return inet_frag_create(nf, f, key);

^ permalink raw reply related

* Re: [PATCH 1/5] smsc95xx: fix error checking of usbnet_resume
From: Steve Glendinning @ 2012-11-23 12:55 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20121122.153203.449492215100737957.davem@davemloft.net>

On 22 November 2012 20:32, David Miller <davem@davemloft.net> wrote:
>
> I only see patches #1 and #2, where are the other 3?

That's odd, they seem to have all made it to patchwork so they must have sent:

http://patchwork.ozlabs.org/project/netdev/list/

Spam filtering problem maybe?  Let me know if you need me to re-send.

^ permalink raw reply

* Re: [PATCH][RESEND] bonding: delete migrated IP addresses from the rlb hash table
From: Jiri Bohac @ 2012-11-23 12:44 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: Jiri Bohac, Andy Gospodarek, netdev
In-Reply-To: <2699.1340319919@death.nxdomain>

Hi, 

This is another resend of the patch discussed in June. The only
changes over the previous version are improved comments.

Bonding with balance_rlb keeps poisoning other machines' ARP
caches and I whink we need to fix this.

On Thu, Jun 21, 2012 at 04:05:19PM -0700, Jay Vosburgh wrote:
> Jiri Bohac <jbohac@suse.cz> wrote:
> 
> >Hi, this is a resend of the patch discussed here:
> >	http://thread.gmane.org/gmane.linux.network/228076
> >It has been updated to apply to the lastest net-next.
> [...]
> >The hash table is hashed by ip_dst. To be able to do the above
> >check efficiently (not walking the whole hash table), we need a
> >reverse mapping (by ip_src).
> 
> 	Just a note that I'm doing some testing with this patch.  Seems
> to be ok for the "direct" case (wherein the IP in question is assigned
> to the local system); I haven't tried the "bridge" case yet.  I've
> extended some of the debugfs stuff to dump the new information, and I'm
> trying some of the corner cases (e.g., breaking the linkages in the
> middle) to see if it all hangs together.

Were there any results of your testing?  Good or bad?

> 	I am thinking that the layout of the "hash"-ish table is now
> sufficiently complicated that there should be a comment block somewhere
> describing what's going on (because I didn't really quite get it until I
> dumped the whole thing and looked at it).  With this patch, there is one
> "used" linkage for all of the elements in use, plus some number of "src"
> linkages, one for each active source hash.  The "src" linkages are also
> notable in that they are separate from the "assigned" state.

I updated the comments in drivers/net/bonding/bond_alb.h to
describe the structure.

> >+	 * have a dirrerent mac_src.
> 
> 	Typo here; should be "different."

Fixed. 
Any chance we could finally get this merged?:






Bonding in balance-alb mode records information from ARP packets
passing through the bond in a hash table (rx_hashtbl).

At certain situations (e.g. link change of a slave),
rlb_update_rx_clients() will send out ARP packets to update ARP
caches of other hosts on the network to achieve RX load
balancing.

The problem is that once an IP address is recorded in the hash
table, it stays there indefinitely. If this IP address is
migrated to a different host in the network, bonding still sends
out ARP packets that poison other systems' ARP caches with
invalid information.

This patch solves this by looking at all incoming ARP packets,
and checking if the source IP address is one of the source
addresses stored in the rx_hashtbl. If it is, but the MAC
addresses differ, the corresponding hash table entries are
removed. Thus, when an IP address is migrated, the first ARP
broadcast by its new owner will purge the offending entries of
rx_hashtbl.

The hash table is hashed by ip_dst. To be able to do the above
check efficiently (not walking the whole hash table), we need a
reverse mapping (by ip_src).

I added three new members in struct rlb_client_info:
   rx_hashtbl[x].src_first will point to the start of a list of
      entries for which hash(ip_src) == x.
   The list is linked with src_next and src_prev.

When an incoming ARP packet arrives at rlb_arp_recv()
rlb_purge_src_ip() can quickly walk only the entries on the
corresponding lists, i.e. the entries that are likely to contain
the offending IP address.

To avoid confusion, I renamed these existing fields of struct 
rlb_client_info:
	next -> used_next
	prev -> used_prev
	rx_hashtbl_head -> rx_hashtbl_used_head

(The current linked list is _not_ a list of hash table
entries with colliding ip_dst. It's a list of entries that are
being used; its purpose is to avoid walking the whole hash table
when looking for used entries.)

Signed-off-by: Jiri Bohac <jbohac@suse.cz>

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index e15cc11..8505a24 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -84,6 +84,9 @@ static inline struct arp_pkt *arp_pkt(const struct sk_buff *skb)
 
 /* Forward declaration */
 static void alb_send_learning_packets(struct slave *slave, u8 mac_addr[]);
+static void rlb_purge_src_ip(struct bonding *bond, struct arp_pkt *arp);
+static void rlb_src_unlink(struct bonding *bond, u32 index);
+static void rlb_src_link(struct bonding *bond, u32 ip_src_hash, u32 ip_dst_hash);
 
 static inline u8 _simple_hash(const u8 *hash_start, int hash_size)
 {
@@ -354,6 +357,17 @@ static int rlb_arp_recv(const struct sk_buff *skb, struct bonding *bond,
 	if (!arp)
 		goto out;
 
+	/* We received an ARP from arp->ip_src.
+	 * We might have used this IP address previously (on the bonding host
+	 * itself or on a system that is bridged together with the bond).
+	 * However, if arp->mac_src is different than what is stored in
+	 * rx_hashtbl, some other host is now using the IP and we must prevent
+	 * sending out client updates with this IP address and the old MAC address.
+	 * Clean up all hash table entries that have this address as ip_src but
+	 * have a different mac_src.
+	 */
+	rlb_purge_src_ip(bond, arp);
+
 	if (arp->op_code == htons(ARPOP_REPLY)) {
 		/* update rx hash table for this ARP */
 		rlb_update_entry_from_arp(bond, arp);
@@ -432,9 +446,9 @@ static void rlb_clear_slave(struct bonding *bond, struct slave *slave)
 	_lock_rx_hashtbl_bh(bond);
 
 	rx_hash_table = bond_info->rx_hashtbl;
-	index = bond_info->rx_hashtbl_head;
+	index = bond_info->rx_hashtbl_used_head;
 	for (; index != RLB_NULL_INDEX; index = next_index) {
-		next_index = rx_hash_table[index].next;
+		next_index = rx_hash_table[index].used_next;
 		if (rx_hash_table[index].slave == slave) {
 			struct slave *assigned_slave = rlb_next_rx_slave(bond);
 
@@ -519,8 +533,8 @@ static void rlb_update_rx_clients(struct bonding *bond)
 
 	_lock_rx_hashtbl_bh(bond);
 
-	hash_index = bond_info->rx_hashtbl_head;
-	for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->next) {
+	hash_index = bond_info->rx_hashtbl_used_head;
+	for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->used_next) {
 		client_info = &(bond_info->rx_hashtbl[hash_index]);
 		if (client_info->ntt) {
 			rlb_update_client(client_info);
@@ -548,8 +562,8 @@ static void rlb_req_update_slave_clients(struct bonding *bond, struct slave *sla
 
 	_lock_rx_hashtbl_bh(bond);
 
-	hash_index = bond_info->rx_hashtbl_head;
-	for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->next) {
+	hash_index = bond_info->rx_hashtbl_used_head;
+	for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->used_next) {
 		client_info = &(bond_info->rx_hashtbl[hash_index]);
 
 		if ((client_info->slave == slave) &&
@@ -578,8 +592,8 @@ static void rlb_req_update_subnet_clients(struct bonding *bond, __be32 src_ip)
 
 	_lock_rx_hashtbl(bond);
 
-	hash_index = bond_info->rx_hashtbl_head;
-	for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->next) {
+	hash_index = bond_info->rx_hashtbl_used_head;
+	for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->used_next) {
 		client_info = &(bond_info->rx_hashtbl[hash_index]);
 
 		if (!client_info->slave) {
@@ -625,6 +639,7 @@ static struct slave *rlb_choose_channel(struct sk_buff *skb, struct bonding *bon
 				/* update mac address from arp */
 				memcpy(client_info->mac_dst, arp->mac_dst, ETH_ALEN);
 			}
+			memcpy(client_info->mac_src, arp->mac_src, ETH_ALEN);
 
 			assigned_slave = client_info->slave;
 			if (assigned_slave) {
@@ -647,6 +662,13 @@ static struct slave *rlb_choose_channel(struct sk_buff *skb, struct bonding *bon
 	assigned_slave = rlb_next_rx_slave(bond);
 
 	if (assigned_slave) {
+		if (!(client_info->assigned && client_info->ip_src == arp->ip_src)) {
+			/* ip_src is going to be updated, fix the src hash list */
+			u32 hash_src = _simple_hash((u8 *)&arp->ip_src, sizeof(arp->ip_src));
+			rlb_src_unlink(bond, hash_index);
+			rlb_src_link(bond, hash_src, hash_index);
+		}
+
 		client_info->ip_src = arp->ip_src;
 		client_info->ip_dst = arp->ip_dst;
 		/* arp->mac_dst is broadcast for arp reqeusts.
@@ -654,6 +676,7 @@ static struct slave *rlb_choose_channel(struct sk_buff *skb, struct bonding *bon
 		 * upon receiving an arp reply.
 		 */
 		memcpy(client_info->mac_dst, arp->mac_dst, ETH_ALEN);
+		memcpy(client_info->mac_src, arp->mac_src, ETH_ALEN);
 		client_info->slave = assigned_slave;
 
 		if (!ether_addr_equal_64bits(client_info->mac_dst, mac_bcast)) {
@@ -669,11 +692,11 @@ static struct slave *rlb_choose_channel(struct sk_buff *skb, struct bonding *bon
 		}
 
 		if (!client_info->assigned) {
-			u32 prev_tbl_head = bond_info->rx_hashtbl_head;
-			bond_info->rx_hashtbl_head = hash_index;
-			client_info->next = prev_tbl_head;
+			u32 prev_tbl_head = bond_info->rx_hashtbl_used_head;
+			bond_info->rx_hashtbl_used_head = hash_index;
+			client_info->used_next = prev_tbl_head;
 			if (prev_tbl_head != RLB_NULL_INDEX) {
-				bond_info->rx_hashtbl[prev_tbl_head].prev =
+				bond_info->rx_hashtbl[prev_tbl_head].used_prev =
 					hash_index;
 			}
 			client_info->assigned = 1;
@@ -740,8 +763,8 @@ static void rlb_rebalance(struct bonding *bond)
 	_lock_rx_hashtbl_bh(bond);
 
 	ntt = 0;
-	hash_index = bond_info->rx_hashtbl_head;
-	for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->next) {
+	hash_index = bond_info->rx_hashtbl_used_head;
+	for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->used_next) {
 		client_info = &(bond_info->rx_hashtbl[hash_index]);
 		assigned_slave = rlb_next_rx_slave(bond);
 		if (assigned_slave && (client_info->slave != assigned_slave)) {
@@ -759,11 +782,113 @@ static void rlb_rebalance(struct bonding *bond)
 }
 
 /* Caller must hold rx_hashtbl lock */
+static void rlb_init_table_entry_dst(struct rlb_client_info *entry)
+{
+	entry->used_next = RLB_NULL_INDEX;
+	entry->used_prev = RLB_NULL_INDEX;
+	entry->assigned = 0;
+	entry->slave = NULL;
+	entry->tag = 0;
+}
+static void rlb_init_table_entry_src(struct rlb_client_info *entry)
+{
+	entry->src_first = RLB_NULL_INDEX;
+	entry->src_prev = RLB_NULL_INDEX;
+	entry->src_next = RLB_NULL_INDEX;
+}
+
 static void rlb_init_table_entry(struct rlb_client_info *entry)
 {
 	memset(entry, 0, sizeof(struct rlb_client_info));
-	entry->next = RLB_NULL_INDEX;
-	entry->prev = RLB_NULL_INDEX;
+	rlb_init_table_entry_dst(entry);
+	rlb_init_table_entry_src(entry);
+}
+
+static void rlb_delete_table_entry_dst(struct bonding *bond, u32 index)
+{
+	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
+	u32 next_index = bond_info->rx_hashtbl[index].used_next;
+	u32 prev_index = bond_info->rx_hashtbl[index].used_prev;
+
+	if (index == bond_info->rx_hashtbl_used_head)
+		bond_info->rx_hashtbl_used_head = next_index;
+	if (prev_index != RLB_NULL_INDEX)
+		bond_info->rx_hashtbl[prev_index].used_next = next_index;
+	if (next_index != RLB_NULL_INDEX)
+		bond_info->rx_hashtbl[next_index].used_prev = prev_index;
+}
+
+/* unlink a rlb hash table entry from the src list */
+static void rlb_src_unlink(struct bonding *bond, u32 index)
+{
+	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
+	u32 next_index = bond_info->rx_hashtbl[index].src_next;
+	u32 prev_index = bond_info->rx_hashtbl[index].src_prev;
+
+	bond_info->rx_hashtbl[index].src_next = RLB_NULL_INDEX;
+	bond_info->rx_hashtbl[index].src_prev = RLB_NULL_INDEX;
+
+	if (next_index != RLB_NULL_INDEX)
+		bond_info->rx_hashtbl[next_index].src_prev = prev_index;
+
+	if (prev_index == RLB_NULL_INDEX)
+		return;
+
+	/* is prev_index pointing to the head of this list? */
+	if (bond_info->rx_hashtbl[prev_index].src_first == index)
+		bond_info->rx_hashtbl[prev_index].src_first = next_index;
+	else
+		bond_info->rx_hashtbl[prev_index].src_next = next_index;
+
+}
+
+static void rlb_delete_table_entry(struct bonding *bond, u32 index)
+{
+	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
+	struct rlb_client_info *entry = &(bond_info->rx_hashtbl[index]);
+
+	rlb_delete_table_entry_dst(bond, index);
+	rlb_init_table_entry_dst(entry);
+
+	rlb_src_unlink(bond, index);
+}
+
+/* add the rx_hashtbl[ip_dst_hash] entry to the list
+ * of entries with identical ip_src_hash
+ */
+static void rlb_src_link(struct bonding *bond, u32 ip_src_hash, u32 ip_dst_hash)
+{
+	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
+	u32 next;
+
+	bond_info->rx_hashtbl[ip_dst_hash].src_prev = ip_src_hash;
+	next = bond_info->rx_hashtbl[ip_src_hash].src_first;
+	bond_info->rx_hashtbl[ip_dst_hash].src_next = next;
+	if (next != RLB_NULL_INDEX)
+		bond_info->rx_hashtbl[next].src_prev = ip_dst_hash;
+	bond_info->rx_hashtbl[ip_src_hash].src_first = ip_dst_hash;
+}
+
+/* deletes all rx_hashtbl entries with  arp->ip_src if their mac_src does
+ * not match arp->mac_src */
+static void rlb_purge_src_ip(struct bonding *bond, struct arp_pkt *arp)
+{
+	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
+	u32 ip_src_hash = _simple_hash((u8*)&(arp->ip_src), sizeof(arp->ip_src));
+	u32 index;
+
+	_lock_rx_hashtbl_bh(bond);
+
+	index = bond_info->rx_hashtbl[ip_src_hash].src_first;
+	while (index != RLB_NULL_INDEX) {
+		struct rlb_client_info *entry = &(bond_info->rx_hashtbl[index]);
+		u32 next_index = entry->src_next;
+		if (entry->ip_src == arp->ip_src &&
+		    !ether_addr_equal_64bits(arp->mac_src, entry->mac_src))
+				rlb_delete_table_entry(bond, index);
+		index = next_index;
+	}
+	_unlock_rx_hashtbl_bh(bond);
 }
 
 static int rlb_initialize(struct bonding *bond)
@@ -781,7 +906,7 @@ static int rlb_initialize(struct bonding *bond)
 
 	bond_info->rx_hashtbl = new_hashtbl;
 
-	bond_info->rx_hashtbl_head = RLB_NULL_INDEX;
+	bond_info->rx_hashtbl_used_head = RLB_NULL_INDEX;
 
 	for (i = 0; i < RLB_HASH_TABLE_SIZE; i++) {
 		rlb_init_table_entry(bond_info->rx_hashtbl + i);
@@ -803,7 +928,7 @@ static void rlb_deinitialize(struct bonding *bond)
 
 	kfree(bond_info->rx_hashtbl);
 	bond_info->rx_hashtbl = NULL;
-	bond_info->rx_hashtbl_head = RLB_NULL_INDEX;
+	bond_info->rx_hashtbl_used_head = RLB_NULL_INDEX;
 
 	_unlock_rx_hashtbl_bh(bond);
 }
@@ -815,25 +940,13 @@ static void rlb_clear_vlan(struct bonding *bond, unsigned short vlan_id)
 
 	_lock_rx_hashtbl_bh(bond);
 
-	curr_index = bond_info->rx_hashtbl_head;
+	curr_index = bond_info->rx_hashtbl_used_head;
 	while (curr_index != RLB_NULL_INDEX) {
 		struct rlb_client_info *curr = &(bond_info->rx_hashtbl[curr_index]);
-		u32 next_index = bond_info->rx_hashtbl[curr_index].next;
-		u32 prev_index = bond_info->rx_hashtbl[curr_index].prev;
-
-		if (curr->tag && (curr->vlan_id == vlan_id)) {
-			if (curr_index == bond_info->rx_hashtbl_head) {
-				bond_info->rx_hashtbl_head = next_index;
-			}
-			if (prev_index != RLB_NULL_INDEX) {
-				bond_info->rx_hashtbl[prev_index].next = next_index;
-			}
-			if (next_index != RLB_NULL_INDEX) {
-				bond_info->rx_hashtbl[next_index].prev = prev_index;
-			}
+		u32 next_index = bond_info->rx_hashtbl[curr_index].used_next;
 
-			rlb_init_table_entry(curr);
-		}
+		if (curr->tag && (curr->vlan_id == vlan_id))
+			rlb_delete_table_entry(bond, curr_index);
 
 		curr_index = next_index;
 	}
diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
index 90f140a..de831ba 100644
--- a/drivers/net/bonding/bond_alb.h
+++ b/drivers/net/bonding/bond_alb.h
@@ -94,15 +94,35 @@ struct tlb_client_info {
 
 /* -------------------------------------------------------------------------
  * struct rlb_client_info contains all info related to a specific rx client
- * connection. This is the Clients Hash Table entry struct
+ * connection. This is the Clients Hash Table entry struct.
+ * Note that this is not a proper hash table; if a new client's IP address
+ * hash collides with an existing client entry, the old entry is replaced.
+ *
+ * There is a linked list (linked by the used_next and used_prev members)
+ * linking all the used entries of the hash table. This allows updating
+ * all the clients without walking over all the unused elements of the table.
+ *
+ * There are also linked lists of entries with identical hash(ip_src). These
+ * allow cleaning up the table from ip_src<->mac_src associatins that have
+ * become outdated and would cause sending out invalid ARP updates to the
+ * network. These are linked by the (src_next and src_prev members).
  * -------------------------------------------------------------------------
  */
 struct rlb_client_info {
 	__be32 ip_src;		/* the server IP address */
 	__be32 ip_dst;		/* the client IP address */
+	u8  mac_src[ETH_ALEN];	/* the server MAC address */
 	u8  mac_dst[ETH_ALEN];	/* the client MAC address */
-	u32 next;		/* The next Hash table entry index */
-	u32 prev;		/* The previous Hash table entry index */
+
+	/* list of used hash table entries, starting at rx_hashtbl_used_head */
+	u32 used_next;
+	u32 used_prev;
+
+	/* ip_src based hashing */
+	u32 src_next;	/* next entry with same hash(ip_src) */
+	u32 src_prev;	/* prev entry with same hash(ip_src) */
+	u32 src_first;	/* first entry with hash(ip_src) == this entry's index */
+
 	u8  assigned;		/* checking whether this entry is assigned */
 	u8  ntt;		/* flag - need to transmit client info */
 	struct slave *slave;	/* the slave assigned to this client */
@@ -131,7 +151,7 @@ struct alb_bond_info {
 	int rlb_enabled;
 	struct rlb_client_info	*rx_hashtbl;	/* Receive hash table */
 	spinlock_t		rx_hashtbl_lock;
-	u32			rx_hashtbl_head;
+	u32			rx_hashtbl_used_head;
 	u8			rx_ntt;	/* flag - need to transmit
 					 * to all rx clients
 					 */
diff --git a/drivers/net/bonding/bond_debugfs.c b/drivers/net/bonding/bond_debugfs.c
index 2cf084e..6ac855f 100644
--- a/drivers/net/bonding/bond_debugfs.c
+++ b/drivers/net/bonding/bond_debugfs.c
@@ -31,8 +31,8 @@ static int bond_debug_rlb_hash_show(struct seq_file *m, void *v)
 
 	spin_lock_bh(&(BOND_ALB_INFO(bond).rx_hashtbl_lock));
 
-	hash_index = bond_info->rx_hashtbl_head;
-	for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->next) {
+	hash_index = bond_info->rx_hashtbl_used_head;
+	for (; hash_index != RLB_NULL_INDEX; hash_index = client_info->used_next) {
 		client_info = &(bond_info->rx_hashtbl[hash_index]);
 		seq_printf(m, "%-15pI4 %-15pI4 %-17pM %s\n",
 			&client_info->ip_src,

-- 
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, SUSE CZ

^ permalink raw reply related

* Re: [PATCH] 8139cp: set ring address after enabling C+ mode
From: Gilboa Davara @ 2012-11-23 12:37 UTC (permalink / raw)
  To: Francois Romieu
  Cc: Jeff Garzik, David Miller, dwmw2, jasowang, netdev, slacky,
	rggjan, Hayes Wang
In-Reply-To: <20121122213950.GA8873@electric-eye.fr.zoreil.com>

On Thu, Nov 22, 2012 at 11:39 PM, Francois Romieu <romieu@fr.zoreil.com> wrote:
> It would be nice if gilboad could give it a try (users Cced).
>

Applied it against 3.6.6.
Seems to be working just fine.

> --
> Ueimor

- Gilboa

^ permalink raw reply

* [PATCHv5] virtio-spec: virtio network device RFS support
From: Michael S. Tsirkin @ 2012-11-23 12:05 UTC (permalink / raw)
  To: Jason Wang; +Cc: rusty, virtualization, netdev, kvm

Add RFS support to virtio network device.
Add a new feature flag VIRTIO_NET_F_RFS for this feature, a new
configuration field max_virtqueue_pairs to detect supported number of
virtqueues as well as a new command VIRTIO_NET_CTRL_RFS to program
packet steering for unidirectional protocols.

---

Changes from v4:
- address Jason's comments
- have configuration specify the number of VQ pairs and not pairs - 1

Changes from v3:
- rename multiqueue -> rfs this is what we support
- Be more explicit about what driver should do.
- Simplify layout making VQs functionality depend on feature.
- Remove unused commands, only leave in programming # of queues

Changes from v2:
Address Jason's comments on v2:
- Changed STEERING_HOST to STEERING_RX_FOLLOWS_TX:
   this is both clearer and easier to support.
   It does not look like we need a separate steering command
   since host can just watch tx packets as they go.
- Moved RX and TX steering sections near each other.
- Add motivation for other changes in v2

Changes from Jason's rfc:
- reserved vq 3: this makes all rx vqs even and tx vqs odd, which
   looks nicer to me.
- documented packet steering, added a generalized steering programming
   command. Current modes are single queue and host driven multiqueue,
   but I envision support for guest driven multiqueue in the future.
- make default vqs unused when in mq mode - this wastes some memory
   but makes it more efficient to switch between modes as
   we can avoid this causing packet reordering.

Rusty, could you please take a look and comment soon?
If this looks OK to everyone, we can proceed with finalizing the
implementation. Would be nice to try and put it in 3.8.

diff --git a/virtio-spec.lyx b/virtio-spec.lyx
index d2f0da9..53ddeec 100644
--- a/virtio-spec.lyx
+++ b/virtio-spec.lyx
@@ -59,6 +59,7 @@
 \author -608949062 "Rusty Russell,,," 
 \author -385801441 "Cornelia Huck" cornelia.huck@de.ibm.com
 \author 1531152142 "Paolo Bonzini,,," 
+\author 1986246365 "Michael S. Tsirkin"
 \end_header
 
 \begin_body
@@ -4170,9 +4171,42 @@ ID 1
 \end_layout
 
 \begin_layout Description
-Virtqueues 0:receiveq.
- 1:transmitq.
- 2:controlq
+Virtqueues 0:receiveq
+\change_inserted 1986246365 1352742829
+0
+\change_unchanged
+.
+ 1:transmitq
+\change_inserted 1986246365 1352742832
+0
+\change_deleted 1986246365 1352742947
+.
+
+\change_inserted 1986246365 1352742952
+.
+ ....
+ 2N
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+
+\change_inserted 1986246365 1352743187
+N=0 if VIRTIO_NET_F_RFS is not negotiated, otherwise N is indicated by max_
+\emph on
+virtqueue_pairs control
+\emph default
+ field.
+
+\end_layout
+
+\end_inset
+
+: receivqN.
+ 2N+1: transmitqN.
+ 2N+
+\change_unchanged
+2:controlq
 \begin_inset Foot
 status open
 
@@ -4343,6 +4377,16 @@ VIRTIO_NET_F_CTRL_VLAN
 
 \begin_layout Description
 VIRTIO_NET_F_GUEST_ANNOUNCE(21) Guest can send gratuitous packets.
+\change_inserted 1986246365 1352742767
+
+\end_layout
+
+\begin_layout Description
+
+\change_inserted 1986246365 1352742808
+VIRTIO_NET_F_RFS(22) Device supports Receive Flow Steering.
+\change_unchanged
+
 \end_layout
 
 \end_deeper
@@ -4355,11 +4399,45 @@ configuration
 \begin_inset space ~
 \end_inset
 
-layout Two configuration fields are currently defined.
+layout
+\change_deleted 1986246365 1352743300
+Two
+\change_inserted 1986246365 1352743301
+Four
+\change_unchanged
+ configuration fields are currently defined.
  The mac address field always exists (though is only valid if VIRTIO_NET_F_MAC
  is set), and the status field only exists if VIRTIO_NET_F_STATUS is set.
  Two read-only bits are currently defined for the status field: VIRTIO_NET_S_LIN
 K_UP and VIRTIO_NET_S_ANNOUNCE.
+
+\change_inserted 1986246365 1353595219
+ The following read-only field,
+\emph on
+max_virtqueue_pairs
+\emph default
+ only exists if VIRTIO_NET_F_RFS is set.
+ This field specifies the maximum number of each of transmit and receive
+ virtqueues (receiveq0..receiveq
+\emph on
+N
+\emph default
+ and transmitq0..transmitq
+\emph on
+N
+\emph default
+ respectively;
+\emph on
+N
+\emph default
+=
+\emph on
+max_virtqueue_pairs - 1
+\emph default
+) that can be configured once VIRTIO_NET_F_RFS is negotiated.
+Legal values for this field are 1 to 8000h.
+
+\change_unchanged
  
 \begin_inset listings
 inline false
@@ -4410,7 +4488,24 @@ Device Initialization
 
 \begin_layout Enumerate
 The initialization routine should identify the receive and transmission
- virtqueues.
+ virtqueues
+\change_inserted 1986246365 1352744077
+, up to N+1 of each kind
+\change_unchanged
+.
+
+\change_inserted 1986246365 1352743942
+ If VIRTIO_NET_F_RFS feature bit is negotiated, 
+\emph on
+N=max_virtqueue_pairs-1
+\emph default
+, otherwise identify 
+\emph on
+N=0
+\emph default
+.
+\change_unchanged
+
 \end_layout
 
 \begin_layout Enumerate
@@ -4455,7 +4550,11 @@ status
 \end_layout
 
 \begin_layout Enumerate
-The receive virtqueue should be filled with receive buffers.
+The receive virtqueue
+\change_inserted 1986246365 1352743953
+s
+\change_unchanged
+ should be filled with receive buffers.
  This is described in detail below in 
 \begin_inset Quotes eld
 \end_inset
@@ -4550,8 +4649,15 @@ Device Operation
 \end_layout
 
 \begin_layout Standard
-Packets are transmitted by placing them in the transmitq, and buffers for
- incoming packets are placed in the receiveq.
+Packets are transmitted by placing them in the transmitq
+\change_inserted 1986246365 1353593685
+0..transmitqN
+\change_unchanged
+, and buffers for incoming packets are placed in the receiveq
+\change_inserted 1986246365 1353593692
+0..receiveqN
+\change_unchanged
+.
  In each case, the packet itself is preceeded by a header:
 \end_layout
 
@@ -4861,6 +4967,17 @@ If VIRTIO_NET_F_MRG_RXBUF is negotiated, each buffer must be at least the
 struct virtio_net_hdr
 \family default
 .
+\change_inserted 1986246365 1353594518
+
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 1986246365 1353594638
+If VIRTIO_NET_F_RFS is negotiated, each of the receiveq0...receiveqN that will
+ be used should be populated with receive buffers.
+\change_unchanged
+
 \end_layout
 
 \begin_layout Subsection*
@@ -5293,8 +5410,143 @@ Sending VIRTIO_NET_CTRL_ANNOUNCE_ACK command through control vq.
  
 \end_layout
 
-\begin_layout Enumerate
+\begin_layout Subsection*
+
+\change_inserted 1986246365 1353593879
+Packet Receive Flow Steering
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 1986246365 1353594403
+If the driver negotiates the VIRTIO_NET_F_RFS (depends on VIRTIO_NET_F_CTRL_VQ),
+ it can transmit outgoing packets on one of the multiple transmitq0..transmitqN
+ and ask the device to queue incoming packets into one the multiple receiveq0..rec
+eiveqN depending on the packet flow.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 1986246365 1353594292
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+\change_inserted 1986246365 1353594178
+
+struct virtio_net_ctrl_rfs {
+\end_layout
+
+\begin_layout Plain Layout
+
+\change_inserted 1986246365 1353594212
+
+	u16 virtqueue_pairs;
+\end_layout
+
+\begin_layout Plain Layout
+
+\change_inserted 1986246365 1353594172
+
+};
+\end_layout
+
+\begin_layout Plain Layout
+
+\change_inserted 1986246365 1353594172
+
+\end_layout
+
+\begin_layout Plain Layout
+
+\change_inserted 1986246365 1353594263
+
+#define VIRTIO_NET_CTRL_RFS    1
+\end_layout
+
+\begin_layout Plain Layout
+
+\change_inserted 1986246365 1353594273
+
+ #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET        0 
+\end_layout
+
+\begin_layout Plain Layout
+
+\change_inserted 1986246365 1353594273
+
+ #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MIN        1 
+\end_layout
+
+\begin_layout Plain Layout
+
+\change_inserted 1986246365 1353594273
+
+ #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MAX        0x8000
+\end_layout
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 1986246365 1353594884
+RFS acceleration is disabled by default.
+ Driver enables RFS by executing the VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET command,
+ specifying the number of the transmit and receive queues that is going
+ to be used; thus out of transmitq0..transmitqn and receiveq0..receiveqn where
+ 
+\emph on
+n=virtqueue_pairs-1
+\emph default
+ will be used.
+ All these virtqueues must have been pre-configured in advance.
+ The range of legal values for the
+\emph on
+ virtqueue_pairs
+\emph off
+ field is between 1 and 8000h.
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 1986246365 1353595328
+Programming of the receive flow classificator is implicit.
+ Transmitting a packet of a specific flow on transmitqX will cause incoming
+ packets for this flow to be steered to receiveqX.
+ For uni-directional protocols, or where no packets have been transmitted
+ yet, device will steer a packet to a random queue out of the specified
+ receiveq0..receiveqn.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 1986246365 1353595040
+RFS acceleration is disabled by setting 
+\emph on
+virtqueue_pairs = 1
+\emph default
+ (this is the default).
+ Following this, driver should not transmit new packets on virtqueues other
+ than transmitq0 and device will not steer new packets on virtqueues other
+ than receiveq0.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 1986246365 1353593873
 .
+
+\change_unchanged
  
 \end_layout
 

^ permalink raw reply related

* Re: [Qemu-devel] tap devices not receiving packets from a bridge
From: Peter Lieven @ 2012-11-23 11:02 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Stefan Hajnoczi, qemu-devel, netdev
In-Reply-To: <20121123110146.GC7051@redhat.com>


Am 23.11.2012 um 12:01 schrieb Michael S. Tsirkin:

> On Fri, Nov 23, 2012 at 10:41:21AM +0100, Peter Lieven wrote:
>> 
>> Am 23.11.2012 um 08:02 schrieb Stefan Hajnoczi:
>> 
>>> On Thu, Nov 22, 2012 at 03:29:52PM +0100, Peter Lieven wrote:
>>>> is anyone aware of a problem with the linux network bridge that in very rare circumstances stops
>>>> a bridge from sending pakets to a tap device?
>>>> 
>>>> My problem occurs in conjunction with vanilla qemu-kvm-1.2.0 and Ubuntu Kernel 3.2.0-34.53
>>>> which is based on Linux 3.2.33.
>>>> 
>>>> I was not yet able to reproduce the issue, it happens in really rare cases. The symptom is that
>>>> the tap does not have any TX packets. RX is working fine. I see the packets coming in at
>>>> the physical interface on the host, but they are not forwarded to the tap interface.
>>>> The bridge itself has learnt the mac address of the vServer that is connected to the tap interface.
>>>> It does not help to toggle the bridge link status,  the tap interface status or the interface in the vServer.
>>>> It seems that problem occurs if a tap interface that has previously been used, but set to nonpersistent
>>>> is set persistent again and then is by chance assigned to the same vServer (=same mac address on same
>>>> bridge) again. Unfortunately it seems not to be reproducible.
>>> 
>>> Not sure but this patch from Michael Tsirkin may help - it solves an
>>> issue with persistent tap devices:
>>> 
>>> http://patchwork.ozlabs.org/patch/198598/
>> 
>> Hi Stefan,
>> 
>> thanks for the pointer. I have seen this patch, but I have neglected it because it was dealing
>> with persistent taps. But maybe the taps in the kernel are not deleted directly. 
>> Can you remember what the syptomps of the above issue have been? Sorry for
>> being vague, but I currently have no clue whats going on.
>> 
>> Can someone who has more internal knowledge of the bridging/tap code say if qemu can
>> be responsible at all if the tap device is not receiving packets from the bridge.
>> 
>> If I have the following config. Lets say packets coming in via physical interface eth1.123,
>> and a bridge called br123.I further have a virtual machine with tap0. Both eth1.123
>> and tap0 are member of br123. 
>> 
>> If the issue occurs the vServer has no network connectivity inbound. If I sent a ping
>> from the vServer I see it on tap0 and leaving on eth1.123. I see further the arp reply coming
>> in via eth1.123, but the reply can't be seen on tap0.
>> 
>> Peter
> 
> If guest is not consuming packets, a TX queue in tap device
> will with time overrun (there's space for 1000 packets there).
> This is code from tun:

>From what I remember there where zero TX packets and no TX errors
on the device.

Might it be that this queue is somehow not cleared correctly when
the device is reassigned (although it was nonpersistant in between).

Thank you,
Peter

> 
>        if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
>> = dev->tx_queue_len / tun->numqueues){
>                if (!(tun->flags & TUN_ONE_QUEUE)) {
>                        /* Normal queueing mode. */
>                        /* Packet scheduler handles dropping of further
> * packets. */
>                        netif_stop_subqueue(dev, txq);
> 
>                        /* We won't see all dropped packets
> * individually, so overrun
>                         * error is more appropriate. */
>                        dev->stats.tx_fifo_errors++;
> 
> 
> So you can detect that this triggered by looking at fifo errors counter in device.
> 
> Once this happens TX queue is stopped, then you hit this path:
> 
>                        if (!netif_xmit_stopped(txq)) {
>                                __this_cpu_inc(xmit_recursion);
>                                rc = dev_hard_start_xmit(skb, dev, txq);
>                                __this_cpu_dec(xmit_recursion);
>                                if (dev_xmit_complete(rc)) {
>                                        HARD_TX_UNLOCK(dev, txq);
>                                        goto out;
>                                }
>                        }
> 
> so packets are not passed to device anymore.
> It will stay this way until guest consumes some packets and
> queue is restarted.
> 
>>> 
>>> Stefan

^ permalink raw reply

* Re: [Qemu-devel] tap devices not receiving packets from a bridge
From: Michael S. Tsirkin @ 2012-11-23 11:01 UTC (permalink / raw)
  To: Peter Lieven; +Cc: Stefan Hajnoczi, qemu-devel, netdev
In-Reply-To: <E85C6011-548D-4507-A776-1028DD3E3515@dlhnet.de>

On Fri, Nov 23, 2012 at 10:41:21AM +0100, Peter Lieven wrote:
> 
> Am 23.11.2012 um 08:02 schrieb Stefan Hajnoczi:
> 
> > On Thu, Nov 22, 2012 at 03:29:52PM +0100, Peter Lieven wrote:
> >> is anyone aware of a problem with the linux network bridge that in very rare circumstances stops
> >> a bridge from sending pakets to a tap device?
> >> 
> >> My problem occurs in conjunction with vanilla qemu-kvm-1.2.0 and Ubuntu Kernel 3.2.0-34.53
> >> which is based on Linux 3.2.33.
> >> 
> >> I was not yet able to reproduce the issue, it happens in really rare cases. The symptom is that
> >> the tap does not have any TX packets. RX is working fine. I see the packets coming in at
> >> the physical interface on the host, but they are not forwarded to the tap interface.
> >> The bridge itself has learnt the mac address of the vServer that is connected to the tap interface.
> >> It does not help to toggle the bridge link status,  the tap interface status or the interface in the vServer.
> >> It seems that problem occurs if a tap interface that has previously been used, but set to nonpersistent
> >> is set persistent again and then is by chance assigned to the same vServer (=same mac address on same
> >> bridge) again. Unfortunately it seems not to be reproducible.
> > 
> > Not sure but this patch from Michael Tsirkin may help - it solves an
> > issue with persistent tap devices:
> > 
> > http://patchwork.ozlabs.org/patch/198598/
> 
> Hi Stefan,
> 
> thanks for the pointer. I have seen this patch, but I have neglected it because it was dealing
> with persistent taps. But maybe the taps in the kernel are not deleted directly. 
> Can you remember what the syptomps of the above issue have been? Sorry for
> being vague, but I currently have no clue whats going on.
> 
> Can someone who has more internal knowledge of the bridging/tap code say if qemu can
> be responsible at all if the tap device is not receiving packets from the bridge.
> 
> If I have the following config. Lets say packets coming in via physical interface eth1.123,
> and a bridge called br123.I further have a virtual machine with tap0. Both eth1.123
> and tap0 are member of br123. 
> 
> If the issue occurs the vServer has no network connectivity inbound. If I sent a ping
> from the vServer I see it on tap0 and leaving on eth1.123. I see further the arp reply coming
> in via eth1.123, but the reply can't be seen on tap0.
> 
> Peter

If guest is not consuming packets, a TX queue in tap device
will with time overrun (there's space for 1000 packets there).
This is code from tun:

        if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
                          >= dev->tx_queue_len / tun->numqueues){
                if (!(tun->flags & TUN_ONE_QUEUE)) {
                        /* Normal queueing mode. */
                        /* Packet scheduler handles dropping of further
 * packets. */
                        netif_stop_subqueue(dev, txq);

                        /* We won't see all dropped packets
 * individually, so overrun
                         * error is more appropriate. */
                        dev->stats.tx_fifo_errors++;


So you can detect that this triggered by looking at fifo errors counter in device.

Once this happens TX queue is stopped, then you hit this path:

                        if (!netif_xmit_stopped(txq)) {
                                __this_cpu_inc(xmit_recursion);
                                rc = dev_hard_start_xmit(skb, dev, txq);
                                __this_cpu_dec(xmit_recursion);
                                if (dev_xmit_complete(rc)) {
                                        HARD_TX_UNLOCK(dev, txq);
                                        goto out;
                                }
                        }

so packets are not passed to device anymore.
It will stay this way until guest consumes some packets and
queue is restarted.

> > 
> > Stefan

^ permalink raw reply

* [PATCH] net: sched: enable CAN Identifier to be build into kernel
From: Marc Kleine-Budde @ 2012-11-23 10:44 UTC (permalink / raw)
  To: netdev; +Cc: linux-can, Marc Kleine-Budde

This patch makes it possible to build the CAN Identifier into the kernel, even
if the CAN support is build as a module.

Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
Hello,

is there a nicer solution to this problem? Or remove the "&& CAN" at all?

Marc

 net/sched/Kconfig |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 62fb51f..235e01a 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -509,7 +509,7 @@ config NET_EMATCH_TEXT
 
 config NET_EMATCH_CANID
 	tristate "CAN Identifier"
-	depends on NET_EMATCH && CAN
+	depends on NET_EMATCH && (CAN=y || CAN=m)
 	---help---
 	  Say Y here if you want to be able to classify CAN frames based
 	  on CAN Identifier.
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH net-next] be2net: fix a possible events_get() race on BE2
From: Sathya Perla @ 2012-11-23 10:27 UTC (permalink / raw)
  To: netdev; +Cc: Sathya Perla

On BE2 chip, an interrupt being raised even when EQ is in un-armed state has
been observed a few times.  This is not expected and has never been
observed on BE3/Lancer chips.

As a consequence, be_msix()::events_get() and be_poll()::events_get()
can race and notify an EQ wrongly causing a CEV UE. The other possible
side-effect would be traffic stalling because after notifying EQ,
napi_schedule() is ignored as NAPI is already running.

This patch fixes this issue by counting events only in be_poll().

Signed-off-by: Sathya Perla <sathya.perla@emulex.com>
---
 drivers/net/ethernet/emulex/benet/be_main.c |   11 +++++++----
 1 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index c365722..adef536 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -2029,7 +2029,8 @@ static irqreturn_t be_msix(int irq, void *dev)
 {
 	struct be_eq_obj *eqo = dev;
 
-	event_handle(eqo);
+	be_eq_notify(eqo->adapter, eqo->q.id, false, true, 0);
+	napi_schedule(&eqo->napi);
 	return IRQ_HANDLED;
 }
 
@@ -2125,9 +2126,11 @@ int be_poll(struct napi_struct *napi, int budget)
 {
 	struct be_eq_obj *eqo = container_of(napi, struct be_eq_obj, napi);
 	struct be_adapter *adapter = eqo->adapter;
-	int max_work = 0, work, i;
+	int max_work = 0, work, i, num_evts;
 	bool tx_done;
 
+	num_evts = events_get(eqo);
+
 	/* Process all TXQs serviced by this EQ */
 	for (i = eqo->idx; i < adapter->num_tx_qs; i += adapter->num_evt_qs) {
 		tx_done = be_process_tx(adapter, &adapter->tx_obj[i],
@@ -2150,10 +2153,10 @@ int be_poll(struct napi_struct *napi, int budget)
 
 	if (max_work < budget) {
 		napi_complete(napi);
-		be_eq_notify(adapter, eqo->q.id, true, false, 0);
+		be_eq_notify(adapter, eqo->q.id, true, false, num_evts);
 	} else {
 		/* As we'll continue in polling mode, count and clear events */
-		be_eq_notify(adapter, eqo->q.id, false, false, events_get(eqo));
+		be_eq_notify(adapter, eqo->q.id, false, false, num_evts);
 	}
 	return max_work;
 }
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCHv4] virtio-spec: virtio network device RFS support
From: Michael S. Tsirkin @ 2012-11-23 10:10 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, kvm, virtualization
In-Reply-To: <50AF06F0.1030604@redhat.com>

On Fri, Nov 23, 2012 at 01:17:36PM +0800, Jason Wang wrote:
...

> "specifying the number of the last transmit and receive queue that
> is going to be used; thus out of transmitq0..transmitqn and
> receiveq0..receiveqn where n=virtqueue_pairs will be used."
> 
> In this description, looks like n+1 virtqueue pairs (include
> receiveq0 and transmitq0) could be used in RFS mode.

The intent was not to reserve any virt queue pairs.
I hope I clarified this below.



Thanks for the comments. Here's an incremental patch to address
them.


diff --git a/virtio-spec.lyx b/virtio-spec.lyx
index e562335..53ddeec 100644
--- a/virtio-spec.lyx
+++ b/virtio-spec.lyx
@@ -4384,7 +4384,7 @@ VIRTIO_NET_F_GUEST_ANNOUNCE(21) Guest can send gratuitous packets.
 \begin_layout Description
 
 \change_inserted 1986246365 1352742808
-VIRTIO_NET_F_RFS(2) Device supports Receive Flow Steering.
+VIRTIO_NET_F_RFS(22) Device supports Receive Flow Steering.
 \change_unchanged
 
 \end_layout
@@ -4432,9 +4432,10 @@ N
 \emph default
 =
 \emph on
-max_virtqueue_pairs
+max_virtqueue_pairs - 1
 \emph default
 ) that can be configured once VIRTIO_NET_F_RFS is negotiated.
+Legal values for this field are 1 to 8000h.
 
 \change_unchanged
  
@@ -4496,7 +4497,7 @@ The initialization routine should identify the receive and transmission
 \change_inserted 1986246365 1352743942
  If VIRTIO_NET_F_RFS feature bit is negotiated, 
 \emph on
-N=max_virtqueue_pairs
+N=max_virtqueue_pairs-1
 \emph default
 , otherwise identify 
 \emph on
@@ -5464,7 +5465,7 @@ struct virtio_net_ctrl_rfs {
 
 \change_inserted 1986246365 1353594263
 
-#define VIRTIO_NET_CTRL_RFC    1
+#define VIRTIO_NET_CTRL_RFS    1
 \end_layout
 
 \begin_layout Plain Layout
@@ -5474,6 +5475,19 @@ struct virtio_net_ctrl_rfs {
  #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET        0 
 \end_layout
 
+\begin_layout Plain Layout
+
+\change_inserted 1986246365 1353594273
+
+ #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MIN        1 
+\end_layout
+
+\begin_layout Plain Layout
+
+\change_inserted 1986246365 1353594273
+
+ #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_MAX        0x8000
+\end_layout
 \end_inset
 
 
@@ -5484,14 +5498,19 @@ struct virtio_net_ctrl_rfs {
 \change_inserted 1986246365 1353594884
 RFS acceleration is disabled by default.
  Driver enables RFS by executing the VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET command,
- specifying the number of the last transmit and receive queue that is going
+ specifying the number of the transmit and receive queues that is going
  to be used; thus out of transmitq0..transmitqn and receiveq0..receiveqn where
  
 \emph on
-n=virtqueue
+n=virtqueue_pairs-1
 \emph default
-_pairs will be used.
+ will be used.
  All these virtqueues must have been pre-configured in advance.
+ The range of legal values for the
+\emph on
+ virtqueue_pairs
+\emph off
+ field is between 1 and 8000h.
 \end_layout
 
 \begin_layout Standard
@@ -5512,7 +5531,7 @@ Programming of the receive flow classificator is implicit.
 \change_inserted 1986246365 1353595040
 RFS acceleration is disabled by setting 
 \emph on
-virtqueue_pairs = 0
+virtqueue_pairs = 1
 \emph default
  (this is the default).
  Following this, driver should not transmit new packets on virtqueues other

^ permalink raw reply related

* [TCP] Flags returned by poll on connection request to closed peer?
From: Yi Li @ 2012-11-23 10:07 UTC (permalink / raw)
  To: netdev

Hi List,
When I issues a non-blocking connection request to a closed peer, and 
call select() to get the status
of the socket. But When I issues many threads, and I got the statistic 
as follow:

  POLLIN_SET POLLOUT_SET	9980000
  !POLLIN_SET !POLLOUT_SET	0
  POLLIN_SET !POLLOUT_SET0
  !POLLIN_SET POLLOUT_SET20000

as POLLIN_SET&& POLLOUT_SET means connection error.(of course, we are attempting to connect
to a closed peer). But what the meaning of !POLLIN_SET POLLOUT_SET ?

Here is my test program, and my test command is :
./client -d $SERVERS -s $max_range_start -e $max_range_end -t 20000

#include <sys/types.h>
#include <sys/socket.h>
#include <sys/select.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <pthread.h>
#include <stdlib.h>
#include <arpa/inet.h>
#include <netdb.h>

#define BUFSIZE 255
#define POOL_SIZE 1000

unsigned short port_start, port_end;
uint32_t server_ip;
int total_count;
unsigned short port_pool[POOL_SIZE];

void usage(const char *name){
     printf("%s: -d SERVER_IP -s SERVER_PORT_S -e SERVER_PORT_E  -t TOTAL_COUNT -h\n", name);
     printf("-d SERVER_IP: server ip is SERVER_IP\n");
     printf("-s SERVER_PORT_S: closed server port range start, one thread per port\n");
     printf("-s SERVER_PORT_E: closed server port range end, one thread per port\n");
     printf("-t TOTAL_COUNT: per thread try TOTAL_COUNT tmies connection requests\n");
     printf("-h: print this help message\n");
     return;
}

void* talk_to_server(void* arg){
     int sockfd, flags, ret, i = 0;
     struct timeval timeout;
     fd_set rset, wset;
     struct sockaddr_in server_addr;
     int index = *((int *)&arg);
	
     server_addr.sin_family = AF_INET;
     server_addr.sin_addr.s_addr = server_ip;
     server_addr.sin_port = htons(port_pool[index]);

     while(i++ < total_count){
	if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0){
	    perror("client: socket create error");
	    goto exit;
	}

	FD_ZERO(&rset);
	FD_SET(sockfd, &rset);
	wset = rset;
	timeout.tv_sec = 10;
	timeout.tv_usec = 0;
     
	flags = fcntl(sockfd, F_GETFL, 0);
	fcntl(sockfd, F_SETFL, flags | O_NONBLOCK);
     
	if ((ret = connect(sockfd, (struct sockaddr *)&server_addr, sizeof(server_addr))) <= 0){
	    if(ret == 0){
		fprintf(stderr, "client: connect established.\n");
		goto sockfd_exit;
	    }
	    if(errno != EINPROGRESS){
		perror("client: connect failed.");
		goto sockfd_exit;
	    }
	}
     
	if( (ret = select(sockfd+1, &rset, &wset, NULL, &timeout)) < 0){
	    perror("client: select failed.");
	    goto sockfd_exit;
	}

	if(FD_ISSET(sockfd, &rset) && FD_ISSET(sockfd, &wset)){
	    fprintf(stdout, "client: sockfd=%d, with POLLIN_SET POLLOUT_SET\n", sockfd);
	}else if(FD_ISSET(sockfd, &rset) && !FD_ISSET(sockfd, &wset)){
	    fprintf(stdout, "client: sockfd=%d, with POLLIN_SET !POLLOUT_SET\n", sockfd);
	}else if(!FD_ISSET(sockfd, &rset) && FD_ISSET(sockfd, &wset)){
	    fprintf(stdout, "client: sockfd=%d, with !POLLIN_SET POLLOUT_SET\n", sockfd);
	}else{
	    fprintf(stdout, "client: sockfd=%d, with !POLLIN_SET !POLLOUT_SET\n", sockfd);
	}
	
     sockfd_exit:
	close(sockfd);
     }
  exit:
     pthread_exit(NULL);
}

int parse_options(int argc, char *argv[]){
     int ret;
     struct hostent *hptr;
     char buf[BUFSIZE];

     if(argc < 6){
	usage(argv[0]);
	return -1;
     }
     
     while((ret = getopt(argc, argv, "d:s:e:t:h")) != -1){
	switch(ret){
	case 'd':
	    if( (hptr = gethostbyname(optarg)) == NULL){
		fprintf(stderr, "client: gethostbyname error: %s\n", hstrerror(h_errno));
		return -1;
	    }
	    switch(hptr->h_addrtype){
	    case AF_INET:
		server_ip =((struct in_addr*)hptr->h_addr)->s_addr;
		break;
	    default:
		fprintf(stderr, "client: unknow address type\n");
		return -1;
	    }
	    break;
	case 's':
	    port_start = atoi(optarg);
	    break;
	case 'e':
	    port_end = atoi(optarg);
	    break;
	case 't':
	    total_count = atoi(optarg);
	    break;
	case 'h':
	    usage(argv[0]);
	    return -1;
	case '?':
	default:
	    fprintf(stderr, "unknow option %c\n", optopt);
	    return -1;
	}
     }
     return 0;
}

int main(int argc, char *argv[]){
     int i;
     pthread_t tid;
     pthread_attr_t child_thread_attr;
     
     if(parse_options(argc, argv) < 0)
	return 0;

     if( port_end - port_start+1 > POOL_SIZE)
	port_end = port_start + POOL_SIZE -1;
     
     /*initialize port pool*/
     for(i = 0; port_start + i <= port_end; i++){
	port_pool[i] = port_start + i;
     }
     
     /*create threads, one thread per server port*/
     pthread_attr_init(&child_thread_attr);
     pthread_attr_setdetachstate(&child_thread_attr, PTHREAD_CREATE_DETACHED);
     for( i = 0; port_start + i <= port_end ; i++){
	    if( pthread_create(&tid, &child_thread_attr, talk_to_server, (void *)i) != 0 )
		fprintf(stderr, "client: pthread create failed thread %d port %d\n",
			i, port_start+i);
     }
     pthread_exit(NULL);
}

^ permalink raw reply

* Re: [Qemu-devel] tap devices not receiving packets from a bridge
From: Peter Lieven @ 2012-11-23  9:41 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: qemu-devel, netdev, mst
In-Reply-To: <20121123070211.GC22787@stefanha-thinkpad.hitronhub.home>

Am 23.11.2012 um 08:02 schrieb Stefan Hajnoczi:

> On Thu, Nov 22, 2012 at 03:29:52PM +0100, Peter Lieven wrote:
>> is anyone aware of a problem with the linux network bridge that in very rare circumstances stops
>> a bridge from sending pakets to a tap device?
>> 
>> My problem occurs in conjunction with vanilla qemu-kvm-1.2.0 and Ubuntu Kernel 3.2.0-34.53
>> which is based on Linux 3.2.33.
>> 
>> I was not yet able to reproduce the issue, it happens in really rare cases. The symptom is that
>> the tap does not have any TX packets. RX is working fine. I see the packets coming in at
>> the physical interface on the host, but they are not forwarded to the tap interface.
>> The bridge itself has learnt the mac address of the vServer that is connected to the tap interface.
>> It does not help to toggle the bridge link status,  the tap interface status or the interface in the vServer.
>> It seems that problem occurs if a tap interface that has previously been used, but set to nonpersistent
>> is set persistent again and then is by chance assigned to the same vServer (=same mac address on same
>> bridge) again. Unfortunately it seems not to be reproducible.
> 
> Not sure but this patch from Michael Tsirkin may help - it solves an
> issue with persistent tap devices:
> 
> http://patchwork.ozlabs.org/patch/198598/

Hi Stefan,

thanks for the pointer. I have seen this patch, but I have neglected it because it was dealing
with persistent taps. But maybe the taps in the kernel are not deleted directly. 
Can you remember what the syptomps of the above issue have been? Sorry for
being vague, but I currently have no clue whats going on.

Can someone who has more internal knowledge of the bridging/tap code say if qemu can
be responsible at all if the tap device is not receiving packets from the bridge.

If I have the following config. Lets say packets coming in via physical interface eth1.123,
and a bridge called br123.I further have a virtual machine with tap0. Both eth1.123
and tap0 are member of br123. 

If the issue occurs the vServer has no network connectivity inbound. If I sent a ping
from the vServer I see it on tap0 and leaving on eth1.123. I see further the arp reply coming
in via eth1.123, but the reply can't be seen on tap0.

Peter

> 
> Stefan

^ permalink raw reply

* Re: [PATCH 1/3] net: stmmac: change GMAC control register for SGMII
From: Giuseppe CAVALLARO @ 2012-11-23  9:31 UTC (permalink / raw)
  To: Byungho An; +Cc: davem, jeffrey.t.kirsher, netdev, kgene.kim, linux-kernel
In-Reply-To: <004b01cdc959$80af8030$820e8090$%an@samsung.com>

Hello An

On 11/23/2012 10:04 AM, Byungho An wrote:
>
> This patch changes GMAC control register (TC(Transmit
> Configuration) and PS(Port Selection) bit for SGMII.
> In case of SGMII, TC bit is '1' and PS bit is 0.

I was looking at this too. In particular, I was working on the rgmii 
interrupt so I guess we could improve this part together.

First my note is that I would like to have this kind of code never 
placed in the stmmac_main. It should stay in the core part.
Also I 'd like to avoid the Kconfig option where possible.

At any rate, I'll come back with further details soon.

BR,
Peppe

>
> Signed-off-by: Byungho An <bh74.an@samsung.com>
> ---
>   drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |   10 ++++++++++
>   1 file changed, 10 insertions(+)
>
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> index c6cdbc4..a719c87 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> @@ -1037,6 +1037,7 @@ static int stmmac_open(struct net_device *dev)
>   {
>   	struct stmmac_priv *priv = netdev_priv(dev);
>   	int ret;
> +	u32 value;
>
>   #ifdef CONFIG_STMMAC_TIMER
>   	priv->tm = kzalloc(sizeof(struct stmmac_timer *), GFP_KERNEL);
> @@ -1088,6 +1089,15 @@ static int stmmac_open(struct net_device *dev)
>   	/* Initialize the MAC Core */
>   	priv->hw->mac->core_init(priv->ioaddr);
>
> +	if (priv->phydev->interface == PHY_INTERFACE_MODE_SGMII) {
> +		value = readl(priv->ioaddr);
> +		/* GMAC_CONTROL_TC : transmit config in RGMII/SGMII */
> +		value |= 0x1000000;
> +		/* GMAC_CONTROL_PS : Port Selection for GMII */
> +		value &= ~(0x8000);
> +		writel(value, priv->ioaddr);
> +	}
> +
>   	/* Request the IRQ lines */
>   	ret = request_irq(dev->irq, stmmac_interrupt,
>   			 IRQF_SHARED, dev->name, dev);
>

^ permalink raw reply

* [PATCH 2/3] net: stmmac: add SGMII RAL control bit
From: Byungho An @ 2012-11-23  9:04 UTC (permalink / raw)
  To: davem, peppe.cavallaro, jeffrey.t.kirsher; +Cc: netdev, kgene.kim, linux-kernel


This patch sets SGMRAL bit in AN control register.
This bit forces the SGMII RAL block to operate in the
speed configured in the Speed and Port Select bits of
the GMAC Configuration register.

Signed-off-by: Byungho An <bh74.an@samsung.com>
---
 drivers/net/ethernet/stmicro/stmmac/Kconfig       |    7 +++++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |   11 +++++++++++
 2 files changed, 18 insertions(+)

diff --git a/drivers/net/ethernet/stmicro/stmmac/Kconfig
b/drivers/net/ethernet/stmicro/stmmac/Kconfig
index 9f44827..d65d63b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/Kconfig
+++ b/drivers/net/ethernet/stmicro/stmmac/Kconfig
@@ -54,6 +54,13 @@ config STMMAC_DA
 	  By default, the DMA arbitration scheme is based on Round-robin
 	  (rx:tx priority is 1:1).
 
+config STMMAC_SGMRAL
+	bool "STMMAC SGMII RAL Control"
+	default n
+	---help---
+	  SGMII RAL block to operate in the speed configured in the speed
+	  and port select bits of the MAC Configuration register.
+
 config STMMAC_TIMER
 	bool "STMMAC Timer optimisation"
 	default n
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index a719c87..670e585 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1096,6 +1096,17 @@ static int stmmac_open(struct net_device *dev)
 		/* GMAC_CONTROL_PS : Port Selection for GMII */
 		value &= ~(0x8000);
 		writel(value, priv->ioaddr);
+
+#ifdef CONFIG_STMMAC_SGMRAL
+		value = readl(priv->ioaddr + 0xc0);
+		/*
+		 * forces RAL block to operate in speed configured
+		 * in the speed and port select bits of GMAC
+		 * configuration register
+		 */
+		value = |= 0x40000;
+		writel(value, priv->ioaddr + 0xc0);
+#endif
 	}
 
 	/* Request the IRQ lines */
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH 1/3] net: stmmac: change GMAC control register for SGMII
From: Byungho An @ 2012-11-23  9:04 UTC (permalink / raw)
  To: davem, peppe.cavallaro, jeffrey.t.kirsher; +Cc: netdev, kgene.kim, linux-kernel


This patch changes GMAC control register (TC(Transmit
Configuration) and PS(Port Selection) bit for SGMII.
In case of SGMII, TC bit is '1' and PS bit is 0.

Signed-off-by: Byungho An <bh74.an@samsung.com>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |   10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index c6cdbc4..a719c87 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1037,6 +1037,7 @@ static int stmmac_open(struct net_device *dev)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 	int ret;
+	u32 value;
 
 #ifdef CONFIG_STMMAC_TIMER
 	priv->tm = kzalloc(sizeof(struct stmmac_timer *), GFP_KERNEL);
@@ -1088,6 +1089,15 @@ static int stmmac_open(struct net_device *dev)
 	/* Initialize the MAC Core */
 	priv->hw->mac->core_init(priv->ioaddr);
 
+	if (priv->phydev->interface == PHY_INTERFACE_MODE_SGMII) {
+		value = readl(priv->ioaddr);
+		/* GMAC_CONTROL_TC : transmit config in RGMII/SGMII */
+		value |= 0x1000000;
+		/* GMAC_CONTROL_PS : Port Selection for GMII */
+		value &= ~(0x8000);
+		writel(value, priv->ioaddr);
+	}
+
 	/* Request the IRQ lines */
 	ret = request_irq(dev->irq, stmmac_interrupt,
 			 IRQF_SHARED, dev->name, dev);
-- 
1.7.9.5

^ permalink raw reply related

* Re: [PATCH v6] can: kvaser_usb: Add support for Kvaser CAN/USB devices
From: Marc Kleine-Budde @ 2012-11-23  8:48 UTC (permalink / raw)
  To: Greg KH
  Cc: Olivier Sobrie, Wolfgang Grandegger, linux-can, netdev, linux-usb,
	Daniel Berglund
In-Reply-To: <20121122213022.GB1461@kroah.com>

[-- Attachment #1: Type: text/plain, Size: 1594 bytes --]

On 11/22/2012 10:30 PM, Greg KH wrote:
> On Thu, Nov 22, 2012 at 04:01:49PM +0100, Olivier Sobrie wrote:
>> Hi linux-usb folks,
>>
>> Is there someone who can help me to fix the following errors?
>>
>> smatch warnings:
>>
>> + drivers/net/can/usb/kvaser_usb.c:431 kvaser_usb_send_simple_msg() error: doing
>> +dma on the stack ((null))
>> + drivers/net/can/usb/kvaser_usb.c:1073 kvaser_usb_set_opt_mode() error: doing
>> +dma on the stack ((null))
>> + drivers/net/can/usb/kvaser_usb.c:1174 kvaser_usb_flush_queue() error: doing
>> +dma on the stack ((null))
>> + drivers/net/can/usb/kvaser_usb.c:1384 kvaser_usb_set_bittiming() error: doing
>> +dma on the stack ((null))
>>
>> I assume it's due to the buffer I pass to the function usb_bulk_msg()
>> which is on the stack and can't be.
>> Do I just have to kmalloc a buffer and give it to the usb_bulk_msg()
>> function? That's what I understood by reading
>> "Documentation/DMA-API-HOWTO.txt" section "What memory is DMA'able?"...
>> and from commit
>> http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commitdiff;h=32ec4576c3fb37316b1d11a04b220527822f3f0d
> 
> Yes, that is all that is needed.

Thanks Greg. Olivier, you can post an incremental patch, I'll squash it
before sending the patches upstream.

regards,
Marc

-- 
Pengutronix e.K.                  | Marc Kleine-Budde           |
Industrial Linux Solutions        | Phone: +49-231-2826-924     |
Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 261 bytes --]

^ permalink raw reply

* [PATCH 1/1] ARM: dts: am335x-evmsk: Add cpsw phy_id
From: Mugunthan V N @ 2012-11-23  8:32 UTC (permalink / raw)
  To: b-cousson
  Cc: netdev, devicetree-discuss, linux-arm-kernel, linux-omap, paul,
	Mugunthan V N

Add phy id for CPSW

Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
---
The patch is verified with CPSW patches present in the following git repo
git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git

 arch/arm/boot/dts/am335x-evmsk.dts |    8 ++++++++
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/arch/arm/boot/dts/am335x-evmsk.dts b/arch/arm/boot/dts/am335x-evmsk.dts
index 6f53879..c629086 100644
--- a/arch/arm/boot/dts/am335x-evmsk.dts
+++ b/arch/arm/boot/dts/am335x-evmsk.dts
@@ -164,3 +164,11 @@
 		};
 	};
 };
+
+&cpsw_emac0 {
+	phy_id = <&davinci_mdio>, <0>;
+};
+
+&cpsw_emac1 {
+	phy_id = <&davinci_mdio>, <1>;
+};
-- 
1.7.0.4


^ permalink raw reply related

* Re: [BUG] Kernel recieves DNS reply, but doesn't deliver it to a waiting application
From: Andrew Savchenko @ 2012-11-23  7:45 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <20121021032543.09d1844f.bircoph@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2113 bytes --]

Hello,

On Sun, 21 Oct 2012 03:25:43 +0400 Andrew Savchenko wrote:
> > On Sat, 13 Oct 2012 15:44:20 +0200 Eric Dumazet wrote:
[...]
> > > You should investigate and check where the incoming packet is lost
> > > 
> > > Tools :
> > > 
> > > netstat -s
> > > 
> > > drop_monitor module and dropwatch command
> > > 
> > > cat /proc/net/udp
> > 
> > Thank you for you reply; I updated my kernel to 3.4.14, enabled
> > CONFIG_NET_DROP_MONITOR, and installed dropwatch utility.
> > 
> > I will report back when the bug will struck again.
> > This may take a weak or two, however.
> 
> This bug is back again on kernel 3.4.14, but this time I was able to
> get debug data and to recover running kernel without reboot.
> 
> Drowpatch showed that DNS UDP replies are always dropped here:
> 1 drops at __udp_queue_rcv_skb+61 (0xffffffff813bd670)
> 
> Another observations:
> - only UDP replies are lost, TCP works fine;
> - if network load is dropped dramatically (ip_forward disabled, most
> network daemons are stopped) UDP DNS queries work again; but with
> gradual load increase replies became first slow and than cease at all.
> - CPU load is very low (uptime is below 0.05), so this shouldn't be
> an insufficient computing power issue.
> 
> I found __udp_queue_rcv_skb function in net/ipv4/udp.c. From the code
> and observations above it follows that this is likely to be a ENOMEM
> condition leading to a packet loss.
[...]
> net.ipv4.udp_mem = 100000       150000  200000
> 
> This solved my issue, at least for a while: DNS queries are working
> fine now.

And this solved problem only temporary: after 40 days of uptime the
same problem struck again with the same observables. I "solved" this
by increasing udp memory again:

net.ipv4.udp_mem = 200000  300000  400000

Of course, this solution is only a temporary workaround. Such
behaviour increases my suspicions on some kind of memory leak.

This host is still on 3.4.14, however: can't reboot now due to
workload. Will try 3.7 branch as soon as this will be possible.

Best regards,
Andrew Savchenko

[-- Attachment #2: Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply

* Re: [Qemu-devel] tap devices not receiving packets from a bridge
From: Stefan Hajnoczi @ 2012-11-23  7:02 UTC (permalink / raw)
  To: Peter Lieven; +Cc: qemu-devel@nongnu.org, netdev
In-Reply-To: <50AE36E0.8000307@dlhnet.de>

On Thu, Nov 22, 2012 at 03:29:52PM +0100, Peter Lieven wrote:
> is anyone aware of a problem with the linux network bridge that in very rare circumstances stops
> a bridge from sending pakets to a tap device?
> 
> My problem occurs in conjunction with vanilla qemu-kvm-1.2.0 and Ubuntu Kernel 3.2.0-34.53
> which is based on Linux 3.2.33.
> 
> I was not yet able to reproduce the issue, it happens in really rare cases. The symptom is that
> the tap does not have any TX packets. RX is working fine. I see the packets coming in at
> the physical interface on the host, but they are not forwarded to the tap interface.
> The bridge itself has learnt the mac address of the vServer that is connected to the tap interface.
> It does not help to toggle the bridge link status,  the tap interface status or the interface in the vServer.
> It seems that problem occurs if a tap interface that has previously been used, but set to nonpersistent
> is set persistent again and then is by chance assigned to the same vServer (=same mac address on same
> bridge) again. Unfortunately it seems not to be reproducible.

Not sure but this patch from Michael Tsirkin may help - it solves an
issue with persistent tap devices:

http://patchwork.ozlabs.org/patch/198598/

Stefan

^ permalink raw reply

* KINGDOM NELSON
From: KINGDOM NELSON @ 2012-11-23  6:48 UTC (permalink / raw)


[-- Attachment #1: Type: text/plain, Size: 0 bytes --]



[-- Attachment #2: KINGDOM NELSON.rtf --]
[-- Type: application/msword, Size: 3465 bytes --]

^ permalink raw reply

* Re: [PATCHv4] virtio-spec: virtio network device RFS support
From: Jason Wang @ 2012-11-23  5:17 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: rusty, virtualization, netdev, kvm
In-Reply-To: <20121122144645.GA28284@redhat.com>

On 11/22/2012 10:46 PM, Michael S. Tsirkin wrote:
> Add RFS support to virtio network device.
> Add a new feature flag VIRTIO_NET_F_RFS for this feature, a new
> configuration field max_virtqueue_pairs to detect supported number of
> virtqueues as well as a new command VIRTIO_NET_CTRL_RFS to program
> packet steering for unidirectional protocols.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>
> --
>
> Changes from v3:
> - rename multiqueue -> rfs this is what we support
> - Be more explicit about what driver should do.
> - Simplify layout making VQs functionality depend on feature.
> - Remove unused commands, only leave in programming # of queues
>
> Changes from v2:
> Address Jason's comments on v2:
> - Changed STEERING_HOST to STEERING_RX_FOLLOWS_TX:
>    this is both clearer and easier to support.
>    It does not look like we need a separate steering command
>    since host can just watch tx packets as they go.
> - Moved RX and TX steering sections near each other.
> - Add motivation for other changes in v2
>
> Changes from Jason's rfc:
> - reserved vq 3: this makes all rx vqs even and tx vqs odd, which
>    looks nicer to me.
> - documented packet steering, added a generalized steering programming
>    command. Current modes are single queue and host driven multiqueue,
>    but I envision support for guest driven multiqueue in the future.
> - make default vqs unused when in mq mode - this wastes some memory
>    but makes it more efficient to switch between modes as
>    we can avoid this causing packet reordering.
>
> Rusty, could you please take a look and comment soon?
> If this looks OK to everyone, we can proceed with finalizing the
> implementation. Would be nice to try and put it in 3.8.
>
> ---
>
> diff --git a/virtio-spec.lyx b/virtio-spec.lyx
> index d2f0da9..c1fa3e4 100644
> --- a/virtio-spec.lyx
> +++ b/virtio-spec.lyx
> @@ -59,6 +59,7 @@
>   \author -608949062 "Rusty Russell,,,"
>   \author -385801441 "Cornelia Huck" cornelia.huck@de.ibm.com
>   \author 1531152142 "Paolo Bonzini,,,"
> +\author 1986246365 "Michael S. Tsirkin"
>   \end_header
>   
>   \begin_body
> @@ -4170,9 +4171,42 @@ ID 1
>   \end_layout
>   
>   \begin_layout Description
> -Virtqueues 0:receiveq.
> - 1:transmitq.
> - 2:controlq
> +Virtqueues 0:receiveq
> +\change_inserted 1986246365 1352742829
> +0
> +\change_unchanged
> +.
> + 1:transmitq
> +\change_inserted 1986246365 1352742832
> +0
> +\change_deleted 1986246365 1352742947
> +.
> +
> +\change_inserted 1986246365 1352742952
> +.
> + ....
> + 2N
> +\begin_inset Foot
> +status open
> +
> +\begin_layout Plain Layout
> +
> +\change_inserted 1986246365 1352743187
> +N=0 if VIRTIO_NET_F_RFS is not negotiated, otherwise N is indicated by max_
> +\emph on
> +virtqueue_pairs control
> +\emph default
> + field.
> +
> +\end_layout
> +
> +\end_inset
> +
> +: receivqN.
> + 2N+1: transmitqN.
> + 2N+
> +\change_unchanged
> +2:controlq
>   \begin_inset Foot
>   status open
>   
> @@ -4343,6 +4377,16 @@ VIRTIO_NET_F_CTRL_VLAN
>   
>   \begin_layout Description
>   VIRTIO_NET_F_GUEST_ANNOUNCE(21) Guest can send gratuitous packets.
> +\change_inserted 1986246365 1352742767
> +
> +\end_layout
> +
> +\begin_layout Description
> +
> +\change_inserted 1986246365 1352742808
> +VIRTIO_NET_F_RFS(2) Device supports Receive Flow Steering.
> +\change_unchanged

should be 22
> +
>   \end_layout
>   
>   \end_deeper
> @@ -4355,11 +4399,44 @@ configuration
>   \begin_inset space ~
>   \end_inset
>   
> -layout Two configuration fields are currently defined.
> +layout
> +\change_deleted 1986246365 1352743300
> +Two
> +\change_inserted 1986246365 1352743301
> +Four
> +\change_unchanged
> + configuration fields are currently defined.
>    The mac address field always exists (though is only valid if VIRTIO_NET_F_MAC
>    is set), and the status field only exists if VIRTIO_NET_F_STATUS is set.
>    Two read-only bits are currently defined for the status field: VIRTIO_NET_S_LIN
>   K_UP and VIRTIO_NET_S_ANNOUNCE.
> +
> +\change_inserted 1986246365 1353595219
> + The following read-only field,
> +\emph on
> +max_virtqueue_pairs
> +\emph default
> + only exists if VIRTIO_NET_F_RFS is set.
> + This field specifies the maximum number of each of transmit and receive
> + virtqueues (receiveq0..receiveq
> +\emph on
> +N
> +\emph default
> + and transmitq0..transmitq
> +\emph on
> +N
> +\emph default
> + respectively;
> +\emph on
> +N
> +\emph default
> +=
> +\emph on
> +max_virtqueue_pairs
> +\emph default
> +) that can be configured once VIRTIO_NET_F_RFS is negotiated.
> +
> +\change_unchanged

So the virt queues used in single queue mode is still reserved in 
multiqueue mode, since when max_virtqueue_pairs in N, we finally get N+1 
virt queue pairs? And this looks conflict with the description in 
"Packet Receive Flow Steering":

"specifying the number of the last transmit and receive queue that is 
going to be used; thus out of transmitq0..transmitqn and 
receiveq0..receiveqn where n=virtqueue_pairs will be used."

In this description, looks like n+1 virtqueue pairs (include receiveq0 
and transmitq0) could be used in RFS mode.
>    
>   \begin_inset listings
>   inline false
> @@ -4410,7 +4487,24 @@ Device Initialization
>   
>   \begin_layout Enumerate
>   The initialization routine should identify the receive and transmission
> - virtqueues.
> + virtqueues
> +\change_inserted 1986246365 1352744077
> +, up to N+1 of each kind
> +\change_unchanged
> +.
> +
> +\change_inserted 1986246365 1352743942
> + If VIRTIO_NET_F_RFS feature bit is negotiated,
> +\emph on
> +N=max_virtqueue_pairs
> +\emph default
> +, otherwise identify
> +\emph on
> +N=0
> +\emph default
> +.
> +\change_unchanged
> +
>   \end_layout
>   
>   \begin_layout Enumerate
> @@ -4455,7 +4549,11 @@ status
>   \end_layout
>   
>   \begin_layout Enumerate
> -The receive virtqueue should be filled with receive buffers.
> +The receive virtqueue
> +\change_inserted 1986246365 1352743953
> +s
> +\change_unchanged
> + should be filled with receive buffers.
>    This is described in detail below in
>   \begin_inset Quotes eld
>   \end_inset
> @@ -4550,8 +4648,15 @@ Device Operation
>   \end_layout
>   
>   \begin_layout Standard
> -Packets are transmitted by placing them in the transmitq, and buffers for
> - incoming packets are placed in the receiveq.
> +Packets are transmitted by placing them in the transmitq
> +\change_inserted 1986246365 1353593685
> +0..transmitqN
> +\change_unchanged
> +, and buffers for incoming packets are placed in the receiveq
> +\change_inserted 1986246365 1353593692
> +0..receiveqN
> +\change_unchanged
> +.
>    In each case, the packet itself is preceeded by a header:
>   \end_layout
>   
> @@ -4861,6 +4966,17 @@ If VIRTIO_NET_F_MRG_RXBUF is negotiated, each buffer must be at least the
>   struct virtio_net_hdr
>   \family default
>   .
> +\change_inserted 1986246365 1353594518
> +
> +\end_layout
> +
> +\begin_layout Standard
> +
> +\change_inserted 1986246365 1353594638
> +If VIRTIO_NET_F_RFS is negotiated, each of the receiveq0...receiveqN that will
> + be used should be populated with receive buffers.
> +\change_unchanged
> +
>   \end_layout
>   
>   \begin_layout Subsection*
> @@ -5293,8 +5409,125 @@ Sending VIRTIO_NET_CTRL_ANNOUNCE_ACK command through control vq.
>    
>   \end_layout
>   
> -\begin_layout Enumerate
> +\begin_layout Subsection*
> +
> +\change_inserted 1986246365 1353593879
> +Packet Receive Flow Steering
> +\end_layout
> +
> +\begin_layout Standard
> +
> +\change_inserted 1986246365 1353594403
> +If the driver negotiates the VIRTIO_NET_F_RFS (depends on VIRTIO_NET_F_CTRL_VQ),
> + it can transmit outgoing packets on one of the multiple transmitq0..transmitqN
> + and ask the device to queue incoming packets into one the multiple receiveq0..rec
> +eiveqN depending on the packet flow.
> +\change_unchanged
> +
> +\end_layout
> +
> +\begin_layout Standard
> +
> +\change_inserted 1986246365 1353594292
> +\begin_inset listings
> +inline false
> +status open
> +
> +\begin_layout Plain Layout
> +
> +\change_inserted 1986246365 1353594178
> +
> +struct virtio_net_ctrl_rfs {
> +\end_layout
> +
> +\begin_layout Plain Layout
> +
> +\change_inserted 1986246365 1353594212
> +
> +	u16 virtqueue_pairs;
> +\end_layout
> +
> +\begin_layout Plain Layout
> +
> +\change_inserted 1986246365 1353594172
> +
> +};
> +\end_layout
> +
> +\begin_layout Plain Layout
> +
> +\change_inserted 1986246365 1353594172
> +
> +\end_layout
> +
> +\begin_layout Plain Layout
> +
> +\change_inserted 1986246365 1353594263
> +
> +#define VIRTIO_NET_CTRL_RFC    1

RFS
> +\end_layout
> +
> +\begin_layout Plain Layout
> +
> +\change_inserted 1986246365 1353594273
> +
> + #define VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET        0
> +\end_layout
> +
> +\end_inset
> +
> +
> +\end_layout
> +
> +\begin_layout Standard
> +
> +\change_inserted 1986246365 1353594884
> +RFS acceleration is disabled by default.
> + Driver enables RFS by executing the VIRTIO_NET_CTRL_RFS_VQ_PAIRS_SET command,
> + specifying the number of the last transmit and receive queue that is going
> + to be used; thus out of transmitq0..transmitqn and receiveq0..receiveqn where
> +
> +\emph on
> +n=virtqueue
> +\emph default
> +_pairs will be used.
> + All these virtqueues must have been pre-configured in advance.
> +\end_layout
> +
> +\begin_layout Standard
> +
> +\change_inserted 1986246365 1353595328
> +Programming of the receive flow classificator is implicit.
> + Transmitting a packet of a specific flow on transmitqX will cause incoming
> + packets for this flow to be steered to receiveqX.
> + For uni-directional protocols, or where no packets have been transmitted
> + yet, device will steer a packet to a random queue out of the specified
> + receiveq0..receiveqn.
> +\change_unchanged
> +
> +\end_layout
> +
> +\begin_layout Standard
> +
> +\change_inserted 1986246365 1353595040
> +RFS acceleration is disabled by setting
> +\emph on
> +virtqueue_pairs = 0

Zero looks a little bit misleading, use 1 here is more clear since we 
would still use 1 queue pairs.
> +\emph default
> + (this is the default).
> + Following this, driver should not transmit new packets on virtqueues other
> + than transmitq0 and device will not steer new packets on virtqueues other
> + than receiveq0.
> +\change_unchanged
> +
> +\end_layout
> +
> +\begin_layout Standard
> +
> +\change_deleted 1986246365 1353593873
>   .
> +
> +\change_unchanged
>    
>   \end_layout
>   
> @@ -6152,13 +6385,7 @@ Virtqueues 0:receiveq(port0).
>   status open
>   
>   \begin_layout Plain Layout
> -Ports
> -\change_inserted 1986246365 1347188327
> -1
> -\change_deleted 1986246365 1347188327
> -2
> -\change_unchanged
> - onwards only if VIRTIO_CONSOLE_F_MULTIPORT is set
> +Ports 12 onwards only if VIRTIO_CONSOLE_F_MULTIPORT is set

The changes here and follow looks unrelated.
>   \end_layout
>   
>   \end_inset
> @@ -6185,13 +6412,8 @@ VIRTIO_CONSOLE_F_SIZE
>   
>   \begin_layout Description
>   VIRTIO_CONSOLE_F_MULTIPORT(1) Device has support for multiple ports; configurati
> -on fields nr_ports and max_nr_ports are valid
> -\change_inserted 1986246365 1347188404
> -; if this bit is negotiated,
> -\change_deleted 1986246365 1347188406
> - and
> -\change_unchanged
> - control virtqueues will be used.
> +on fields nr_ports and max_nr_ports are valid; if this bit is negotiated,
> + and control virtqueues will be used.
>   \end_layout
>   
>   \end_deeper
> @@ -6260,8 +6482,7 @@ If the VIRTIO_CONSOLE_F_MULTIPORT feature is negotiated, the driver can
>    spawn multiple ports, not all of which may be attached to a console.
>    Some could be generic ports.
>    In this case, the control virtqueues are enabled and according to the max_nr_po
> -rts configuration-space value, an appropriate number of virtqueues are
> - created.
> +rts configuration-space value, an appropriate number of virtqueues are created.
>    A control message indicating the driver is ready is sent to the host.
>    The host can then send control messages for adding new ports to the device.
>    After creating and initializing each port, a VIRTIO_CONSOLE_PORT_READY
> @@ -6699,14 +6920,9 @@ The driver constructs an array of addresses of memory pages it has previously
>   \end_layout
>   
>   \begin_layout Enumerate
> -If the VIRTIO_BALLOON_F_MUST_TELL_HOST feature is
> -\change_inserted 1986246365 1347188540
> -negotiated
> -\change_deleted 1986246365 1347188542
> -set
> -\change_unchanged
> -, the guest may not use these requested pages until that descriptor in the
> - deflateq has been used by the device.
> +If the VIRTIO_BALLOON_F_MUST_TELL_HOST feature is negotiatedset, the guest
> + may not use these requested pages until that descriptor in the deflateq
> + has been used by the device.
>   \end_layout
>   
>   \begin_layout Enumerate

^ permalink raw reply

* Re: [PATCH] 8139cp: set ring address after enabling C+ mode
From: Jason Wang @ 2012-11-23  3:53 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: David Miller, dwmw2, netdev
In-Reply-To: <50ADAFB7.7070704@pobox.com>

On 11/22/2012 12:53 PM, Jeff Garzik wrote:
> On 11/21/2012 11:39 PM, David Miller wrote:
>> From: Jeff Garzik <jgarzik@pobox.com>
>> Date: Wed, 21 Nov 2012 22:47:39 -0500
>>
>>> State A:  pre-b01af457, known working
>>> State B:  b01af457, known broken
>>
>> State A is also known buggy on the largest consumer of this driver,
>> the emulated hardware.
>>
>> Please evaluate this realistically.
>
> If the simulator fails to match the hardware, that is a simulator bug.
Resend the mail because it's fail to post to the list yesterday.

CC realtek linux driver mainter (nic_swsd@realtek.com)

The problem the behaviour of the hardware is subtle, and we could not 
just infer it from the datasheet. Another issue is in some situation, 
the datasheet is conflict with what real hardware does, one example is 
the cfg9364 issue mentioned by David ( I also meet it during qemu 
development).

If the hardware always fit garbage into the TxRingAddr register when 
"plus mode" were enabled, it may send something from memory to the wire 
unexpectedly which looks really strange. If it does not change the 
RxRingAddr when enabling C+, another method is to keep setting the rx 
address before C+ enabling but does the tx after.
>
> It is disappointing to work around someone else's software bug in the 
> kernel.
>

Qemu also has some workarounds for the buggy kernels and even in this 
case: it initialize RxRingAddr to 0 and check it during receiving, it  
check whether the addr is still zero ( which may mean the rx ring addr 
were set after the c+ is enabled), it won't do the receiving to prevent 
the corruption. So reverting is safe for rx now.
>     Jeff
>
>
>

^ permalink raw reply

* [PATCH net-next] tcp: remove dead prototype for tcp_v4_get_peer()
From: Neal Cardwell @ 2012-11-23  3:48 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Neal Cardwell

This function no longer exists.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
---
 include/net/tcp.h |    1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6feeccd..3202bde 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -369,7 +369,6 @@ extern void tcp_shutdown (struct sock *sk, int how);
 extern void tcp_v4_early_demux(struct sk_buff *skb);
 extern int tcp_v4_rcv(struct sk_buff *skb);
 
-extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
 extern int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
 extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		       size_t size);
-- 
1.7.7.3

^ permalink raw reply related

* Fwd: Re: [PATCH] net: ipv6: change %8s to %s for rt->dst.dev->name in seq_printf of rt6_info_route
From: Chen Gang @ 2012-11-23  3:35 UTC (permalink / raw)
  To: Shan Wei, Eric Dumazet, David Miller; +Cc: netdev
In-Reply-To: <50ADE447.8030300@asianux.com>


1) about the proof:
 currently, sorry for I can not find the device which name length is more than 8.
 maybe they (Asianux user) use system call in user mode to assign the new name to device.
   please reference: dev_ioctl -> dev_ifsioc -> dev_change_name  in net/core/dev.c.
   I do not know why they want to change the net device name (but they surely can do).

2) about %*s:
 since kernel is an open system, IFNAMSIZ is belong to OS API level for outside
   it has effect both on individual kernel modules and user mode system call
   we need obey this rule, and %8s is not match this rule.
   so %8s is not suitable. (and now we have to choose %16s or %s).

 for the format of information which seq_printf output:
   it is not belong to OS API level for outside (at least, for current case, it is true). 
   so we need not keep 'compatible' of it, so %16s is not necessary.

 for keeping source code simple and clearly:
   %s is better than %16s.

 so for result, we should choose %s only (neither %16s nor %8s).

3) about my original mail:
 why did my original mail (first mail relative with this patch) say %16s ?
 my goals are:
   i)   to confirm whether suitable to communicate about RHEL* in *@vger.kernel.org.
   ii)  to confirm whether *@vger.kernel.org welcome such a minor patch (at least, it is not a spam).
   iii) to confirm whether *@vger.kernel.org are focused on coding. 
        (so I intended to use %16s and 'beautiful')
        (I have seen too many another various organizations to not be focused on coding)
 after get feed back from Eric Dumazet.
   i)   it is not suitable to communicate about RHEL* in *@vger.kernel.org.
   ii)  *@vger.kernel.org welcome such a minor patch.
   iii) *@vger.kernel.org are focused on coding.
        (so I am sure that can use "coding review" to provide contributes to *@vger.kernel.org)


 Regards

gchen.

-------- 原始消息 --------
主题: Re: [PATCH] net: ipv6: change %8s to %s for rt->dst.dev->name in seq_printf of rt6_info_route
日期: Thu, 22 Nov 2012 16:37:27 +0800
发件人: Chen Gang <gang.chen@asianux.com>
收件人: Shan Wei <shanwei88@gmail.com>
抄送: Eric Dumazet <eric.dumazet@gmail.com>,  David Miller <davem@davemloft.net>, netdev <netdev@vger.kernel.org>

于 2012年11月22日 13:28, Shan Wei 写道:
> Hi chen gang:
> 
> For length of device name which less than 8 char，
> your patch changes them to be print from align right 
> to align left. But at least since 2005(git age-time),
> we keep this style so far.
> Maybe, since birth of this code, just align right. :-)
> 

  originally, it is a solid output length, the length is "#define
RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)"
  and RHEL5 (kernel-2.6.18-308.20.el5) still use it.
  it assume that the length of rt->rt6i_dev->name (in RHEL5) is 8.

> Why we *should* change this style?
> just keep be consistent with the case which length of device
> name greater than 8 char?
> 

  as a solid length, 8 is not suitable, firstly I suggest to '%16s' (I
call it 'beautiful',  but for RHEL5, it is a correctness issue)
  and Eric Dumazet suggest use '%s' is better, since it is not solid
length any more (have already let seq_printf instead of arg->buffer)
  and I think: as a result, what he said is reasonable

> Not only old name rule i.e. eth0,eth1, but also new name rule
> base on pci address ,i.e. em1,p3p1. most of them are less than 8 char.
> Should not we take more attention on the case less than 8 char?
> 

  I have ever seen such a device name is more than 8 characters.
  I am not quite sure: maybe they are eth-route* or eth-usb* ...
  I will check it in these days, please wait for some days.


> By addition, if we want to add new field in the future,
> align right is a better choice.
> 

  maybe what you said is better (still keep it 'beautiful', but need use
'%16s' instead of '%8s')

  for this, Eric Dumazet maybe have his opinions.


 Regards

gchen.

> 
> Chen Gang said, at 2012/11/22 10:52:
>> Hi Shan Wei, Eric Dumazet
>>
>>   is this patch integrated into main branch ?
>>   if need me for additional completion (such as: merge another 2 trivial patches into this patch, too)
>>   please tell me, I will do. 
>>
>>   I understand you are working overtime, maybe no time for any minor and trivial patches.
>>   if surely it is, I think:
>>     you can modify these code manually, and obsolete these minor and trivial patches which I provided.
>>     I do not mind whether mention me in another new patches (you can mention me or not mention me, both are OK).
>>     since our goal is to provide contributes to outside, efficiently.
>>
>>  regards
>>
>> gchen
>>
>>
>> 于 2012年11月05日 11:02, Chen Gang 写道:
>>>
>>> 1. not to send same patch triple times. 
>>
>>   thanks, I shall notice, next time.
>>   (I shall 'believe' another members).
>>
>>> 2. config your email client,because tab is changed to space.
>>>    you can read Documentation/email-clients.txt.
>>
>>   1) thanks. I shall notice, next time.
>>   2) now, I get gvim as extention editor for thounderbird
>>   3) the patch is generated by `git format-patch -s --summary --stat`
>>      it use "' '\t" as head, I do not touch it, maybe it is correct.
>>
>> welcome any members to giving additional suggestions and completions.
>>
>> thanks
>>
>> the modified contents are below,
>> -----------------------------------------------------------------------------------
>>
>>   the length of rt->dst.dev->name is 16 (IFNAMSIZ)
>>   in seq_printf, it is not suitable to use %8s for rt->dst.dev->name.
>>   so change it to %s, since each line has not been solid any more.
>>
>>   additional information:
>>
>>     %8s  limit the width, not for the original string output length
>>          if name length is more than 8, it still can be fully displayed.
>>          if name length is less than 8, the ' ' will be filled before name.
>>
>>     %.8s truly limit the original string output length (precision)
>>
>> Signed-off-by: Chen Gang <gang.chen@asianux.com>
>> ---
>>  net/ipv6/route.c |    2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>> index c42650c..b60bc52 100644
>> --- a/net/ipv6/route.c
>> +++ b/net/ipv6/route.c
>> @@ -2835,7 +2835,7 @@ static int rt6_info_route(struct rt6_info *rt, void *p_arg)
>>  	} else {
>>  		seq_puts(m, "00000000000000000000000000000000");
>>  	}
>> -	seq_printf(m, " %08x %08x %08x %08x %8s\n",
>> +	seq_printf(m, " %08x %08x %08x %08x %s\n",
>>  		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
>>  		   rt->dst.__use, rt->rt6i_flags,
>>  		   rt->dst.dev ? rt->dst.dev->name : "");
>>
>>
>>
> 
> 
> 


-- 
Chen Gang

Asianux Corporation

^ permalink raw reply

* FW:
From: balu.balini @ 2012-11-23  2:41 UTC (permalink / raw)
  To: mdoug68

http://bitly.com/Tewapi I just found out your mother has a rap sheet!

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox