Netdev List
 help / color / mirror / Atom feed
* [PATCH 08/12] sfc: Support only two rx buffers per page
From: Ben Hutchings @ 2010-06-01 21:33 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers
In-Reply-To: <1275426967.2114.25.camel@achroite.uk.solarflarecom.com>

From: Steve Hodgson <shodgson@solarflare.com>

- Pull the loop handling into efx_init_rx_buffers_(skb|page)
- Remove rx_queue->buf_page, and associated clean up code
- Remove unmap_addr, since unmap_addr is trivially calculable

This will allow us to recycle discarded buffers directly
from efx_rx_packet(), since will never be in the middle of
splitting a page.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/sfc/net_driver.h |   10 --
 drivers/net/sfc/rx.c         |  228 ++++++++++++++++++------------------------
 2 files changed, 96 insertions(+), 142 deletions(-)

diff --git a/drivers/net/sfc/net_driver.h b/drivers/net/sfc/net_driver.h
index 4539803..59c8ecc 100644
--- a/drivers/net/sfc/net_driver.h
+++ b/drivers/net/sfc/net_driver.h
@@ -222,7 +222,6 @@ struct efx_tx_queue {
  *	If both this and skb are %NULL, the buffer slot is currently free.
  * @data: Pointer to ethernet header
  * @len: Buffer length, in bytes.
- * @unmap_addr: DMA address to unmap
  */
 struct efx_rx_buffer {
 	dma_addr_t dma_addr;
@@ -230,7 +229,6 @@ struct efx_rx_buffer {
 	struct page *page;
 	char *data;
 	unsigned int len;
-	dma_addr_t unmap_addr;
 };
 
 /**
@@ -257,11 +255,6 @@ struct efx_rx_buffer {
  * @alloc_page_count: RX allocation strategy counter.
  * @alloc_skb_count: RX allocation strategy counter.
  * @slow_fill: Timer used to defer efx_nic_generate_fill_event().
- * @buf_page: Page for next RX buffer.
- *	We can use a single page for multiple RX buffers. This tracks
- *	the remaining space in the allocation.
- * @buf_dma_addr: Page's DMA address.
- * @buf_data: Page's host address.
  * @flushed: Use when handling queue flushing
  */
 struct efx_rx_queue {
@@ -284,9 +277,6 @@ struct efx_rx_queue {
 	struct timer_list slow_fill;
 	unsigned int slow_fill_count;
 
-	struct page *buf_page;
-	dma_addr_t buf_dma_addr;
-	char *buf_data;
 	enum efx_flush_state flushed;
 };
 
diff --git a/drivers/net/sfc/rx.c b/drivers/net/sfc/rx.c
index bf1e55e..615a1fc 100644
--- a/drivers/net/sfc/rx.c
+++ b/drivers/net/sfc/rx.c
@@ -98,155 +98,132 @@ static inline unsigned int efx_rx_buf_size(struct efx_nic *efx)
 	return PAGE_SIZE << efx->rx_buffer_order;
 }
 
-
 /**
- * efx_init_rx_buffer_skb - create new RX buffer using skb-based allocation
+ * efx_init_rx_buffers_skb - create EFX_RX_BATCH skb-based RX buffers
  *
  * @rx_queue:		Efx RX queue
- * @rx_buf:		RX buffer structure to populate
  *
- * This allocates memory for a new receive buffer, maps it for DMA,
- * and populates a struct efx_rx_buffer with the relevant
- * information.  Return a negative error code or 0 on success.
+ * This allocates EFX_RX_BATCH skbs, maps them for DMA, and populates a
+ * struct efx_rx_buffer for each one. Return a negative error code or 0
+ * on success. May fail having only inserted fewer than EFX_RX_BATCH
+ * buffers.
  */
-static int efx_init_rx_buffer_skb(struct efx_rx_queue *rx_queue,
-				  struct efx_rx_buffer *rx_buf)
+static int efx_init_rx_buffers_skb(struct efx_rx_queue *rx_queue)
 {
 	struct efx_nic *efx = rx_queue->efx;
 	struct net_device *net_dev = efx->net_dev;
+	struct efx_rx_buffer *rx_buf;
 	int skb_len = efx->rx_buffer_len;
+	unsigned index, count;
 
-	rx_buf->skb = netdev_alloc_skb(net_dev, skb_len);
-	if (unlikely(!rx_buf->skb))
-		return -ENOMEM;
+	for (count = 0; count < EFX_RX_BATCH; ++count) {
+		index = rx_queue->added_count & EFX_RXQ_MASK;
+		rx_buf = efx_rx_buffer(rx_queue, index);
 
-	/* Adjust the SKB for padding and checksum */
-	skb_reserve(rx_buf->skb, NET_IP_ALIGN);
-	rx_buf->len = skb_len - NET_IP_ALIGN;
-	rx_buf->data = (char *)rx_buf->skb->data;
-	rx_buf->skb->ip_summed = CHECKSUM_UNNECESSARY;
+		rx_buf->skb = netdev_alloc_skb(net_dev, skb_len);
+		if (unlikely(!rx_buf->skb))
+			return -ENOMEM;
+		rx_buf->page = NULL;
 
-	rx_buf->dma_addr = pci_map_single(efx->pci_dev,
-					  rx_buf->data, rx_buf->len,
-					  PCI_DMA_FROMDEVICE);
+		/* Adjust the SKB for padding and checksum */
+		skb_reserve(rx_buf->skb, NET_IP_ALIGN);
+		rx_buf->len = skb_len - NET_IP_ALIGN;
+		rx_buf->data = (char *)rx_buf->skb->data;
+		rx_buf->skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+		rx_buf->dma_addr = pci_map_single(efx->pci_dev,
+						  rx_buf->data, rx_buf->len,
+						  PCI_DMA_FROMDEVICE);
+		if (unlikely(pci_dma_mapping_error(efx->pci_dev,
+						   rx_buf->dma_addr))) {
+			dev_kfree_skb_any(rx_buf->skb);
+			rx_buf->skb = NULL;
+			return -EIO;
+		}
 
-	if (unlikely(pci_dma_mapping_error(efx->pci_dev, rx_buf->dma_addr))) {
-		dev_kfree_skb_any(rx_buf->skb);
-		rx_buf->skb = NULL;
-		return -EIO;
+		++rx_queue->added_count;
+		++rx_queue->alloc_skb_count;
 	}
 
 	return 0;
 }
 
 /**
- * efx_init_rx_buffer_page - create new RX buffer using page-based allocation
+ * efx_init_rx_buffers_page - create EFX_RX_BATCH page-based RX buffers
  *
  * @rx_queue:		Efx RX queue
- * @rx_buf:		RX buffer structure to populate
  *
- * This allocates memory for a new receive buffer, maps it for DMA,
- * and populates a struct efx_rx_buffer with the relevant
- * information.  Return a negative error code or 0 on success.
+ * This allocates memory for EFX_RX_BATCH receive buffers, maps them for DMA,
+ * and populates struct efx_rx_buffers for each one. Return a negative error
+ * code or 0 on success. If a single page can be split between two buffers,
+ * then the page will either be inserted fully, or not at at all.
  */
-static int efx_init_rx_buffer_page(struct efx_rx_queue *rx_queue,
-				   struct efx_rx_buffer *rx_buf)
+static int efx_init_rx_buffers_page(struct efx_rx_queue *rx_queue)
 {
 	struct efx_nic *efx = rx_queue->efx;
-	int bytes, space, offset;
-
-	bytes = efx->rx_buffer_len - EFX_PAGE_IP_ALIGN;
-
-	/* If there is space left in the previously allocated page,
-	 * then use it. Otherwise allocate a new one */
-	rx_buf->page = rx_queue->buf_page;
-	if (rx_buf->page == NULL) {
-		dma_addr_t dma_addr;
-
-		rx_buf->page = alloc_pages(__GFP_COLD | __GFP_COMP | GFP_ATOMIC,
-					   efx->rx_buffer_order);
-		if (unlikely(rx_buf->page == NULL))
+	struct efx_rx_buffer *rx_buf;
+	struct page *page;
+	char *page_addr;
+	dma_addr_t dma_addr;
+	unsigned index, count;
+
+	/* We can split a page between two buffers */
+	BUILD_BUG_ON(EFX_RX_BATCH & 1);
+
+	for (count = 0; count < EFX_RX_BATCH; ++count) {
+		page = alloc_pages(__GFP_COLD | __GFP_COMP | GFP_ATOMIC,
+				   efx->rx_buffer_order);
+		if (unlikely(page == NULL))
 			return -ENOMEM;
-
-		dma_addr = pci_map_page(efx->pci_dev, rx_buf->page,
-					0, efx_rx_buf_size(efx),
+		dma_addr = pci_map_page(efx->pci_dev, page, 0,
+					efx_rx_buf_size(efx),
 					PCI_DMA_FROMDEVICE);
-
 		if (unlikely(pci_dma_mapping_error(efx->pci_dev, dma_addr))) {
-			__free_pages(rx_buf->page, efx->rx_buffer_order);
-			rx_buf->page = NULL;
+			__free_pages(page, efx->rx_buffer_order);
 			return -EIO;
 		}
-
-		rx_queue->buf_page = rx_buf->page;
-		rx_queue->buf_dma_addr = dma_addr;
-		rx_queue->buf_data = (page_address(rx_buf->page) +
-				      EFX_PAGE_IP_ALIGN);
-	}
-
-	rx_buf->len = bytes;
-	rx_buf->data = rx_queue->buf_data;
-	offset = efx_rx_buf_offset(rx_buf);
-	rx_buf->dma_addr = rx_queue->buf_dma_addr + offset;
-
-	/* Try to pack multiple buffers per page */
-	if (efx->rx_buffer_order == 0) {
-		/* The next buffer starts on the next 512 byte boundary */
-		rx_queue->buf_data += ((bytes + 0x1ff) & ~0x1ff);
-		offset += ((bytes + 0x1ff) & ~0x1ff);
-
-		space = efx_rx_buf_size(efx) - offset;
-		if (space >= bytes) {
-			/* Refs dropped on kernel releasing each skb */
-			get_page(rx_queue->buf_page);
-			goto out;
+		EFX_BUG_ON_PARANOID(dma_addr & (PAGE_SIZE - 1));
+		page_addr = page_address(page) + EFX_PAGE_IP_ALIGN;
+		dma_addr += EFX_PAGE_IP_ALIGN;
+
+	split:
+		index = rx_queue->added_count & EFX_RXQ_MASK;
+		rx_buf = efx_rx_buffer(rx_queue, index);
+		rx_buf->dma_addr = dma_addr;
+		rx_buf->skb = NULL;
+		rx_buf->page = page;
+		rx_buf->data = page_addr;
+		rx_buf->len = efx->rx_buffer_len - EFX_PAGE_IP_ALIGN;
+		++rx_queue->added_count;
+		++rx_queue->alloc_page_count;
+
+		if ((~count & 1) && (efx->rx_buffer_len < (PAGE_SIZE >> 1))) {
+			/* Use the second half of the page */
+			get_page(page);
+			dma_addr += (PAGE_SIZE >> 1);
+			page_addr += (PAGE_SIZE >> 1);
+			++count;
+			goto split;
 		}
 	}
 
-	/* This is the final RX buffer for this page, so mark it for
-	 * unmapping */
-	rx_queue->buf_page = NULL;
-	rx_buf->unmap_addr = rx_queue->buf_dma_addr;
-
- out:
 	return 0;
 }
 
-/* This allocates memory for a new receive buffer, maps it for DMA,
- * and populates a struct efx_rx_buffer with the relevant
- * information.
- */
-static int efx_init_rx_buffer(struct efx_rx_queue *rx_queue,
-			      struct efx_rx_buffer *new_rx_buf)
-{
-	int rc = 0;
-
-	if (rx_queue->channel->rx_alloc_push_pages) {
-		new_rx_buf->skb = NULL;
-		rc = efx_init_rx_buffer_page(rx_queue, new_rx_buf);
-		rx_queue->alloc_page_count++;
-	} else {
-		new_rx_buf->page = NULL;
-		rc = efx_init_rx_buffer_skb(rx_queue, new_rx_buf);
-		rx_queue->alloc_skb_count++;
-	}
-
-	if (unlikely(rc < 0))
-		EFX_LOG_RL(rx_queue->efx, "%s RXQ[%d] =%d\n", __func__,
-			   rx_queue->queue, rc);
-	return rc;
-}
-
 static void efx_unmap_rx_buffer(struct efx_nic *efx,
 				struct efx_rx_buffer *rx_buf)
 {
 	if (rx_buf->page) {
 		EFX_BUG_ON_PARANOID(rx_buf->skb);
-		if (rx_buf->unmap_addr) {
-			pci_unmap_page(efx->pci_dev, rx_buf->unmap_addr,
+
+		/* Unmap the buffer if there's only one buffer per page(s),
+		 * or this is the second half of a two buffer page. */
+		if (efx->rx_buffer_order != 0 ||
+		    (efx_rx_buf_offset(rx_buf) & (PAGE_SIZE >> 1)) != 0) {
+			pci_unmap_page(efx->pci_dev,
+				       rx_buf->dma_addr & ~(PAGE_SIZE - 1),
 				       efx_rx_buf_size(efx),
 				       PCI_DMA_FROMDEVICE);
-			rx_buf->unmap_addr = 0;
 		}
 	} else if (likely(rx_buf->skb)) {
 		pci_unmap_single(efx->pci_dev, rx_buf->dma_addr,
@@ -286,9 +263,9 @@ static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue,
  */
 void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue)
 {
-	struct efx_rx_buffer *rx_buf;
-	unsigned fill_level, index;
-	int i, space, rc = 0;
+	struct efx_channel *channel = rx_queue->channel;
+	unsigned fill_level;
+	int space, rc = 0;
 
 	/* Calculate current fill level, and exit if we don't need to fill */
 	fill_level = (rx_queue->added_count - rx_queue->removed_count);
@@ -309,21 +286,18 @@ void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue)
 	EFX_TRACE(rx_queue->efx, "RX queue %d fast-filling descriptor ring from"
 		  " level %d to level %d using %s allocation\n",
 		  rx_queue->queue, fill_level, rx_queue->fast_fill_limit,
-		  rx_queue->channel->rx_alloc_push_pages ? "page" : "skb");
+		  channel->rx_alloc_push_pages ? "page" : "skb");
 
 	do {
-		for (i = 0; i < EFX_RX_BATCH; ++i) {
-			index = rx_queue->added_count & EFX_RXQ_MASK;
-			rx_buf = efx_rx_buffer(rx_queue, index);
-			rc = efx_init_rx_buffer(rx_queue, rx_buf);
-			if (unlikely(rc)) {
-				/* Ensure that we don't leave the rx queue
-				 * empty */
-				if (rx_queue->added_count == rx_queue->removed_count)
-					efx_schedule_slow_fill(rx_queue);
-				goto out;
-			}
-			++rx_queue->added_count;
+		if (channel->rx_alloc_push_pages)
+			rc = efx_init_rx_buffers_page(rx_queue);
+		else
+			rc = efx_init_rx_buffers_skb(rx_queue);
+		if (unlikely(rc)) {
+			/* Ensure that we don't leave the rx queue empty */
+			if (rx_queue->added_count == rx_queue->removed_count)
+				efx_schedule_slow_fill(rx_queue);
+			goto out;
 		}
 	} while ((space -= EFX_RX_BATCH) >= EFX_RX_BATCH);
 
@@ -638,16 +612,6 @@ void efx_fini_rx_queue(struct efx_rx_queue *rx_queue)
 			efx_fini_rx_buffer(rx_queue, rx_buf);
 		}
 	}
-
-	/* For a page that is part-way through splitting into RX buffers */
-	if (rx_queue->buf_page != NULL) {
-		pci_unmap_page(rx_queue->efx->pci_dev, rx_queue->buf_dma_addr,
-			       efx_rx_buf_size(rx_queue->efx),
-			       PCI_DMA_FROMDEVICE);
-		__free_pages(rx_queue->buf_page,
-			     rx_queue->efx->rx_buffer_order);
-		rx_queue->buf_page = NULL;
-	}
 }
 
 void efx_remove_rx_queue(struct efx_rx_queue *rx_queue)
-- 
1.6.2.5


-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related

* Re: [PATCH] use unfair spinlock when running on hypervisor.
From: Eric Dumazet @ 2010-06-01 21:39 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Andi Kleen, Gleb Natapov, linux-kernel, kvm, hpa, mingo, npiggin,
	tglx, mtosatti, netdev
In-Reply-To: <4C053ACC.5020708@redhat.com>

Le mardi 01 juin 2010 à 19:52 +0300, Avi Kivity a écrit :

> What I'd like to see eventually is a short-term-unfair, long-term-fair 
> spinlock.  Might make sense for bare metal as well.  But it won't be 
> easy to write.
> 

This thread rings a bell here :)

Yes, ticket spinlocks are sometime slower, especially in workloads where
a spinlock needs to be taken several times to handle one unit of work,
and many cpus competing.

We currently have kind of a similar problem in network stack, and we
have a patch to speedup xmit path by an order of magnitude, letting one
cpu (the consumer cpu) to get unfair access to the (ticket) spinlock.
(It can compete with no more than one other cpu)

Boost from ~50.000 to ~600.000 pps on a dual quad core machine (E5450
@3.00GHz) on a particular workload (many cpus want to xmit their
packets)

( patch : http://patchwork.ozlabs.org/patch/53163/ )


It could be possible to write such a generic beast, with a cascade or
regular ticket spinlocks ?

One ticket spinlock at first stage (only if some conditions are met, aka
slow path), then an 'primary' spinlock at second stage.


// generic implementation
// (x86 could use 16bit fields for users_in & user_out)
struct cascade_lock {
	atomic_t 	users_in;
	int		users_out;
	spinlock_t	primlock;
	spinlock_t	slowpathlock; // could be outside of this structure, shared by many 'cascade_locks'
};

/*
 * In kvm case, you might call hypervisor when slowpathlock is about to be taken ?
 * When a cascade lock is unlocked, and relocked right after, this cpu has unfair
 * priority and could get the lock before cpus blocked in slowpathlock (especially if
 * an hypervisor call was done)
 *
 * In network xmit path, the dequeue thread would use highprio_user=true mode
 * In network xmit path, the 'contended' enqueueing thread would set a negative threshold,
 *  to force a 'lowprio_user' mode.
 */
void cascade_lock(struct cascade_lock *l, bool highprio_user, int threshold)
{
	bool slowpath = false;

	atomic_inc(&l->users_in); // no real need for atomic_inc_return()
	if (atomic_read(&l->users_in) - l->users_out > threshold && !highprio_user)) {
		spin_lock(&l->slowpathlock);
		slowpath = true;
	}
	spin_lock(&l->primlock);
	if (slowpath)
		spin_unlock(&l->slowpathlock);
}

void cascade_unlock(struct cascade_lock *l)
{
	l->users_out++;
	spin_unlock(&l->primlock);
}

^ permalink raw reply

* 2.6.32 and Multicast group membership
From: Mr. Berkley Shands @ 2010-06-01 21:39 UTC (permalink / raw)
  To: Net Dev

starting in 2.6.32, my multicast connections stop getting data after 
60-100 seconds.
The identical user code works fine under 2.6.22 through 2.6.31.

The NIC (an intel 82586) has two ports on the same subnet
(eth0 at 172.16.21.55/24 and eth1 at 172.16.21.56/24)

      if (setsockopt(sock, IPPROTO_IP, IP_ADD_MEMBERSHIP, (char*)&req, 
sizeof(req)))
      {
         perror("setsockopt IP_ADD_MEMBERSHIP failed");
         ::exit(-1);
      }

If I do ADD_MEMBERSHIP on just one of these interfaces, the sockets 
still get data.
But if I join on both interfaces, one or both will stop getting packets
after 60-100 seconds. Sniffing with tcpdump shows the Cisco layer 3 switch
is not getting its responses back to keep the multicast group open.
The HP layer 2 switch does not seem to care, it keeps the data flowing 
regardless
of which physical port the join is executed on.

Changing IP_MULTICAST_ALL has no effect :-(
Did I miss something? New code that I have to specify to keep the Cisco 
happy?

tia

Berkley


-- 

// E. F. Berkley Shands, MSc//

** Exegy Inc.**

349 Marshall Road, Suite 100

St. Louis , MO  63119

Direct:  (314) 218-3600 X450

Cell:  (314) 303-2546

Office:  (314) 218-3600

Fax:  (314) 218-3601

 

The Usual Disclaimer follows...

 



^ permalink raw reply

* Re: [PATCHv2] Refactor update of IPv6 flow destination address for srcrt (RH) option
From: Arnaud Ebalard @ 2010-06-01 21:58 UTC (permalink / raw)
  To: Joe Perches
  Cc: David S. Miller,
	YOSHIFUJI Hideaki / 吉藤英明, netdev
In-Reply-To: <1275422925.19372.114.camel@Joe-Laptop.home>

Hi,

Joe Perches <joe@perches.com> writes:

>> +static inline struct in6_addr *srcrt_dst_flow_update(struct in6_addr *final,
>> +						     struct in6_addr *fl6dst,
>> +						     const struct ipv6_txoptions *opt)
>> +{
>> +	if (opt && opt->srcrt) {
>> +		const struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt;
>> +		ipv6_addr_copy(final, fl6dst);
>> +		ipv6_addr_copy(fl6dst, rt0->addr);
>> +		return final;
>> +	}
>> +	return NULL;
>> +}
>> +
>
> Should this be inline?  Maybe it'd be better moved to exthdrs.c

I thought it was small enough to keep it inline in ipv6.h but removing
the inline and moving it to exthdrs.c makes sense. 

> Perhaps this is clearer as something like:
>
> /**
>  * fl6_update_dst - update flow destination address
>  *
>  * @fl: flowlabel fl_dst to update
>  * @opt: struct ipv6_txoptions
>  * @orig: original fl_dst if modified
>  *
>  * Returns NULL if no txoptions or no srcrt, otherwise
>  * returns orig and initial value of fl->fl6_dst set in orig
>  */
> struct in6_addr *fl6_update_dst(struct ip6_flowlabel *fl,
> 				const struct ipv6_txoptions *opt,
> 				struct in6_addr *orig)
> {
> 	if (!opt || !opt->srcrt)
> 		return NULL;
>
> 	ipv6_addr_copy(orig, &fl->fl6_dst);
> 	ipv6_addr_copy(&fl->fl6_dst, ((struct rt0_hdr *)opt->srcrt)->addr);
> 	return orig;
> }

I'll send an updated new version tomorrow, i.e. in a few hours (with
s/ip6_flowlabel/flowi/).

Thanks for your feedback.

Cheers,

a+

^ permalink raw reply

* Re: [PATCH] IP: Increment INADDRERRORS if routing for a packet is not successful
From: Eric Dumazet @ 2010-06-01 22:07 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: netdev, Stephen Hemminger, David Miller
In-Reply-To: <alpine.DEB.2.00.1006011609280.9438@router.home>

Le mardi 01 juin 2010 à 16:13 -0500, Christoph Lameter a écrit :
> Something like this would have been very helpful during recent debugging
> of multicast issues. Silent discards are bad.
> 

> 
> If the kernel perceives that something is wrong with an incoming packet then the
> IP stack currently silently discards packets. This makes it difficult to diagnose
> problems with the network configurations (such as a misbehaving kernel
> subsystem discarding multicast packets because the reverse path filter
> does not like multicast subscriptions on the second NIC with rp_filter=1).
> 
> It is also necessary to know how many inbound packets are discarded to
> assess networking issues in general with a NIC.
> 
> Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
> Acked-by: Stephen Hemminger <shemminger@vyatta.com>
> 

I disagree with this patch.

IPSTATS_MIB_INADDRERRORS has a strong meaning, part of RFCS.

In this path, we simulate the routing of a virtual packet, not its
delivery.

This should not affect IPSTATS SNMP entries.

You should use another MIB entry, say LINUX_MIB_INROUTEERRORS ?

Dont inet_rtm_getroute() caller gets an error status anyway ?

> ---
>  net/ipv4/route.c |    3 +++
>  1 file changed, 3 insertions(+)
> 
> Index: linux-2.6/net/ipv4/route.c
> ===================================================================
> --- linux-2.6.orig/net/ipv4/route.c	2010-06-01 11:46:10.000000000 -0500
> +++ linux-2.6/net/ipv4/route.c	2010-06-01 11:52:55.000000000 -0500
> @@ -2981,6 +2981,9 @@ static int inet_rtm_getroute(struct sk_b
>  		rt = skb_rtable(skb);
>  		if (err == 0 && rt->u.dst.error)
>  			err = -rt->u.dst.error;
> +		if (err)
> +			IP_INC_STATS_BH(dev_net(skb->dev),
> +					IPSTATS_MIB_INADDRERRORS);
>  	} else {
>  		struct flowi fl = {
>  			.nl_u = {
> 
> 



^ permalink raw reply

* Re: [Patch] fix packet loss and massive ping spikes with PPP multi-link
From: David Miller @ 2010-06-01 22:15 UTC (permalink / raw)
  To: richih.mailinglist
  Cc: ben, paulus, netdev, linux-ppp, alan, patrakov, linux-kernel
In-Reply-To: <AANLkTinpsIs7qMSpQ78fWCTHh03iLBIwRlsU7sqJau08@mail.gmail.com>

From: Richard Hartmann <richih.mailinglist@gmail.com>
Date: Tue, 1 Jun 2010 13:28:59 +0200

> Maybe not a bug in the Linux kernel itself, but certainly in the real world
> that exists around Linux. Similar to how a change to a device driver that
> is needed to work around broken hardware is a bug fix, imo.

It's not the same situation at all.

It is easier to fix misconfigured products that exist because of
software and configurations than it is to fix a physical piece of
hardware.

So you could work around it if you wanted to.

I definitely don't see this as -stable material, as a result.  We will
push it to net-next-2.6 and it will thus hit 2.6.36 as previously
mentioned.

^ permalink raw reply

* Re: [PATCH] IP: Increment INADDRERRORS if routing for a packet is not successful
From: David Miller @ 2010-06-01 22:23 UTC (permalink / raw)
  To: eric.dumazet; +Cc: cl, netdev, shemminger
In-Reply-To: <1275430054.2638.115.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 02 Jun 2010 00:07:34 +0200

> IPSTATS_MIB_INADDRERRORS has a strong meaning, part of RFCS.
> 
> In this path, we simulate the routing of a virtual packet, not its
> delivery.
> 
> This should not affect IPSTATS SNMP entries.
> 
> You should use another MIB entry, say LINUX_MIB_INROUTEERRORS ?

Agreed.

^ permalink raw reply

* Re: [2.6.35-rc1 regression] tg3 vpd r/w failed
From: David Miller @ 2010-06-01 22:32 UTC (permalink / raw)
  To: mikpe; +Cc: netdev, mcarlson, linux-kernel, sparclinux
In-Reply-To: <19461.26529.591960.912204@pilspetsen.it.uu.se>


Mikael I'm getting tired of continuing to add the proper
mailing list CC: for your sparc bug reports.  Please make
sure to add sparclinux@vger.kernel.org to your reports
in the future.

Thanks.

> Booting 2.6.35-rc1 on a Sun Blade 2500 Red with builtin tg3 I get:
> 
> tg3.c:v3.110 (April 9, 2010)
> PCI: Enabling device: (0000:00:03.0), cmd 2
> ...
> tg3 0000:00:03.0: vpd r/w failed.  This is likely a firmware bug on this device.  Contact the card vendor for a firmware update.
> tg3 0000:00:03.0: vpd r/w failed.  This is likely a firmware bug on this device.  Contact the card vendor for a firmware update.
> tg3 0000:00:03.0: vpd r/w failed.  This is likely a firmware bug on this device.  Contact the card vendor for a firmware update.
> tg3 0000:00:03.0: eth0: Tigon3 [partno(none) rev 1002] (PCI:66MHz:64-bit) MAC address ...
> tg3 0000:00:03.0: eth0: attached PHY is 5703 (10/100/1000Base-T Ethernet) (WireSpeed[1])
> tg3 0000:00:03.0: eth0: RXcsums[1] LinkChgREG[0] MIirq[0] ASF[0] TSOcap[1]
> tg3 0000:00:03.0: eth0: dma_rwctrl[763f0000] dma_mask[32-bit]
> ...
> tg3 0000:00:03.0: eth0: No firmware running
> tg3 0000:00:03.0: eth0: Link is up at 100 Mbps, full duplex
> tg3 0000:00:03.0: eth0: Flow control is on for TX and on for RX
> 
> The three 'vpd r/w failed' messages are new and did not occur with 2.6.34
> or earlier kernels.
> 
> /Mikael

^ permalink raw reply

* Re: [PATCH] phylib: Add support for the LXT973 phy.
From: Andy Fleming @ 2010-06-01 22:39 UTC (permalink / raw)
  To: Richard Cochran; +Cc: netdev
In-Reply-To: <20100531130932.GA15845@riccoc20.at.omicron.at>

On Mon, May 31, 2010 at 8:09 AM, Richard Cochran
<richardcochran@gmail.com> wrote:
> This patch implements a work around for Erratum 5, "3.3 V Fiber Speed
> Selection." If the hardware wiring does not respect this erratum, then
> fiber optic mode will not work properly.
>
> Signed-off-by: Richard Cochran <richard.cochran@omicron.at>


>
> +static int lxt973_probe(struct phy_device *phydev)
> +{
> +       int val = phy_read(phydev, MII_LXT973_PCR);
> +
> +       if (val & PCR_FIBER_SELECT) {
> +               /*
> +                * If fiber is selected, then the only correct setting
> +                * is 100Mbps, full duplex, and auto negotiation off.
> +                */
> +               val = phy_read(phydev, MII_BMCR);
> +               val |= (BMCR_SPEED100 | BMCR_FULLDPLX);
> +               val &= ~BMCR_ANENABLE;
> +               phy_write(phydev, MII_BMCR, val);
> +               /* Remember that the port is in fiber mode. */
> +               phydev->priv = lxt973_probe;


That's a bit hacky.  There is a dev_flags field, which could be used
for this.  Probably, we should add a more general way of saying what
sort of port this is.  But don't use the presence and absence of
"priv", as it could one day get used for a different purpose, and this
seems like it would leave us open to strange bugs.

Also, is this erratum true for all lxt973 models, or is it fixed in
some revisions?


Andy

^ permalink raw reply

* Re: [PATCH] [ath5k][leds] Ability to disable leds support. If leds support enabled do not force mac802.11 leds layer selection.
From: Dmytro Milinevskyy @ 2010-06-01 23:01 UTC (permalink / raw)
  To: Bob Copeland
  Cc: ath5k-devel, Jiri Slaby, Nick Kossifidis, Luis R. Rodriguez,
	John W. Linville, GeunSik Lim, Greg Kroah-Hartman, Lukas Turek,
	Mark Hindley, Johannes Berg, Jiri Kosina, Kalle Valo, Keng-Yu Lin,
	Luca Verdesca, Shahar Or, linux-wireless, netdev, linux-kernel
In-Reply-To: <AANLkTinFCuqgvnEGFmssL5R1J9-Yx7JB8mO9yKdXuYUj@mail.gmail.com>

Bob, thanks for the response.

I will rework the patch.

>> -/* GPIO-controlled software LED */
>> -#define AR5K_SOFTLED_PIN       0
>> -#define AR5K_SOFTLED_ON                0
>> -#define AR5K_SOFTLED_OFF       1
>> -
>
> Please drop this hunk, no problem keeping it around.

I suppose this should go away with another patch to keep current
clean. These dfinitions are not related to the subject.

Regards,

-- Dima

On Tue, Jun 1, 2010 at 11:34 PM, Bob Copeland <me@bobcopeland.com> wrote:
> On Wed, Apr 7, 2010 at 2:58 PM, Dmytro Milinevskyy
> <milinevskyy@gmail.com> wrote:
>> Hello!
>
> Thanks, comments inline.
>
>
>> +config ATH5K_LEDS
>> +       tristate "Atheros 5xxx wireless cards LEDs support"
>> +       depends on ATH5K
>> +       ---help---
>> +         Atheros 5xxx LED support.
>> +
>
> This can select the proper LED classes?  Then you can get rid of another
> ifdef check later.
>
>> -/* GPIO-controlled software LED */
>> -#define AR5K_SOFTLED_PIN       0
>> -#define AR5K_SOFTLED_ON                0
>> -#define AR5K_SOFTLED_OFF       1
>> -
>
> Please drop this hunk, no problem keeping it around.
>
>> +#ifdef CONFIG_ATH5K_LEDS
>>  /* LED functions */
>>  int ath5k_init_leds(struct ath5k_softc *sc);
>>  void ath5k_led_enable(struct ath5k_softc *sc);
>>  void ath5k_led_off(struct ath5k_softc *sc);
>>  void ath5k_unregister_leds(struct ath5k_softc *sc);
>> +#else
>> +#define ath5k_init_leds(sc) do {} while (0)
>> +#define ath5k_led_enable(sc) do {} while (0)
>> +#define ath5k_led_off(sc) do {} while (0)
>> +#define ath5k_unregister_leds(sc) do {} while (0)
>> +#endif
>
> I prefer:
>
> #ifdef
> ...
> #else
> static inline int ath5k_init_leds(struct ath5k_softc *sc)
> {
>    return 0;
> }
> ...
> #endif
>
> so you get type-checking.  Also this doesn't quite work in your version:
>
>    int foo = ath5k_init_leds(sc);
>
>> +#ifdef CONFIG_ATH5K_LEDS
>>  /*
>>  * State for LED triggers
>>  */
>>  struct ath5k_led
>>  {
>> -       char name[ATH5K_LED_MAX_NAME_LEN + 1];  /* name of the LED in sysfs */
>>        struct ath5k_softc *sc;                 /* driver state */
>> +#ifdef CONFIG_LEDS_CLASS
>> +       char name[ATH5K_LED_MAX_NAME_LEN + 1];  /* name of the LED in sysfs */
>>        struct led_classdev led_dev;            /* led classdev */
>> +#endif
>>  };
>> +#endif
>
> Why move name?
>
>>  /* Rfkill */
>>  struct ath5k_rfkill {
>> @@ -186,9 +190,6 @@ struct ath5k_softc {
>>
>>        u8                      bssidmask[ETH_ALEN];
>>
>> -       unsigned int            led_pin,        /* GPIO pin for driving LED */
>> -                               led_on;         /* pin setting for LED on */
>> -
>>        struct tasklet_struct   restq;          /* reset tasklet */
>>
>>        unsigned int            rxbufsize;      /* rx size based on mtu */
>> @@ -196,7 +197,6 @@ struct ath5k_softc {
>>        spinlock_t              rxbuflock;
>>        u32                     *rxlink;        /* link ptr in last RX desc */
>>        struct tasklet_struct   rxtq;           /* rx intr tasklet */
>> -       struct ath5k_led        rx_led;         /* rx led */
>>
>>        struct list_head        txbuf;          /* transmit buffer */
>>        spinlock_t              txbuflock;
>> @@ -204,7 +204,14 @@ struct ath5k_softc {
>>        struct ath5k_txq        txqs[AR5K_NUM_TX_QUEUES];       /* tx queues */
>>        struct ath5k_txq        *txq;           /* main tx queue */
>>        struct tasklet_struct   txtq;           /* tx intr tasklet */
>> +
>> +
>> +#ifdef CONFIG_ATH5K_LEDS
>> +       unsigned int            led_pin,        /* GPIO pin for driving LED */
>> +                               led_on;         /* pin setting for LED on */
>> +       struct ath5k_led        rx_led;         /* rx led */
>>        struct ath5k_led        tx_led;         /* tx led */
>> +#endif
>
> You might want to do this in two stages: move the led-dependent things
> together in the structure (or into a separate structure) and then only
> have one #ifdef section.
>
> Still too many ifdefs in general.
>
> --
> Bob Copeland %% www.bobcopeland.com
>

^ permalink raw reply

* Re: [2.6.35-rc1 regression] tg3 vpd r/w failed
From: Matt Carlson @ 2010-06-01 23:28 UTC (permalink / raw)
  To: David Miller
  Cc: mikpe@it.uu.se, netdev@vger.kernel.org, Matthew Carlson,
	linux-kernel@vger.kernel.org, sparclinux@vger.kernel.org
In-Reply-To: <20100601.153209.135521213.davem@davemloft.net>

On Tue, Jun 01, 2010 at 03:32:09PM -0700, David Miller wrote:
> 
> Mikael I'm getting tired of continuing to add the proper
> mailing list CC: for your sparc bug reports.  Please make
> sure to add sparclinux@vger.kernel.org to your reports
> in the future.
> 
> Thanks.
> 
> > Booting 2.6.35-rc1 on a Sun Blade 2500 Red with builtin tg3 I get:
> > 
> > tg3.c:v3.110 (April 9, 2010)
> > PCI: Enabling device: (0000:00:03.0), cmd 2
> > ...
> > tg3 0000:00:03.0: vpd r/w failed.  This is likely a firmware bug on this device.  Contact the card vendor for a firmware update.
> > tg3 0000:00:03.0: vpd r/w failed.  This is likely a firmware bug on this device.  Contact the card vendor for a firmware update.
> > tg3 0000:00:03.0: vpd r/w failed.  This is likely a firmware bug on this device.  Contact the card vendor for a firmware update.
> > tg3 0000:00:03.0: eth0: Tigon3 [partno(none) rev 1002] (PCI:66MHz:64-bit) MAC address ...
> > tg3 0000:00:03.0: eth0: attached PHY is 5703 (10/100/1000Base-T Ethernet) (WireSpeed[1])
> > tg3 0000:00:03.0: eth0: RXcsums[1] LinkChgREG[0] MIirq[0] ASF[0] TSOcap[1]
> > tg3 0000:00:03.0: eth0: dma_rwctrl[763f0000] dma_mask[32-bit]
> > ...
> > tg3 0000:00:03.0: eth0: No firmware running
> > tg3 0000:00:03.0: eth0: Link is up at 100 Mbps, full duplex
> > tg3 0000:00:03.0: eth0: Flow control is on for TX and on for RX
> > 
> > The three 'vpd r/w failed' messages are new and did not occur with 2.6.34
> > or earlier kernels.

These messages are originating from the PCI layer.  It looks like your
device does not have NVRAM, which is a known issue.  I have an idea on
how to reduce the noise, but let me investigate it a little more before
I commit to it.

^ permalink raw reply

* RE: IAMT broken by commit 82776a4bcd7aa5fbcd2e6339b3ce88b727dd40ab
From: Allan, Bruce W @ 2010-06-01 23:33 UTC (permalink / raw)
  To: Aurelien Jarno, Kirsher, Jeffrey T; +Cc: netdev@vger.kernel.org
In-Reply-To: <20100530010219.GA20368@volta.aurel32.net>

I will look into this in the next couple days.

-----Original Message-----
From: Aurelien Jarno [mailto:aurelien@aurel32.net] 
Sent: Saturday, May 29, 2010 6:02 PM
To: Allan, Bruce W; Kirsher, Jeffrey T
Cc: netdev@vger.kernel.org
Subject: IAMT broken by commit 82776a4bcd7aa5fbcd2e6339b3ce88b727dd40ab

Hi,

I have recently upgrade my kernel, and found that Intel AMT support is
not working anymore as expected. I have configured IAMT so that is 
always available, even when the machine is off ("Desktop: ON in S0, S3,
S4-5").

On recent kernels, IAMT support does not work after the machine has 
been powered-off. Even worse, it also goes into this state when I try
to reboot it.

I have done a bisect and got this commit:

| commit 82776a4bcd7aa5fbcd2e6339b3ce88b727dd40ab
| Author: Bruce Allan <bruce.w.allan@intel.com>
| Date:   Fri Aug 14 14:35:33 2009 +0000
| 
|     e1000e: WoL does not work on 82577/82578 with manageability enabled
|     
|     With manageability (Intel AMT) enabled via BIOS, PHY wakeup does not get
|     configured on newer parts which use PHY wakeup vs. MAC wakeup which causes
|     WoL to not work.  The driver should configure PHY wakeup whether or not
|     manageability is enabled.
|     
|     Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
|     Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
|     Signed-off-by: David S. Miller <davem@davemloft.net>

I have tried to revert it on recent kernels (2.6.34), and IAMT is then
working as expected. My machine is using a Gigabyte EQ45M-S2 motherboard
with an 82567LM-3 ethernet chip (8086:10de), that is a different model
than the one of the original problem.

I do wonder if the changes in the patch should not only be done on some 
chip models, and I will appreciate any help in fixing this issue.

Thanks,
Aurelien


-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply

* Re: IAMT broken by commit 82776a4bcd7aa5fbcd2e6339b3ce88b727dd40ab
From: Kirsher, Jeffrey T @ 2010-06-01 23:35 UTC (permalink / raw)
  To: Allan, Bruce W, 'aurelien@aurel32.net'
  Cc: 'netdev@vger.kernel.org'

Thanks.

--
Cheers,
Jeff


-----Original Message-----
From: Allan, Bruce W <bruce.w.allan@intel.com>
To: Aurelien Jarno <aurelien@aurel32.net>; Kirsher, Jeffrey T <jeffrey.t.kirsher@intel.com>
CC: netdev@vger.kernel.org <netdev@vger.kernel.org>
Sent: Tue Jun 01 16:33:27 2010
Subject: RE: IAMT broken by commit 82776a4bcd7aa5fbcd2e6339b3ce88b727dd40ab

I will look into this in the next couple days.

-----Original Message-----
From: Aurelien Jarno [mailto:aurelien@aurel32.net] 
Sent: Saturday, May 29, 2010 6:02 PM
To: Allan, Bruce W; Kirsher, Jeffrey T
Cc: netdev@vger.kernel.org
Subject: IAMT broken by commit 82776a4bcd7aa5fbcd2e6339b3ce88b727dd40ab

Hi,

I have recently upgrade my kernel, and found that Intel AMT support is
not working anymore as expected. I have configured IAMT so that is 
always available, even when the machine is off ("Desktop: ON in S0, S3,
S4-5").

On recent kernels, IAMT support does not work after the machine has 
been powered-off. Even worse, it also goes into this state when I try
to reboot it.

I have done a bisect and got this commit:

| commit 82776a4bcd7aa5fbcd2e6339b3ce88b727dd40ab
| Author: Bruce Allan <bruce.w.allan@intel.com>
| Date:   Fri Aug 14 14:35:33 2009 +0000
| 
|     e1000e: WoL does not work on 82577/82578 with manageability enabled
|     
|     With manageability (Intel AMT) enabled via BIOS, PHY wakeup does not get
|     configured on newer parts which use PHY wakeup vs. MAC wakeup which causes
|     WoL to not work.  The driver should configure PHY wakeup whether or not
|     manageability is enabled.
|     
|     Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
|     Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
|     Signed-off-by: David S. Miller <davem@davemloft.net>

I have tried to revert it on recent kernels (2.6.34), and IAMT is then
working as expected. My machine is using a Gigabyte EQ45M-S2 motherboard
with an 82567LM-3 ethernet chip (8086:10de), that is a different model
than the one of the original problem.

I do wonder if the changes in the patch should not only be done on some 
chip models, and I will appreciate any help in fixing this issue.

Thanks,
Aurelien


-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply

* Re: [PATCH 3/3] vhost: apply cpumask and cgroup to vhost workers
From: Tejun Heo @ 2010-06-01 23:59 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Michael S. Tsirkin, Oleg Nesterov, netdev, lkml,
	kvm@vger.kernel.org, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Ingo Molnar, Andi Kleen
In-Reply-To: <1275412754.989.10.camel@w-sridhar.beaverton.ibm.com>

On 06/01/2010 07:19 PM, Sridhar Samudrala wrote:
>> -	int i;
>> +	cpumask_var_t mask;
>> +	int i, ret = -ENOMEM;
>> +
>> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
>> +		goto out_free_mask;
> 
> I think this is another bug in the error path. You should simply
> do a return instead of a goto here when aloc_cpu_mask fails.

Oh... it's always safe to call free_cpumask_var() after failed
alloc_cpumask_var(), so that part isn't broken.

Thanks.

-- 
tejun

^ permalink raw reply

* [PATCH net-2.6] bnx2: Fix hang during rmmod bnx2.
From: Michael Chan @ 2010-06-02  1:05 UTC (permalink / raw)
  To: davem; +Cc: netdev

The regression is caused by:

commit 4327ba435a56ada13eedf3eb332e583c7a0586a9
    bnx2: Fix netpoll crash.

If ->open() and ->close() are called multiple times, the same napi structs
will be added to dev->napi_list multiple times, corrupting the dev->napi_list.
This causes free_netdev() to hang during rmmod.

We fix this by calling netif_napi_del() during ->close().

Also, bnx2_init_napi() must not be in the __devinit section since it is
called by ->open().

Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: Benjamin Li <benli@broadcom.com>
---
 drivers/net/bnx2.c |   14 +++++++++++++-
 1 files changed, 13 insertions(+), 1 deletions(-)

diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 188e356..949d7a9 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -247,6 +247,7 @@ static const struct flash_spec flash_5709 = {
 MODULE_DEVICE_TABLE(pci, bnx2_pci_tbl);
 
 static void bnx2_init_napi(struct bnx2 *bp);
+static void bnx2_del_napi(struct bnx2 *bp);
 
 static inline u32 bnx2_tx_avail(struct bnx2 *bp, struct bnx2_tx_ring_info *txr)
 {
@@ -6270,6 +6271,7 @@ open_err:
 	bnx2_free_skbs(bp);
 	bnx2_free_irq(bp);
 	bnx2_free_mem(bp);
+	bnx2_del_napi(bp);
 	return rc;
 }
 
@@ -6537,6 +6539,7 @@ bnx2_close(struct net_device *dev)
 	bnx2_free_irq(bp);
 	bnx2_free_skbs(bp);
 	bnx2_free_mem(bp);
+	bnx2_del_napi(bp);
 	bp->link_up = 0;
 	netif_carrier_off(bp->dev);
 	bnx2_set_power_state(bp, PCI_D3hot);
@@ -8227,7 +8230,16 @@ bnx2_bus_string(struct bnx2 *bp, char *str)
 	return str;
 }
 
-static void __devinit
+static void
+bnx2_del_napi(struct bnx2 *bp)
+{
+	int i;
+
+	for (i = 0; i < bp->irq_nvecs; i++)
+		netif_napi_del(&bp->bnx2_napi[i].napi);
+}
+
+static void
 bnx2_init_napi(struct bnx2 *bp)
 {
 	int i;
-- 
1.6.4.GIT



^ permalink raw reply related

* Re: [PATCH]: vxge: Fix checkstack warning in vxge_probe()
From: Jon Mason @ 2010-06-02  3:03 UTC (permalink / raw)
  To: Prarit Bhargava; +Cc: netdev, mschmidt
In-Reply-To: <20100528175735.30903.21134.sendpatchset@prarit.bos.redhat.com>

On Fri, 2010-05-28 at 14:01 -0400, Prarit Bhargava wrote:
> Linux 2.6.33 reports this checkstack warning:
> 
> drivers/net/vxge/vxge-main.c: In function 'vxge_probe':
> drivers/net/vxge/vxge-main.c:4409: warning: the frame size of 1028 bytes is larger than 1024 bytes
> 
> This warning does not occur in the latest linux-2.6 or linux-next, however,
> when I do a 'make -j32 CONFIG_FRAME_WARN=512' instead of 1024 I see
> 
> drivers/net/vxge/vxge-main.c: In function ‘vxge_probe’:
> drivers/net/vxge/vxge-main.c:4423: warning: the frame size of 1024 bytes is larger than 512 bytes
> 
> This patch moves the large vxge_config struct off the stack.

The patch is sufficient, but I think it doesn't go far enough.  The
struct vxge_config in struct vxgedev should be malloc'd as well.  struct
vxgedev is entirely too big and needs to be smaller, for cache perf and
other reasons.

Thanks,
Jon

> 
> Signed-off-by: Prarit Bhargava <prarit@redhat.com>
> 
> diff --git a/drivers/net/vxge/vxge-main.c b/drivers/net/vxge/vxge-main.c
> index 2bab364..b955802 100644
> --- a/drivers/net/vxge/vxge-main.c
> +++ b/drivers/net/vxge/vxge-main.c
> @@ -4016,7 +4016,7 @@ vxge_probe(struct pci_dev *pdev, const struct pci_device_id *pre)
>  	int high_dma = 0;
>  	u64 vpath_mask = 0;
>  	struct vxgedev *vdev;
> -	struct vxge_config ll_config;
> +	struct vxge_config *ll_config = NULL;
>  	struct vxge_hw_device_config *device_config = NULL;
>  	struct vxge_hw_device_attr attr;
>  	int i, j, no_of_vpath = 0, max_vpath_supported = 0;
> @@ -4075,17 +4075,24 @@ vxge_probe(struct pci_dev *pdev, const struct pci_device_id *pre)
>  		goto _exit0;
>  	}
>  
> -	memset(&ll_config, 0, sizeof(struct vxge_config));
> -	ll_config.tx_steering_type = TX_MULTIQ_STEERING;
> -	ll_config.intr_type = MSI_X;
> -	ll_config.napi_weight = NEW_NAPI_WEIGHT;
> -	ll_config.rth_steering = RTH_STEERING;
> +	ll_config = kzalloc(sizeof(*ll_config), GFP_KERNEL);
> +	if (!ll_config) {
> +		ret = -ENOMEM;
> +		vxge_debug_init(VXGE_ERR,
> +			"ll_config : malloc failed %s %d",
> +			__FILE__, __LINE__);
> +		goto _exit0;
> +	}
> +	ll_config->tx_steering_type = TX_MULTIQ_STEERING;
> +	ll_config->intr_type = MSI_X;
> +	ll_config->napi_weight = NEW_NAPI_WEIGHT;
> +	ll_config->rth_steering = RTH_STEERING;
>  
>  	/* get the default configuration parameters */
>  	vxge_hw_device_config_default_get(device_config);
>  
>  	/* initialize configuration parameters */
> -	vxge_device_config_init(device_config, &ll_config.intr_type);
> +	vxge_device_config_init(device_config, &ll_config->intr_type);
>  
>  	ret = pci_enable_device(pdev);
>  	if (ret) {
> @@ -4138,7 +4145,7 @@ vxge_probe(struct pci_dev *pdev, const struct pci_device_id *pre)
>  		(unsigned long long)pci_resource_start(pdev, 0));
>  
>  	status = vxge_hw_device_hw_info_get(attr.bar0,
> -			&ll_config.device_hw_info);
> +			&ll_config->device_hw_info);
>  	if (status != VXGE_HW_OK) {
>  		vxge_debug_init(VXGE_ERR,
>  			"%s: Reading of hardware info failed."
> @@ -4147,7 +4154,7 @@ vxge_probe(struct pci_dev *pdev, const struct pci_device_id *pre)
>  		goto _exit3;
>  	}
>  
> -	if (ll_config.device_hw_info.fw_version.major !=
> +	if (ll_config->device_hw_info.fw_version.major !=
>  		VXGE_DRIVER_FW_VERSION_MAJOR) {
>  		vxge_debug_init(VXGE_ERR,
>  			"%s: Incorrect firmware version."
> @@ -4157,7 +4164,7 @@ vxge_probe(struct pci_dev *pdev, const struct pci_device_id *pre)
>  		goto _exit3;
>  	}
>  
> -	vpath_mask = ll_config.device_hw_info.vpath_mask;
> +	vpath_mask = ll_config->device_hw_info.vpath_mask;
>  	if (vpath_mask == 0) {
>  		vxge_debug_ll_config(VXGE_TRACE,
>  			"%s: No vpaths available in device", VXGE_DRIVER_NAME);
> @@ -4169,10 +4176,10 @@ vxge_probe(struct pci_dev *pdev, const struct pci_device_id *pre)
>  		"%s:%d  Vpath mask = %llx", __func__, __LINE__,
>  		(unsigned long long)vpath_mask);
>  
> -	function_mode = ll_config.device_hw_info.function_mode;
> -	host_type = ll_config.device_hw_info.host_type;
> +	function_mode = ll_config->device_hw_info.function_mode;
> +	host_type = ll_config->device_hw_info.host_type;
>  	is_privileged = __vxge_hw_device_is_privilaged(host_type,
> -		ll_config.device_hw_info.func_id);
> +		ll_config->device_hw_info.func_id);
>  
>  	/* Check how many vpaths are available */
>  	for (i = 0; i < VXGE_HW_MAX_VIRTUAL_PATHS; i++) {
> @@ -4186,7 +4193,7 @@ vxge_probe(struct pci_dev *pdev, const struct pci_device_id *pre)
>  
>  	/* Enable SRIOV mode, if firmware has SRIOV support and if it is a PF */
>  	if (is_sriov(function_mode) && (max_config_dev > 1) &&
> -		(ll_config.intr_type != INTA) &&
> +		(ll_config->intr_type != INTA) &&
>  		(is_privileged == VXGE_HW_OK)) {
>  		ret = pci_enable_sriov(pdev, ((max_config_dev - 1) < num_vfs)
>  			? (max_config_dev - 1) : num_vfs);
> @@ -4199,7 +4206,7 @@ vxge_probe(struct pci_dev *pdev, const struct pci_device_id *pre)
>  	 * Configure vpaths and get driver configured number of vpaths
>  	 * which is less than or equal to the maximum vpaths per function.
>  	 */
> -	no_of_vpath = vxge_config_vpaths(device_config, vpath_mask, &ll_config);
> +	no_of_vpath = vxge_config_vpaths(device_config, vpath_mask, ll_config);
>  	if (!no_of_vpath) {
>  		vxge_debug_ll_config(VXGE_ERR,
>  			"%s: No more vpaths to configure", VXGE_DRIVER_NAME);
> @@ -4234,21 +4241,21 @@ vxge_probe(struct pci_dev *pdev, const struct pci_device_id *pre)
>  	/* set private device info */
>  	pci_set_drvdata(pdev, hldev);
>  
> -	ll_config.gro_enable = VXGE_GRO_ALWAYS_AGGREGATE;
> -	ll_config.fifo_indicate_max_pkts = VXGE_FIFO_INDICATE_MAX_PKTS;
> -	ll_config.addr_learn_en = addr_learn_en;
> -	ll_config.rth_algorithm = RTH_ALG_JENKINS;
> -	ll_config.rth_hash_type_tcpipv4 = VXGE_HW_RING_HASH_TYPE_TCP_IPV4;
> -	ll_config.rth_hash_type_ipv4 = VXGE_HW_RING_HASH_TYPE_NONE;
> -	ll_config.rth_hash_type_tcpipv6 = VXGE_HW_RING_HASH_TYPE_NONE;
> -	ll_config.rth_hash_type_ipv6 = VXGE_HW_RING_HASH_TYPE_NONE;
> -	ll_config.rth_hash_type_tcpipv6ex = VXGE_HW_RING_HASH_TYPE_NONE;
> -	ll_config.rth_hash_type_ipv6ex = VXGE_HW_RING_HASH_TYPE_NONE;
> -	ll_config.rth_bkt_sz = RTH_BUCKET_SIZE;
> -	ll_config.tx_pause_enable = VXGE_PAUSE_CTRL_ENABLE;
> -	ll_config.rx_pause_enable = VXGE_PAUSE_CTRL_ENABLE;
> -
> -	if (vxge_device_register(hldev, &ll_config, high_dma, no_of_vpath,
> +	ll_config->gro_enable = VXGE_GRO_ALWAYS_AGGREGATE;
> +	ll_config->fifo_indicate_max_pkts = VXGE_FIFO_INDICATE_MAX_PKTS;
> +	ll_config->addr_learn_en = addr_learn_en;
> +	ll_config->rth_algorithm = RTH_ALG_JENKINS;
> +	ll_config->rth_hash_type_tcpipv4 = VXGE_HW_RING_HASH_TYPE_TCP_IPV4;
> +	ll_config->rth_hash_type_ipv4 = VXGE_HW_RING_HASH_TYPE_NONE;
> +	ll_config->rth_hash_type_tcpipv6 = VXGE_HW_RING_HASH_TYPE_NONE;
> +	ll_config->rth_hash_type_ipv6 = VXGE_HW_RING_HASH_TYPE_NONE;
> +	ll_config->rth_hash_type_tcpipv6ex = VXGE_HW_RING_HASH_TYPE_NONE;
> +	ll_config->rth_hash_type_ipv6ex = VXGE_HW_RING_HASH_TYPE_NONE;
> +	ll_config->rth_bkt_sz = RTH_BUCKET_SIZE;
> +	ll_config->tx_pause_enable = VXGE_PAUSE_CTRL_ENABLE;
> +	ll_config->rx_pause_enable = VXGE_PAUSE_CTRL_ENABLE;
> +
> +	if (vxge_device_register(hldev, ll_config, high_dma, no_of_vpath,
>  		&vdev)) {
>  		ret = -EINVAL;
>  		goto _exit4;
> @@ -4279,7 +4286,7 @@ vxge_probe(struct pci_dev *pdev, const struct pci_device_id *pre)
>  		vdev->vpaths[j].vdev = vdev;
>  		vdev->vpaths[j].max_mac_addr_cnt = max_mac_vpath;
>  		memcpy((u8 *)vdev->vpaths[j].macaddr,
> -				(u8 *)ll_config.device_hw_info.mac_addrs[i],
> +				ll_config->device_hw_info.mac_addrs[i],
>  				ETH_ALEN);
>  
>  		/* Initialize the mac address list header */
> @@ -4300,18 +4307,18 @@ vxge_probe(struct pci_dev *pdev, const struct pci_device_id *pre)
>  
>  	macaddr = (u8 *)vdev->vpaths[0].macaddr;
>  
> -	ll_config.device_hw_info.serial_number[VXGE_HW_INFO_LEN - 1] = '\0';
> -	ll_config.device_hw_info.product_desc[VXGE_HW_INFO_LEN - 1] = '\0';
> -	ll_config.device_hw_info.part_number[VXGE_HW_INFO_LEN - 1] = '\0';
> +	ll_config->device_hw_info.serial_number[VXGE_HW_INFO_LEN - 1] = '\0';
> +	ll_config->device_hw_info.product_desc[VXGE_HW_INFO_LEN - 1] = '\0';
> +	ll_config->device_hw_info.part_number[VXGE_HW_INFO_LEN - 1] = '\0';
>  
>  	vxge_debug_init(VXGE_TRACE, "%s: SERIAL NUMBER: %s",
> -		vdev->ndev->name, ll_config.device_hw_info.serial_number);
> +		vdev->ndev->name, ll_config->device_hw_info.serial_number);
>  
>  	vxge_debug_init(VXGE_TRACE, "%s: PART NUMBER: %s",
> -		vdev->ndev->name, ll_config.device_hw_info.part_number);
> +		vdev->ndev->name, ll_config->device_hw_info.part_number);
>  
>  	vxge_debug_init(VXGE_TRACE, "%s: Neterion %s Server Adapter",
> -		vdev->ndev->name, ll_config.device_hw_info.product_desc);
> +		vdev->ndev->name, ll_config->device_hw_info.product_desc);
>  
>  	vxge_debug_init(VXGE_TRACE, "%s: MAC ADDR: %pM",
>  		vdev->ndev->name, macaddr);
> @@ -4321,11 +4328,11 @@ vxge_probe(struct pci_dev *pdev, const struct pci_device_id *pre)
>  
>  	vxge_debug_init(VXGE_TRACE,
>  		"%s: Firmware version : %s Date : %s", vdev->ndev->name,
> -		ll_config.device_hw_info.fw_version.version,
> -		ll_config.device_hw_info.fw_date.date);
> +		ll_config->device_hw_info.fw_version.version,
> +		ll_config->device_hw_info.fw_date.date);
>  
>  	if (new_device) {
> -		switch (ll_config.device_hw_info.function_mode) {
> +		switch (ll_config->device_hw_info.function_mode) {
>  		case VXGE_HW_FUNCTION_MODE_SINGLE_FUNCTION:
>  			vxge_debug_init(VXGE_TRACE,
>  			"%s: Single Function Mode Enabled", vdev->ndev->name);
> @@ -4348,7 +4355,7 @@ vxge_probe(struct pci_dev *pdev, const struct pci_device_id *pre)
>  	vxge_print_parm(vdev, vpath_mask);
>  
>  	/* Store the fw version for ethttool option */
> -	strcpy(vdev->fw_version, ll_config.device_hw_info.fw_version.version);
> +	strcpy(vdev->fw_version, ll_config->device_hw_info.fw_version.version);
>  	memcpy(vdev->ndev->dev_addr, (u8 *)vdev->vpaths[0].macaddr, ETH_ALEN);
>  	memcpy(vdev->ndev->perm_addr, vdev->ndev->dev_addr, ETH_ALEN);
>  
> @@ -4387,7 +4394,7 @@ vxge_probe(struct pci_dev *pdev, const struct pci_device_id *pre)
>  	 * present to prevent such a failure.
>  	 */
>  
> -	if (ll_config.device_hw_info.function_mode ==
> +	if (ll_config->device_hw_info.function_mode ==
>  		VXGE_HW_FUNCTION_MODE_MULTI_FUNCTION)
>  		if (vdev->config.intr_type == INTA)
>  			vxge_hw_device_unmask_all(hldev);
> @@ -4399,6 +4406,7 @@ vxge_probe(struct pci_dev *pdev, const struct pci_device_id *pre)
>  	VXGE_COPY_DEBUG_INFO_TO_LL(vdev, vxge_hw_device_error_level_get(hldev),
>  		vxge_hw_device_trace_level_get(hldev));
>  
> +	kfree(ll_config);
>  	return 0;
>  
>  _exit5:
> @@ -4416,6 +4424,7 @@ _exit2:
>  _exit1:
>  	pci_disable_device(pdev);
>  _exit0:
> +	kfree(ll_config);
>  	kfree(device_config);
>  	driver_config->config_dev_cnt--;
>  	pci_set_drvdata(pdev, NULL);
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply

* [PATCH nf-next-2.6] netfilter: vmalloc_node cleanup
From: Eric Dumazet @ 2010-06-02  5:02 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: netdev, Netfilter Development Mailinglist

Using vmalloc_node(size, numa_node_id()) for temporary storage is not
needed. vmalloc(size) is more respectful of user NUMA policy.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/ipv4/netfilter/arp_tables.c |    7 +++----
 net/ipv4/netfilter/ip_tables.c  |    4 ++--
 net/ipv6/netfilter/ip6_tables.c |    7 +++----
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 1ac01b1..16c0ba0 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -758,7 +758,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 	 * about).
 	 */
 	countersize = sizeof(struct xt_counters) * private->number;
-	counters = vmalloc_node(countersize, numa_node_id());
+	counters = vmalloc(countersize);
 
 	if (counters == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -1005,8 +1005,7 @@ static int __do_replace(struct net *net, const char *name,
 	struct arpt_entry *iter;
 
 	ret = 0;
-	counters = vmalloc_node(num_counters * sizeof(struct xt_counters),
-				numa_node_id());
+	counters = vmalloc(num_counters * sizeof(struct xt_counters));
 	if (!counters) {
 		ret = -ENOMEM;
 		goto out;
@@ -1159,7 +1158,7 @@ static int do_add_counters(struct net *net, const void __user *user,
 	if (len != size + num_counters * sizeof(struct xt_counters))
 		return -EINVAL;
 
-	paddc = vmalloc_node(len - size, numa_node_id());
+	paddc = vmalloc(len - size);
 	if (!paddc)
 		return -ENOMEM;
 
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 63958f3..7c0b8ad 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -928,7 +928,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 	   (other than comefrom, which userspace doesn't care
 	   about). */
 	countersize = sizeof(struct xt_counters) * private->number;
-	counters = vmalloc_node(countersize, numa_node_id());
+	counters = vmalloc(countersize);
 
 	if (counters == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -1352,7 +1352,7 @@ do_add_counters(struct net *net, const void __user *user,
 	if (len != size + num_counters * sizeof(struct xt_counters))
 		return -EINVAL;
 
-	paddc = vmalloc_node(len - size, numa_node_id());
+	paddc = vmalloc(len - size);
 	if (!paddc)
 		return -ENOMEM;
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 6f517bd..82945ef 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -943,7 +943,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 	   (other than comefrom, which userspace doesn't care
 	   about). */
 	countersize = sizeof(struct xt_counters) * private->number;
-	counters = vmalloc_node(countersize, numa_node_id());
+	counters = vmalloc(countersize);
 
 	if (counters == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -1213,8 +1213,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	struct ip6t_entry *iter;
 
 	ret = 0;
-	counters = vmalloc_node(num_counters * sizeof(struct xt_counters),
-				numa_node_id());
+	counters = vmalloc(num_counters * sizeof(struct xt_counters));
 	if (!counters) {
 		ret = -ENOMEM;
 		goto out;
@@ -1368,7 +1367,7 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 	if (len != size + num_counters * sizeof(struct xt_counters))
 		return -EINVAL;
 
-	paddc = vmalloc_node(len - size, numa_node_id());
+	paddc = vmalloc(len - size);
 	if (!paddc)
 		return -ENOMEM;
 



^ permalink raw reply related

* Re: replace hooks in __netif_receive_skb (v4)
From: Jiri Pirko @ 2010-06-02  6:50 UTC (permalink / raw)
  To: Fischer, Anna
  Cc: Stephen Hemminger, davem@davemloft.net, netdev@vger.kernel.org,
	kaber@trash.net, eric.dumazet@gmail.com
In-Reply-To: <0199E0D51A61344794750DC57738F58E70B17D377A@GVW1118EXC.americas.hpqcorp.net>

Tue, Jun 01, 2010 at 06:13:51PM CEST, anna.fischer@hp.com wrote:
>> Subject: net: replace hooks in __netif_receive_skb (v4)
>> 
>> 
>> From: Jiri Pirko <jpirko@redhat.com>
>> 
>> What this patch does is it removes two receive frame hooks (for bridge
>> and for
>> macvlan) from __netif_receive_skb. These are replaced them with a
>> single
>> hook for both. It only supports one hook per device because it makes no
>> sense to do bridging and macvlan on the same device.
>> 
>> Then a network driver (of virtual netdev like macvlan or bridge) can
>> register
>> an rx_handler for needed net device.
>
>
>I think the idea of this is really good, and it has been long required to get rid of the bridging hook and the "hack" for macvlan to get into the network stack. 
>
>However, I wonder, if this is to be used as a generic interface, then why the restriction of only having a single hook per device? Yes, it makes sense to do this for the bridge/macvlan combination, but in general there could be other cases where you would want to allow multiple receivers per device. 

Right, I agree.

>
>
>> @@ -2792,6 +2778,7 @@ EXPORT_SYMBOL(__skb_bond_should_drop);
>>  static int __netif_receive_skb(struct sk_buff *skb)
>>  {
>>  	struct packet_type *ptype, *pt_prev;
>> +	rx_callback_func_t *rh;
>>  	struct net_device *orig_dev;
>>  	struct net_device *master;
>>  	struct net_device *null_or_orig;
>> @@ -2855,12 +2842,16 @@ static int __netif_receive_skb(struct sk
>>  ncls:
>>  #endif
>> 
>> -	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
>> -	if (!skb)
>> -		goto out;
>> -	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
>> -	if (!skb)
>> -		goto out;
>> +	/* Handle special case of bridge or macvlan */
>> +	if ((rh = rcu_dereference(skb->dev->rx_handler)) != NULL) {
>> +		if (pt_prev) {
>> +			ret = deliver_skb(skb, pt_prev, orig_dev);
>
>What happens with 'ret' here? It is completely ignored, isn't it? Because you assume that there is only a single receiver per device? I think it would be much better to have return codes that indicate whether a packet has been consumed by a receive handler, or whether it is supposed to be processed further. The same actually applies for the packet handlers that are processed a bit further down in netif_receive_skb() - the return code is ignored and so a handler has no control over further processing of the packet.

It's not ignored. it's returned at the end of the function __netif_receive_skb.

^ permalink raw reply

* Re: net: replace hooks in __netif_receive_skb (v4)
From: Jiri Pirko @ 2010-06-02  7:02 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: davem, netdev, kaber, eric.dumazet
In-Reply-To: <20100601091004.7885cd8c@nehalam>

Tue, Jun 01, 2010 at 06:10:04PM CEST, shemminger@vyatta.com wrote:
>On Tue, 1 Jun 2010 17:41:07 +0200
>Jiri Pirko <jpirko@redhat.com> wrote:
>
>> Actually, I'm not happy about this (reducing to only one hook) and for two
>> reasons:
>> 
>> 1) I think it's a good behaviour to "mask" one handler by another in case device
>> is for example used for macvlan and then added to bridge. Because when it's
>> again removed from the bridge, the original functionality is restored. And also,
>> this would be consistent with the current behaviour.
>> 
>> 2) I would imagine a situation, when multiple handers are needed in cascade.
>> Actually I'm working on a virtual device draft which uses two handlers, although
>> in corner situation.
>> 
>> Regards,
>> 	Jirka
>
>I don't like it because:
>
>1) Adding macvlan to bridge makes no sense. The bridge is already in promicious mode.

Right, I'm not saying it has sense to have it together in the same moment. But
you can to this:

# ip link add link eth0 type macvlan

You can use eth0 + macvlan0. Now you do:

# brctl addif br0 eth0

Direct use of eth0 and macvlan0 is not not possible now.

# brctl delif br0 eth0

Now the original functionality ot eth0 and macvlan0 is restored.

>
>2) I don't like to see functionality added when it is not needed.
>
>3) The extra overhead of traversing list causes more cache activity in extreme
>   hot path.
>

But ok, I hear your arguments, I thought about them myself before I did the
patch and I thought that added overhead (which is not too big, I see one more
dereference, rx_handler pointer) would be acceptible.

Maybe someone other then us has opinion too :)

>Wait and add the multiple handlers when your code is included.
>
>-- 
>http://www.extremeprogramming.org/rules/early.html

^ permalink raw reply

* Re: net: replace hooks in __netif_receive_skb (v4)
From: Jiri Pirko @ 2010-06-02  7:24 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: davem, netdev, kaber, eric.dumazet
In-Reply-To: <20100601082805.1c84b16d@nehalam>

Few nitpicks, I'm going to send patch reflecting this later.

Tue, Jun 01, 2010 at 05:28:05PM CEST, shemminger@vyatta.com wrote:
>
>From: Jiri Pirko <jpirko@redhat.com>
>
>What this patch does is it removes two receive frame hooks (for bridge and for
>macvlan) from __netif_receive_skb. These are replaced them with a single
>hook for both. It only supports one hook per device because it makes no
>sense to do bridging and macvlan on the same device.
>
>Then a network driver (of virtual netdev like macvlan or bridge) can register
>an rx_handler for needed net device.
>
>Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>---
>v3->v4 
>       - only one hook per device.
>v2->v3
>	- used GPL-ed exports
>v1->v2
>	- writers are locked by rtnl_lock (removed unnecessary spinlock)
>	- using call_rcu in unregister
>	- nicer macvlan_port_create cleanup
>	- struct rx_hanler is made const in funtion parameters
>
> drivers/net/macvlan.c      |   19 ++++---
> include/linux/if_bridge.h  |    2 
> include/linux/if_macvlan.h |    4 -
> include/linux/netdevice.h  |    8 +++
> net/bridge/br.c            |    2 
> net/bridge/br_if.c         |    8 +++
> net/bridge/br_input.c      |   12 +++-
> net/bridge/br_private.h    |    3 -
> net/core/dev.c             |  119 ++++++++++++++++++++-------------------------
> 9 files changed, 94 insertions(+), 83 deletions(-)
>
>--- a/drivers/net/macvlan.c	2010-05-28 08:41:38.248169422 -0700
>+++ b/drivers/net/macvlan.c	2010-06-01 08:16:52.307206412 -0700
>@@ -145,15 +145,16 @@ static void macvlan_broadcast(struct sk_
> }
> 
> /* called under rcu_read_lock() from netif_receive_skb */
>-static struct sk_buff *macvlan_handle_frame(struct macvlan_port *port,
>-					    struct sk_buff *skb)
>+static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb)
> {
>+	struct macvlan_port *port;
> 	const struct ethhdr *eth = eth_hdr(skb);
> 	const struct macvlan_dev *vlan;
> 	const struct macvlan_dev *src;
> 	struct net_device *dev;
> 	unsigned int len;
> 
>+	port = rcu_dereference(skb->dev->macvlan_port);
> 	if (is_multicast_ether_addr(eth->h_dest)) {
> 		src = macvlan_hash_lookup(port, eth->h_source);
> 		if (!src)
>@@ -515,6 +516,7 @@ static int macvlan_port_create(struct ne
> {
> 	struct macvlan_port *port;
> 	unsigned int i;
>+	int err;
> 
> 	if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK)
> 		return -EINVAL;
>@@ -528,13 +530,21 @@ static int macvlan_port_create(struct ne
> 	for (i = 0; i < MACVLAN_HASH_SIZE; i++)
> 		INIT_HLIST_HEAD(&port->vlan_hash[i]);
> 	rcu_assign_pointer(dev->macvlan_port, port);
>-	return 0;
>+
>+	err = netdev_rx_handler_register(dev, macvlan_handle_frame);
>+	if (err) {
>+		dev->macvlan_port = NULL;

Why not to use rcu_assign_pointer here? since this is not hot path, that would be
nicer. It should be done for rcu-protected pointers (see rcu_assign_poiter
comment).

>+		kfree(port);
>+	}
>+
>+	return err;
> }
> 
> static void macvlan_port_destroy(struct net_device *dev)
> {
> 	struct macvlan_port *port = dev->macvlan_port;
> 
>+	netdev_rx_handler_unregister(dev);
> 	rcu_assign_pointer(dev->macvlan_port, NULL);
> 	synchronize_rcu();
> 	kfree(port);
>@@ -767,14 +777,12 @@ static int __init macvlan_init_module(vo
> 	int err;
> 
> 	register_netdevice_notifier(&macvlan_notifier_block);
>-	macvlan_handle_frame_hook = macvlan_handle_frame;
> 
> 	err = macvlan_link_register(&macvlan_link_ops);
> 	if (err < 0)
> 		goto err1;
> 	return 0;
> err1:
>-	macvlan_handle_frame_hook = NULL;
> 	unregister_netdevice_notifier(&macvlan_notifier_block);
> 	return err;
> }
>@@ -782,7 +790,6 @@ err1:
> static void __exit macvlan_cleanup_module(void)
> {
> 	rtnl_link_unregister(&macvlan_link_ops);
>-	macvlan_handle_frame_hook = NULL;
> 	unregister_netdevice_notifier(&macvlan_notifier_block);
> }
> 
>--- a/include/linux/if_bridge.h	2010-05-28 08:35:11.628190230 -0700
>+++ b/include/linux/if_bridge.h	2010-06-01 08:02:39.859736930 -0700
>@@ -102,8 +102,6 @@ struct __fdb_entry {
> #include <linux/netdevice.h>
> 
> extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
>-extern struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
>-					       struct sk_buff *skb);
> extern int (*br_should_route_hook)(struct sk_buff *skb);
> 
> #endif
>--- a/include/linux/if_macvlan.h	2010-05-28 08:35:11.628190230 -0700
>+++ b/include/linux/if_macvlan.h	2010-06-01 08:02:39.869738723 -0700
>@@ -84,8 +84,4 @@ extern int macvlan_link_register(struct 
> extern netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
> 				      struct net_device *dev);
> 
>-
>-extern struct sk_buff *(*macvlan_handle_frame_hook)(struct macvlan_port *,
>-						    struct sk_buff *);
>-
> #endif /* _LINUX_IF_MACVLAN_H */
>--- a/include/linux/netdevice.h	2010-05-28 08:41:38.628169375 -0700
>+++ b/include/linux/netdevice.h	2010-06-01 08:12:59.680035275 -0700
>@@ -254,6 +254,7 @@ struct netdev_hw_addr_list {
> #define netdev_for_each_mc_addr(ha, dev) \
> 	netdev_hw_addr_list_for_each(ha, &(dev)->mc)
> 
>+
Forgotten free line.

> struct hh_cache {
> 	struct hh_cache *hh_next;	/* Next entry			     */
> 	atomic_t	hh_refcnt;	/* number of users                   */
>@@ -381,6 +382,8 @@ enum gro_result {
> };
> typedef enum gro_result gro_result_t;
> 
>+typedef struct sk_buff *rx_callback_func_t(struct sk_buff *skb);
>+
> extern void __napi_schedule(struct napi_struct *n);
> 
> static inline int napi_disable_pending(struct napi_struct *n)
>@@ -957,6 +960,7 @@ struct net_device {
> #endif
> 
> 	struct netdev_queue	rx_queue;
>+	rx_callback_func_t	*rx_handler;
> 
> 	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;
> 
>@@ -1693,6 +1697,10 @@ static inline void napi_free_frags(struc
> 	napi->skb = NULL;
> }
> 
>+extern int netdev_rx_handler_register(struct net_device *dev,
>+				      rx_callback_func_t *hook);
>+extern void netdev_rx_handler_unregister(struct net_device *dev);
>+
> extern void		netif_nit_deliver(struct sk_buff *skb);
> extern int		dev_valid_name(const char *name);
> extern int		dev_ioctl(struct net *net, unsigned int cmd, void __user *);
>--- a/net/bridge/br.c	2010-05-28 08:35:11.858189945 -0700
>+++ b/net/bridge/br.c	2010-06-01 08:02:39.889741050 -0700
>@@ -63,7 +63,6 @@ static int __init br_init(void)
> 		goto err_out4;
> 
> 	brioctl_set(br_ioctl_deviceless_stub);
>-	br_handle_frame_hook = br_handle_frame;
> 
> #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
> 	br_fdb_test_addr_hook = br_fdb_test_addr;
>@@ -100,7 +99,6 @@ static void __exit br_deinit(void)
> 	br_fdb_test_addr_hook = NULL;
> #endif
> 
>-	br_handle_frame_hook = NULL;
> 	br_fdb_fini();
> }
> 
>--- a/net/bridge/br_if.c	2010-05-28 08:35:11.858189945 -0700
>+++ b/net/bridge/br_if.c	2010-06-01 08:14:30.521680943 -0700
>@@ -147,6 +147,7 @@ static void del_nbp(struct net_bridge_po
> 
> 	list_del_rcu(&p->list);
> 
>+	netdev_rx_handler_unregister(dev);
> 	rcu_assign_pointer(dev->br_port, NULL);

> 
> 	br_multicast_del_port(p);
>@@ -429,6 +430,11 @@ int br_add_if(struct net_bridge *br, str
> 		goto err2;
> 
> 	rcu_assign_pointer(dev->br_port, p);
>+
>+	err = netdev_rx_handler_register(dev, br_handle_frame);
>+	if (err)
>+		goto err3;
>+
> 	dev_disable_lro(dev);
> 
> 	list_add_rcu(&p->list, &br->port_list);
>@@ -451,6 +457,8 @@ int br_add_if(struct net_bridge *br, str
> 	br_netpoll_enable(br, dev);
> 
> 	return 0;
>+err3:
>+	dev->br_port = NULL;

I would like to see rcu_assing_pointer here too

> err2:
> 	br_fdb_delete_by_port(br, p, 1);
> err1:
>--- a/net/bridge/br_input.c	2010-05-28 08:35:11.858189945 -0700
>+++ b/net/bridge/br_input.c	2010-06-01 08:02:39.909740447 -0700
>@@ -131,15 +131,19 @@ static inline int is_link_local(const un
> }
> 
> /*
>- * Called via br_handle_frame_hook.
>  * Return NULL if skb is handled
>- * note: already called with rcu_read_lock (preempt_disabled)
>+ * note: already called with rcu_read_lock (preempt_disabled) from
>+ * netif_receive_skb
>  */
>-struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb)
>+struct sk_buff *br_handle_frame(struct sk_buff *skb)
> {
>+	struct net_bridge_port *p;
> 	const unsigned char *dest = eth_hdr(skb)->h_dest;
> 	int (*rhook)(struct sk_buff *skb);
> 
>+	if (skb->pkt_type == PACKET_LOOPBACK)
>+		return skb;
>+
> 	if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
> 		goto drop;
> 
>@@ -147,6 +151,8 @@ struct sk_buff *br_handle_frame(struct n
> 	if (!skb)
> 		return NULL;
> 
>+	p = rcu_dereference(skb->dev->br_port);
>+
> 	if (unlikely(is_link_local(dest))) {
> 		/* Pause frames shouldn't be passed up by driver anyway */
> 		if (skb->protocol == htons(ETH_P_PAUSE))
>--- a/net/bridge/br_private.h	2010-05-28 08:35:11.858189945 -0700
>+++ b/net/bridge/br_private.h	2010-06-01 08:02:39.919725220 -0700
>@@ -331,8 +331,7 @@ extern void br_features_recompute(struct
> 
> /* br_input.c */
> extern int br_handle_frame_finish(struct sk_buff *skb);
>-extern struct sk_buff *br_handle_frame(struct net_bridge_port *p,
>-				       struct sk_buff *skb);
>+extern struct sk_buff *br_handle_frame(struct sk_buff *skb);
> 
> /* br_ioctl.c */
> extern int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
>--- a/net/core/dev.c	2010-05-28 08:41:38.678169590 -0700
>+++ b/net/core/dev.c	2010-06-01 08:16:38.563051750 -0700
>@@ -2581,70 +2581,14 @@ static inline int deliver_skb(struct sk_
> 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
> }
> 
>-#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
>-
>-#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
>+#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
>+    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
> /* This hook is defined here for ATM LANE */
> int (*br_fdb_test_addr_hook)(struct net_device *dev,
> 			     unsigned char *addr) __read_mostly;
> EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
> #endif
> 
>-/*
>- * If bridge module is loaded call bridging hook.
>- *  returns NULL if packet was consumed.
>- */
>-struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
>-					struct sk_buff *skb) __read_mostly;
>-EXPORT_SYMBOL_GPL(br_handle_frame_hook);
>-
>-static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
>-					    struct packet_type **pt_prev, int *ret,
>-					    struct net_device *orig_dev)
>-{
>-	struct net_bridge_port *port;
>-
>-	if (skb->pkt_type == PACKET_LOOPBACK ||
>-	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
>-		return skb;
>-
>-	if (*pt_prev) {
>-		*ret = deliver_skb(skb, *pt_prev, orig_dev);
>-		*pt_prev = NULL;
>-	}
>-
>-	return br_handle_frame_hook(port, skb);
>-}
>-#else
>-#define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
>-#endif
>-
>-#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
>-struct sk_buff *(*macvlan_handle_frame_hook)(struct macvlan_port *p,
>-					     struct sk_buff *skb) __read_mostly;
>-EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
>-
>-static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
>-					     struct packet_type **pt_prev,
>-					     int *ret,
>-					     struct net_device *orig_dev)
>-{
>-	struct macvlan_port *port;
>-
>-	port = rcu_dereference(skb->dev->macvlan_port);
>-	if (!port)
>-		return skb;
>-
>-	if (*pt_prev) {
>-		*ret = deliver_skb(skb, *pt_prev, orig_dev);
>-		*pt_prev = NULL;
>-	}
>-	return macvlan_handle_frame_hook(port, skb);
>-}
>-#else
>-#define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
>-#endif
>-
> #ifdef CONFIG_NET_CLS_ACT
> /* TODO: Maybe we should just force sch_ingress to be compiled in
>  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
>@@ -2740,6 +2684,48 @@ void netif_nit_deliver(struct sk_buff *s
> 	rcu_read_unlock();
> }
> 
>+/**
>+ *	netdev_rx_handler_register - register receive handler
>+ *	@dev: device to register a handler for
>+ *	@rh: receive handler to register

Old comment

>+ *
>+ *	Register a receive hander for a device. This handler will then be
>+ *	called from __netif_receive_skb. A negative errno code is returned
>+ *	on a failure.
>+ *
>+ *	The caller must hold the rtnl_mutex.
>+ */
>+int netdev_rx_handler_register(struct net_device *dev,
>+			       rx_callback_func_t *hook)
>+{
>+	ASSERT_RTNL();
>+
>+	if (dev->rx_handler)
>+		return -EBUSY;
>+
>+	rcu_assign_pointer(dev->rx_handler, hook);
>+
>+	return 0;
>+}
>+EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
>+
>+/**
>+ *	netdev_rx_handler_unregister - unregister receive handler
>+ *	@dev: device to unregister a handler from
>+ *	@rh: receive handler to unregister
old comment

>+ *
>+ *	Unregister a receive hander from a device.
>+ *
>+ *	The caller must hold the rtnl_mutex.
>+ */
>+void netdev_rx_handler_unregister(struct net_device *dev)
>+{
>+
>+	ASSERT_RTNL();
>+	dev->rx_handler = NULL;

Also rcu_assign pointer would be nicer here.
>+}
>+EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
>+
> static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
> 					      struct net_device *master)
> {
>@@ -2792,6 +2778,7 @@ EXPORT_SYMBOL(__skb_bond_should_drop);
> static int __netif_receive_skb(struct sk_buff *skb)
> {
> 	struct packet_type *ptype, *pt_prev;
>+	rx_callback_func_t *rh;
> 	struct net_device *orig_dev;
> 	struct net_device *master;
> 	struct net_device *null_or_orig;
>@@ -2855,12 +2842,16 @@ static int __netif_receive_skb(struct sk
> ncls:
> #endif
> 
>-	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
>-	if (!skb)
>-		goto out;
>-	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
>-	if (!skb)
>-		goto out;
>+	/* Handle special case of bridge or macvlan */
>+	if ((rh = rcu_dereference(skb->dev->rx_handler)) != NULL) {
>+		if (pt_prev) {
>+			ret = deliver_skb(skb, pt_prev, orig_dev);
>+			pt_prev = NULL;
>+		}
>+		skb = rh(skb);
>+		if (!skb)
>+			goto out;
>+	}
> 
> 	/*
> 	 * Make sure frames received on VLAN interfaces stacked on

^ permalink raw reply

* Re: [PATCHv3] Refactor update of IPv6 flowi destination address for srcrt (RH) option
From: Arnaud Ebalard @ 2010-06-02  7:35 UTC (permalink / raw)
  To: David S. Miller
  Cc: Joe Perches, YOSHIFUJI Hideaki / 吉藤英明,
	netdev
In-Reply-To: <1275422925.19372.114.camel@Joe-Laptop.home>

Hi,

Here is a v3 based on the comments I received from Joe. The helper is
now in exthdrs.c (no more inline) and has a proper name.

Applies on net-2.6. Tested on 2.6.34.

Cheers,

a+

>From ec9c9695345f4821060a78bd3243cdfeaa852897 Mon Sep 17 00:00:00 2001
From: Arnaud Ebalard <arno@natisbad.org>
Date: Wed, 2 Jun 2010 09:25:27 +0200
Subject: [PATCH] Refactor update of IPv6 flowi destination address for srcrt (RH) option

There are more than a dozen occurrences of following code in the
IPv6 stack:

    if (opt && opt->srcrt) {
            struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
            ipv6_addr_copy(&final, &fl.fl6_dst);
            ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
            final_p = &final;
    }

Replace those with a helper. Note that the helper overrides final_p
in all cases. This is ok as final_p was previously initialized to
NULL when declared.

Signed-off-by: Arnaud Ebalard <arno@natisbad.org>
---
 include/net/ipv6.h               |    4 ++++
 net/dccp/ipv6.c                  |   30 ++++++------------------------
 net/ipv6/af_inet6.c              |    9 ++-------
 net/ipv6/datagram.c              |   18 ++++--------------
 net/ipv6/exthdrs.c               |   24 ++++++++++++++++++++++++
 net/ipv6/inet6_connection_sock.c |    9 ++-------
 net/ipv6/raw.c                   |   10 ++--------
 net/ipv6/syncookies.c            |    9 ++-------
 net/ipv6/tcp_ipv6.c              |   27 ++++++---------------------
 net/ipv6/udp.c                   |   11 +++--------
 10 files changed, 55 insertions(+), 96 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 2600b69..f5808d5 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -551,6 +551,10 @@ extern int 			ipv6_ext_hdr(u8 nexthdr);
 
 extern int ipv6_find_tlv(struct sk_buff *skb, int offset, int type);
 
+extern struct in6_addr *fl6_update_dst(struct flowi *fl,
+				       const struct ipv6_txoptions *opt,
+				       struct in6_addr *orig);
+
 /*
  *	socket options (ipv6_sockglue.c)
  */
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 0916988..6e3f325 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -248,7 +248,7 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct sk_buff *skb;
 	struct ipv6_txoptions *opt = NULL;
-	struct in6_addr *final_p = NULL, final;
+	struct in6_addr *final_p, final;
 	struct flowi fl;
 	int err = -1;
 	struct dst_entry *dst;
@@ -265,13 +265,7 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
 
 	opt = np->opt;
 
-	if (opt != NULL && opt->srcrt != NULL) {
-		const struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt;
-
-		ipv6_addr_copy(&final, &fl.fl6_dst);
-		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-		final_p = &final;
-	}
+	final_p = fl6_update_dst(&fl, opt, &final);
 
 	err = ip6_dst_lookup(sk, &dst, &fl);
 	if (err)
@@ -545,19 +539,13 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 		goto out_overflow;
 
 	if (dst == NULL) {
-		struct in6_addr *final_p = NULL, final;
+		struct in6_addr *final_p, final;
 		struct flowi fl;
 
 		memset(&fl, 0, sizeof(fl));
 		fl.proto = IPPROTO_DCCP;
 		ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
-		if (opt != NULL && opt->srcrt != NULL) {
-			const struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt;
-
-			ipv6_addr_copy(&final, &fl.fl6_dst);
-			ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-			final_p = &final;
-		}
+		final_p = fl6_update_dst(&fl, opt, &final);
 		ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
 		fl.oif = sk->sk_bound_dev_if;
 		fl.fl_ip_dport = inet_rsk(req)->rmt_port;
@@ -885,7 +873,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct dccp_sock *dp = dccp_sk(sk);
-	struct in6_addr *saddr = NULL, *final_p = NULL, final;
+	struct in6_addr *saddr = NULL, *final_p, final;
 	struct flowi fl;
 	struct dst_entry *dst;
 	int addr_type;
@@ -988,13 +976,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	fl.fl_ip_sport = inet->inet_sport;
 	security_sk_classify_flow(sk, &fl);
 
-	if (np->opt != NULL && np->opt->srcrt != NULL) {
-		const struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
-
-		ipv6_addr_copy(&final, &fl.fl6_dst);
-		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-		final_p = &final;
-	}
+	final_p = fl6_update_dst(&fl, np->opt, &final);
 
 	err = ip6_dst_lookup(sk, &dst, &fl);
 	if (err)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index e733942..94b1b9c 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -651,7 +651,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
 
 	if (dst == NULL) {
 		struct inet_sock *inet = inet_sk(sk);
-		struct in6_addr *final_p = NULL, final;
+		struct in6_addr *final_p, final;
 		struct flowi fl;
 
 		memset(&fl, 0, sizeof(fl));
@@ -665,12 +665,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
 		fl.fl_ip_sport = inet->inet_sport;
 		security_sk_classify_flow(sk, &fl);
 
-		if (np->opt && np->opt->srcrt) {
-			struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
-			ipv6_addr_copy(&final, &fl.fl6_dst);
-			ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-			final_p = &final;
-		}
+		final_p = fl6_update_dst(&fl, np->opt, &final);
 
 		err = ip6_dst_lookup(sk, &dst, &fl);
 		if (err) {
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 7126846..7d929a2 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -38,10 +38,11 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	struct sockaddr_in6	*usin = (struct sockaddr_in6 *) uaddr;
 	struct inet_sock      	*inet = inet_sk(sk);
 	struct ipv6_pinfo      	*np = inet6_sk(sk);
-	struct in6_addr		*daddr, *final_p = NULL, final;
+	struct in6_addr		*daddr, *final_p, final;
 	struct dst_entry	*dst;
 	struct flowi		fl;
 	struct ip6_flowlabel	*flowlabel = NULL;
+	struct ipv6_txoptions   *opt;
 	int			addr_type;
 	int			err;
 
@@ -155,19 +156,8 @@ ipv4_connected:
 
 	security_sk_classify_flow(sk, &fl);
 
-	if (flowlabel) {
-		if (flowlabel->opt && flowlabel->opt->srcrt) {
-			struct rt0_hdr *rt0 = (struct rt0_hdr *) flowlabel->opt->srcrt;
-			ipv6_addr_copy(&final, &fl.fl6_dst);
-			ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-			final_p = &final;
-		}
-	} else if (np->opt && np->opt->srcrt) {
-		struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
-		ipv6_addr_copy(&final, &fl.fl6_dst);
-		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-		final_p = &final;
-	}
+	opt = flowlabel ? flowlabel->opt : np->opt;
+	final_p = fl6_update_dst(&fl, opt, &final);
 
 	err = ip6_dst_lookup(sk, &dst, &fl);
 	if (err)
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 8a659f9..853a633 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -874,3 +874,27 @@ struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
 	return opt;
 }
 
+/**
+ * fl6_update_dst - update flowi destination address with info given
+ *                  by srcrt option, if any.
+ *
+ * @fl: flowi for which fl6_dst is to be updated
+ * @opt: struct ipv6_txoptions in which to look for srcrt opt
+ * @orig: copy of original fl6_dst address if modified
+ *
+ * Returns NULL if no txoptions or no srcrt, otherwise returns orig
+ * and initial value of fl->fl6_dst set in orig
+ */
+struct in6_addr *fl6_update_dst(struct flowi *fl,
+				const struct ipv6_txoptions *opt,
+				struct in6_addr *orig)
+{
+	if (!opt || !opt->srcrt)
+		return NULL;
+
+	ipv6_addr_copy(orig, &fl->fl6_dst);
+	ipv6_addr_copy(&fl->fl6_dst, ((struct rt0_hdr *)opt->srcrt)->addr);
+	return orig;
+}
+
+EXPORT_SYMBOL_GPL(fl6_update_dst);
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 0c5e3c3..8a16280 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -185,7 +185,7 @@ int inet6_csk_xmit(struct sk_buff *skb)
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct flowi fl;
 	struct dst_entry *dst;
-	struct in6_addr *final_p = NULL, final;
+	struct in6_addr *final_p, final;
 
 	memset(&fl, 0, sizeof(fl));
 	fl.proto = sk->sk_protocol;
@@ -199,12 +199,7 @@ int inet6_csk_xmit(struct sk_buff *skb)
 	fl.fl_ip_dport = inet->inet_dport;
 	security_sk_classify_flow(sk, &fl);
 
-	if (np->opt && np->opt->srcrt) {
-		struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
-		ipv6_addr_copy(&final, &fl.fl6_dst);
-		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-		final_p = &final;
-	}
+	final_p = fl6_update_dst(&fl, np->opt, &final);
 
 	dst = __inet6_csk_dst_check(sk, np->dst_cookie);
 
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 4a4dcbe..864eb8e 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -725,7 +725,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 {
 	struct ipv6_txoptions opt_space;
 	struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name;
-	struct in6_addr *daddr, *final_p = NULL, final;
+	struct in6_addr *daddr, *final_p, final;
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct raw6_sock *rp = raw6_sk(sk);
@@ -847,13 +847,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 	if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr))
 		ipv6_addr_copy(&fl.fl6_src, &np->saddr);
 
-	/* merge ip6_build_xmit from ip6_output */
-	if (opt && opt->srcrt) {
-		struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
-		ipv6_addr_copy(&final, &fl.fl6_dst);
-		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-		final_p = &final;
-	}
+	final_p = fl6_update_dst(&fl, opt, &final);
 
 	if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst))
 		fl.oif = np->mcast_oif;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 34d1f06..1238370 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -240,17 +240,12 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	 * me if there is a preferred way.
 	 */
 	{
-		struct in6_addr *final_p = NULL, final;
+		struct in6_addr *final_p, final;
 		struct flowi fl;
 		memset(&fl, 0, sizeof(fl));
 		fl.proto = IPPROTO_TCP;
 		ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
-		if (np->opt && np->opt->srcrt) {
-			struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
-			ipv6_addr_copy(&final, &fl.fl6_dst);
-			ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-			final_p = &final;
-		}
+		final_p = fl6_update_dst(&fl, np->opt, &final);
 		ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
 		fl.oif = sk->sk_bound_dev_if;
 		fl.mark = sk->sk_mark;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2b7c3a1..e487080 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -129,7 +129,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct in6_addr *saddr = NULL, *final_p = NULL, final;
+	struct in6_addr *saddr = NULL, *final_p, final;
 	struct flowi fl;
 	struct dst_entry *dst;
 	int addr_type;
@@ -250,12 +250,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	fl.fl_ip_dport = usin->sin6_port;
 	fl.fl_ip_sport = inet->inet_sport;
 
-	if (np->opt && np->opt->srcrt) {
-		struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
-		ipv6_addr_copy(&final, &fl.fl6_dst);
-		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-		final_p = &final;
-	}
+	final_p = fl6_update_dst(&fl, np->opt, &final);
 
 	security_sk_classify_flow(sk, &fl);
 
@@ -477,7 +472,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct sk_buff * skb;
 	struct ipv6_txoptions *opt = NULL;
-	struct in6_addr * final_p = NULL, final;
+	struct in6_addr * final_p, final;
 	struct flowi fl;
 	struct dst_entry *dst;
 	int err = -1;
@@ -494,12 +489,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
 	security_req_classify_flow(req, &fl);
 
 	opt = np->opt;
-	if (opt && opt->srcrt) {
-		struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
-		ipv6_addr_copy(&final, &fl.fl6_dst);
-		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-		final_p = &final;
-	}
+	final_p = fl6_update_dst(&fl, opt, &final);
 
 	err = ip6_dst_lookup(sk, &dst, &fl);
 	if (err)
@@ -1392,18 +1382,13 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		goto out_overflow;
 
 	if (dst == NULL) {
-		struct in6_addr *final_p = NULL, final;
+		struct in6_addr *final_p, final;
 		struct flowi fl;
 
 		memset(&fl, 0, sizeof(fl));
 		fl.proto = IPPROTO_TCP;
 		ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
-		if (opt && opt->srcrt) {
-			struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
-			ipv6_addr_copy(&final, &fl.fl6_dst);
-			ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-			final_p = &final;
-		}
+		final_p = fl6_update_dst(&fl, opt, &final);
 		ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr);
 		fl.oif = sk->sk_bound_dev_if;
 		fl.mark = sk->sk_mark;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 87be586..1dd1aff 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -927,7 +927,7 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name;
-	struct in6_addr *daddr, *final_p = NULL, final;
+	struct in6_addr *daddr, *final_p, final;
 	struct ipv6_txoptions *opt = NULL;
 	struct ip6_flowlabel *flowlabel = NULL;
 	struct flowi fl;
@@ -1097,14 +1097,9 @@ do_udp_sendmsg:
 		ipv6_addr_copy(&fl.fl6_src, &np->saddr);
 	fl.fl_ip_sport = inet->inet_sport;
 
-	/* merge ip6_build_xmit from ip6_output */
-	if (opt && opt->srcrt) {
-		struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
-		ipv6_addr_copy(&final, &fl.fl6_dst);
-		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-		final_p = &final;
+	final_p = fl6_update_dst(&fl, opt, &final);
+	if (final_p)
 		connected = 0;
-	}
 
 	if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) {
 		fl.oif = np->mcast_oif;
-- 
1.7.1


^ permalink raw reply related

* [PATCH net-next-2.6] net: replace hooks in __netif_receive_skb V5
From: Jiri Pirko @ 2010-06-02  7:52 UTC (permalink / raw)
  To: netdev; +Cc: davem, kaber, eric.dumazet, shemminger
In-Reply-To: <20100601082805.1c84b16d@nehalam>

What this patch does is it removes two receive frame hooks (for bridge and for
macvlan) from __netif_receive_skb. These are replaced them with a single
hook for both. It only supports one hook per device because it makes no
sense to do bridging and macvlan on the same device.

Then a network driver (of virtual netdev like macvlan or bridge) can register
an rx_handler for needed net device.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
---
v4->v5
	- do rcu_assign_pointer even for assigning null for rcu-protected
	  pointers 
	- cosmetics (renames, comments and __netif_receive_skb if part)

 drivers/net/macvlan.c      |   19 +++++--
 include/linux/if_bridge.h  |    2 -
 include/linux/if_macvlan.h |    4 --
 include/linux/netdevice.h  |    7 +++
 net/bridge/br.c            |    2 -
 net/bridge/br_if.c         |    8 +++
 net/bridge/br_input.c      |   12 +++-
 net/bridge/br_private.h    |    3 +-
 net/core/dev.c             |  119 ++++++++++++++++++++-----------------------
 9 files changed, 93 insertions(+), 83 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 87e8d4c..53422ce 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -145,15 +145,16 @@ static void macvlan_broadcast(struct sk_buff *skb,
 }
 
 /* called under rcu_read_lock() from netif_receive_skb */
-static struct sk_buff *macvlan_handle_frame(struct macvlan_port *port,
-					    struct sk_buff *skb)
+static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb)
 {
+	struct macvlan_port *port;
 	const struct ethhdr *eth = eth_hdr(skb);
 	const struct macvlan_dev *vlan;
 	const struct macvlan_dev *src;
 	struct net_device *dev;
 	unsigned int len;
 
+	port = rcu_dereference(skb->dev->macvlan_port);
 	if (is_multicast_ether_addr(eth->h_dest)) {
 		src = macvlan_hash_lookup(port, eth->h_source);
 		if (!src)
@@ -515,6 +516,7 @@ static int macvlan_port_create(struct net_device *dev)
 {
 	struct macvlan_port *port;
 	unsigned int i;
+	int err;
 
 	if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK)
 		return -EINVAL;
@@ -528,13 +530,21 @@ static int macvlan_port_create(struct net_device *dev)
 	for (i = 0; i < MACVLAN_HASH_SIZE; i++)
 		INIT_HLIST_HEAD(&port->vlan_hash[i]);
 	rcu_assign_pointer(dev->macvlan_port, port);
-	return 0;
+
+	err = netdev_rx_handler_register(dev, macvlan_handle_frame);
+	if (err) {
+		rcu_assign_pointer(dev->macvlan_port, NULL);
+		kfree(port);
+	}
+
+	return err;
 }
 
 static void macvlan_port_destroy(struct net_device *dev)
 {
 	struct macvlan_port *port = dev->macvlan_port;
 
+	netdev_rx_handler_unregister(dev);
 	rcu_assign_pointer(dev->macvlan_port, NULL);
 	synchronize_rcu();
 	kfree(port);
@@ -767,14 +777,12 @@ static int __init macvlan_init_module(void)
 	int err;
 
 	register_netdevice_notifier(&macvlan_notifier_block);
-	macvlan_handle_frame_hook = macvlan_handle_frame;
 
 	err = macvlan_link_register(&macvlan_link_ops);
 	if (err < 0)
 		goto err1;
 	return 0;
 err1:
-	macvlan_handle_frame_hook = NULL;
 	unregister_netdevice_notifier(&macvlan_notifier_block);
 	return err;
 }
@@ -782,7 +790,6 @@ err1:
 static void __exit macvlan_cleanup_module(void)
 {
 	rtnl_link_unregister(&macvlan_link_ops);
-	macvlan_handle_frame_hook = NULL;
 	unregister_netdevice_notifier(&macvlan_notifier_block);
 }
 
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 938b7e8..0d241a5 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -102,8 +102,6 @@ struct __fdb_entry {
 #include <linux/netdevice.h>
 
 extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
-extern struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
-					       struct sk_buff *skb);
 extern int (*br_should_route_hook)(struct sk_buff *skb);
 
 #endif
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 9ea047a..c26a0e4 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -84,8 +84,4 @@ extern int macvlan_link_register(struct rtnl_link_ops *ops);
 extern netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
 				      struct net_device *dev);
 
-
-extern struct sk_buff *(*macvlan_handle_frame_hook)(struct macvlan_port *,
-						    struct sk_buff *);
-
 #endif /* _LINUX_IF_MACVLAN_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a249161..244494f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -381,6 +381,8 @@ enum gro_result {
 };
 typedef enum gro_result gro_result_t;
 
+typedef struct sk_buff *rx_handler_func_t(struct sk_buff *skb);
+
 extern void __napi_schedule(struct napi_struct *n);
 
 static inline int napi_disable_pending(struct napi_struct *n)
@@ -957,6 +959,7 @@ struct net_device {
 #endif
 
 	struct netdev_queue	rx_queue;
+	rx_handler_func_t	*rx_handler;
 
 	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;
 
@@ -1693,6 +1696,10 @@ static inline void napi_free_frags(struct napi_struct *napi)
 	napi->skb = NULL;
 }
 
+extern int netdev_rx_handler_register(struct net_device *dev,
+				      rx_handler_func_t *rx_handler);
+extern void netdev_rx_handler_unregister(struct net_device *dev);
+
 extern void		netif_nit_deliver(struct sk_buff *skb);
 extern int		dev_valid_name(const char *name);
 extern int		dev_ioctl(struct net *net, unsigned int cmd, void __user *);
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 76357b5..c8436fa 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -63,7 +63,6 @@ static int __init br_init(void)
 		goto err_out4;
 
 	brioctl_set(br_ioctl_deviceless_stub);
-	br_handle_frame_hook = br_handle_frame;
 
 #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
 	br_fdb_test_addr_hook = br_fdb_test_addr;
@@ -100,7 +99,6 @@ static void __exit br_deinit(void)
 	br_fdb_test_addr_hook = NULL;
 #endif
 
-	br_handle_frame_hook = NULL;
 	br_fdb_fini();
 }
 
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 18b245e..d924234 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -147,6 +147,7 @@ static void del_nbp(struct net_bridge_port *p)
 
 	list_del_rcu(&p->list);
 
+	netdev_rx_handler_unregister(dev);
 	rcu_assign_pointer(dev->br_port, NULL);
 
 	br_multicast_del_port(p);
@@ -429,6 +430,11 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 		goto err2;
 
 	rcu_assign_pointer(dev->br_port, p);
+
+	err = netdev_rx_handler_register(dev, br_handle_frame);
+	if (err)
+		goto err3;
+
 	dev_disable_lro(dev);
 
 	list_add_rcu(&p->list, &br->port_list);
@@ -451,6 +457,8 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 	br_netpoll_enable(br, dev);
 
 	return 0;
+err3:
+	rcu_assign_pointer(dev->br_port, NULL);
 err2:
 	br_fdb_delete_by_port(br, p, 1);
 err1:
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index d36e700..99647d8 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -131,15 +131,19 @@ static inline int is_link_local(const unsigned char *dest)
 }
 
 /*
- * Called via br_handle_frame_hook.
  * Return NULL if skb is handled
- * note: already called with rcu_read_lock (preempt_disabled)
+ * note: already called with rcu_read_lock (preempt_disabled) from
+ * netif_receive_skb
  */
-struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb)
+struct sk_buff *br_handle_frame(struct sk_buff *skb)
 {
+	struct net_bridge_port *p;
 	const unsigned char *dest = eth_hdr(skb)->h_dest;
 	int (*rhook)(struct sk_buff *skb);
 
+	if (skb->pkt_type == PACKET_LOOPBACK)
+		return skb;
+
 	if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
 		goto drop;
 
@@ -147,6 +151,8 @@ struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb)
 	if (!skb)
 		return NULL;
 
+	p = rcu_dereference(skb->dev->br_port);
+
 	if (unlikely(is_link_local(dest))) {
 		/* Pause frames shouldn't be passed up by driver anyway */
 		if (skb->protocol == htons(ETH_P_PAUSE))
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 0f4a74b..c83519b 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -331,8 +331,7 @@ extern void br_features_recompute(struct net_bridge *br);
 
 /* br_input.c */
 extern int br_handle_frame_finish(struct sk_buff *skb);
-extern struct sk_buff *br_handle_frame(struct net_bridge_port *p,
-				       struct sk_buff *skb);
+extern struct sk_buff *br_handle_frame(struct sk_buff *skb);
 
 /* br_ioctl.c */
 extern int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
diff --git a/net/core/dev.c b/net/core/dev.c
index 983a3c1..4a02b58 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2583,70 +2583,14 @@ static inline int deliver_skb(struct sk_buff *skb,
 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 }
 
-#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
-
-#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
+#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
+    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
 /* This hook is defined here for ATM LANE */
 int (*br_fdb_test_addr_hook)(struct net_device *dev,
 			     unsigned char *addr) __read_mostly;
 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 #endif
 
-/*
- * If bridge module is loaded call bridging hook.
- *  returns NULL if packet was consumed.
- */
-struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
-					struct sk_buff *skb) __read_mostly;
-EXPORT_SYMBOL_GPL(br_handle_frame_hook);
-
-static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
-					    struct packet_type **pt_prev, int *ret,
-					    struct net_device *orig_dev)
-{
-	struct net_bridge_port *port;
-
-	if (skb->pkt_type == PACKET_LOOPBACK ||
-	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
-		return skb;
-
-	if (*pt_prev) {
-		*ret = deliver_skb(skb, *pt_prev, orig_dev);
-		*pt_prev = NULL;
-	}
-
-	return br_handle_frame_hook(port, skb);
-}
-#else
-#define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
-#endif
-
-#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
-struct sk_buff *(*macvlan_handle_frame_hook)(struct macvlan_port *p,
-					     struct sk_buff *skb) __read_mostly;
-EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
-
-static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
-					     struct packet_type **pt_prev,
-					     int *ret,
-					     struct net_device *orig_dev)
-{
-	struct macvlan_port *port;
-
-	port = rcu_dereference(skb->dev->macvlan_port);
-	if (!port)
-		return skb;
-
-	if (*pt_prev) {
-		*ret = deliver_skb(skb, *pt_prev, orig_dev);
-		*pt_prev = NULL;
-	}
-	return macvlan_handle_frame_hook(port, skb);
-}
-#else
-#define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
-#endif
-
 #ifdef CONFIG_NET_CLS_ACT
 /* TODO: Maybe we should just force sch_ingress to be compiled in
  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
@@ -2742,6 +2686,47 @@ void netif_nit_deliver(struct sk_buff *skb)
 	rcu_read_unlock();
 }
 
+/**
+ *	netdev_rx_handler_register - register receive handler
+ *	@dev: device to register a handler for
+ *	@rx_handler: receive handler to register
+ *
+ *	Register a receive hander for a device. This handler will then be
+ *	called from __netif_receive_skb. A negative errno code is returned
+ *	on a failure.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+int netdev_rx_handler_register(struct net_device *dev,
+			       rx_handler_func_t *rx_handler)
+{
+	ASSERT_RTNL();
+
+	if (dev->rx_handler)
+		return -EBUSY;
+
+	rcu_assign_pointer(dev->rx_handler, rx_handler);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
+
+/**
+ *	netdev_rx_handler_unregister - unregister receive handler
+ *	@dev: device to unregister a handler from
+ *
+ *	Unregister a receive hander from a device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+void netdev_rx_handler_unregister(struct net_device *dev)
+{
+
+	ASSERT_RTNL();
+	rcu_assign_pointer(dev->rx_handler, NULL);
+}
+EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
+
 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
 					      struct net_device *master)
 {
@@ -2794,6 +2779,7 @@ EXPORT_SYMBOL(__skb_bond_should_drop);
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
+	rx_handler_func_t *rx_handler;
 	struct net_device *orig_dev;
 	struct net_device *master;
 	struct net_device *null_or_orig;
@@ -2856,12 +2842,17 @@ static int __netif_receive_skb(struct sk_buff *skb)
 ncls:
 #endif
 
-	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
-	if (!skb)
-		goto out;
-	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
-	if (!skb)
-		goto out;
+	/* Handle special case of bridge or macvlan */
+	rx_handler = rcu_dereference(skb->dev->rx_handler);
+	if (rx_handler) {
+		if (pt_prev) {
+			ret = deliver_skb(skb, pt_prev, orig_dev);
+			pt_prev = NULL;
+		}
+		skb = rx_handler(skb);
+		if (!skb)
+			goto out;
+	}
 
 	/*
 	 * Make sure frames received on VLAN interfaces stacked on
-- 
1.7.0.1


^ permalink raw reply related

* Re: pull request: wireless-2.6 2010-05-28
From: Sedat Dilek @ 2010-06-02  8:42 UTC (permalink / raw)
  To: reinette chatre
  Cc: John W. Linville, davem@davemloft.net,
	linux-wireless@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, Zheng, Jiajia, Kolekar, Abhijeet,
	Berg, Johannes
In-Reply-To: <1275409361.2091.27208.camel@rchatre-DESK>

On Tue, Jun 1, 2010 at 6:22 PM, reinette chatre
<reinette.chatre@intel.com> wrote:
> Hi Sedat,
>
> On Sat, 2010-05-29 at 06:34 -0700, Sedat Dilek wrote:
>> Two people confirmed the patch in [2] fixes:
>> 1. iwlwifi-2.6 GIT master (commit f10a237c95abd6d64a3a24553bd1d3bcddd9108b)
>> 2. compat-wireless (2010-05-21)
>>
>> And it fixes also the above mentionned combination.
>
> We are working on getting this patch upstream. The version tested
> contains a significant amount of duplicated code as well as some cleanup
> code mixed in. It is thus not appropriate for an upstream submission at
> this time.
>

What do you mean by "upstream"?
upstream for me is Linus-tree (linux-2.6 GIT master).
Patches against iwlwifi-2.6 GIT trees are not very helpful in my case.

>> As a suggestion:
>> What about "copying" bug-reports (incl. its history) from IWL-BTS into
>> linux-wireless ML?
>> For example (dri-devel related) bug-reports from
>> bugzilla.freedesktop.org are "copied" into dri-devel ML.
>
> Pointing people to the bug report seems to work ok. What benefit does
> the above have?
>

[+] People - not subscribed to the BR - can follow "on change".
[-] Blow up linux-wireless ML?

- Sedat -

^ permalink raw reply

* Re: [PATCH 00/12] sfc changes for 2.6.36
From: David Miller @ 2010-06-02  9:21 UTC (permalink / raw)
  To: bhutchings; +Cc: netdev, linux-net-drivers
In-Reply-To: <1275426967.2114.25.camel@achroite.uk.solarflarecom.com>

From: Ben Hutchings <bhutchings@solarflare.com>
Date: Tue, 01 Jun 2010 22:16:07 +0100

> The major feature here is RX buffer recycling, which improves
> performance on networks with heavy multicast traffic.  Other than that,
> there are various bug fixes and cleanup.
> 
> Ben.
> 
> Ben Hutchings (3):
>   sfc: Rename struct efx_mcdi_phy_cfg to efx_mcdi_phy_data
>   sfc: Only count bad packets in rx_errors
>   sfc: Get port number from CS_PORT_NUM, not PCI function number
> 
> Steve Hodgson (9):
>   sfc: Reschedule any resets scheduled inside efx_pm_freeze()
>   sfc: Workaround flush failures on Falcon B0
>   sfc: Synchronise link_advertising and wanted_fc on Siena
>   sfc: Wait for the link to stay up before running loopback selftest
>   sfc: Allow DRV_GEN events to be used outside of selftests
>   sfc: Remove efx_rx_queue::add_lock
>   sfc: Support only two rx buffers per page
>   sfc: Recycle discarded rx buffers back onto the queue
>   sfc: Allow shared pages to be recycled

ALl applied, thanks Ben.

^ permalink raw reply

* Re: [PATCH net-next-2.6 1/2] qlcnic: NIC Partitioning - Add basic infrastructure support
From: David Miller @ 2010-06-02  9:23 UTC (permalink / raw)
  To: anirban.chakraborty; +Cc: netdev, amit.salecha, ameen.rahman
In-Reply-To: <alpine.OSX.2.00.1006011422230.45836@macintosh-2.qlogic.org>

From: Anirban Chakraborty <anirban.chakraborty@qlogic.com>
Date: Tue, 1 Jun 2010 14:28:51 -0700

> Following changes have been added to enable the adapter to work in
> NIC partitioning mode where multiple PCI functions of an adapter port can
> be configured to work as NIC functions. The first function that is enumerated on
> the PCI bus assumes the role of management function which, besides being able
> to do all the NIC functionality, can configure other NIC partitions. Other NIC
> functions can be configured as privileged or non privileged functions.
> Privileged function can not configure other NIC functions but can do all the
> NIC functionality including any firmware initialization, chip reset etc. Non
> privileged functions can do only basic IO. For chip reset etc, it depends on the
> privilege or management function.
> 
> 1. Added code to determine PCI function number independent of kernel API.
> 2. Added Driver - FW version 2.0 support.
> 3. Changed producer and consumer register offset calculation.
> 4. Added management and privileged operation modes for npar functions. A module
>  parameter has been added to control it.
> 5. Added support for configuring the eswitch in the adapter.
> 
> Signed-off-by: Anirban Chakraborty <anirban.chakraborty@qlogic.com>

Applied.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox