Netdev List
 help / color / mirror / Atom feed
* [PATCH v3 1/6] net: sh_eth: remove unnecessary function
From: Shimoda, Yoshihiro @ 2012-05-14  6:46 UTC (permalink / raw)
  To: netdev; +Cc: SH-Linux

The sh_eth_timer() called mod_timer() for itself.
So, this patch removes the function.

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
---
 about v3:
  - this patch is the same as v2

 drivers/net/ethernet/renesas/sh_eth.c |   22 ----------------------
 drivers/net/ethernet/renesas/sh_eth.h |    1 -
 2 files changed, 0 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index be3c221..c35084a 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -1289,14 +1289,6 @@ other_irq:
 	return ret;
 }

-static void sh_eth_timer(unsigned long data)
-{
-	struct net_device *ndev = (struct net_device *)data;
-	struct sh_eth_private *mdp = netdev_priv(ndev);
-
-	mod_timer(&mdp->timer, jiffies + (10 * HZ));
-}
-
 /* PHY state control function */
 static void sh_eth_adjust_link(struct net_device *ndev)
 {
@@ -1544,11 +1536,6 @@ static int sh_eth_open(struct net_device *ndev)
 	if (ret)
 		goto out_free_irq;

-	/* Set the timer to check for link beat. */
-	init_timer(&mdp->timer);
-	mdp->timer.expires = (jiffies + (24 * HZ)) / 10;/* 2.4 sec. */
-	setup_timer(&mdp->timer, sh_eth_timer, (unsigned long)ndev);
-
 	return ret;

 out_free_irq:
@@ -1573,9 +1560,6 @@ static void sh_eth_tx_timeout(struct net_device *ndev)
 	/* tx_errors count up */
 	ndev->stats.tx_errors++;

-	/* timer off */
-	del_timer_sync(&mdp->timer);
-
 	/* Free all the skbuffs in the Rx queue. */
 	for (i = 0; i < RX_RING_SIZE; i++) {
 		rxdesc = &mdp->rx_ring[i];
@@ -1593,10 +1577,6 @@ static void sh_eth_tx_timeout(struct net_device *ndev)

 	/* device init */
 	sh_eth_dev_init(ndev);
-
-	/* timer on */
-	mdp->timer.expires = (jiffies + (24 * HZ)) / 10;/* 2.4 sec. */
-	add_timer(&mdp->timer);
 }

 /* Packet transmit function */
@@ -1669,8 +1649,6 @@ static int sh_eth_close(struct net_device *ndev)

 	free_irq(ndev->irq, ndev);

-	del_timer_sync(&mdp->timer);
-
 	/* Free all the skbuffs in the Rx queue. */
 	sh_eth_ring_free(ndev);

diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h
index 57b8e1f..9e8e4d5 100644
--- a/drivers/net/ethernet/renesas/sh_eth.h
+++ b/drivers/net/ethernet/renesas/sh_eth.h
@@ -771,7 +771,6 @@ struct sh_eth_private {
 	struct sh_eth_txdesc *tx_ring;
 	struct sk_buff **rx_skbuff;
 	struct sk_buff **tx_skbuff;
-	struct timer_list timer;
 	spinlock_t lock;
 	u32 cur_rx, dirty_rx;	/* Producer/consumer ring indices */
 	u32 cur_tx, dirty_tx;
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCH v2 5/5] net: sh_eth: use NAPI
From: Shimoda, Yoshihiro @ 2012-05-14  6:13 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, SH-Linux
In-Reply-To: <1336750529.2874.69.camel@bwh-desktop.uk.solarflarecom.com>

2012/05/12 0:35, Ben Hutchings wrote:
> On Fri, 2012-05-11 at 17:38 +0900, Shimoda, Yoshihiro wrote:
>> This patch modifies the driver to use NAPI.
> [...]
>> +static int sh_eth_poll(struct napi_struct *napi, int budget)
>> +{
< snip >
>> +	/* check whether the controller doesn't have any events */
>> +	if (!txfree_num && !(intr_status & cd->eesr_err_check) &&
>> +	    work_done < budget) {
>> +		napi_complete(napi);
> 
> If and only if you return a value less than the budget then you *must*
> call napi_complete().  You can't add these extra conditions.

Thank you for the point. I will fix it.

< snip >
> 
> You will also need to call napi_disable() and napi_enable() in the
> set_ringparam implementation.

I will add the code in sh_eth_set_ringparam().

> Ben.
> 

Best regards,
Yoshihiro Shimoda

^ permalink raw reply

* Re: [PATCH v2 4/5] net: sh_eth: add support for set_ringparam/get_ringparam
From: Shimoda, Yoshihiro @ 2012-05-14  6:13 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, SH-Linux
In-Reply-To: <1336750030.2874.65.camel@bwh-desktop.uk.solarflarecom.com>

2012/05/12 0:27, Ben Hutchings wrote:
> On Fri, 2012-05-11 at 17:38 +0900, Shimoda, Yoshihiro wrote:
>> This patch supports the ethtool's set_ringparam() and get_ringparam().
> [...]
>> +static int sh_eth_set_ringparam(struct net_device *ndev,
>> +				struct ethtool_ringparam *ring)
>> +{
< snip >
>> +	if (netif_running(ndev)) {
>> +		/* Disable interrupts by clearing the interrupt mask. */
>> +		sh_eth_write(ndev, 0x0000, EESIPR);
>> +		/* Stop the chip's Tx and Rx processes. */
>> +		sh_eth_write(ndev, 0, EDTRR);
>> +		sh_eth_write(ndev, 0, EDRRR);
> 
> You'll also need to call netif_tx_disable() and synchronize_irq().

Thank you for the point. I will fix it.

>> +	sh_eth_ring_init(ndev);
>> +	sh_eth_dev_init(ndev);
> 
> And what if either of these fails?

Oh, I should have added error handling.
I checked this driver, and I found that the driver should change
the value of the pointer to NULL. So, I will submit such a patch first.

>> +	if (netif_running(ndev)) {
>> +		sh_eth_write(ndev, mdp->cd->eesipr_value, EESIPR);
>> +		/* Setting the Rx mode will start the Rx process. */
>> +		sh_eth_write(ndev, EDRRR_R, EDRRR);
> 
> You'll need to call netif_wake_queue().

I will fix it.

>> +	}
>> +
>> +	return 0;
>> +}
> [...]
> 
> Ben.
> 

Best regards,
Yoshihiro Shimoda

^ permalink raw reply

* Dedicated server rental
From: eric @ 2012-05-14  4:14 UTC (permalink / raw)


Dear Sir/Madam,

Please see below our Server Rental Package,

HKD1980/month + one-time setup fee HKD2000

Dell? PowerEdge? Enterprise Rack Mount Server
-          Intel(R) Xeon(R) E3-1220 Processor (3.10GHz, 8M Cache, Turbo, 4C/4T, 80W)
-          4GB RAM, 1x4GB, 1333MHz, DDR-3, Single Ranked UDIMMs
-          500GB, 3.5", SATA II x 1 Enterprise Hard Disk
-          Remote KVM (iDRAC6 Enterprise)
-          8x SATA slim DVD-ROM Drive
-          Dell OpenManage
-          Broadcom 5716 dual-port Gigabit Ethernet without TOE enabled
-          Dell BMC Info Mod

Software Specification
-     CentOS 5/6 Linux Operating System


Data Center Facilities
-          1Gbps Share Internet Connectivity with 10/BASE-T
-          One IP Addresses Allocation
-          Un-interruptible Power Supply (UPS) backed up by private diesel generator
-          FM200 gas¡Vbased fire suppression system
-          24x7 CRAC Air Conditioning and Humidity Control
-          24x7 Security Control

Should you have any query, please feel free to contact me. Thanks.

Best,
Eric Chan
Senior Sales Executive

If you do not wish to further receive this event message, meail subscriber@dedicatedserver.com.hk to unsubscribe this message.

^ permalink raw reply

* PACKET_LOSS socket option for packet_mmap
From: nn6eumtr @ 2012-05-14  4:10 UTC (permalink / raw)
  To: netdev

What does the PACKET_LOSS socket option for packet_mmap do?

I've seen some sample code setting the option, but couldn't find an 
explanation of what it does.

^ permalink raw reply

* [GIT] Networking
From: David Miller @ 2012-05-14  4:05 UTC (permalink / raw)
  To: torvalds; +Cc: akpm, netdev, linux-kernel


The main purpose of this pull request is to fix up the erroneous
bonding patch I applied last round.  I meant to apply v4 of the patch
from Jiri but I applied v3 by accident.  Mea culpa.

Also, eagle eyed Dan Carpenter noticed that openvswitch has one of
those "X = alloc(); if (!Y)" mistakes, test the proper pointer
instead.

Please pull, thanks a lot!

The following changes since commit cf00c55e3d30b242d6f6530e61a7bc828124f0a3:

  Merge tag 'scsi-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi (2012-05-12 13:02:31 -0700)

are available in the git repository at:


  git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git master

for you to fetch changes up to 8aa51d64c1f526e43b1e7f89fb8b98c2fd583f4b:

  openvswitch: checking wrong variable in queue_userspace_packet() (2012-05-13 15:47:34 -0400)

----------------------------------------------------------------
Dan Carpenter (1):
      openvswitch: checking wrong variable in queue_userspace_packet()

David S. Miller (1):
      bonding: Fix LACPDU rx_dropped commit.

 drivers/net/bonding/bond_alb.c |   12 +++++++-----
 drivers/net/bonding/bonding.h  |    2 +-
 net/openvswitch/datapath.c     |    2 +-
 3 files changed, 9 insertions(+), 7 deletions(-)

^ permalink raw reply

* [PATCH] ipv6: fix incorrect ipsec transport mode fragment
From: Gao feng @ 2012-05-14  3:21 UTC (permalink / raw)
  To: netdev; +Cc: steffen.klassert, davem, lw, Gao feng
In-Reply-To: <20120215104021.GC31660@secunet.com>

Since commit 299b0767(ipv6: Fix IPsec slowpath fragmentation problem)
the fragment of ipsec transport mode packets is incorrect.
because tunnel mode needs IPsec headers and trailer for all fragments,
while on transport mode it is sufficient to add the headers to the
first fragment and the trailer to the last.

so modify mtu and maxfraglen base on ipsec mode and if fragment is first
or last.

with my test,it work well and does not trigger slow fragment path.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
 net/ipv6/ip6_output.c |   80 +++++++++++++++++++++++++++++++++++++-----------
 1 files changed, 61 insertions(+), 19 deletions(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b7ca461..9416887 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1191,19 +1191,23 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct inet_cork *cork;
 	struct sk_buff *skb;
-	unsigned int maxfraglen, fragheaderlen;
+	unsigned int maxfraglen, maxfraglen_prev, fragheaderlen;
 	int exthdrlen;
 	int dst_exthdrlen;
 	int hh_len;
-	int mtu;
+	int mtu, mtu_prev;
 	int copy;
 	int err;
 	int offset = 0;
 	int csummode = CHECKSUM_NONE;
 	__u8 tx_flags = 0;
-
+	bool transport_mode = false;
+	struct xfrm_state *x = rt->dst.xfrm;
 	if (flags&MSG_PROBE)
 		return 0;
+	if (x && x->props.mode == XFRM_MODE_TRANSPORT)
+		transport_mode = true;
+
 	cork = &inet->cork.base;
 	if (skb_queue_empty(&sk->sk_write_queue)) {
 		/*
@@ -1248,13 +1252,17 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 		inet->cork.fl.u.ip6 = *fl6;
 		np->cork.hop_limit = hlimit;
 		np->cork.tclass = tclass;
-		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
-		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
+		if (transport_mode)
+			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
+			      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
+		else
+			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
+			      rt->dst.dev->mtu : dst_mtu(&rt->dst);
 		if (np->frag_size < mtu) {
 			if (np->frag_size)
 				mtu = np->frag_size;
 		}
-		cork->fragsize = mtu;
+		mtu_prev = cork->fragsize = mtu;
 		if (dst_allfrag(rt->dst.path))
 			cork->flags |= IPCORK_ALLFRAG;
 		cork->length = 0;
@@ -1271,14 +1279,15 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 		transhdrlen = 0;
 		exthdrlen = 0;
 		dst_exthdrlen = 0;
-		mtu = cork->fragsize;
+		mtu_prev = mtu = cork->fragsize;
 	}
 
 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
 
 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
 			(opt ? opt->opt_nflen : 0);
-	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
+	maxfraglen_prev = maxfraglen = ((mtu - fragheaderlen) & ~7)
+				       + fragheaderlen - sizeof(struct frag_hdr);
 
 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
@@ -1329,15 +1338,27 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 			return 0;
 		}
 	}
-
-	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
+	skb = skb_peek_tail(&sk->sk_write_queue);
+	if (skb == NULL) {
+		if (transport_mode) {
+			/*
+			 * transport mode the first ipsec fragment should contain
+			 * ipsec header, so decrease dst_exthdrlen from mtu.
+			 */
+			mtu -= dst_exthdrlen;
+			mtu_prev = mtu;
+			maxfraglen = ((mtu - fragheaderlen) & ~7)
+				     + fragheaderlen - sizeof(struct frag_hdr);
+			maxfraglen_prev = maxfraglen;
+		}
 		goto alloc_new_skb;
+	}
 
 	while (length > 0) {
 		/* Check if the remaining data fits into current packet. */
-		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
+		copy = (cork->length <= mtu_prev && !(cork->flags & IPCORK_ALLFRAG) ? mtu_prev : maxfraglen_prev) - skb->len;
 		if (copy < length)
-			copy = maxfraglen - skb->len;
+			copy = maxfraglen_prev - skb->len;
 
 		if (copy <= 0) {
 			char *data;
@@ -1351,7 +1372,7 @@ alloc_new_skb:
 
 			/* There's no room in the current skb */
 			if (skb_prev)
-				fraggap = skb_prev->len - maxfraglen;
+				fraggap = skb_prev->len - maxfraglen_prev;
 			else
 				fraggap = 0;
 
@@ -1360,10 +1381,14 @@ alloc_new_skb:
 			 * we know we need more fragment(s).
 			 */
 			datalen = length + fraggap;
-			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
-				datalen = maxfraglen - fragheaderlen;
+			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) {
+				/*
+				 * decrease ipsec trailer here,
+				 * if it's not the last fragment, add trailer latter
+				 */
+				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
+			}
 
-			fraglen = datalen + fragheaderlen;
 			if ((flags & MSG_MORE) &&
 			    !(rt->dst.dev->features&NETIF_F_SG))
 				alloclen = mtu;
@@ -1377,9 +1402,15 @@ alloc_new_skb:
 			 * Note: we overallocate on fragments with MSG_MODE
 			 * because we have no idea if we're the last one.
 			 */
-			if (datalen == length + fraggap)
-				alloclen += rt->dst.trailer_len;
-
+			alloclen += rt->dst.trailer_len;
+			if (datalen != length + fraggap) {
+				/*
+				 * this fragment is not the last fragment,
+				 * add trailer to datalen
+				 */
+				datalen += rt->dst.trailer_len;
+			}
+			fraglen = datalen + fragheaderlen;
 			/*
 			 * We just reserve space for fragment header.
 			 * Note: this may be overallocation if the message
@@ -1452,6 +1483,17 @@ alloc_new_skb:
 
 			offset += copy;
 			length -= datalen - fraggap;
+
+			mtu_prev = mtu;
+			maxfraglen_prev = maxfraglen;
+			if (skb_prev == NULL && transport_mode) {
+				/*
+				 * transport mode the middle skb should not have
+				 * ipsec header and trailer, so update the mtu.
+				 */
+				mtu = dst_mtu(rt->dst.path);
+				maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
+			}
 			transhdrlen = 0;
 			exthdrlen = 0;
 			dst_exthdrlen = 0;
-- 
1.7.7.6

^ permalink raw reply related

* Re: linux-next: build warning after merge of the origin tree
From: David Miller @ 2012-05-14  3:00 UTC (permalink / raw)
  To: sfr; +Cc: netdev, linux-next, linux-kernel, torvalds, jbohac
In-Reply-To: <20120514104452.2411b761e8f98076d1ae88ee@canb.auug.org.au>

From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 14 May 2012 10:44:52 +1000

> Introduced by commit 13a8e0c8cdb4 ("bonding: don't increase rx_dropped
> after processing LACPDUs"). recv-probe in struct bonding is a function
> returning void, but the local variable it assigned to/from has been
> changed to a function returning int ...

http://marc.info/?l=linux-netdev&m=133693883404392&w=2

^ permalink raw reply

* Re: [PATCH v5 2/2] decrement static keys on real destroy time
From: Li Zefan @ 2012-05-14  1:38 UTC (permalink / raw)
  To: Glauber Costa
  Cc: cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	devel-GEFAQzZX7r8dnm+yROfE0A,
	kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A,
	netdev-u79uwXL29TY76Z2rM5mHXA, Tejun Heo, Johannes Weiner,
	Michal Hocko
In-Reply-To: <1336767077-25351-3-git-send-email-glommer-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>

> +static void disarm_static_keys(struct mem_cgroup *memcg)

> +{
> +#ifdef CONFIG_INET
> +	if (memcg->tcp_mem.cg_proto.activated)
> +		static_key_slow_dec(&memcg_socket_limit_enabled);
> +#endif
> +}


Move this inside the ifdef/endif below ?

Otherwise I think you'll get compile error if !CONFIG_INET...

> +
>  #ifdef CONFIG_INET
>  struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
>  {
> @@ -452,6 +462,11 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
>  }
>  EXPORT_SYMBOL(tcp_proto_cgroup);
>  #endif /* CONFIG_INET */
> +#else
> +static inline void disarm_static_keys(struct mem_cgroup *memcg)
> +{
> +}
> +
>  #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */

^ permalink raw reply

* Re: [PATCH v5 2/2] decrement static keys on real destroy time
From: KAMEZAWA Hiroyuki @ 2012-05-14  0:59 UTC (permalink / raw)
  To: Glauber Costa
  Cc: cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	devel-GEFAQzZX7r8dnm+yROfE0A, netdev-u79uwXL29TY76Z2rM5mHXA,
	Tejun Heo, Li Zefan, Johannes Weiner, Michal Hocko
In-Reply-To: <1336767077-25351-3-git-send-email-glommer-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>

(2012/05/12 5:11), Glauber Costa wrote:

> We call the destroy function when a cgroup starts to be removed,
> such as by a rmdir event.
> 
> However, because of our reference counters, some objects are still
> inflight. Right now, we are decrementing the static_keys at destroy()
> time, meaning that if we get rid of the last static_key reference,
> some objects will still have charges, but the code to properly
> uncharge them won't be run.
> 
> This becomes a problem specially if it is ever enabled again, because
> now new charges will be added to the staled charges making keeping
> it pretty much impossible.
> 
> We just need to be careful with the static branch activation:
> since there is no particular preferred order of their activation,
> we need to make sure that we only start using it after all
> call sites are active. This is achieved by having a per-memcg
> flag that is only updated after static_key_slow_inc() returns.
> At this time, we are sure all sites are active.
> 
> This is made per-memcg, not global, for a reason:
> it also has the effect of making socket accounting more
> consistent. The first memcg to be limited will trigger static_key()
> activation, therefore, accounting. But all the others will then be
> accounted no matter what. After this patch, only limited memcgs
> will have its sockets accounted.
> 
> [v2: changed a tcp limited flag for a generic proto limited flag ]
> [v3: update the current active flag only after the static_key update ]
> [v4: disarm_static_keys() inside free_work ]
> [v5: got rid of tcp_limit_mutex, now in the static_key interface ]
> 
> Signed-off-by: Glauber Costa <glommer-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>
> CC: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> CC: Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
> CC: Kamezawa Hiroyuki <kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org>
> CC: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>
> CC: Michal Hocko <mhocko-AlSwsSmVLrQ@public.gmane.org>


Thank you for your patient works.

Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org>

BTW, what is the relationship between 1/2 and 2/2  ?

Thanks,
-Kame

^ permalink raw reply

* Re: [PATCH v5 1/2] Always free struct memcg through schedule_work()
From: KAMEZAWA Hiroyuki @ 2012-05-14  0:56 UTC (permalink / raw)
  To: Glauber Costa
  Cc: cgroups-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	devel-GEFAQzZX7r8dnm+yROfE0A, netdev-u79uwXL29TY76Z2rM5mHXA,
	Tejun Heo, Li Zefan, Johannes Weiner, Michal Hocko
In-Reply-To: <1336767077-25351-2-git-send-email-glommer-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>

(2012/05/12 5:11), Glauber Costa wrote:

> Right now we free struct memcg with kfree right after a
> rcu grace period, but defer it if we need to use vfree() to get
> rid of that memory area. We do that by need, because we need vfree
> to be called in a process context.
> 
> This patch unifies this behavior, by ensuring that even kfree will
> happen in a separate thread. The goal is to have a stable place to
> call the upcoming jump label destruction function outside the realm
> of the complicated and quite far-reaching cgroup lock (that can't be
> held when calling neither the cpu_hotplug.lock nor the jump_label_mutex)
> 
> Signed-off-by: Glauber Costa <glommer-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>
> CC: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> CC: Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
> CC: Kamezawa Hiroyuki <kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org>
> CC: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>
> CC: Michal Hocko <mhocko-AlSwsSmVLrQ@public.gmane.org>


I think we'll need to revisit this, again.
for now,
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org>

^ permalink raw reply

* linux-next: build warning after merge of the origin tree
From: Stephen Rothwell @ 2012-05-14  0:44 UTC (permalink / raw)
  To: David Miller, netdev; +Cc: linux-next, linux-kernel, Linus Torvalds, Jiri Bohac

[-- Attachment #1: Type: text/plain, Size: 912 bytes --]

Hi Dave,

Buildinf Linus' tree, today's linux-next build (powerpc ppc64_defconfig)
produced these warnings:

drivers/net/bonding/bond_main.c: In function 'bond_handle_frame':
drivers/net/bonding/bond_main.c:1463:13: warning: assignment from incompatible pointer type [enabled by default]
drivers/net/bonding/bond_main.c: In function 'bond_open':
drivers/net/bonding/bond_main.c:3441:21: warning: assignment from incompatible pointer type [enabled by default]
drivers/net/bonding/bond_main.c:3448:20: warning: assignment from incompatible pointer type [enabled by default]

Introduced by commit 13a8e0c8cdb4 ("bonding: don't increase rx_dropped
after processing LACPDUs"). recv-probe in struct bonding is a function
returning void, but the local variable it assigned to/from has been
changed to a function returning int ...

-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* [PATCH] CS89x0 : Use ioread16/iowrite16 on all platforms
From: Jaccon Bastiaansen @ 2012-05-13 22:14 UTC (permalink / raw)
  To: arnd
  Cc: s.hauer, gfm, davem, festevam, linux-arm-kernel, netdev,
	Jaccon Bastiaansen

The use of the inw/outw functions by the cs89x0 platform driver
results in NULL pointer references.

Signed-off-by: Jaccon Bastiaansen <jaccon.bastiaansen@gmail.com>
---
 drivers/net/ethernet/cirrus/cs89x0.c |   91 +++++++++++++++++++++++-----------
 1 files changed, 62 insertions(+), 29 deletions(-)

diff --git a/drivers/net/ethernet/cirrus/cs89x0.c b/drivers/net/ethernet/cirrus/cs89x0.c
index b9406cb..5a1e5d7 100644
--- a/drivers/net/ethernet/cirrus/cs89x0.c
+++ b/drivers/net/ethernet/cirrus/cs89x0.c
@@ -222,6 +222,7 @@ struct net_local {
 	int send_underrun;	/* keep track of how many underruns in a row we get */
 	int force;		/* force various values; see FORCE* above. */
 	spinlock_t lock;
+	unsigned long phys_addr;/* CS89x0 I/O port or physical address. */
 #if ALLOW_DMA
 	int use_dma;		/* Flag: we're using dma */
 	int dma;		/* DMA channel */
@@ -231,8 +232,6 @@ struct net_local {
 	unsigned char *rx_dma_ptr;	/* points to the next packet  */
 #endif
 #ifdef CONFIG_CS89x0_PLATFORM
-	void __iomem *virt_addr;/* Virtual address for accessing the CS89x0. */
-	unsigned long phys_addr;/* Physical address for accessing the CS89x0. */
 	unsigned long size;	/* Length of CS89x0 memory region. */
 #endif
 };
@@ -279,6 +278,47 @@ static int __init dma_fn(char *str)
 __setup("cs89x0_dma=", dma_fn);
 #endif	/* !defined(MODULE) && (ALLOW_DMA != 0) */
 
+#ifndef CONFIG_CS89x0_PLATFORM
+/*
+ * This function converts the I/O port addres used by the cs89x0_probe() and
+ * init_module() functions to the I/O memory address used by the
+ * cs89x0_probe1() function.
+ */
+static int __init
+cs89x0_ioport_probe(struct net_device *dev, unsigned long ioport, int modular)
+{
+	struct net_local *lp = netdev_priv(dev);
+	int ret = 0;
+	void __iomem *io_mem;
+
+	if (!lp)
+		return -ENOMEM;
+
+	lp->phys_addr = ioport;
+
+	if (!request_region(ioport, NETCARD_IO_EXTENT, DRV_NAME)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	io_mem = ioport_map(ioport, NETCARD_IO_EXTENT);
+	if (!io_mem) {
+		ret = -ENOMEM;
+		goto release;
+	}
+
+	ret = cs89x0_probe1(dev, ret, modular);
+	if (!ret)
+		goto out;
+
+	ioport_unmap(io_mem);
+release:
+	release_region(ioport, NETCARD_IO_EXTENT);
+out:
+	return ret;
+}
+#endif
+
 #ifndef MODULE
 static int g_cs89x0_media__force;
 
@@ -292,7 +332,6 @@ static int __init media_fn(char *str)
 
 __setup("cs89x0_media=", media_fn);
 
-
 #ifndef CONFIG_CS89x0_PLATFORM
 /* Check for a network adaptor of this type, and return '0' iff one exists.
    If dev->base_addr == 0, probe all likely locations.
@@ -322,12 +361,12 @@ struct net_device * __init cs89x0_probe(int unit)
 		printk("cs89x0:cs89x0_probe(0x%x)\n", io);
 
 	if (io > 0x1ff)	{	/* Check a single specified location. */
-		err = cs89x0_probe1(dev, io, 0);
+		err = cs89x0_ioport_probe(dev, io, 0);
 	} else if (io != 0) {	/* Don't probe at all. */
 		err = -ENXIO;
 	} else {
 		for (port = netcard_portlist; *port; port++) {
-			if (cs89x0_probe1(dev, *port, 0) == 0)
+			if (cs89x0_ioport_probe(dev, *port, 0) == 0)
 				break;
 			dev->irq = irq;
 		}
@@ -373,13 +412,13 @@ writeword(unsigned long base_addr, int portno, u16 value)
 static u16
 readword(unsigned long base_addr, int portno)
 {
-	return inw(base_addr + portno);
+	return ioread16((void __iomem *)base_addr + portno);
 }
 
 static void
 writeword(unsigned long base_addr, int portno, u16 value)
 {
-	outw(value, base_addr + portno);
+	iowrite16(value, (void __iomem *)(base_addr + portno));
 }
 #endif
 
@@ -532,15 +571,6 @@ cs89x0_probe1(struct net_device *dev, unsigned long ioaddr, int modular)
 
         }
 
-	/* Grab the region so we can find another board if autoIRQ fails. */
-	/* WTF is going on here? */
-	if (!request_region(ioaddr & ~3, NETCARD_IO_EXTENT, DRV_NAME)) {
-		printk(KERN_ERR "%s: request_region(0x%lx, 0x%x) failed\n",
-				DRV_NAME, ioaddr, NETCARD_IO_EXTENT);
-		retval = -EBUSY;
-		goto out1;
-	}
-
 	/* if they give us an odd I/O address, then do ONE write to
            the address port, to get it back to address zero, where we
            expect to find the EISA signature word. An IO with a base of 0x3
@@ -553,7 +583,7 @@ cs89x0_probe1(struct net_device *dev, unsigned long ioaddr, int modular)
 				printk(KERN_ERR "%s: bad signature 0x%x\n",
 					dev->name, readword(ioaddr & ~3, ADD_PORT));
 		        	retval = -ENODEV;
-				goto out2;
+				goto out1;
 			}
 	}
 
@@ -568,7 +598,7 @@ cs89x0_probe1(struct net_device *dev, unsigned long ioaddr, int modular)
 			CHIP_EISA_ID_SIG_STR "\n",
 			dev->name, ioaddr, DATA_PORT, tmp);
   		retval = -ENODEV;
-  		goto out2;
+		goto out1;
 	}
 
 	/* Fill in the 'dev' fields. */
@@ -787,13 +817,12 @@ cs89x0_probe1(struct net_device *dev, unsigned long ioaddr, int modular)
 
 	retval = register_netdev(dev);
 	if (retval)
-		goto out3;
+		goto out2;
 	return 0;
-out3:
-	writeword(dev->base_addr, ADD_PORT, PP_ChipID);
 out2:
-	release_region(ioaddr & ~3, NETCARD_IO_EXTENT);
+	writeword(dev->base_addr, ADD_PORT, PP_ChipID);
 out1:
+	release_region(ioaddr & ~3, NETCARD_IO_EXTENT);
 	return retval;
 }
 
@@ -1886,7 +1915,7 @@ int __init init_module(void)
 		goto out;
 	}
 #endif
-	ret = cs89x0_probe1(dev, io, 1);
+	ret = cs89x0_ioport_probe(dev, io, 1);
 	if (ret)
 		goto out;
 
@@ -1900,9 +1929,12 @@ out:
 void __exit
 cleanup_module(void)
 {
+	struct net_local *lp = netdev_priv(dev_cs89x0);
+
 	unregister_netdev(dev_cs89x0);
 	writeword(dev_cs89x0->base_addr, ADD_PORT, PP_ChipID);
-	release_region(dev_cs89x0->base_addr, NETCARD_IO_EXTENT);
+	ioport_unmap((void __iomem *)dev_cs89x0->base_addr);
+	release_region(lp->phys_addr, NETCARD_IO_EXTENT);
 	free_netdev(dev_cs89x0);
 }
 #endif /* MODULE && !CONFIG_CS89x0_PLATFORM */
@@ -1913,6 +1945,7 @@ static int __init cs89x0_platform_probe(struct platform_device *pdev)
 	struct net_device *dev = alloc_etherdev(sizeof(struct net_local));
 	struct net_local *lp;
 	struct resource *mem_res;
+	void __iomem *virt_addr;
 	int err;
 
 	if (!dev)
@@ -1936,14 +1969,14 @@ static int __init cs89x0_platform_probe(struct platform_device *pdev)
 		goto free;
 	}
 
-	lp->virt_addr = ioremap(lp->phys_addr, lp->size);
-	if (!lp->virt_addr) {
+	virt_addr = ioremap(lp->phys_addr, lp->size);
+	if (!virt_addr) {
 		dev_warn(&dev->dev, "ioremap() failed.\n");
 		err = -ENOMEM;
 		goto release;
 	}
 
-	err = cs89x0_probe1(dev, (unsigned long)lp->virt_addr, 0);
+	err = cs89x0_probe1(dev, (unsigned long)virt_addr, 0);
 	if (err) {
 		dev_warn(&dev->dev, "no cs8900 or cs8920 detected.\n");
 		goto unmap;
@@ -1953,7 +1986,7 @@ static int __init cs89x0_platform_probe(struct platform_device *pdev)
 	return 0;
 
 unmap:
-	iounmap(lp->virt_addr);
+	iounmap(virt_addr);
 release:
 	release_mem_region(lp->phys_addr, lp->size);
 free:
@@ -1967,7 +2000,7 @@ static int cs89x0_platform_remove(struct platform_device *pdev)
 	struct net_local *lp = netdev_priv(dev);
 
 	unregister_netdev(dev);
-	iounmap(lp->virt_addr);
+	iounmap((void __iomem *)dev->base_addr);
 	release_mem_region(lp->phys_addr, lp->size);
 	free_netdev(dev);
 	return 0;
-- 
1.7.1

^ permalink raw reply related

* bonding: Fix LACPDU rx_dropped commit.
From: David Miller @ 2012-05-13 19:49 UTC (permalink / raw)
  To: netdev; +Cc: jbohac


I applied the wrong version of Jiri's bonding fix in commit
13a8e0c8cdb43982372bd6c65fb26839c8fd8ce9 ("bonding: don't increase
rx_dropped after processing LACPDUs")

I applied v3, which introduces warnings I asked him to fix,
instead of v4 which properly takes care of those issues.

This inter-diffs such that the warnings are now gone.

Signed-off-by: David S. Miller <davem@davemloft.net>
---

I'm still trying to figure out how I mistakenly put v3 into my tree
instead of v4, must be some kind of sorcery or curse that I'm not
familiar with.

 drivers/net/bonding/bond_alb.c |   12 +++++++-----
 drivers/net/bonding/bonding.h  |    2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 9abfde4..2e1f806 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -342,26 +342,26 @@ static void rlb_update_entry_from_arp(struct bonding *bond, struct arp_pkt *arp)
 	_unlock_rx_hashtbl_bh(bond);
 }
 
-static void rlb_arp_recv(struct sk_buff *skb, struct bonding *bond,
+static int rlb_arp_recv(struct sk_buff *skb, struct bonding *bond,
 			 struct slave *slave)
 {
 	struct arp_pkt *arp;
 
 	if (skb->protocol != cpu_to_be16(ETH_P_ARP))
-		return;
+		goto out;
 
 	arp = (struct arp_pkt *) skb->data;
 	if (!arp) {
 		pr_debug("Packet has no ARP data\n");
-		return;
+		goto out;
 	}
 
 	if (!pskb_may_pull(skb, arp_hdr_len(bond->dev)))
-		return;
+		goto out;
 
 	if (skb->len < sizeof(struct arp_pkt)) {
 		pr_debug("Packet is too small to be an ARP\n");
-		return;
+		goto out;
 	}
 
 	if (arp->op_code == htons(ARPOP_REPLY)) {
@@ -369,6 +369,8 @@ static void rlb_arp_recv(struct sk_buff *skb, struct bonding *bond,
 		rlb_update_entry_from_arp(bond, arp);
 		pr_debug("Server received an ARP Reply from client\n");
 	}
+out:
+	return RX_HANDLER_ANOTHER;
 }
 
 /* Caller must hold bond lock for read */
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index 9f2bae66..4581aa5 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -218,7 +218,7 @@ struct bonding {
 	struct   slave *primary_slave;
 	bool     force_primary;
 	s32      slave_cnt; /* never change this value outside the attach/detach wrappers */
-	void     (*recv_probe)(struct sk_buff *, struct bonding *,
+	int     (*recv_probe)(struct sk_buff *, struct bonding *,
 			       struct slave *);
 	rwlock_t lock;
 	rwlock_t curr_slave_lock;
-- 
1.7.10.1

^ permalink raw reply related

* Re: [patch] openvswitch: checking wrong variable in queue_userspace_packet()
From: David Miller @ 2012-05-13 19:47 UTC (permalink / raw)
  To: jesse-l0M0P4e3n4LQT0dZR+AlfA
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
	kernel-janitors-u79uwXL29TY76Z2rM5mHXA,
	dan.carpenter-QHcLZuEGTsvQT0dZR+AlfA
In-Reply-To: <CAEP_g=_LojOio0E5K_vYBkO3BKdW9F=dDam4zwffyZPbSZJd2Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

From: Jesse Gross <jesse-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
Date: Sun, 13 May 2012 12:22:29 -0700

> On Sun, May 13, 2012 at 11:44 AM, Dan Carpenter
> <dan.carpenter-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org> wrote:
>> "skb" is non-NULL here, for example we dereference it in skb_clone().
>> The intent was to test "nskb" which was just set.
>>
>> Signed-off-by: Dan Carpenter <dan.carpenter-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
> 
> Thanks Dan.
> 
> Acked-by: Jesse Gross <jesse-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>

Applied.

^ permalink raw reply

* Re: [patch] openvswitch: checking wrong variable in queue_userspace_packet()
From: Jesse Gross @ 2012-05-13 19:22 UTC (permalink / raw)
  To: Dan Carpenter
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
	kernel-janitors-u79uwXL29TY76Z2rM5mHXA, David S. Miller
In-Reply-To: <20120513184418.GB16541-mgFCXtclrQlZLf2FXnZxJA@public.gmane.org>

On Sun, May 13, 2012 at 11:44 AM, Dan Carpenter
<dan.carpenter-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org> wrote:
> "skb" is non-NULL here, for example we dereference it in skb_clone().
> The intent was to test "nskb" which was just set.
>
> Signed-off-by: Dan Carpenter <dan.carpenter-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>

Thanks Dan.

Acked-by: Jesse Gross <jesse-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>

^ permalink raw reply

* [patch] openvswitch: checking wrong variable in queue_userspace_packet()
From: Dan Carpenter @ 2012-05-13 18:44 UTC (permalink / raw)
  To: Jesse Gross; +Cc: David S. Miller, dev, netdev, kernel-janitors

"skb" is non-NULL here, for example we dereference it in skb_clone().
The intent was to test "nskb" which was just set.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index fe28562..2c74daa 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -321,7 +321,7 @@ static int queue_userspace_packet(int dp_ifindex, struct sk_buff *skb,
 			return -ENOMEM;
 
 		nskb = __vlan_put_tag(nskb, vlan_tx_tag_get(nskb));
-		if (!skb)
+		if (!nskb)
 			return -ENOMEM;
 
 		nskb->vlan_tci = 0;

^ permalink raw reply related

* [PATCH resend] sunrpc: fix kernel-doc warnings
From: Randy Dunlap @ 2012-05-13 17:35 UTC (permalink / raw)
  To: linux-nfs-u79uwXL29TY76Z2rM5mHXA
  Cc: Trond Myklebust, J. Bruce Fields, Andrew Morton,
	netdev-u79uwXL29TY76Z2rM5mHXA

From: Randy Dunlap <rdunlap-/UHa2rfvQTnk1uMJSBkQmQ@public.gmane.org>

Fix kernel-doc warnings in sunrpc/rpc_pipe.c and
sunrpc/rpcb_clnt.c:

Warning(net/sunrpc/rpcb_clnt.c:428): No description found for parameter 'net'
Warning(net/sunrpc/rpcb_clnt.c:567): No description found for parameter 'net'

Warning(net/sunrpc/rpc_pipe.c:133): No description found for parameter 'pipe'
Warning(net/sunrpc/rpc_pipe.c:133): Excess function parameter 'inode' description in 'rpc_queue_upcall'
Warning(net/sunrpc/rpc_pipe.c:839): No description found for parameter 'pipe'
Warning(net/sunrpc/rpc_pipe.c:839): Excess function parameter 'ops' description in 'rpc_mkpipe_dentry'
Warning(net/sunrpc/rpc_pipe.c:839): Excess function parameter 'flags' description in 'rpc_mkpipe_dentry'
Warning(net/sunrpc/rpc_pipe.c:949): No description found for parameter 'dentry'
Warning(net/sunrpc/rpc_pipe.c:949): Excess function parameter 'clnt' description in 'rpc_remove_client_dir'

Signed-off-by: Randy Dunlap <rdunlap-/UHa2rfvQTnk1uMJSBkQmQ@public.gmane.org>
---
 net/sunrpc/rpc_pipe.c |    8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

--- lnx-34-rc3.orig/net/sunrpc/rpcb_clnt.c
+++ lnx-34-rc3/net/sunrpc/rpcb_clnt.c
@@ -394,6 +394,7 @@ static int rpcb_register_call(struct rpc
 
 /**
  * rpcb_register - set or unset a port registration with the local rpcbind svc
+ * @net: target network namespace
  * @prog: RPC program number to bind
  * @vers: RPC version number to bind
  * @prot: transport protocol to register
@@ -521,6 +522,7 @@ static int rpcb_unregister_all_protofami
 
 /**
  * rpcb_v4_register - set or unset a port registration with the local rpcbind
+ * @net: target network namespace
  * @program: RPC program number of service to (un)register
  * @version: RPC version number of service to (un)register
  * @address: address family, IP address, and port to (un)register
--- lnx-34-rc3.orig/net/sunrpc/rpc_pipe.c
+++ lnx-34-rc3/net/sunrpc/rpc_pipe.c
@@ -120,7 +120,7 @@ EXPORT_SYMBOL_GPL(rpc_pipe_generic_upcal
 
 /**
  * rpc_queue_upcall - queue an upcall message to userspace
- * @inode: inode of upcall pipe on which to queue given message
+ * @pipe: upcall pipe on which to queue given message
  * @msg: message to queue
  *
  * Call with an @inode created by rpc_mkpipe() to queue an upcall.
@@ -819,9 +819,7 @@ static int rpc_rmdir_depopulate(struct d
  * @parent: dentry of directory to create new "pipe" in
  * @name: name of pipe
  * @private: private data to associate with the pipe, for the caller's use
- * @ops: operations defining the behavior of the pipe: upcall, downcall,
- *	release_pipe, open_pipe, and destroy_msg.
- * @flags: rpc_pipe flags
+ * @pipe: &rpc_pipe containing input parameters
  *
  * Data is made available for userspace to read by calls to
  * rpc_queue_upcall().  The actual reads will result in calls to
@@ -943,7 +941,7 @@ struct dentry *rpc_create_client_dir(str
 
 /**
  * rpc_remove_client_dir - Remove a directory created with rpc_create_client_dir()
- * @clnt: rpc client
+ * @dentry: dentry for the pipe
  */
 int rpc_remove_client_dir(struct dentry *dentry)
 {
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH RFC] tun: experimental zero copy tx support
From: Michael S. Tsirkin @ 2012-05-13 15:52 UTC (permalink / raw)
  To: David S. Miller
  Cc: Stephen Hemminger, Joe Perches, Jason Wang, netdev, linux-kernel,
	Ian.Campbell, kvm

Let vhost-net utilize zero copy tx when the experimental
zero copy mode is enabled and when used with tun.  This works on
top of the patchset 'copy aside frags with destructors' that I posted
previously. This is not using tcp so doesn't have the
issue with early skb cloning noticed by Ian.

For those that wish to test this with kvm, I intend to post a patchset +
git tree with just the necessary bits from the destructor patch
a bit later.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/net/tun.c |  125 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 120 insertions(+), 5 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index fe5cd2f3..eb10ee7 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -100,6 +100,8 @@ do {								\
 } while (0)
 #endif
 
+#define GOODCOPY_LEN 128
+
 #define FLT_EXACT_COUNT 8
 struct tap_filter {
 	unsigned int    count;    /* Number of addrs. Zero means disabled */
@@ -602,8 +604,80 @@ static struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
 	return skb;
 }
 
+/* set skb frags from iovec, this can move to core network code for reuse */
+static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
+				  int offset, size_t count)
+{
+	int len = iov_length(from, count) - offset;
+	int copy = skb_headlen(skb);
+	int size, offset1 = 0;
+	int i = 0;
+
+	/* Skip over from offset */
+	while (count && (offset >= from->iov_len)) {
+		offset -= from->iov_len;
+		++from;
+		--count;
+	}
+
+	/* copy up to skb headlen */
+	while (count && (copy > 0)) {
+		size = min_t(unsigned int, copy, from->iov_len - offset);
+		if (copy_from_user(skb->data + offset1, from->iov_base + offset,
+				   size))
+			return -EFAULT;
+		if (copy > size) {
+			++from;
+			--count;
+		}
+		copy -= size;
+		offset1 += size;
+		offset = 0;
+	}
+
+	if (len == offset1)
+		return 0;
+
+	while (count--) {
+		struct page *page[MAX_SKB_FRAGS];
+		int num_pages;
+		unsigned long base;
+
+		len = from->iov_len - offset1;
+		if (!len) {
+			offset1 = 0;
+			++from;
+			continue;
+		}
+		base = (unsigned long)from->iov_base + offset1;
+		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+		num_pages = get_user_pages_fast(base, size, 0, &page[i]);
+		if ((num_pages != size) ||
+		    (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags))
+			/* put_page is in skb free */
+			return -EFAULT;
+		skb->data_len += len;
+		skb->len += len;
+		skb->truesize += len;
+		atomic_add(len, &skb->sk->sk_wmem_alloc);
+		while (len) {
+			int off = base & ~PAGE_MASK;
+			int size = min_t(int, len, PAGE_SIZE - off);
+			__skb_fill_page_desc(skb, i, page[i], off, size);
+			skb_shinfo(skb)->nr_frags++;
+			/* increase sk_wmem_alloc */
+			base += size;
+			len -= size;
+			i++;
+		}
+		offset1 = 0;
+		++from;
+	}
+	return 0;
+}
+
 /* Get packet from user space buffer */
-static ssize_t tun_get_user(struct tun_struct *tun,
+static ssize_t tun_get_user(struct tun_struct *tun, void *msg_control,
 			    const struct iovec *iv, size_t count,
 			    int noblock)
 {
@@ -612,6 +686,9 @@ static ssize_t tun_get_user(struct tun_struct *tun,
 	size_t len = count, align = NET_SKB_PAD;
 	struct virtio_net_hdr gso = { 0 };
 	int offset = 0;
+	int copylen;
+	bool zerocopy = false;
+	int err;
 
 	if (!(tun->flags & TUN_NO_PI)) {
 		if ((len -= sizeof(pi)) > count)
@@ -645,14 +722,47 @@ static ssize_t tun_get_user(struct tun_struct *tun,
 			return -EINVAL;
 	}
 
-	skb = tun_alloc_skb(tun, align, len, gso.hdr_len, noblock);
+	if (msg_control)
+		zerocopy = true;
+
+	if (zerocopy) {
+		/* Userspace may produce vectors with count greater than
+		 * MAX_SKB_FRAGS, so we need to linearize parts of the skb
+		 * to let the rest of data to be fit in the frags.
+		 */
+		if (count > MAX_SKB_FRAGS) {
+			copylen = iov_length(iv, count - MAX_SKB_FRAGS);
+			if (copylen < offset)
+				copylen = 0;
+			else
+				copylen -= offset;
+		} else
+				copylen = 0;
+		/* There are 256 bytes to be copied in skb, so there is enough
+		 * room for skb expand head in case it is used.
+		 * The rest of the buffer is mapped from userspace.
+		 */
+		if (copylen < gso.hdr_len)
+			copylen = gso.hdr_len;
+		if (!copylen)
+			copylen = GOODCOPY_LEN;
+	} else
+		copylen = len;
+
+	skb = tun_alloc_skb(tun, align, copylen, gso.hdr_len, noblock);
 	if (IS_ERR(skb)) {
 		if (PTR_ERR(skb) != -EAGAIN)
 			tun->dev->stats.rx_dropped++;
 		return PTR_ERR(skb);
 	}
 
-	if (skb_copy_datagram_from_iovec(skb, 0, iv, offset, len)) {
+	if (zerocopy) {
+		err = zerocopy_sg_from_iovec(skb, iv, offset, count);
+		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+	} else
+		err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len);
+
+	if (err) {
 		tun->dev->stats.rx_dropped++;
 		kfree_skb(skb);
 		return -EFAULT;
@@ -726,6 +836,10 @@ static ssize_t tun_get_user(struct tun_struct *tun,
 		skb_shinfo(skb)->gso_segs = 0;
 	}
 
+	/* copy skb_ubuf_info for callback when skb has no error */
+	if (zerocopy)
+		skb_shinfo(skb)->destructor_arg = msg_control;
+
 	netif_rx_ni(skb);
 
 	tun->dev->stats.rx_packets++;
@@ -746,7 +860,7 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
 
 	tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count);
 
-	result = tun_get_user(tun, iv, iov_length(iv, count),
+	result = tun_get_user(tun, NULL, iv, iov_length(iv, count),
 			      file->f_flags & O_NONBLOCK);
 
 	tun_put(tun);
@@ -960,7 +1074,7 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
 		       struct msghdr *m, size_t total_len)
 {
 	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
-	return tun_get_user(tun, m->msg_iov, total_len,
+	return tun_get_user(tun, m->msg_control, m->msg_iov, total_len,
 			    m->msg_flags & MSG_DONTWAIT);
 }
 
@@ -1130,6 +1244,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		sock_init_data(&tun->socket, sk);
 		sk->sk_write_space = tun_sock_write_space;
 		sk->sk_sndbuf = INT_MAX;
+		sock_set_flag(sk, SOCK_ZEROCOPY);
 
 		tun_sk(sk)->tun = tun;
 
-- 
MST

^ permalink raw reply related

* Re: [patch] Re: qlge driver corrupting kernel memory
From: Mike Galbraith @ 2012-05-13 10:16 UTC (permalink / raw)
  To: Thadeu Lima de Souza Cascardo; +Cc: netdev
In-Reply-To: <1336903839.7390.13.camel@marge.simpson.net>

Erm, with a quilt refresh you get the compiling version :)

glge: Fix double pci_free_consistent() upon tx_ring->q allocation failure

Let ql_free_tx_resources() do it's job.  You are not helping.

Signed-off-by: Mike Galbraith <mgalbraith@suse.de>
---
 drivers/net/qlge/qlge_main.c |   10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

--- a/drivers/net/qlge/qlge_main.c
+++ b/drivers/net/qlge/qlge_main.c
@@ -2664,11 +2664,8 @@ static int ql_alloc_tx_resources(struct
 	    pci_alloc_consistent(qdev->pdev, tx_ring->wq_size,
 				 &tx_ring->wq_base_dma);
 
-	if ((tx_ring->wq_base == NULL) ||
-	    tx_ring->wq_base_dma & WQ_ADDR_ALIGN) {
-		QPRINTK(qdev, IFUP, ERR, "tx_ring alloc failed.\n");
-		return -ENOMEM;
-	}
+	if ((tx_ring->wq_base == NULL) || tx_ring->wq_base_dma & WQ_ADDR_ALIGN)
+		goto err;
 	tx_ring->q =
 	    kmalloc(tx_ring->wq_len * sizeof(struct tx_ring_desc), GFP_KERNEL);
 	if (tx_ring->q == NULL)
@@ -2676,8 +2673,7 @@ static int ql_alloc_tx_resources(struct
 
 	return 0;
 err:
-	pci_free_consistent(qdev->pdev, tx_ring->wq_size,
-			    tx_ring->wq_base, tx_ring->wq_base_dma);
+	QPRINTK(qdev, IFUP, ERR, "tx_ring alloc failed.\n");
 	return -ENOMEM;
 }
 

^ permalink raw reply

* [patch] Re: qlge driver corrupting kernel memory
From: Mike Galbraith @ 2012-05-13 10:10 UTC (permalink / raw)
  To: Thadeu Lima de Souza Cascardo; +Cc: netdev
In-Reply-To: <1336736301.7361.144.camel@marge.simpson.net>

On Fri, 2012-05-11 at 13:38 +0200, Mike Galbraith wrote: 
> On Tue, 2012-05-08 at 09:07 -0300, Thadeu Lima de Souza Cascardo wrote: 
> > On Tue, May 08, 2012 at 01:00:18PM +0200, Mike Galbraith wrote:
> > > Greetings network wizards,
> > > 
> > > $subject is happening in an 2.6.32 enterprise kernel with the driver
> > > updated to what looks to me to be 2.6.38 or so.
> > > 
> > > Allegedly, IFF boxen are running dual CNAs with storage and LAN sharing
> > > a port, $subject happens fairly regularly.  Rummaging in crashdumps
> > > seems to show corruption happens because we somehow end up stuffing
> > > loads of frags into skb_shared_info, scribbling all over the place.
> > > 
> > > Before I proceed, what I know about skbs can be found here..
> > > 
> > >     http://vger.kernel.org/~davem/skb_data.html
> > > 
> > > ..and that's the sum and total ;-)
> > > 
> > > I guess the first thing I should ask is whether anyone has seen such
> > > scribbling with this driver.  Known issue would be a case of happiness,
> > > but I doubt that will be the case from searching, so onward.
> > > 
> > 
> > Hi, Mike.
> > 
> > From what you describe, I suspect this is related to this fix:
> > 
> > http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=782428535e0819b5b7c9825cd3faa2ad37032a70
> > 
> > Please, apply and report if that works for you.
> 
> Nope, box exploded.  I haven't seen a dump yet, but expect it'll be more
> of the same scribbling.

Something else popped up meanwhile.  Shortly after tx_ring->q order 5
allocation failure and ql_release_adapter_resources(), BUG: Bad page
state has now arrived twice to muddy the water.

[ 3537.150327] Node 0 DMA: 2*4kB 2*8kB 1*16kB 2*32kB 2*64kB 1*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 360kB
[ 3537.150345] Node 0 DMA32: 318*4kB 144*8kB 89*16kB 17*32kB 3*64kB 1*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 4712kB
[ 3537.150364] 5248 total pagecache pages
[ 3537.150367] 211 pages in swap cache
[ 3537.150372] Swap cache stats: add 1437, delete 1226, find 1641/1752
[ 3537.150377] Free swap  = 67109880kB
[ 3537.150381] Total swap = 67111528kB
[ 3537.152314] 73723 pages RAM
[ 3537.152319] 13128 pages reserved
[ 3537.152322] 4910 pages shared
[ 3537.152326] 22795 pages non-shared
[ 3537.152333] qlge 0000:04:00.0: ql_alloc_mem_resources: TX resource allocation failed.
[ 3537.152343] qlge 0000:04:00.0: ql_get_adapter_resources: Unable to  allocate memory.
[ 3537.152499] qlge 0000:04:00.0: ql_set_mac_addr_reg: Adding UNICAST address 00:c0:dd:1a:46:ac at index 0 in the CAM.
[ 3537.440237] BUG: Bad page state in process ifdown-dhcp  pfn:10940
[ 3537.440244] page:ffffea00003a0600 flags:0020000000000000 count:-1 mapcount:0 mapping:(null) index:0
[ 3537.440249] Pid: 4317, comm: ifdown-dhcp Tainted: G           X 2.6.32.54-0.3.1.4242.0.TEST-default #1
[ 3537.440253] Call Trace:
[ 3537.440265]  [<ffffffff810061dc>] dump_trace+0x6c/0x2d0
[ 3537.440271]  [<ffffffff8139b366>] dump_stack+0x69/0x73
[ 3537.440279]  [<ffffffff810badb3>] bad_page+0xe3/0x170
[ 3537.440284]  [<ffffffff810bbedb>] prep_new_page+0xab/0x1b0
[ 3537.440289]  [<ffffffff810bc2e4>] get_page_from_freelist+0x304/0x720
[ 3537.440295]  [<ffffffff810bc9ba>] __alloc_pages_slowpath+0x11a/0x5f0
[ 3537.440300]  [<ffffffff810bcfca>] __alloc_pages_nodemask+0x13a/0x140
[ 3537.440305]  [<ffffffff810bbdd9>] __get_free_pages+0x9/0x50
[ 3537.440314]  [<ffffffff8104ba62>] dup_task_struct+0x42/0x150
[ 3537.440320]  [<ffffffff8104cc54>] copy_process+0xb4/0xe50
[ 3537.440324]  [<ffffffff8104da7c>] do_fork+0x8c/0x3c0
[ 3537.440331]  [<ffffffff81003263>] stub_clone+0x13/0x20
[ 3537.441094] DWARF2 unwinder stuck at stub_clone+0x13/0x20
[ 3537.441097]
[ 3537.441098] Leftover inexact backtrace:
[ 3537.441099]
[ 3537.441103]  [<ffffffff81002f7b>] ? system_call_fastpath+0x16/0x1b
[ 3537.441107] Disabling lock debugging due to kernel taint
[ 3537.899545] bonding: bond0 is being deleted..

glge: Fix double pci_free_consistent() upon tx_ring->q allocation failure

Let ql_free_tx_resources() do it's job.  You are not helping.

Signed-off-by: Mike Galbraith <mgalbraith@suse.de>
---
 drivers/net/qlge/qlge_main.c |   10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

--- a/drivers/net/qlge/qlge_main.c
+++ b/drivers/net/qlge/qlge_main.c
@@ -2664,11 +2664,8 @@ static int ql_alloc_tx_resources(struct
 	    pci_alloc_consistent(qdev->pdev, tx_ring->wq_size,
 				 &tx_ring->wq_base_dma);
 
-	if ((tx_ring->wq_base == NULL) ||
-	    tx_ring->wq_base_dma & WQ_ADDR_ALIGN) {
-		QPRINTK(qdev, IFUP, ERR, "tx_ring alloc failed.\n");
-		return -ENOMEM;
-	}
+	if ((tx_ring->wq_base == NULL) tx_ring->wq_base_dma & WQ_ADDR_ALIGN)
+		goto err;
 	tx_ring->q =
 	    kmalloc(tx_ring->wq_len * sizeof(struct tx_ring_desc), GFP_KERNEL);
 	if (tx_ring->q == NULL)
@@ -2676,8 +2673,7 @@ static int ql_alloc_tx_resources(struct
 
 	return 0;
 err:
-	pci_free_consistent(qdev->pdev, tx_ring->wq_size,
-			    tx_ring->wq_base, tx_ring->wq_base_dma);
+	QPRINTK(qdev, IFUP, ERR, "tx_ring alloc failed.\n");
 	return -ENOMEM;
 }
 

^ permalink raw reply

* Re: [PATCH RFC 1/6] skbuff: support per-page destructors in copy_ubufs
From: Michael S. Tsirkin @ 2012-05-13 10:10 UTC (permalink / raw)
  To: Ian Campbell; +Cc: David Miller, netdev@vger.kernel.org, eric.dumazet@gmail.com
In-Reply-To: <1336802484.3891.24.camel@dagon.hellion.org.uk>

On Sat, May 12, 2012 at 07:01:24AM +0100, Ian Campbell wrote:
> On Fri, 2012-05-11 at 17:30 +0100, Michael S. Tsirkin wrote:
> > On Fri, May 11, 2012 at 03:08:36PM +0300, Michael S. Tsirkin wrote:
> > > On Fri, May 11, 2012 at 11:58:12AM +0100, Ian Campbell wrote:
> > > > On Fri, 2012-05-11 at 10:00 +0100, Ian Campbell wrote:
> > > > > I'm seeing copy_ubufs called in my remote NFS test, which I don't
> > > > > think I expected -- I'll investigate why this is happening today. 
> > > > 
> > > > It's tcp_transmit_skb which can (conditionally) call skb_clone
> > > > (backtrace below)
> > > 
> > > Interesting. I didn't realise we clone skbs on data path:
> > > tcp_write_xmit calls tcp_transmit_skb with clone_it flag.
> > > Could someone comment on why we need to clone on good path
> > > like this?
> > 
> > Hmm, it's in case we need to retransmit it later.
> 
> I wonder if we could avoid the copy_ubuf in this particular clone path
> and have any subsequent calls to copy_ubufs use skb->fclone to determine
> if it can safely replace the frags?
> 
> If it cannot then could it do a full copy of the skb (including new
> shinfo, new frag pages etc) as a fallback?
> 
> Ian.
> 

Yes I think we should call a variant of clone that avoids copy_ubuf on
the first transmit.  But need to be careful we don't access the frag
list while it is being modified.

For example very roughly, maybe we could have copy_ubuf detect
packet clone is queued and take some lock?

On retransmit we could check and if we are not the only clone left
(which should be uncommon) trigger copy ubuf then.

Thoughts?

-- 
MST

^ permalink raw reply

* Re: [PATCH net-next] codel: use Newton method instead of sqrt() and divides
From: Eric Dumazet @ 2012-05-13  7:23 UTC (permalink / raw)
  To: David Miller
  Cc: dave.taht, netdev, nichols, van, codel, ycheng, mattmathis,
	therbert, shemminger, nanditad
In-Reply-To: <20120512.175217.1632102067268101115.davem@davemloft.net>

From: Eric Dumazet <edumazet@google.com>

On Sat, 2012-05-12 at 17:52 -0400, David Miller wrote:

> Ok, fair enough.

Oh well, I sent my mail too late. The error made no sense after a good
night. Also, when Van says something, you can be fairly sure its right,
and if it's not, then you didn't understand what Van said ;)

16bit precision is OK, once the maths are correctly done in the userland
program I wrote yesterday...

count=16525, precision=16 bits, sqrt(scaled_count)=4113, reciprocal(sq)=1fde240, Newton=1fd0000
  interval/sqrt(16525) = 
	777909 (float compute)  // (u32)(interval/sqrt(count))
	778020 (integer approx) // reciprocal_divide(interval, rec)
	777926 (int_sqrt_div)   // int_sqrt_div(interval, count)
	776672 (Newton approx)  // reciprocal_divide(interval, previnv << shift)

count=9889134, precision=16 bits, sqrt(scaled_count)=50315,
reciprocal(sq)=14d720, Newton=140000
  interval/sqrt(9889134) = 
	31799 (float compute)
	31799 (integer approx)
	31799 (int_sqrt_div)
	30517 (Newton approx)


And kernel code using u16 :

 6a1:	0f b7 72 0a          	movzwl 0xa(%rdx),%esi
 6a5:	8b 3a                	mov    (%rdx),%edi
 6a7:	83 c7 01             	add    $0x1,%edi
 6aa:	c1 e6 10             	shl    $0x10,%esi
 6ad:	89 3a                	mov    %edi,(%rdx)  vars->count++
 6af:	89 ff                	mov    %edi,%edi
 6b1:	89 f6                	mov    %esi,%esi
 6b3:	48 89 f1             	mov    %rsi,%rcx
 6b6:	48 0f af ce          	imul   %rsi,%rcx
 6ba:	48 c1 e9 20          	shr    $0x20,%rcx
 6be:	48 0f af cf          	imul   %rdi,%rcx
 6c2:	48 bf 00 00 00 00 03 	mov    $0x300000000,%rdi
 6c9:	00 00 00 
 6cc:	48 29 cf             	sub    %rcx,%rdi
 6cf:	48 89 f9             	mov    %rdi,%rcx
 6d2:	48 c1 e9 02          	shr    $0x2,%rcx
 6d6:	48 0f af ce          	imul   %rsi,%rcx
 6da:	48 c1 e9 2f          	shr    $0x2f,%rcx
 6de:	66 89 4a 0a          	mov    %cx,0xa(%rdx)


Fell free to add following cleanup patch, if you like it ;)

Thanks

[PATCH net-next] codel: use u16 field instead of 31bits for rec_inv_sqrt

David pointed out gcc might generate poor code with 31bit fields.

Using u16 is more than enough and permits a better code output.

Also make the code intent more readable using constants, fixed point arithmetic
not being trivial for everybody.

Suggested-by: David Miller <davem@davemloft.net>
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/codel.h |   25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/include/net/codel.h b/include/net/codel.h
index bd8747c..7546517 100644
--- a/include/net/codel.h
+++ b/include/net/codel.h
@@ -133,13 +133,17 @@ struct codel_params {
 struct codel_vars {
 	u32		count;
 	u32		lastcount;
-	bool		dropping:1;
-	u32		rec_inv_sqrt:31;
+	bool		dropping;
+	u16		rec_inv_sqrt;
 	codel_time_t	first_above_time;
 	codel_time_t	drop_next;
 	codel_time_t	ldelay;
 };
 
+#define REC_INV_SQRT_BITS (8 * sizeof(u16)) /* or sizeof_in_bits(rec_inv_sqrt) */
+/* needed shift to get a Q0.32 number from rec_inv_sqrt */
+#define REC_INV_SQRT_SHIFT (32 - REC_INV_SQRT_BITS)
+
 /**
  * struct codel_stats - contains codel shared variables and stats
  * @maxpacket:	largest packet we've seen so far
@@ -173,17 +177,18 @@ static void codel_stats_init(struct codel_stats *stats)
  * http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Iterative_methods_for_reciprocal_square_roots
  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
  *
- * Here, invsqrt is a fixed point number (< 1.0), 31bit mantissa)
+ * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
  */
 static void codel_Newton_step(struct codel_vars *vars)
 {
-	u32 invsqrt = vars->rec_inv_sqrt;
-	u32 invsqrt2 = ((u64)invsqrt * invsqrt) >> 31;
-	u64 val = (3LL << 31) - ((u64)vars->count * invsqrt2);
+	u32 invsqrt = ((u32)vars->rec_inv_sqrt) << REC_INV_SQRT_SHIFT;
+	u32 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
+	u64 val = (3LL << 32) - ((u64)vars->count * invsqrt2);
 
-	val = (val * invsqrt) >> 32;
+	val >>= 2; /* avoid overflow in following multiply */
+	val = (val * invsqrt) >> (32 - 2 + 1);
 
-	vars->rec_inv_sqrt = val;
+	vars->rec_inv_sqrt = val >> REC_INV_SQRT_SHIFT;
 }
 
 /*
@@ -195,7 +200,7 @@ static codel_time_t codel_control_law(codel_time_t t,
 				      codel_time_t interval,
 				      u32 rec_inv_sqrt)
 {
-	return t + reciprocal_divide(interval, rec_inv_sqrt << 1);
+	return t + reciprocal_divide(interval, rec_inv_sqrt << REC_INV_SQRT_SHIFT);
 }
 
 
@@ -326,7 +331,7 @@ static struct sk_buff *codel_dequeue(struct Qdisc *sch,
 			codel_Newton_step(vars);
 		} else {
 			vars->count = 1;
-			vars->rec_inv_sqrt = 0x7fffffff;
+			vars->rec_inv_sqrt = ~0U >> REC_INV_SQRT_SHIFT;
 		}
 		vars->lastcount = vars->count;
 		vars->drop_next = codel_control_law(now, params->interval,

^ permalink raw reply related

* Re: [PATCH 1/2] Bluetooth: notify userspace of security level change
From: Gustavo Padovan @ 2012-05-13  6:22 UTC (permalink / raw)
  To: linville; +Cc: davem, linux-wireless, linux-bluetooth, linux-kernel, netdev
In-Reply-To: <1336849910-29064-1-git-send-email-gustavo@padovan.org>

Hi,

* Gustavo Padovan <gustavo@padovan.org> [2012-05-12 16:11:49 -0300]:

> When the userspace request a security level change it needs to be notified
> of when the change is complete.
> This patch make the socket non writable while the security request is
> ongoing. If it succeeds POLL_OUT is emitted, otherwise the channel is
> disconnected.

I just sent a second version of this patches that includes comments Johan made
in the last e-mail for better understanding of what the problem really is.

	Gustavo

^ permalink raw reply

* [PATCH 1/2] Bluetooth: notify userspace of security level change
From: Gustavo Padovan @ 2012-05-13  6:20 UTC (permalink / raw)
  To: linville-2XuSBdqkA4R54TAoqtyWWQ
  Cc: davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	linux-bluetooth-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1336849910-29064-2-git-send-email-gustavo-THi1TnShQwVAfugRpC6u6w@public.gmane.org>

It fixes L2CAP socket based security level elevation during a
connection. The HID profile needs this (for keyboards) and it is the only
way to achieve the security level elevation when using the management
interface to talk to the kernel (hence the management enabling patch
being the one that exposes this issue).

It enables the userspace a security level change when the socket is
already connected and create a way to notify the socket the result of the
request. At the moment of the request the socket is made non writable, if
the request fails the connections closes, otherwise the socket is made
writable again, POLL_OUT is emmited.

Signed-off-by: Gustavo Padovan <gustavo-THi1TnShQwVAfugRpC6u6w@public.gmane.org>
Acked-by: Marcel Holtmann <marcel-kz+m5ild9QBg9hUCZPvPmw@public.gmane.org>
Signed-off-by: Johan Hedberg <johan.hedberg-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
 include/net/bluetooth/bluetooth.h |    1 +
 net/bluetooth/af_bluetooth.c      |    2 +-
 net/bluetooth/hci_event.c         |    7 +++++++
 net/bluetooth/l2cap_core.c        |    5 +++++
 net/bluetooth/l2cap_sock.c        |   12 ++++++++----
 5 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 262ebd1..a65910b 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -191,6 +191,7 @@ struct bt_sock {
 	struct list_head accept_q;
 	struct sock *parent;
 	u32 defer_setup;
+	bool suspended;
 };
 
 struct bt_sock_list {
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 72eb187..6fb68a9 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -450,7 +450,7 @@ unsigned int bt_sock_poll(struct file *file, struct socket *sock, poll_table *wa
 			sk->sk_state == BT_CONFIG)
 		return mask;
 
-	if (sock_writeable(sk))
+	if (!bt_sk(sk)->suspended && sock_writeable(sk))
 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
 	else
 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 7f87a70..ff38cc6 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2040,6 +2040,12 @@ static inline void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *
 
 		clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags);
 
+		if (ev->status && conn->state == BT_CONNECTED) {
+			hci_acl_disconn(conn, 0x13);
+			hci_conn_put(conn);
+			goto unlock;
+		}
+
 		if (conn->state == BT_CONFIG) {
 			if (!ev->status)
 				conn->state = BT_CONNECTED;
@@ -2050,6 +2056,7 @@ static inline void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *
 			hci_encrypt_cfm(conn, ev->status, ev->encrypt);
 	}
 
+unlock:
 	hci_dev_unlock(hdev);
 }
 
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 38d934a..c073533 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -4590,6 +4590,11 @@ int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
 
 		if (!status && (chan->state == BT_CONNECTED ||
 						chan->state == BT_CONFIG)) {
+			struct sock *sk = chan->sk;
+
+			bt_sk(sk)->suspended = false;
+			sk->sk_state_change(sk);
+
 			l2cap_check_encryption(chan, encrypt);
 			l2cap_chan_unlock(chan);
 			continue;
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 29122ed..04e7c17 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -592,10 +592,14 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, ch
 			sk->sk_state = BT_CONFIG;
 			chan->state = BT_CONFIG;
 
-		/* or for ACL link, under defer_setup time */
-		} else if (sk->sk_state == BT_CONNECT2 &&
-					bt_sk(sk)->defer_setup) {
-			err = l2cap_chan_check_security(chan);
+		/* or for ACL link */
+		} else if ((sk->sk_state == BT_CONNECT2 &&
+			   bt_sk(sk)->defer_setup) ||
+			   sk->sk_state == BT_CONNECTED) {
+			if (!l2cap_chan_check_security(chan))
+				bt_sk(sk)->suspended = true;
+			else
+				sk->sk_state_change(sk);
 		} else {
 			err = -EINVAL;
 		}
-- 
1.7.10.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox