Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [omega-g1:11110] Re: [PATCH] net: configurable sysctl parameter "net.core.tcp_lowat" for sk_stream_min_wspace()
From: David Miller @ 2011-09-09  2:17 UTC (permalink / raw)
  To: jun.kondo
  Cc: linux-kernel, omega-g1, notsuki, motokazu.kozaki, htaira, netdev,
	tomohiko.takahashi, kotaro.sakai, ken.sugawara
In-Reply-To: <4E696D06.3000003@ctc-g.co.jp>

From: "Jun.Kondo" <jun.kondo@ctc-g.co.jp>
Date: Fri, 09 Sep 2011 10:33:58 +0900

> - In normal situation, acquire large default transmission
>   buffer value, and ensure high throughput from the
>   beginning of tcp connection

You should never do this.  You should use the default buffer sizes and
as a result the kernel's TCP stack automatically adjusts the send and
receive buffers in response to the link characteristics.

When you set explicit buffer sizes, this turns off the TCP stack's
auto-tuning mechanism.

Every argument made in support of your proposed feature is based upon
a false premise of one kind of another, and this is yet another example
of this.

^ permalink raw reply

* Re: [PATCH v2 1/9] per-netns ipv4 sysctl_tcp_mem
From: KAMEZAWA Hiroyuki @ 2011-09-09  2:47 UTC (permalink / raw)
  To: Glauber Costa
  Cc: linux-kernel, linux-mm, containers, netdev, xemul,
	David S. Miller, Eric W. Biederman
In-Reply-To: <1315369399-3073-2-git-send-email-glommer@parallels.com>

On Wed,  7 Sep 2011 01:23:11 -0300
Glauber Costa <glommer@parallels.com> wrote:

> This patch allows each namespace to independently set up
> its levels for tcp memory pressure thresholds. This patch
> alone does not buy much: we need to make this values
> per group of process somehow. This is achieved in the
> patches that follows in this patchset.
> 
> Signed-off-by: Glauber Costa <glommer@parallels.com>
> CC: David S. Miller <davem@davemloft.net>
> CC: Hiroyouki Kamezawa <kamezawa.hiroyu@jp.fujitsu.com>
> CC: Eric W. Biederman <ebiederm@xmission.com>


Hmm, it may be better to post this patch as independent one.

I'm not familiar with this area...but try review ;)

> ---
>  include/net/netns/ipv4.h   |    1 +
>  include/net/tcp.h          |    1 -
>  net/ipv4/sysctl_net_ipv4.c |   51 +++++++++++++++++++++++++++++++++++++------
>  net/ipv4/tcp.c             |   13 +++-------
>  4 files changed, 49 insertions(+), 17 deletions(-)
> 
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index d786b4f..bbd023a 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -55,6 +55,7 @@ struct netns_ipv4 {
>  	int current_rt_cache_rebuild_count;
>  
>  	unsigned int sysctl_ping_group_range[2];
> +	long sysctl_tcp_mem[3];
>  
>  	atomic_t rt_genid;
>  	atomic_t dev_addr_genid;

Hmm, in original placement, sysctl_tcp_mem[] was on __read_mostly
area. Doesn't this placement cause many cache invalidations ?



> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 149a415..6bfdd9b 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -230,7 +230,6 @@ extern int sysctl_tcp_fack;
>  extern int sysctl_tcp_reordering;
>  extern int sysctl_tcp_ecn;
>  extern int sysctl_tcp_dsack;
> -extern long sysctl_tcp_mem[3];
>  extern int sysctl_tcp_wmem[3];
>  extern int sysctl_tcp_rmem[3];
>  extern int sysctl_tcp_app_win;
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 69fd720..0d74b9d 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -14,6 +14,7 @@
>  #include <linux/init.h>
>  #include <linux/slab.h>
>  #include <linux/nsproxy.h>
> +#include <linux/swap.h>
>  #include <net/snmp.h>
>  #include <net/icmp.h>
>  #include <net/ip.h>
> @@ -174,6 +175,36 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
>  	return ret;
>  }
>  
> +static int ipv4_tcp_mem(ctl_table *ctl, int write,
> +			   void __user *buffer, size_t *lenp,
> +			   loff_t *ppos)
> +{
> +	int ret;
> +	unsigned long vec[3];
> +	struct net *net = current->nsproxy->net_ns;
> +	int i;
> +
> +	ctl_table tmp = {
> +		.data = &vec,
> +		.maxlen = sizeof(vec),
> +		.mode = ctl->mode,
> +	};
> +
> +	if (!write) {
> +		ctl->data = &net->ipv4.sysctl_tcp_mem;
> +		return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
> +	}
> +
> +	ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
> +	if (ret)
> +		return ret;
> +
> +	for (i = 0; i < 3; i++)
> +		net->ipv4.sysctl_tcp_mem[i] = vec[i];
> +
> +	return 0;
> +}
> +
>  static struct ctl_table ipv4_table[] = {
>  	{
>  		.procname	= "tcp_timestamps",
> @@ -433,13 +464,6 @@ static struct ctl_table ipv4_table[] = {
>  		.proc_handler	= proc_dointvec
>  	},
>  	{
> -		.procname	= "tcp_mem",
> -		.data		= &sysctl_tcp_mem,
> -		.maxlen		= sizeof(sysctl_tcp_mem),
> -		.mode		= 0644,
> -		.proc_handler	= proc_doulongvec_minmax
> -	},
> -	{
>  		.procname	= "tcp_wmem",
>  		.data		= &sysctl_tcp_wmem,
>  		.maxlen		= sizeof(sysctl_tcp_wmem),
> @@ -721,6 +745,12 @@ static struct ctl_table ipv4_net_table[] = {
>  		.mode		= 0644,
>  		.proc_handler	= ipv4_ping_group_range,
>  	},
> +	{
> +		.procname	= "tcp_mem",
> +		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_mem),
> +		.mode		= 0644,
> +		.proc_handler	= ipv4_tcp_mem,
> +	},
>  	{ }
>  };
>  




> @@ -734,6 +764,7 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
>  static __net_init int ipv4_sysctl_init_net(struct net *net)
>  {
>  	struct ctl_table *table;
> +	unsigned long limit;
>  
>  	table = ipv4_net_table;
>  	if (!net_eq(net, &init_net)) {
> @@ -769,6 +800,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
>  
>  	net->ipv4.sysctl_rt_cache_rebuild_count = 4;
>  
> +	limit = nr_free_buffer_pages() / 8;
> +	limit = max(limit, 128UL);
> +	net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
> +	net->ipv4.sysctl_tcp_mem[1] = limit;
> +	net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
> +
>  	net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
>  			net_ipv4_ctl_path, table);
>  	if (net->ipv4.ipv4_hdr == NULL)
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 46febca..f06df24 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -266,6 +266,7 @@
>  #include <linux/crypto.h>
>  #include <linux/time.h>
>  #include <linux/slab.h>
> +#include <linux/nsproxy.h>
>  
>  #include <net/icmp.h>
>  #include <net/tcp.h>
> @@ -282,11 +283,9 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
>  struct percpu_counter tcp_orphan_count;
>  EXPORT_SYMBOL_GPL(tcp_orphan_count);
>  
> -long sysctl_tcp_mem[3] __read_mostly;
>  int sysctl_tcp_wmem[3] __read_mostly;
>  int sysctl_tcp_rmem[3] __read_mostly;
>  
> -EXPORT_SYMBOL(sysctl_tcp_mem);
>  EXPORT_SYMBOL(sysctl_tcp_rmem);
>  EXPORT_SYMBOL(sysctl_tcp_wmem);
>  
> @@ -3277,14 +3276,10 @@ void __init tcp_init(void)
>  	sysctl_tcp_max_orphans = cnt / 2;
>  	sysctl_max_syn_backlog = max(128, cnt / 256);
>  
> -	limit = nr_free_buffer_pages() / 8;
> -	limit = max(limit, 128UL);
> -	sysctl_tcp_mem[0] = limit / 4 * 3;
> -	sysctl_tcp_mem[1] = limit;
> -	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
> -
>  	/* Set per-socket limits to no more than 1/128 the pressure threshold */
> -	limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
> +	limit = (unsigned long)init_net.ipv4.sysctl_tcp_mem[1];
> +	limit <<= (PAGE_SHIFT - 7);
> +

I'm not sure but...why defined as 'long'  ?



BTW, when I grep,

tcp_input.c:        atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
tcp_input.c:    if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])

Don't you need to change this ?



Thanks,
-Kame







--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [net-next-2.6 PATCH 0/3 RFC] macvlan: MAC Address filtering support for passthru mode
From: Roopa Prabhu @ 2011-09-09  2:53 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: netdev, dragos.tatulea, arnd, dwang2, benve, kaber, sri, davem,
	eric.dumazet, mchan, kvm
In-Reply-To: <20110908182509.GB469@redhat.com>




On 9/8/11 12:11 PM, "Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Thu, Sep 08, 2011 at 09:19:32AM -0700, Roopa Prabhu wrote:
>>>>> There are more features we'll want down the road though,
>>>>> so let's see whether the interface will be able to
>>>>> satisfy them in a backwards compatible way before we
>>>>> set it in stone. Here's what I came up with:
>>>>> 
>>>>> How will the filtering table be partitioned within guests?
>>>> 
>>>> Since this patch supports macvlan PASSTHRU mode only, in which the lower
>>>> device has 1-1 mapping to the guest nic, it does not require any
>>>> partitioning of filtering table within guests. Unless I missed
>>>> understanding
>>>> something. 
>>>> If the lower device were being shared by multiple guest network interfaces
>>>> (non PASSTHRU mode), only then we will need to maintain separate filter
>>>> tables for each guest network interface in macvlan and forward the pkt to
>>>> respective guest interface after a filter lookup. This could affect
>>>> performance too I think.
>>> 
>>> Not with hardware filtering support. Which is where we'd need to
>>> partition the host nic mac table between guests.
>>> 
>> I need to understand this more. In non passthru case when a VF or physical
>> nic is shared between guests,
> 
> For example, consider a VF given to each guest. Hardware supports a fixed
> total number of filters, which can be partitioned between VFs.
> 
O ok. But hw maintains VF filters separately for every VF as far as I know.
Filters received on a VF are programmed for that VF only. Am assuming all
hardware do this. Atleast our hardware does this.
What I was referring to was a single VF shared between guests using macvtap
(could be bridge mode for example). All guests sharing the VF will register
 filters with the VF via macvlan. Hw makes sure what ever the VF asked for
is received at the VF. VF in hw does not know that it is shared by guests.
Only at macvlan we might need to re-filter the pkts received on the VF and
steer pkts to the individual guests based on what they asked for.


>> the nic does not really know about the guests,
>> so I was thinking we do the same thing as we do for the passthru case (ie
>> send all the address filters from macvlan to the physical nic). So at the
>> hardware, filtering is done for all guests sharing the nic. But if we want
>> each virtio-net nic or guest to get exactly what it asked for
>> macvlan/macvtap needs to maintain a copy of each guest filter and do a
>> lookup and send only the requested traffic to the guest. Here is the
>> performance hit that I was seeing. Please see my next comment for further
>> details. 
> 
> It won't be any slower than attaching a non-passthrough macvlan
> to a device, will it?
> 
Am not sure. The filter lookup in macvlan is the one I am concerned about.
Will need to try it out.

>> 
>>>> I chose to support PASSTHRU Mode only at first because its simpler and all
>>>> code additions are in control path only.
>>> 
>>> I agree. It would be a bit silly to have a dedicated interface
>>> for passthough and a completely separate one for
>>> non passthrough.
>>> 
>> Agree. The reason I did not focus on non-passthru case in the initial
>> version was because I was thinking things to do in the non-passthru case
>> will be just add-ons to the passthru case. But true Better to flush out the
>> non-pasthru case details.
>> 
>> After dwelling on this a bit more how about the below:
>> 
>> Phase 1: Goal: Enable hardware filtering for all macvlan modes
>>     - In macvlan passthru mode the single guest virtio-nic connected will
>>       receive traffic that he requested for
>>     - In macvlan non-passthru mode all guest virtio-nics sharing the
>>       physical nic will see all other guest traffic
>>       but the filtering at guest virtio-nic
> 
> I don't think guests currently filter anything.
> 
I was referring to Qemu-kvm virtio-net in
virtion_net_receive->receive_filter. I think It only passes pkts that the
guest OS is interested. It uses the filter table that I am passing to
macvtap in this patch.


>>       will make sure each guest
>>       eventually sees traffic he asked for. This is still better than
>>       putting the physical nic in promiscuous mode.
>> 
>> (This is mainly what my patch does...but will need to remove the passthru
>> check and see if there are any thing else needed for non-passthru case)
> 
> I'm fine with sticking with passthrough, make non passthrough
> a separate phase.
> 
Ok.

>> 
>> Phase 2: Goal: Enable filtering at macvlan so that each guest virtio-nic
>> receives only what he requested for.
>>     - In this case, in addition to pushing the filters down to the physical
>>       nic we will have to maintain the same filter in macvlan and do a filter
>>       lookup before forwarding the traffic to a virtio-nic.
>> 
>> But I am thinking phase 2 might be redundant given virtio-nic already does
>> filtering for the guest.
> 
> It does? Do you mean the filter that qemu does in userspace?
> 
Yes I meant the filter that qemu does in userspace qemu-kvm/hw/virtio-net.c
receive_filter(). 

>> In which case we might not need phase 2 at all. I
>> might have been over complicating things.
>> 
>> Please comment. And please correct if I missed something.
>>  
>>  
>>>>> 
>>>>> A way to limit what the guest can do would also be useful.
>>>>> How can this be done? selinux?
>>>> 
>>>> I vaguely remember a thread on the same context.. had a suggestion to
>>>> maintain pre-approved address lists and allow guest filter registration of
>>>> only those addresses for security. This seemed reasonable. Plus the ability
>>>> to support additional address registration from guest could be made
>>>> configurable (One of your ideas again from prior work).
>>>> 
>>>> I am not an selinux expert, but I am thinking we can use it to only allow
>>>> or
>>>> disallow access or operations to the macvtap device. (?). I will check more
>>>> on this.
>>> 
>>> We'd have to have a way to revoke that as well.
>>> 
>> Yes true.
>> 
>> 
>>>>> 
>>>>> Any thoughts on spoofing filtering?
>>>> 
>>>> I can only think of checking addresses against an allowed address list.
>>>> Don't know of any other ways. Any hints ?
>>> 
>>> Hardware (esp SRIOV) often has ways to do this check, too.
>>> 
>> Yes correct. Hw sriov and even switch in 802.1Qbh has anti-spoofing feature.
>> In which case I am thinking having It at the macvtap layer is not an
>> absolute must (?).
> 
> Exactly. But let's figure out *how* it will be programmed.
> If anti-spoofing is programmed with netlink, maybe that's
> a better interface for rx filter too, for consistency.
> 
I will check sriov vfs. But I know our hw does not have an interface exposed
from the driver. We do it through the management s/w at the switch port
(this is 802.1Qbh). I will check other sriov nics. I don't think intel has a
netlink interface either. But I will double check.


>>>> 
>>>> In any case I am assuming all the protection/security measures should be
>>>> taken at the layer calling the TUNSETTXFILTER ie..In macvtap virtualization
>>>> use case its libvirt or qemu-kvm. No ?
>>> 
>>> Ideally we'd have a way to separate these capabilities, so that libvirt
>>> can override qemu.
>>> 
>>>>> 
>>>>> Would it be possible to make the filtering programmable
>>>>> using netlink, e.g. ethtool, ip, or some such?
>>>> 
>>>> Should be possible via ethtool or ip calling ioctl TUNSETTXFILTER. Are you
>>>> thinking of macvlan having a netlink interface to set filter and not ioctl
>>>> ?. Sure.
>>> 
>>> Yes.
>>> 
>>>> But I was thinking the point of implementing TUNSETTXFILTER was to
>>>> maintain compatibility with the generic tap interface that does the same
>>>> thing. 
>>> 
>>> Yes. OTOH I don't think anyone uses that ATM so it might not
>>> be important if it's not a good fit.
>>> E.g. we could notify libvirt and have it use netlink for us
>>> if we like that better.
>>> 
>> Ok thanks for clarifying that. One more reason to use TUNSETTXFILTER
>> interface was for qemu-kvm who uses the same tap interface for macvtap and
>> regular tap. So if we use netlink we have to do different things for macvtap
>> and tap filters in qemu. And qemu-kvm does not distinguish between macvtap
>> and tap as far as I know. No ?
> 
> It's not a question of simplifying qemu as much as trying to
> make the kernel interface abstract device differences
> away from users. Using same interface for tun and macvtap
> gave us some confidence that the interface is a good one.
> 
> But this does not seem to have worked with TUNSETTXFILTER -
> at least qemu doesn't use it yet, and it's been upstream
> a while. So there's no proof it's a good interface.

All qemu-kvm patches (not upstream yet) that have been floating around and
which I have been testing with use TUNSETTXFILTER. And tun kernel driver
does support filtering using TUNSETTXFILTER. It has code that does filtering
based on filter received using TUNSETTXFILTER.


But true we don't have to stick with TUNSETTXFILTER. I will explore the
netlink interface and research pros and cons and sketch the interface and
get back. 

> 
> So if we decide netlink is a better interface we can add it for tun too.
> We need to be backwards compatible and figure out what happens if someone
> tries to use both methods: probably apply both or ignore TUNSETTXFILTER ...
> 
>> 
>> Thanks you for your review and comments.
>> 
>> 
>>>> And having both the netlink op and ioctl interface might not be clean ?.
>>> 
>>> No idea.
>>> 
>>>> Sorry if I misunderstood your question.
>>>> 
>>>>> That would make this useful for bridged setups besides
>>>>> macvtap/virtualization.
>>>>> 
>>>> 
>>>> Thanks for the comments.
> 
> Overall good progress, and don't let the interface discussions
> block you. You want to push in two directions - stabilize code in one
> branch, and play with interfaces in another one. By the time there's a
> concensus on the interfaces you have the main logic all ready,
> then you merge. 

ok thanks michael! 

- Roopa



^ permalink raw reply

* Re: [net-next-2.6 PATCH 0/3 RFC] macvlan: MAC Address filtering support for passthru mode
From: Roopa Prabhu @ 2011-09-09  3:00 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sridhar Samudrala, netdev, dragos.tatulea, arnd, dwang2, benve,
	kaber, davem, eric.dumazet, mchan, kvm
In-Reply-To: <20110908193332.GA2476@redhat.com>

On 9/8/11 12:33 PM, "Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Thu, Sep 08, 2011 at 12:23:56PM -0700, Roopa Prabhu wrote:
>>> 
>>> I think the main usecase for passthru mode is to assign a SR-IOV VF to
>>> a single guest.
>>> 
>> Yes and for the passthru usecase this patch should be enough to enable
>> filtering in hw (eventually like I indicated before I need to fix vlan
>> filtering too).
> 
> So with filtering in hw, and in sriov VF case, VFs
> actually share a filtering table. How will that
> be partitioned?

AFAIK, though it might maintain a single filter table space in hw, hw does
know which filter belongs to which VF. And the OS driver does not need to do
anything special. The VF driver exposes a VF netdev. And any uc/mc addresses
registered with a VF netdev are registered with the hw by the driver. And hw
will filter and send only pkts that the VF has expressed interest in.

No special filter partitioning in hw is required.

Thanks,
Roopa

^ permalink raw reply

* Re: [PATCH v2 6/9] per-cgroup tcp buffers control
From: KAMEZAWA Hiroyuki @ 2011-09-09  3:12 UTC (permalink / raw)
  To: Glauber Costa
  Cc: linux-kernel, linux-mm, containers, netdev, xemul,
	David S. Miller, Eric W. Biederman
In-Reply-To: <1315369399-3073-7-git-send-email-glommer@parallels.com>

On Wed,  7 Sep 2011 01:23:16 -0300
Glauber Costa <glommer@parallels.com> wrote:

> With all the infrastructure in place, this patch implements
> per-cgroup control for tcp memory pressure handling.
> 
> Signed-off-by: Glauber Costa <glommer@parallels.com>
> CC: David S. Miller <davem@davemloft.net>
> CC: Hiroyouki Kamezawa <kamezawa.hiroyu@jp.fujitsu.com>
> CC: Eric W. Biederman <ebiederm@xmission.com>

Hmm, then, kmem_cgroup.c is just a caller of plugins implemented
by other components ?

> ---
>  include/linux/kmem_cgroup.h |    7 ++++
>  include/net/sock.h          |   10 ++++++-
>  mm/kmem_cgroup.c            |   10 ++++++-
>  net/core/sock.c             |   18 +++++++++++
>  net/ipv4/tcp.c              |   67 +++++++++++++++++++++++++++++++++++++-----
>  5 files changed, 102 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/kmem_cgroup.h b/include/linux/kmem_cgroup.h
> index d983ba8..89ad0a1 100644
> --- a/include/linux/kmem_cgroup.h
> +++ b/include/linux/kmem_cgroup.h
> @@ -23,6 +23,13 @@
>  struct kmem_cgroup {
>  	struct cgroup_subsys_state css;
>  	struct kmem_cgroup *parent;
> +
> +#ifdef CONFIG_INET
> +	int tcp_memory_pressure;
> +	atomic_long_t tcp_memory_allocated;
> +	struct percpu_counter tcp_sockets_allocated;
> +	long tcp_prot_mem[3];
> +#endif
>  };

I think you should place 'read-mostly' values carefully.


>  
> diff --git a/include/net/sock.h b/include/net/sock.h
> index ab65640..91424e3 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -64,6 +64,7 @@
>  #include <net/dst.h>
>  #include <net/checksum.h>
>  
> +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
>  /*
>   * This structure really needs to be cleaned up.
>   * Most of it is for TCP, and not used by any of
> @@ -814,7 +815,14 @@ struct proto {
>  	int			*(*memory_pressure)(struct kmem_cgroup *sg);
>  	/* Pointer to the per-cgroup version of the the sysctl_mem field */
>  	long			*(*prot_mem)(struct kmem_cgroup *sg);
> -
> +	/*
> +	 * cgroup specific initialization function. Called once for all
> +	 * protocols that implement it, from cgroups populate function.
> +	 * This function has to setup any files the protocol want to
> +	 * appear in the kmem cgroup filesystem.
> +	 */
> +	int			(*init_cgroup)(struct cgroup *cgrp,
> +					       struct cgroup_subsys *ss);
>  	int			*sysctl_wmem;
>  	int			*sysctl_rmem;
>  	int			max_header;
> diff --git a/mm/kmem_cgroup.c b/mm/kmem_cgroup.c
> index 7950e69..5e53d66 100644
> --- a/mm/kmem_cgroup.c
> +++ b/mm/kmem_cgroup.c
> @@ -17,16 +17,24 @@
>  #include <linux/cgroup.h>
>  #include <linux/slab.h>
>  #include <linux/kmem_cgroup.h>
> +#include <net/sock.h>
>  
>  static int kmem_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
>  {
> -	return 0;
> +	int ret = 0;
> +#ifdef CONFIG_NET
> +	ret = sockets_populate(ss, cgrp);
> +#endif

CONFIG_INET ?

> +	return ret;
>  }
>  
>  static void
>  kmem_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
>  {
>  	struct kmem_cgroup *cg = kcg_from_cgroup(cgrp);
> +#ifdef CONFIG_INET
> +	percpu_counter_destroy(&cg->tcp_sockets_allocated);
> +#endif
>  	kfree(cg);
>  }
>  
> diff --git a/net/core/sock.c b/net/core/sock.c
> index ead9c02..9d833cf 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -134,6 +134,24 @@
>  #include <net/tcp.h>
>  #endif
>  
> +static DEFINE_RWLOCK(proto_list_lock);
> +static LIST_HEAD(proto_list);
> +
> +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
> +{
> +	struct proto *proto;
> +	int ret = 0;
> +
> +	read_lock(&proto_list_lock);
> +	list_for_each_entry(proto, &proto_list, node) {
> +		if (proto->init_cgroup)
> +			ret |= proto->init_cgroup(cgrp, ss);
> +	}
> +	read_unlock(&proto_list_lock);
> +
> +	return ret;
> +}

Hmm, I don't understand this part but...

	ret |= ...

and no 'undo'  ? If no 'undo', ->init_cgroup() should success
always and no return value is required. 





> +
>  /*
>   * Each address family might have different locking rules, so we have
>   * one slock key per address family:
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 76f03ed..0725dc4 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -289,13 +289,6 @@ int sysctl_tcp_rmem[3] __read_mostly;
>  EXPORT_SYMBOL(sysctl_tcp_rmem);
>  EXPORT_SYMBOL(sysctl_tcp_wmem);
>  
> -atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
> -
> -/*
> - * Current number of TCP sockets.
> - */
> -struct percpu_counter tcp_sockets_allocated;
> -
>  /*
>   * TCP splice context
>   */
> @@ -305,13 +298,68 @@ struct tcp_splice_state {
>  	unsigned int flags;
>  };
>  
> +#ifdef CONFIG_CGROUP_KMEM
>  /*
>   * Pressure flag: try to collapse.
>   * Technical note: it is used by multiple contexts non atomically.
>   * All the __sk_mem_schedule() is of this nature: accounting
>   * is strict, actions are advisory and have some latency.
>   */
> -int tcp_memory_pressure __read_mostly;
> +void tcp_enter_memory_pressure(struct sock *sk)
> +{
> +	struct kmem_cgroup *sg = sk->sk_cgrp;
> +	if (!sg->tcp_memory_pressure) {
> +		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
> +		sg->tcp_memory_pressure = 1;
> +	}
> +}
> +
> +long *tcp_sysctl_mem(struct kmem_cgroup *sg)
> +{
> +	return sg->tcp_prot_mem;
> +}
> +
> +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg)
> +{
> +	return &(sg->tcp_memory_allocated);
> +}
> +
> +int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
> +{
> +	struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
> +	unsigned long limit;
> +	struct net *net = current->nsproxy->net_ns;
> +
> +	sg->tcp_memory_pressure = 0;
> +	atomic_long_set(&sg->tcp_memory_allocated, 0);
> +	percpu_counter_init(&sg->tcp_sockets_allocated, 0);
> +
> +	limit = nr_free_buffer_pages() / 8;
> +	limit = max(limit, 128UL);
> +
> +	sg->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0];
> +	sg->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1];
> +	sg->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2];
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(tcp_init_cgroup);
> +
> +int *memory_pressure_tcp(struct kmem_cgroup *sg)
> +{
> +	return &sg->tcp_memory_pressure;
> +}
> +
> +struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg)
> +{
> +	return &sg->tcp_sockets_allocated;
> +}
> +#else
> +
> +/* Current number of TCP sockets. */
> +struct percpu_counter tcp_sockets_allocated;
> +atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
> +int tcp_memory_pressure;
>  

you dropped __read_mostly.

Thanks,
-kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH] per-cgroup tcp buffer limitation
From: Glauber Costa @ 2011-09-09  4:17 UTC (permalink / raw)
  To: Greg Thelen
  Cc: linux-kernel, linux-mm, containers, netdev, xemul,
	David S. Miller, Hiroyouki Kamezawa, Eric W. Biederman,
	Suleiman Souhlal
In-Reply-To: <CAHH2K0YcXMUfd1Zr=f5a4=X9cPPp8NZiuichFXaOo=kVp5rRJA@mail.gmail.com>

On 09/08/2011 06:53 PM, Greg Thelen wrote:
> On Wed, Sep 7, 2011 at 9:44 PM, Glauber Costa<glommer@parallels.com>  wrote:
>
> Thanks for your ideas and patience.

Likewise. It is turning out to be a very fruitful
discussion.

>> Well, it is a way to see this. The other way to see this, is that you're
>> proposing to move to the kernel, something that really belongs in userspace.
>> That's because:
>>
>> With the information you provided me, I have no reason to believe that the
>> kernel has more condition to do this work. Do the kernel have access to any
>> information that userspace do not, and can't be exported? If not, userspace
>> is traditionally where this sort of stuff has been done.
>
> I think direct reclaim is a pain if user space is required to participate in
> memory balancing decisions.
It depends on the decision.

> One thing a single memory limit solution has is the
> ability to reclaim user memory to satisfy growing kernel memory needs (and vise
> versa).

this works for a strict definition of the word "needs". If I *need*
more kernel memory, I'd be happy to have more. But if I just want to
screw other containers, they will be happy if I don't get what I
"need" - it can be unreclaimable. Since those are limits, they are
expected to be, in any real setups, greater than any use people
should be doing, and yet, prevent bad usage scenarios.

> If a container must fit within 100M, then a single limit solution
> would set the limit to 100M and never change it.  In a split limit solution a
> user daemon (e.g. uswapd) would need to monitor the usage and the amount of
> active memory vs inactive user memory and unreferenced kernel memory to
> determine where to apply pressure.

Or it can just define some parameters and let the kernel do the
rest. Like for instance, a maximum proportion allowed, a maximum
proportion desired, etc.

> With some more knobs such a uswapd could
> attempt to keep ahead of demand.  But eventually direct reclaim would
> be needed to satisfy rapid growth spikes.  Example: If the 100M container
> starts with limits of 20M kmem and 80M user memory but later its kernel
> memory needs grow to 70M.  With separate user and kernel memory
> limits the kernel memory allocation could fail despite there being
> reclaimable user pages available.

No no, this is a ratio, not a *limit*. A limit is something you
should not be allowed to go over. A good limit of kernel memory for
a 100 Mb container could be something like... 100 mb. But there is
more to that.

Risking being a bit polemic here, I think that when we do
containers, we have to view the kernel a little bit like a shared 
resource that is not accounted to anybody. It is easy to account things 
like tcp buffer, but who do you account page tables for shared pages to? 
Or pinned dentries for shared filesystems ?

Being the shared table under everybody, the kernel is more or less
like buffers inside a physical hdd, or cache lines. You know it is
there, you know it has a size, but provided you have some sane
protections, you don't really care - because in most cases you can't
- who is using it.

> The job should have a way to
> transition to memory limits to 70M+ kernel and 30M- of user.

Yes, and I don't see how what I propose prevents that.

> I suppose a GFP_WAIT slab kernel page allocation could wakeup user space to
> perform user-assisted direct reclaim.  User space would then lower the user
> limit thereby causing the kernel to direct reclaim user pages, then
> the user daemon would raise the kernel limit allowing the slab allocation to
> succeed.  My hunch is that this would be prone to deadlocks (what prevents
> uswapd from needing more even more kmem?)  I'll defer to more
> experienced minds to know if user assisted direct memory reclaim has
> other pitfalls.  It scares me.

Good that it scares you, it should. OTOH, userspace being
able to set parameters to it, has nothing scary at all. A daemon in
userspace can detect that you need more kernel space memory , and
then - according to a policy you abide to - write to a file allowing
it more, or maybe not - according to that same policy. It is very
far away from "userspace driven reclaim".

> Fundamentally I have no problem putting an upper bound on a cgroup's resource
> usage.  This serves to contain the damage a job can do to the system and other
> jobs.  My concern is about limiting the kernel's ability to trade one type of
> memory for another by using different cgroups for different types of memory.
Yes, but limits have nothing to do with it.

>
> If kmem expands to include reclaimable kernel memory (e.g. dentry) then I
> presume the kernel would have no way to exchange unused user pages for dentry
> pages even if the user memory in the container is well below its limit.  This is
> motivation for the above user assisted direct reclaim.

Dentry is not always reclaimable. If it is pinned, it is non
reclaimable. Speaking of it, Would you take a look at
https://lkml.org/lkml/2011/8/14/110 ?

I am targetting dentry as well. But since it is hard to assign a
dentry to a process all the time, going through a different path. I
however, haven't entirely given up of doing it cgroups based, so any
ideas are welcome =)

> Do you feel the need to segregate user and kernel memory into different cgroups
> with independent limits?  Or is this this just a way to create a new clean
> cgroup with a simple purpose?

No, I don't necessarily feel that need. I just thought it was
cleaner to have entities with different purposes in different
cgroups. If moving it to the memory controller would help you in any
way, I can just do it. 80 % of this work is independent of where a
cgroup file lives.

> In some resource sharing shops customers purchase a certain amount of memory,
> cpu, network, etc.  Such customers don't define how the memory is used and the
> user/kernel mixture may change over time.  Can a user space reclaim daemon stay
> ahead of the workloads needs?

If you think solely about limits, you don't need to. The most
sane policy is actually "I don't care what is the kernel/user ratio,
as long as the kernel never grows over X Mb".

>> Using userspace CPU is no different from using kernel cpu in this particular
>> case. It is all overhead, regardless where it comes from. Moreover, you end
>> up setting up a policy, instead of a mechanism. What should be this
>> proportion?  Do we reclaim everything with the same frequency? Should we be
>> more tolerant with a specific container?
>
> I assume that this implies that a generic kmem cgroup usage is inferior to
> separate limits for each kernel memory type to allow user space the flexibility
> to choose between kernel types (udp vs tcp vs ext4 vs page_tables vs ...)?  Do
> you foresee a way to provide a limit on the total amount of kmem usage by all
> such types?  If a container wants to dedicate 4M for all network protocol
> buffers (tcp, udp, etc.) would that require a user space daemon to balance
> memory limits b/w the protocols?

Well, I am giving this an extra thought... Having separate knobs
adds flexibility, but - as usual - also complexity. For the goals I
have in mind, "kernel memory" would work just as fine.

If you look carefully at the other patches in the series besides
this one, you'll see that it is just a matter of billing from kernel
memory instead of tcp-memory, and then all the rest is the same.

Do you think that a single kernel-memory knob would be better for
your needs? I am willing to give it a try.
>> Also, If you want to allow any flexibility in this scheme, like: "Should
>> this network container be able to stress the network more, pinning more
>> memory, but not other subsystems?", you end up having to touch all
>> individual files anyway - probably with a userspace daemon.
>>
>> Also, as you noticed yourself, kernel memory is fundamentally different from
>> userspace memory. You can't just set reclaim limits, since you have no
>> guarantees it will work. User memory is not a scarce resource.
>> Kernel memory is.
>
> I agree that kernel memory is somewhat different.  In some (I argue most)
> situations containers want the ability to exchange job kmem and job umem.
> Either split or combined accounting protects the system and isolates other
> containers from kmem allocations of a bad job.  To me it seems natural to
> indicate that job X gets Y MB of memory.  I have more trouble dividing the
> Y MB of memory into dedicated slices for different types of memory.

I understand. And I don't think anyone doing containers
should be mandated to define a division. But a limit...
>>> While there are people (like me) who want a combined memory usage
>>> limit there are also people (like you) who want separate user and
>>> kernel limiting.
>>
>> Combined excludes separate. Separate does not exclude combined.
>
> I agree.  I have no problem with separate accounting and separate
> user-accessible pressure knobs to allow for complex policies.  My concern is
> about limiting the kernel's ability to reclaim one type of memory to
> fulfill the needs of another memory type (e.g. I think reclaiming clean file
> pages should be possible to make room for user slab needs).

I agree with your concern. It is definitely something we should not
do.

> I think
> memcg aware slab accounting does a good job of limiting a job's
> memory allocations.
> Would such slab accounting meet your needs?

Well, the slab alone, no. There are other objects - like tcp buffers
- that aren't covered by the slab. Others are usually shared among
many cgroups, and others don't really belong to anybody in
particular.igh

How do you think then, about turning this into 2 files inside memcg:

  - kernel_memory_hard_limit.
  - kernel_memory_soft_limit.

tcp memory would be the one defined in /proc, except if it is
greater than any of the limits. Instead of testing for memory
allocation against kmem.tcp_allocated_memory, we'd test it against
memcg.kmem_pinned_memory.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH v2 1/9] per-netns ipv4 sysctl_tcp_mem
From: Glauber Costa @ 2011-09-09  4:19 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, containers, netdev, xemul,
	David S. Miller, Eric W. Biederman
In-Reply-To: <20110909114720.f050c520.kamezawa.hiroyu@jp.fujitsu.com>

On 09/08/2011 11:47 PM, KAMEZAWA Hiroyuki wrote:
> On Wed,  7 Sep 2011 01:23:11 -0300
> Glauber Costa<glommer@parallels.com>  wrote:
>
>> This patch allows each namespace to independently set up
>> its levels for tcp memory pressure thresholds. This patch
>> alone does not buy much: we need to make this values
>> per group of process somehow. This is achieved in the
>> patches that follows in this patchset.
>>
>> Signed-off-by: Glauber Costa<glommer@parallels.com>
>> CC: David S. Miller<davem@davemloft.net>
>> CC: Hiroyouki Kamezawa<kamezawa.hiroyu@jp.fujitsu.com>
>> CC: Eric W. Biederman<ebiederm@xmission.com>
>
>
> Hmm, it may be better to post this patch as independent one.


Maybe we can search acks from eric about this one
specifically prior to merging, but I'd still like it to be part of
the whole. It will put us in a weird state if this is merged, and
the rest is not.

> I'm not familiar with this area...but try review ;)
Thank you!

>
>> ---
>>   include/net/netns/ipv4.h   |    1 +
>>   include/net/tcp.h          |    1 -
>>   net/ipv4/sysctl_net_ipv4.c |   51 +++++++++++++++++++++++++++++++++++++------
>>   net/ipv4/tcp.c             |   13 +++-------
>>   4 files changed, 49 insertions(+), 17 deletions(-)
>>
>> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
>> index d786b4f..bbd023a 100644
>> --- a/include/net/netns/ipv4.h
>> +++ b/include/net/netns/ipv4.h
>> @@ -55,6 +55,7 @@ struct netns_ipv4 {
>>   	int current_rt_cache_rebuild_count;
>>
>>   	unsigned int sysctl_ping_group_range[2];
>> +	long sysctl_tcp_mem[3];
>>
>>   	atomic_t rt_genid;
>>   	atomic_t dev_addr_genid;
>
> Hmm, in original placement, sysctl_tcp_mem[] was on __read_mostly
> area. Doesn't this placement cause many cache invalidations ?
>
Yes, you are right. I will move back to the old way of doing it.

>
>> diff --git a/include/net/tcp.h b/include/net/tcp.h
>> index 149a415..6bfdd9b 100644
>> --- a/include/net/tcp.h
>> +++ b/include/net/tcp.h
>> @@ -230,7 +230,6 @@ extern int sysctl_tcp_fack;
>>   extern int sysctl_tcp_reordering;
>>   extern int sysctl_tcp_ecn;
>>   extern int sysctl_tcp_dsack;
>> -extern long sysctl_tcp_mem[3];
>>   extern int sysctl_tcp_wmem[3];
>>   extern int sysctl_tcp_rmem[3];
>>   extern int sysctl_tcp_app_win;
>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>> index 69fd720..0d74b9d 100644
>> --- a/net/ipv4/sysctl_net_ipv4.c
>> +++ b/net/ipv4/sysctl_net_ipv4.c
>> @@ -14,6 +14,7 @@
>>   #include<linux/init.h>
>>   #include<linux/slab.h>
>>   #include<linux/nsproxy.h>
>> +#include<linux/swap.h>
>>   #include<net/snmp.h>
>>   #include<net/icmp.h>
>>   #include<net/ip.h>
>> @@ -174,6 +175,36 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
>>   	return ret;
>>   }
>>
>> +static int ipv4_tcp_mem(ctl_table *ctl, int write,
>> +			   void __user *buffer, size_t *lenp,
>> +			   loff_t *ppos)
>> +{
>> +	int ret;
>> +	unsigned long vec[3];
>> +	struct net *net = current->nsproxy->net_ns;
>> +	int i;
>> +
>> +	ctl_table tmp = {
>> +		.data =&vec,
>> +		.maxlen = sizeof(vec),
>> +		.mode = ctl->mode,
>> +	};
>> +
>> +	if (!write) {
>> +		ctl->data =&net->ipv4.sysctl_tcp_mem;
>> +		return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
>> +	}
>> +
>> +	ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
>> +	if (ret)
>> +		return ret;
>> +
>> +	for (i = 0; i<  3; i++)
>> +		net->ipv4.sysctl_tcp_mem[i] = vec[i];
>> +
>> +	return 0;
>> +}
>> +
>>   static struct ctl_table ipv4_table[] = {
>>   	{
>>   		.procname	= "tcp_timestamps",
>> @@ -433,13 +464,6 @@ static struct ctl_table ipv4_table[] = {
>>   		.proc_handler	= proc_dointvec
>>   	},
>>   	{
>> -		.procname	= "tcp_mem",
>> -		.data		=&sysctl_tcp_mem,
>> -		.maxlen		= sizeof(sysctl_tcp_mem),
>> -		.mode		= 0644,
>> -		.proc_handler	= proc_doulongvec_minmax
>> -	},
>> -	{
>>   		.procname	= "tcp_wmem",
>>   		.data		=&sysctl_tcp_wmem,
>>   		.maxlen		= sizeof(sysctl_tcp_wmem),
>> @@ -721,6 +745,12 @@ static struct ctl_table ipv4_net_table[] = {
>>   		.mode		= 0644,
>>   		.proc_handler	= ipv4_ping_group_range,
>>   	},
>> +	{
>> +		.procname	= "tcp_mem",
>> +		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_mem),
>> +		.mode		= 0644,
>> +		.proc_handler	= ipv4_tcp_mem,
>> +	},
>>   	{ }
>>   };
>>
>
>
>
>
>> @@ -734,6 +764,7 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
>>   static __net_init int ipv4_sysctl_init_net(struct net *net)
>>   {
>>   	struct ctl_table *table;
>> +	unsigned long limit;
>>
>>   	table = ipv4_net_table;
>>   	if (!net_eq(net,&init_net)) {
>> @@ -769,6 +800,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
>>
>>   	net->ipv4.sysctl_rt_cache_rebuild_count = 4;
>>
>> +	limit = nr_free_buffer_pages() / 8;
>> +	limit = max(limit, 128UL);
>> +	net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
>> +	net->ipv4.sysctl_tcp_mem[1] = limit;
>> +	net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
>> +
>>   	net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
>>   			net_ipv4_ctl_path, table);
>>   	if (net->ipv4.ipv4_hdr == NULL)
>> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
>> index 46febca..f06df24 100644
>> --- a/net/ipv4/tcp.c
>> +++ b/net/ipv4/tcp.c
>> @@ -266,6 +266,7 @@
>>   #include<linux/crypto.h>
>>   #include<linux/time.h>
>>   #include<linux/slab.h>
>> +#include<linux/nsproxy.h>
>>
>>   #include<net/icmp.h>
>>   #include<net/tcp.h>
>> @@ -282,11 +283,9 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
>>   struct percpu_counter tcp_orphan_count;
>>   EXPORT_SYMBOL_GPL(tcp_orphan_count);
>>
>> -long sysctl_tcp_mem[3] __read_mostly;
>>   int sysctl_tcp_wmem[3] __read_mostly;
>>   int sysctl_tcp_rmem[3] __read_mostly;
>>
>> -EXPORT_SYMBOL(sysctl_tcp_mem);
>>   EXPORT_SYMBOL(sysctl_tcp_rmem);
>>   EXPORT_SYMBOL(sysctl_tcp_wmem);
>>
>> @@ -3277,14 +3276,10 @@ void __init tcp_init(void)
>>   	sysctl_tcp_max_orphans = cnt / 2;
>>   	sysctl_max_syn_backlog = max(128, cnt / 256);
>>
>> -	limit = nr_free_buffer_pages() / 8;
>> -	limit = max(limit, 128UL);
>> -	sysctl_tcp_mem[0] = limit / 4 * 3;
>> -	sysctl_tcp_mem[1] = limit;
>> -	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
>> -
>>   	/* Set per-socket limits to no more than 1/128 the pressure threshold */
>> -	limit = ((unsigned long)sysctl_tcp_mem[1])<<  (PAGE_SHIFT - 7);
>> +	limit = (unsigned long)init_net.ipv4.sysctl_tcp_mem[1];
>> +	limit<<= (PAGE_SHIFT - 7);
>> +
>
> I'm not sure but...why defined as 'long'  ?
>

It is part of the "it was there
before" bundle.

It is defined as long not only for tcp, but for all of the
equivalents sysctl as well. So no reason to touch it, at least not
in this series =)

>
> BTW, when I grep,
>
> tcp_input.c:        atomic_long_read(&tcp_memory_allocated)<  sysctl_tcp_mem[0])
> tcp_input.c:    if (atomic_long_read(&tcp_memory_allocated)>= sysctl_tcp_mem[0])
>
> Don't you need to change this ?

It ended up being changed in another patch, and I missed the right
split.

Thank you, I will reorder it so it gets changed correctly.
>
>
> Thanks,
> -Kame
>
>
>
>
>
>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [net-next-2.6 PATCH 0/3 RFC] macvlan: MAC Address filtering support for passthru mode
From: Sridhar Samudrala @ 2011-09-09  4:25 UTC (permalink / raw)
  To: Roopa Prabhu
  Cc: Michael S. Tsirkin, netdev, dragos.tatulea, arnd, dwang2, benve,
	kaber, davem, eric.dumazet, mchan, kvm
In-Reply-To: <CA8ECF75.33BFF%roprabhu@cisco.com>

On 9/8/2011 8:00 PM, Roopa Prabhu wrote:
>
>
> On 9/8/11 12:33 PM, "Michael S. Tsirkin"<mst@redhat.com>  wrote:
>
>> On Thu, Sep 08, 2011 at 12:23:56PM -0700, Roopa Prabhu wrote:
>>>> I think the main usecase for passthru mode is to assign a SR-IOV VF to
>>>> a single guest.
>>>>
>>> Yes and for the passthru usecase this patch should be enough to enable
>>> filtering in hw (eventually like I indicated before I need to fix vlan
>>> filtering too).
>> So with filtering in hw, and in sriov VF case, VFs
>> actually share a filtering table. How will that
>> be partitioned?
> AFAIK, though it might maintain a single filter table space in hw, hw does
> know which filter belongs to which VF. And the OS driver does not need to do
> anything special. The VF driver exposes a VF netdev. And any uc/mc addresses
> registered with a VF netdev are registered with the hw by the driver. And hw
> will filter and send only pkts that the VF has expressed interest in.
Does your NIC & driver support adding multiple mac addresses to a VF?
I have tried a few other SR-IOV NICs sometime back and they didn't 
support this feature.

Currently, we don't have an interface to add multiple mac addresses to a 
netdev other than an
indirect way of creating a macvlan /if on top of it.

Thanks
Sridhar

>
> No special filter partitioning in hw is required.
>
> Thanks,
> Roopa
>



^ permalink raw reply

* Memory leak in ip_dst_cache
From: Kumar S @ 2011-09-09  5:04 UTC (permalink / raw)
  To: netdev@vger.kernel.org

Hi,
We are running Linux-2.6.24 kernel on MPC8360. Though forwarding is disabled, when connected to public network we see the system running out of memory and rebooting frequently. After doing some study we found out slowly "ip_dst-cache" is growing, and doesn't release entries. Interestingly route entries displayed with command "ip route ls cache" show fewer than the active objects listed under "cat /proc/slabinfo | grep ip_dst_cache". After doing some study, we could reproduce it in the lab by injecting packets withdifferent source IP addresses. Ideally ageing should happen, and old entries are supposed to be cleared out, but that doesn't happen. We do see one or two entries getting agedout but not all. 
After some time we did see that "rt_run_flush" kicks in and flushes out the ip_dst_cache. That's why the "ip route ls cache" fewer entries. But looks like __chache_free() doesn't get called, that's why these entries are not really released. This results in the leak.

Any idea what is going wrong here. Is it a known bug?

Regards
psk

^ permalink raw reply

* [PATCH net-next] af_unix: dont send SCM_CREDENTIALS by default
From: Eric Dumazet @ 2011-09-09  5:06 UTC (permalink / raw)
  To: Tim Chen
  Cc: Yan, Zheng, Yan, Zheng, netdev@vger.kernel.org,
	davem@davemloft.net, sfr@canb.auug.org.au, jirislaby@gmail.com,
	sedat.dilek@gmail.com, Shi, Alex, Valdis Kletnieks
In-Reply-To: <1315473888.2301.21.camel@schen9-mobl>

Le jeudi 08 septembre 2011 à 02:24 -0700, Tim Chen a écrit :

> Looking forward to the patch.  This should improve the scalability of
> af_unix.

Here it is, based on top on previous one [af_unix: Fix use-after-free
crashes]

Thanks

[PATCH net-next] af_unix: dont send SCM_CREDENTIALS by default

Since commit 7361c36c5224 (af_unix: Allow credentials to work across
user and pid namespaces) af_unix performance dropped a lot.

This is because we now take a reference on pid and cred in each write(),
and release them in read(), usually done from another process,
eventually from another cpu. This triggers false sharing.

# Events: 154K cycles
#
# Overhead  Command       Shared Object                               Symbol
# ........  .......  ..................  ...................................
#
    10.40%  hackbench  [kernel.kallsyms]   [k] put_pid
     8.60%  hackbench  [kernel.kallsyms]   [k] unix_stream_recvmsg
     7.87%  hackbench  [kernel.kallsyms]   [k] unix_stream_sendmsg
     6.11%  hackbench  [kernel.kallsyms]   [k] do_raw_spin_lock
     4.95%  hackbench  [kernel.kallsyms]   [k] unix_scm_to_skb
     4.87%  hackbench  [kernel.kallsyms]   [k] pid_nr_ns
     4.34%  hackbench  [kernel.kallsyms]   [k] cred_to_ucred
     2.39%  hackbench  [kernel.kallsyms]   [k] unix_destruct_scm
     2.24%  hackbench  [kernel.kallsyms]   [k] sub_preempt_count
     1.75%  hackbench  [kernel.kallsyms]   [k] fget_light
     1.51%  hackbench  [kernel.kallsyms]   [k] __mutex_lock_interruptible_slowpath
     1.42%  hackbench  [kernel.kallsyms]   [k] sock_alloc_send_pskb


This patch includes SCM_CREDENTIALS information in a af_unix message/skb
only if requested by the sender, [man 7 unix for details how to include
ancillary data using sendmsg() system call]

Note: This might break buggy applications that expected SCM_CREDENTIAL
from an unaware write() system call.

Performance boost in hackbench : more than 50% gain on a 16 thread
machine (2 quad-core cpus, 2 threads per core)

hackbench 20 thread 2000

4.224 sec instead of 9.102 sec


Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/net/scm.h        |   11 +----------
 net/core/scm.c           |   10 ++++++----
 net/netlink/af_netlink.c |    5 ++---
 net/unix/af_unix.c       |    9 +++++----
 4 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/include/net/scm.h b/include/net/scm.h
index 2a5b42f..74c8fdc 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -45,14 +45,6 @@ static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_co
 { }
 #endif /* CONFIG_SECURITY_NETWORK */
 
-static __inline__ void scm_set_cred(struct scm_cookie *scm,
-				    struct pid *pid, const struct cred *cred)
-{
-	scm->pid  = get_pid(pid);
-	scm->cred = get_cred(cred);
-	cred_to_ucred(pid, cred, &scm->creds);
-}
-
 static __inline__ void scm_set_cred_noref(struct scm_cookie *scm,
 				    struct pid *pid, const struct cred *cred)
 {
@@ -81,8 +73,7 @@ static __inline__ void scm_destroy(struct scm_cookie *scm)
 static __inline__ int scm_send(struct socket *sock, struct msghdr *msg,
 			       struct scm_cookie *scm)
 {
-	scm_set_cred(scm, task_tgid(current), current_cred());
-	scm->fp = NULL;
+	memset(scm, 0, sizeof(*scm));
 	unix_get_peersec_dgram(sock, scm);
 	if (msg->msg_controllen <= 0)
 		return 0;
diff --git a/net/core/scm.c b/net/core/scm.c
index 811b53f..ff52ad0 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -173,7 +173,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
 			if (err)
 				goto error;
 
-			if (pid_vnr(p->pid) != p->creds.pid) {
+			if (!p->pid || pid_vnr(p->pid) != p->creds.pid) {
 				struct pid *pid;
 				err = -ESRCH;
 				pid = find_get_pid(p->creds.pid);
@@ -183,8 +183,9 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
 				p->pid = pid;
 			}
 
-			if ((p->cred->euid != p->creds.uid) ||
-				(p->cred->egid != p->creds.gid)) {
+			if (!p->cred ||
+			    (p->cred->euid != p->creds.uid) ||
+			    (p->cred->egid != p->creds.gid)) {
 				struct cred *cred;
 				err = -ENOMEM;
 				cred = prepare_creds();
@@ -193,7 +194,8 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
 
 				cred->uid = cred->euid = p->creds.uid;
 				cred->gid = cred->egid = p->creds.gid;
-				put_cred(p->cred);
+				if (p->cred)
+					put_cred(p->cred);
 				p->cred = cred;
 			}
 			break;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 4330db9..1201b6d 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1324,10 +1324,9 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	if (msg->msg_flags&MSG_OOB)
 		return -EOPNOTSUPP;
 
-	if (NULL == siocb->scm) {
+	if (NULL == siocb->scm)
 		siocb->scm = &scm;
-		memset(&scm, 0, sizeof(scm));
-	}
+
 	err = scm_send(sock, msg, siocb->scm);
 	if (err < 0)
 		return err;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c8a08ba..4c77385 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1383,12 +1383,13 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb,
 {
 	int err = 0;
 
+	UNIXCB(skb).pid  = scm->pid;
+	UNIXCB(skb).cred = scm->cred;
 	if (!steal_refs) {
-		UNIXCB(skb).pid  = get_pid(scm->pid);
-		UNIXCB(skb).cred = get_cred(scm->cred);
+		get_pid(scm->pid);
+		if (scm->cred)
+			get_cred(scm->cred);
 	} else {
-		UNIXCB(skb).pid  = scm->pid;
-		UNIXCB(skb).cred = scm->cred;
 		scm->pid = NULL;
 		scm->cred = NULL;
 	}

^ permalink raw reply related

* Re: Memory leak in ip_dst_cache
From: Eric Dumazet @ 2011-09-09  5:30 UTC (permalink / raw)
  To: Kumar S; +Cc: netdev@vger.kernel.org
In-Reply-To: <1315544674.20226.YahooMailNeo@web113902.mail.gq1.yahoo.com>

Le jeudi 08 septembre 2011 à 22:04 -0700, Kumar S a écrit :
> Hi,
> We are running Linux-2.6.24 kernel on MPC8360. Though forwarding is
> disabled, when connected to public network we see the system running
> out of memory and rebooting frequently. After doing some study we
> found out slowly "ip_dst-cache" is growing, and doesn't release
> entries. Interestingly route entries displayed with command "ip route
> ls cache" show fewer than the active objects listed under
> "cat /proc/slabinfo | grep ip_dst_cache". After doing some study, we
> could reproduce it in the lab by injecting packets withdifferent
> source IP addresses. Ideally ageing should happen, and old entries are
> supposed to be cleared out, but that doesn't happen. We do see one or
> two entries getting agedout but not all. 
> After some time we did see that "rt_run_flush" kicks in and flushes
> out the ip_dst_cache. That's why the "ip route ls cache" fewer
> entries. But looks like __chache_free() doesn't get called, that's why
> these entries are not really released. This results in the leak.
>  
> Any idea what is going wrong here. Is it a known bug?

Please send :

grep . /proc/sys/net/ipv4/route/*
rtstat -c10 -i1

This a very well known problem. You need to upgrade your kernel in order
to avoid very complex tuning of your IP route cache.

Recent ones have smooth garbage collection.

In the meantime, you can tune a bit :

echo 1 >/proc/sys/net/ipv4/route/gc_interval
echo 4 >/proc/sys/net/ipv4/route/gc_elasticity

^ permalink raw reply

* Re: [net-next-2.6 PATCH 0/3 RFC] macvlan: MAC Address filtering support for passthru mode
From: Michael S. Tsirkin @ 2011-09-09  5:55 UTC (permalink / raw)
  To: Roopa Prabhu
  Cc: netdev, dragos.tatulea, arnd, dwang2, benve, kaber, sri, davem,
	eric.dumazet, mchan, kvm
In-Reply-To: <CA8ECDA7.33BFA%roprabhu@cisco.com>

On Thu, Sep 08, 2011 at 07:53:11PM -0700, Roopa Prabhu wrote:
> >> Phase 1: Goal: Enable hardware filtering for all macvlan modes
> >>     - In macvlan passthru mode the single guest virtio-nic connected will
> >>       receive traffic that he requested for
> >>     - In macvlan non-passthru mode all guest virtio-nics sharing the
> >>       physical nic will see all other guest traffic
> >>       but the filtering at guest virtio-nic
> > 
> > I don't think guests currently filter anything.
> > 
> I was referring to Qemu-kvm virtio-net in
> virtion_net_receive->receive_filter. I think It only passes pkts that the
> guest OS is interested. It uses the filter table that I am passing to
> macvtap in this patch.

This happens after userspace thread gets woken up and data
is copied there. So relying on filtering at that level is
going to be very inefficient on a system with
multiple active guests. Further, and for that reason, vhost-net
doesn't do filtering at all, relying on the backends
to pass it correct packets.

-- 
MST

^ permalink raw reply

* Re: [PATCH net-next v3] af_unix: Fix use-after-free crashes
From: Eric Dumazet @ 2011-09-09  6:51 UTC (permalink / raw)
  To: Tim Chen
  Cc: sedat.dilek, Yan, Zheng, Yan, Zheng, netdev@vger.kernel.org,
	davem@davemloft.net, sfr@canb.auug.org.au, jirislaby@gmail.com,
	Shi, Alex, Valdis Kletnieks
In-Reply-To: <1315471065.2301.1.camel@schen9-mobl>

Le jeudi 08 septembre 2011 à 01:37 -0700, Tim Chen a écrit :
> On Thu, 2011-09-08 at 15:21 +0200, Eric Dumazet wrote:
> > Le jeudi 08 septembre 2011 à 11:59 +0200, Sedat Dilek a écrit :
> > 
> > > I have tested this fixup patch on i386.
> > > Can we have a separate patch with corrected descriptive text?
> > > 
> > > Thanks to all involved people.
> > 
> > Here it is :
> > 
> > [PATCH net-next v3] af_unix: Fix use-after-free crashes
> > 
> > Commit 0856a30409 (Scm: Remove unnecessary pid & credential references
> > in Unix socket's send and receive path) introduced an use-after-free
> > bug.
> > 
> > We are allowed to steal the references to pid/cred only in the last skb
> > sent from unix_stream_sendmsg(), because first skbs might be consumed by
> > the receiver before we finish our sendmsg() call.
> > 
> > Remove scm_release() helper, since its cleaner to clear pid/cred fields
> > in unix_scm_to_skb() when we steal them.
> > 
> > Based on prior patches from Yan Zheng and Tim Chen
> > 
> > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> > Reported-by: Jiri Slaby <jirislaby@gmail.com>
> > Tested-by: Sedat Dilek <sedat.dilek@googlemail.com>
> > Tested-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
> > ---
> 
> Thanks.
> 
> Acked-by: Tim Chen <tim.c.chen@linux.intel.com>
> 

Now we have to fix a bug in unix_stream_recvmsg() as well.

consume_skb() call actually releases pid/cred references, and we can use
them after their eventual freeing.

Keep also in mind that receiver can provides a too short user buffer,
and skb can be put back to head of sk_receive_queue

Tim, your 0856a304091b33 commit introduced a lot of bugs, I was right
asking a revert.

If we revert your patch, my litle patch (af_unix: dont send
SCM_CREDENTIALS by default) is enough to solve performance problems.

^ permalink raw reply

* Re: [BUG?] tcp: potential bug in tcp_is_sackblock_valid()
From: Ilpo Järvinen @ 2011-09-09  8:03 UTC (permalink / raw)
  To: Yan, Zheng
  Cc: Herbert Xu, netdev@vger.kernel.org, davem@davemloft.net,
	eric.dumazet@gmail.com, sfr@canb.auug.org.au
In-Reply-To: <4E6975B2.8000603@intel.com>

On Fri, 9 Sep 2011, Yan, Zheng wrote:

> On 09/09/2011 09:54 AM, Herbert Xu wrote:
> > On Fri, Sep 09, 2011 at 09:45:52AM +0800, Yan, Zheng wrote:
> >> I found a check in tcp_is_sackblock_valid() is suspicious. It against
> >> its comment and RFC. I think the correct check should be:
> >> ---
> >> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> >> index 385c470..a5d01b1 100644
> >> --- a/net/ipv4/tcp_input.c
> >> +++ b/net/ipv4/tcp_input.c
> >> @@ -1124,7 +1124,7 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
> >>                 return 0;
> >>  
> >>         /* ...Then it's D-SACK, and must reside below snd_una completely */
> >> -       if (!after(end_seq, tp->snd_una))
> >> +       if (after(end_seq, tp->snd_una))
> >>                 return 0;
> >>  
> >>         if (!before(start_seq, tp->undo_marker))
> >> ---
> >>
> >> I also checked /proc/net/netstat of my laptop, found TCPDSACKIgnoredOld
> >> field is not zero. Maybe it's caused by the bug.

Hmm, some Ignored I was expecting to see even when nothing was wrong but I 
think it might have been the other counter... too long time already passed 
to remember all the details I've been thinking.

> > Yes this looks like a typo.  Please resend your patch with a
> > description and signed-off-by line.
> 
> I think the bug has a big influence on tcp DSACKs. It's better to
> leave it to people who fully understand tcp code.

Indeed, it looks like a double negative error.

-- 
 i.

^ permalink raw reply

* [PATCH net-next] af_unix: fix use after free in unix_stream_recvmsg()
From: Eric Dumazet @ 2011-09-09  7:58 UTC (permalink / raw)
  To: Tim Chen
  Cc: sedat.dilek, Yan, Zheng, Yan, Zheng, netdev@vger.kernel.org,
	davem@davemloft.net, sfr@canb.auug.org.au, jirislaby@gmail.com,
	Shi, Alex, Valdis Kletnieks
In-Reply-To: <1315551100.5410.30.camel@edumazet-laptop>

Le vendredi 09 septembre 2011 à 08:51 +0200, Eric Dumazet a écrit :

> Now we have to fix a bug in unix_stream_recvmsg() as well.
> 
> consume_skb() call actually releases pid/cred references, and we can use
> them after their eventual freeing.
> 
> Keep also in mind that receiver can provides a too short user buffer,
> and skb can be put back to head of sk_receive_queue
> 

Here is the patch to address this point.

Apply it after (af_unix: Fix use-after-free crashes)

I can make a combo patch once everybody agrees.

[PATCH net-next] af_unix: fix use after free in unix_stream_recvmsg()

Commit 0856a30409 (Scm: Remove unnecessary pid & credential references
in Unix socket's send and receive path) introduced an use-after-free
bug in unix_stream_recvmsg().

We should call consume_skb(skb) only after our possible use of pid/cred.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/unix/af_unix.c |   13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c8a08ba..1bd4ecf 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1873,6 +1873,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 	int target;
 	int err = 0;
 	long timeo;
+	struct sk_buff *skb;
 
 	err = -EINVAL;
 	if (sk->sk_state != TCP_ESTABLISHED)
@@ -1904,7 +1905,6 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 
 	do {
 		int chunk;
-		struct sk_buff *skb;
 
 		unix_state_lock(sk);
 		skb = skb_dequeue(&sk->sk_receive_queue);
@@ -1949,6 +1949,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 			if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
 			    (UNIXCB(skb).cred != siocb->scm->cred)) {
 				skb_queue_head(&sk->sk_receive_queue, skb);
+				skb = NULL;
 				break;
 			}
 		} else {
@@ -1967,6 +1968,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 		chunk = min_t(unsigned int, skb->len, size);
 		if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
 			skb_queue_head(&sk->sk_receive_queue, skb);
+			skb = NULL;
 			if (copied == 0)
 				copied = -EFAULT;
 			break;
@@ -1984,13 +1986,14 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 			/* put the skb back if we didn't use it up.. */
 			if (skb->len) {
 				skb_queue_head(&sk->sk_receive_queue, skb);
+				skb = NULL;
 				break;
 			}
 
-			consume_skb(skb);
-
-			if (siocb->scm->fp)
+			if (UNIXCB(skb).pid || siocb->scm->fp)
 				break;
+			consume_skb(skb);
+			skb = NULL;
 		} else {
 			/* It is questionable, see note in unix_dgram_recvmsg.
 			 */
@@ -1999,12 +2002,14 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 
 			/* put message back and return */
 			skb_queue_head(&sk->sk_receive_queue, skb);
+			skb = NULL;
 			break;
 		}
 	} while (size);
 
 	mutex_unlock(&u->readlock);
 	scm_recv(sock, msg, siocb->scm, flags);
+	consume_skb(skb);
 out:
 	return copied ? : err;
 }

^ permalink raw reply related

* Re: [PATCH v3] net/smsc911x: add device tree probe support
From: Dave Martin @ 2011-09-09  8:50 UTC (permalink / raw)
  To: Grant Likely
  Cc: Shawn Guo, netdev, patches, devicetree-discuss, Steve Glendinning,
	David S. Miller, linux-arm-kernel
In-Reply-To: <20110908182920.GL2967@ponder.secretlab.ca>

On Thu, Sep 08, 2011 at 11:29:20AM -0700, Grant Likely wrote:
> On Thu, Sep 08, 2011 at 03:59:46PM +0100, Dave Martin wrote:
> > Hi Shawn,
> > 
> > On Sun, Jul 31, 2011 at 02:26:00AM +0800, Shawn Guo wrote:
> > > It adds device tree probe support for smsc911x driver.
> > > 
> > > Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
> > > Cc: Grant Likely <grant.likely@secretlab.ca>
> > > Cc: Steve Glendinning <steve.glendinning@smsc.com>
> > > Cc: David S. Miller <davem@davemloft.net>
> > > Reviewed-by: Grant Likely <grant.likely@secretlab.ca>
> > > ---
> > > Changes since v2:
> > >  * Fix a typo in smsc911x.txt
> > > 
> > > Changes since v1:
> > >  * Instead of getting irq line from gpio number, it use irq domain
> > >    to keep platform_get_resource(IORESOURCE_IRQ) works for dt too.
> > >  * Use 'lan9115' the first model that smsc911x supports in the match
> > >    table
> > >  * Use reg-shift and reg-io-width which already used in of_serial for
> > >    shift and access size binding
> > 
> > When using this patch with vexpress, I found that 16-bit register access
> > mode doesn't seem to be getting set correctly.
> > 
> > Can you take a look at this additional patch and let me know if it looks
> > correct?
> > 
> > Cheers
> > ---Dave
> > 
> > From: Dave Martin <dave.martin@linaro.org>
> > Date: Wed, 7 Sep 2011 17:26:31 +0100
> > Subject: [PATCH] net/smsc911x: Correctly configure 16-bit register access from DT
> > 
> > The SMSC911X_USE_16BIT needs to be set when using 16-bit register
> > access.  However, currently no flag is set if the DT doesn't specify
> > 32-bit access.
> > 
> > This patch should set the SMSC911X_USE_16BIT flag in a manner consistent
> > with the documented DT bindings.
> > 
> > Signed-off-by: Dave Martin <dave.martin@linaro.org>
> > ---
> >  drivers/net/smsc911x.c |    2 ++
> >  1 files changed, 2 insertions(+), 0 deletions(-)
> > 
> > diff --git a/drivers/net/smsc911x.c b/drivers/net/smsc911x.c
> > index 75c08a5..1a35c25 100644
> > --- a/drivers/net/smsc911x.c
> > +++ b/drivers/net/smsc911x.c
> > @@ -2121,6 +2121,8 @@ static int __devinit smsc911x_probe_config_dt(
> >  	of_property_read_u32(np, "reg-io-width", &width);
> >  	if (width == 4)
> >  		config->flags |= SMSC911X_USE_32BIT;
> > +	else
> > +		config->flags |= SMSC911X_USE_16BIT;
> 
> Would it be better to do "else if (width == 2)"?  (completely
> uninformed comment.  I've not looked at what the non-DT probe path
> does on this driver.)

I wouldn't have a problem with that.  But currently the binding
documentation says that any value other than 4, or a missing property,
implies 16-bit register access.

So the binding documentation would need to change too in that case.

Personally I think this would be better, but it's just an opinion.

Cheers
---Dave

^ permalink raw reply

* [PATCH 00/14] Swap-over-NBD without deadlocking v6
From: Mel Gorman @ 2011-09-09 10:57 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Neil Brown,
	Peter Zijlstra, Mel Gorman

Changelog since V5
  o Rebase to 3.1-rc5

Changelog since V4
  o Update comment clarifying what protocols can be used		(Michal)
  o Rebase to 3.0-rc3

Changelog since V3
  o Propogate pfmemalloc from packet fragment pages to skb		(Neil)
  o Rebase to 3.0-rc2

Changelog since V2
  o Document that __GFP_NOMEMALLOC overrides __GFP_MEMALLOC		(Neil)
  o Use wait_event_interruptible					(Neil)
  o Use !! when casting to bool to avoid any possibilitity of type
    truncation								(Neil)
  o Nicer logic when using skb_pfmemalloc_protocol			(Neil)

Changelog since V1
  o Rebase on top of mmotm
  o Use atomic_t for memalloc_socks		(David Miller)
  o Remove use of sk_memalloc_socks in vmscan	(Neil Brown)
  o Check throttle within prepare_to_wait	(Neil Brown)
  o Add statistics on throttling instead of printk

When a user or administrator requires swap for their application, they
create a swap partition and file, format it with mkswap and activate
it with swapon. Swap over the network is considered as an option in
diskless systems. The two likely scenarios are when blade servers
are used as part of a cluster where the form factor or maintenance
costs do not allow the use of disks and thin clients.

The Linux Terminal Server Project recommends the use of the
Network Block Device (NBD) for swap according to the manual at
https://sourceforge.net/projects/ltsp/files/Docs-Admin-Guide/LTSPManual.pdf/download
.  There is also documentation and tutorials
on how to setup swap over NBD at places like
https://help.ubuntu.com/community/UbuntuLTSP/EnableNBDSWAP .  The
nbd-client also documents the use of NBD as swap. Despite this, the
fact is that a machine using NBD for swap can deadlock within minutes
if swap is used intensively. This patch series addresses the problem.

The core issue is that network block devices do not use mempools
like normal block devices do. As the host cannot control where they
receive packets from, they cannot reliably work out in advance how
much memory they might need.

Some years ago, Peter Ziljstra developed a series of patches that
supported swap over an NFS that some distributions are carrying in
their kernels. This patch series borrows very heavily from Peter's
work to support swapping over NBD as a pre-requisite to supporting
swap-over-NFS. The bulk of the complexity is concerned with preserving
memory that is allocated from the PFMEMALLOC reserves for use by the
network layer which is needed for both NBD and NFS.

Patch 1 serialises access to min_free_kbytes. It's not strictly needed
	by this series but as the series cares about watermarks in
	general, it's a harmless fix. It could be merged independently.

Patch 2 adds knowledge of the PFMEMALLOC reserves to SLAB and SLUB to
	preserve access to pages allocated under low memory situations
	to callers that are freeying memory.

Patch 3 introduces __GFP_MEMALLOC to allow access to the PFMEMALLOC
	reserves without setting PFMEMALLOC.

Patch 4 opens the possibility for softirqs to use PFMEMALLOC reserves
	for later use by network packet processing.

Patch 5 ignores memory policies when ALLOC_NO_WATERMARKS is set.

Patches 6-10 allows network processing to use PFMEMALLOC reserves when
	the socket has been marked as being used by the VM to clean
	pages. If packets are received and stored in pages that were
	allocated under low-memory situations and are unrelated to
	the VM, the packets are dropped.

Patch 11 is a micro-optimisation to avoid a function call in the
	common case.

Patch 12 tags NBD sockets as being SOCK_MEMALLOC so they can use
	PFMEMALLOC if necessary.

Patch 13 notes that it is still possible for the PFMEMALLOC reserve
	to be depleted. To prevent this, direct reclaimers get
	throttled on a waitqueue if 50% of the PFMEMALLOC reserves are
	depleted.  It is expected that kswapd and the direct reclaimers
	already running will clean enough pages for the low watermark
	to be reached and the throttled processes are woken up.

Patch 14 adds a statistic to track how often processes get throttled

Some basic performance testing was run using kernel builds, netperf
on loopback for UDP and TCP, hackbench (pipes and sockets), iozone
and sysbench. Each of them were expected to use the sl*b allocators
reasonably heavily but there did not appear to be significant
performance variances. Here is the results from netperf using
slab as an example

NETPERF UDP
      64   237.47 ( 0.00%)    237.34 (-0.05%) 
     128   472.69 ( 0.00%)    465.96 (-1.44%) 
     256   926.82 ( 0.00%)    948.40 ( 2.28%) 
    1024  3260.08 ( 0.00%)   3266.50 ( 0.20%) 
    2048  5535.11 ( 0.00%)   5453.55 (-1.50%) 
    3312  7496.60 ( 0.00%)*  7574.44 ( 1.03%) 
             1.12%             1.00%        
    4096  8266.35 ( 0.00%)*  8240.06 (-0.32%)*
             1.18%             1.49%        
    8192 11026.01 ( 0.00%)  11010.44 (-0.14%) 
   16384 14653.98 ( 0.00%)  14666.97 ( 0.09%) 
MMTests Statistics: duration
User/Sys Time Running Test (seconds)       2156.64   1873.27
Total Elapsed Time (seconds)               2570.09   2234.10

NETPERF TCP
                   netperf-tcp       tcp-swapnbd
                  vanilla-slab         v4r3-slab
      64  1250.76 ( 0.00%)   1256.52 ( 0.46%) 
     128  2290.70 ( 0.00%)   2336.43 ( 1.96%) 
     256  3668.42 ( 0.00%)   3751.17 ( 2.21%) 
    1024  7214.33 ( 0.00%)   7237.23 ( 0.32%) 
    2048  8230.01 ( 0.00%)   8280.02 ( 0.60%) 
    3312  8634.95 ( 0.00%)   8758.62 ( 1.41%) 
    4096  8851.18 ( 0.00%)   9045.88 ( 2.15%) 
    8192 10067.59 ( 0.00%)  10263.30 ( 1.91%) 
   16384 11523.26 ( 0.00%)  11654.78 ( 1.13%) 
MMTests Statistics: duration
User/Sys Time Running Test (seconds)       1450.23    1389.8
Total Elapsed Time (seconds)               1450.41   1390.35

Here is the equivalent test for SLUB

                   netperf-udp       udp-swapnbd
                  vanilla-slub         v4r3-slub
      64   235.33 ( 0.00%)    237.80 ( 1.04%) 
     128   465.92 ( 0.00%)    469.98 ( 0.86%) 
     256   907.16 ( 0.00%)    907.58 ( 0.05%) 
    1024  3240.25 ( 0.00%)   3255.56 ( 0.47%) 
    2048  5564.87 ( 0.00%)   5446.46 (-2.17%) 
    3312  7427.65 ( 0.00%)*  7650.00 ( 2.91%) 
             1.33%             1.00%        
    4096  8004.51 ( 0.00%)*  8132.79 ( 1.58%)*
             1.05%             1.21%        
    8192 11079.60 ( 0.00%)  10927.09 (-1.40%) 
   16384 14737.38 ( 0.00%)  15019.50 ( 1.88%) 
MMTests Statistics: duration
User/Sys Time Running Test (seconds)       2056.21   2160.38
Total Elapsed Time (seconds)               2426.09   2498.16

NETPERF TCP
                   netperf-tcp       tcp-swapnbd
                  vanilla-slub         v4r3-slub
      64  1251.64 ( 0.00%)   1262.89 ( 0.89%) 
     128  2289.88 ( 0.00%)   2332.94 ( 1.85%) 
     256  3654.34 ( 0.00%)   3736.48 ( 2.20%) 
    1024  7192.47 ( 0.00%)   7286.96 ( 1.30%) 
    2048  8243.55 ( 0.00%)   8291.50 ( 0.58%) 
    3312  8664.16 ( 0.00%)   8799.88 ( 1.54%) 
    4096  8869.13 ( 0.00%)   9018.12 ( 1.65%) 
    8192 10009.53 ( 0.00%)  10214.26 ( 2.00%) 
   16384 11470.78 ( 0.00%)  11685.20 ( 1.83%) 
MMTests Statistics: duration
User/Sys Time Running Test (seconds)       1368.28   1511.81
Total Elapsed Time (seconds)               1370.33   1510.42

Time to completion varied a lot but this can happen with netperf as
it tries to find results within a sufficiently high confidence. There
were some small gains and losses but they are close to the variances
seen between kernel releases.

For testing swap-over-NBD, a machine was booted with 2G of RAM with a
swapfile backed by NBD. 8*NUM_CPU processes were started that create
anonymous memory mappings and read them linearly in a loop. The total
size of the mappings were 4*PHYSICAL_MEMORY to use swap heavily under
memory pressure. Without the patches, the machine locks up within
minutes and runs to completion with them applied.

 drivers/block/nbd.c             |    7 +-
 include/linux/gfp.h             |   13 ++-
 include/linux/mm_types.h        |    9 ++
 include/linux/mmzone.h          |    1 +
 include/linux/sched.h           |    7 +
 include/linux/skbuff.h          |   21 +++-
 include/linux/slub_def.h        |    1 +
 include/linux/vm_event_item.h   |    1 +
 include/net/sock.h              |   19 +++
 include/trace/events/gfpflags.h |    1 +
 kernel/softirq.c                |    3 +
 mm/page_alloc.c                 |   57 +++++++--
 mm/slab.c                       |  240 +++++++++++++++++++++++++++++++++------
 mm/slub.c                       |   35 +++++-
 mm/vmscan.c                     |   55 +++++++++
 mm/vmstat.c                     |    1 +
 net/core/dev.c                  |   48 +++++++-
 net/core/filter.c               |    8 ++
 net/core/skbuff.c               |   95 +++++++++++++---
 net/core/sock.c                 |   42 +++++++
 net/ipv4/tcp.c                  |    3 +-
 net/ipv4/tcp_output.c           |   13 +-
 net/ipv6/tcp_ipv6.c             |   12 ++-
 23 files changed, 606 insertions(+), 86 deletions(-)

-- 
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH 01/14] mm: Serialize access to min_free_kbytes
From: Mel Gorman @ 2011-09-09 10:57 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Neil Brown,
	Peter Zijlstra, Mel Gorman
In-Reply-To: <1315565845-16857-1-git-send-email-mgorman@suse.de>

There is a race between the min_free_kbytes sysctl, memory hotplug
and transparent hugepage support enablement.  Memory hotplug uses a
zonelists_mutex to avoid a race when building zonelists. Reuse it to
serialise watermark updates.

[a.p.zijlstra@chello.nl: Older patch fixed the race with spinlock]
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 mm/page_alloc.c |   23 +++++++++++++++--------
 1 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e8ecb6..9d8bd0e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5113,14 +5113,7 @@ static void setup_per_zone_lowmem_reserve(void)
 	calculate_totalreserve_pages();
 }
 
-/**
- * setup_per_zone_wmarks - called when min_free_kbytes changes
- * or when memory is hot-{added|removed}
- *
- * Ensures that the watermark[min,low,high] values for each zone are set
- * correctly with respect to min_free_kbytes.
- */
-void setup_per_zone_wmarks(void)
+static void __setup_per_zone_wmarks(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
@@ -5175,6 +5168,20 @@ void setup_per_zone_wmarks(void)
 	calculate_totalreserve_pages();
 }
 
+/**
+ * setup_per_zone_wmarks - called when min_free_kbytes changes
+ * or when memory is hot-{added|removed}
+ *
+ * Ensures that the watermark[min,low,high] values for each zone are set
+ * correctly with respect to min_free_kbytes.
+ */
+void setup_per_zone_wmarks(void)
+{
+	mutex_lock(&zonelists_mutex);
+	__setup_per_zone_wmarks();
+	mutex_unlock(&zonelists_mutex);
+}
+
 /*
  * The inactive anon list should be small enough that the VM never has to
  * do too much work, but large enough that each inactive page has a chance
-- 
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH 02/14] mm: sl[au]b: Add knowledge of PFMEMALLOC reserve pages
From: Mel Gorman @ 2011-09-09 10:57 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Neil Brown,
	Peter Zijlstra, Mel Gorman
In-Reply-To: <1315565845-16857-1-git-send-email-mgorman@suse.de>

Allocations of pages below the min watermark run a risk of the
machine hanging due to a lack of memory.  To prevent this, only
callers who have PF_MEMALLOC or TIF_MEMDIE set and are not processing
an interrupt are allowed to allocate with ALLOC_NO_WATERMARKS. Once
they are allocated to a slab though, nothing prevents other callers
consuming free objects within those slabs. This patch limits access
to slab pages that were alloced from the PFMEMALLOC reserves.

Pages allocated from the reserve are returned with page->pfmemalloc
set and it is up to the caller to determine how the page should be
protected.  SLAB restricts access to any page with page->pfmemalloc set
to callers which are known to able to access the PFMEMALLOC reserve. If
one is not available, an attempt is made to allocate a new page rather
than use a reserve. SLUB is a bit more relaxed in that it only records
if the current per-CPU page was allocated from PFMEMALLOC reserve and
uses another partial slab if the caller does not have the necessary
GFP or process flags. This was found to be sufficient in tests to
avoid hangs due to SLUB generally maintaining smaller lists than SLAB.

In low-memory conditions it does mean that !PFMEMALLOC allocators
can fail a slab allocation even though free objects are available
because they are being preserved for callers that are freeing pages.

[a.p.zijlstra@chello.nl: Original implementation]
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/mm_types.h |    9 ++
 include/linux/slub_def.h |    1 +
 mm/internal.h            |    3 +
 mm/page_alloc.c          |   27 +++++-
 mm/slab.c                |  216 +++++++++++++++++++++++++++++++++++++++-------
 mm/slub.c                |   35 +++++++-
 6 files changed, 248 insertions(+), 43 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 774b895..3716e9f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -53,6 +53,15 @@ struct page {
 		union {
 			pgoff_t index;		/* Our offset within mapping. */
 			void *freelist;		/* slub first free object */
+			bool pfmemalloc;	/* If set by the page allocator,
+						 * ALLOC_PFMEMALLOC was set
+						 * and the low watermark was not
+						 * met implying that the system
+						 * is under some pressure. The
+						 * caller should try ensure
+						 * this page is only used to
+						 * free other pages.
+						 */
 		};
 
 		union {
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index f58d641..d41a9a4 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -43,6 +43,7 @@ struct kmem_cache_cpu {
 	unsigned long tid;	/* Globally unique transaction id */
 	struct page *page;	/* The slab from which we are allocating */
 	int node;		/* The node of the page (or -1 for debug) */
+	bool pfmemalloc;	/* Slab page had pfmemalloc set */
 #ifdef CONFIG_SLUB_STATS
 	unsigned stat[NR_SLUB_STAT_ITEMS];
 #endif
diff --git a/mm/internal.h b/mm/internal.h
index d071d380..a520f3b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -193,6 +193,9 @@ static inline struct page *mem_map_next(struct page *iter,
 #define __paginginit __init
 #endif
 
+/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
+
 /* Memory initialisation debug and verification */
 enum mminit_level {
 	MMINIT_WARNING,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9d8bd0e..561cb61 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -656,6 +656,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
 	trace_mm_page_free_direct(page, order);
 	kmemcheck_free_shadow(page, order);
 
+	page->pfmemalloc = false;
 	if (PageAnon(page))
 		page->mapping = NULL;
 	for (i = 0; i < (1 << order); i++)
@@ -1174,6 +1175,7 @@ void free_hot_cold_page(struct page *page, int cold)
 
 	migratetype = get_pageblock_migratetype(page);
 	set_page_private(page, migratetype);
+	page->pfmemalloc = false;
 	local_irq_save(flags);
 	if (unlikely(wasMlocked))
 		free_page_mlock(page);
@@ -1367,6 +1369,7 @@ failed:
 #define ALLOC_HARDER		0x10 /* try to alloc harder */
 #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
 #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
+#define ALLOC_PFMEMALLOC	0x80 /* Caller has PF_MEMALLOC set */
 
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 
@@ -2055,16 +2058,22 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 	} else if (unlikely(rt_task(current)) && !in_interrupt())
 		alloc_flags |= ALLOC_HARDER;
 
-	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
-		if (!in_interrupt() &&
-		    ((current->flags & PF_MEMALLOC) ||
-		     unlikely(test_thread_flag(TIF_MEMDIE))))
+	if ((current->flags & PF_MEMALLOC) ||
+			unlikely(test_thread_flag(TIF_MEMDIE))) {
+		alloc_flags |= ALLOC_PFMEMALLOC;
+
+		if (likely(!(gfp_mask & __GFP_NOMEMALLOC)) && !in_interrupt())
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 	}
 
 	return alloc_flags;
 }
 
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+	return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_PFMEMALLOC);
+}
+
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -2237,8 +2246,16 @@ nopage:
 got_pg:
 	if (kmemcheck_enabled)
 		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
-	return page;
 
+	/*
+	 * page->pfmemalloc is set when the caller had PFMEMALLOC set or is
+	 * been OOM killed. The expectation is that the caller is taking
+	 * steps that will free more memory. The caller should avoid the
+	 * page being used for !PFMEMALLOC purposes.
+	 */
+	page->pfmemalloc = !!(alloc_flags & ALLOC_PFMEMALLOC);
+
+	return page;
 }
 
 /*
diff --git a/mm/slab.c b/mm/slab.c
index 6d90a09..1dd03e0 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -121,6 +121,8 @@
 #include	<asm/tlbflush.h>
 #include	<asm/page.h>
 
+#include	"internal.h"
+
 /*
  * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
  *		  0 for faster, smaller code (especially in the critical paths).
@@ -227,6 +229,7 @@ struct slab {
 			unsigned int inuse;	/* num of objs active in slab */
 			kmem_bufctl_t free;
 			unsigned short nodeid;
+			bool pfmemalloc;	/* Slab had pfmemalloc set */
 		};
 		struct slab_rcu __slab_cover_slab_rcu;
 	};
@@ -248,15 +251,37 @@ struct array_cache {
 	unsigned int avail;
 	unsigned int limit;
 	unsigned int batchcount;
-	unsigned int touched;
+	bool touched;
+	bool pfmemalloc;
 	spinlock_t lock;
 	void *entry[];	/*
 			 * Must have this definition in here for the proper
 			 * alignment of array_cache. Also simplifies accessing
 			 * the entries.
+			 *
+			 * Entries should not be directly dereferenced as
+			 * entries belonging to slabs marked pfmemalloc will
+			 * have the lower bits set SLAB_OBJ_PFMEMALLOC
 			 */
 };
 
+#define SLAB_OBJ_PFMEMALLOC	1
+static inline bool is_obj_pfmemalloc(void *objp)
+{
+	return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
+}
+
+static inline void set_obj_pfmemalloc(void **objp)
+{
+	*objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
+	return;
+}
+
+static inline void clear_obj_pfmemalloc(void **objp)
+{
+	*objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
+}
+
 /*
  * bootstrap: The caches do not work without cpuarrays anymore, but the
  * cpuarrays are allocated from the generic caches...
@@ -929,12 +954,100 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 		nc->avail = 0;
 		nc->limit = entries;
 		nc->batchcount = batchcount;
-		nc->touched = 0;
+		nc->touched = false;
 		spin_lock_init(&nc->lock);
 	}
 	return nc;
 }
 
+/* Clears ac->pfmemalloc if no slabs have pfmalloc set */
+static void check_ac_pfmemalloc(struct kmem_cache *cachep,
+						struct array_cache *ac)
+{
+	struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()];
+	struct slab *slabp;
+
+	if (!ac->pfmemalloc)
+		return;
+
+	list_for_each_entry(slabp, &l3->slabs_full, list)
+		if (slabp->pfmemalloc)
+			return;
+
+	list_for_each_entry(slabp, &l3->slabs_partial, list)
+		if (slabp->pfmemalloc)
+			return;
+
+	list_for_each_entry(slabp, &l3->slabs_free, list)
+		if (slabp->pfmemalloc)
+			return;
+
+	ac->pfmemalloc = false;
+}
+
+static void *ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
+						gfp_t flags, bool force_refill)
+{
+	int i;
+	void *objp = ac->entry[--ac->avail];
+
+	/* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
+	if (unlikely(is_obj_pfmemalloc(objp))) {
+		struct kmem_list3 *l3;
+
+		if (gfp_pfmemalloc_allowed(flags)) {
+			clear_obj_pfmemalloc(&objp);
+			return objp;
+		}
+
+		/* The caller cannot use PFMEMALLOC objects, find another one */
+		for (i = 1; i < ac->avail; i++) {
+			/* If a !PFMEMALLOC object is found, swap them */
+			if (!is_obj_pfmemalloc(ac->entry[i])) {
+				objp = ac->entry[i];
+				ac->entry[i] = ac->entry[ac->avail];
+				ac->entry[ac->avail] = objp;
+				return objp;
+			}
+		}
+
+		/*
+		 * If there are empty slabs on the slabs_free list and we are
+		 * being forced to refill the cache, mark this one !pfmemalloc.
+		 */
+		l3 = cachep->nodelists[numa_mem_id()];
+		if (!list_empty(&l3->slabs_free) && force_refill) {
+			struct slab *slabp = virt_to_slab(objp);
+			slabp->pfmemalloc = false;
+			clear_obj_pfmemalloc(&objp);
+			check_ac_pfmemalloc(cachep, ac);
+			return objp;
+		}
+
+		/* No !PFMEMALLOC objects available */
+		ac->avail++;
+		objp = NULL;
+	}
+
+	return objp;
+}
+
+static void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+								void *objp)
+{
+	struct slab *slabp;
+
+	/* If there are pfmemalloc slabs, check if the object is part of one */
+	if (unlikely(ac->pfmemalloc)) {
+		slabp = virt_to_slab(objp);
+
+		if (slabp->pfmemalloc)
+			set_obj_pfmemalloc(&objp);
+	}
+
+	ac->entry[ac->avail++] = objp;
+}
+
 /*
  * Transfer objects in one arraycache to another.
  * Locking must be handled by the caller.
@@ -1111,7 +1224,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 			STATS_INC_ACOVERFLOW(cachep);
 			__drain_alien_cache(cachep, alien, nodeid);
 		}
-		alien->entry[alien->avail++] = objp;
+		ac_put_obj(cachep, alien, objp);
 		spin_unlock(&alien->lock);
 	} else {
 		spin_lock(&(cachep->nodelists[nodeid])->list_lock);
@@ -1719,7 +1832,8 @@ __initcall(cpucache_init);
  * did not request dmaable memory, we might get it, but that
  * would be relatively rare and ignorable.
  */
-static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid,
+		bool *pfmemalloc)
 {
 	struct page *page;
 	int nr_pages;
@@ -1740,6 +1854,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
 	if (!page)
 		return NULL;
+	*pfmemalloc = page->pfmemalloc;
 
 	nr_pages = (1 << cachep->gfporder);
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
@@ -2172,7 +2287,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 	cpu_cache_get(cachep)->avail = 0;
 	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
 	cpu_cache_get(cachep)->batchcount = 1;
-	cpu_cache_get(cachep)->touched = 0;
+	cpu_cache_get(cachep)->touched = false;
 	cachep->batchcount = 1;
 	cachep->limit = BOOT_CPUCACHE_ENTRIES;
 	return 0;
@@ -2730,6 +2845,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
 	slabp->s_mem = objp + colour_off;
 	slabp->nodeid = nodeid;
 	slabp->free = 0;
+	slabp->pfmemalloc = false;
 	return slabp;
 }
 
@@ -2861,7 +2977,7 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
  * kmem_cache_alloc() when there are no active objs left in a cache.
  */
 static int cache_grow(struct kmem_cache *cachep,
-		gfp_t flags, int nodeid, void *objp)
+		gfp_t flags, int nodeid, void *objp, bool pfmemalloc)
 {
 	struct slab *slabp;
 	size_t offset;
@@ -2905,7 +3021,7 @@ static int cache_grow(struct kmem_cache *cachep,
 	 * 'nodeid'.
 	 */
 	if (!objp)
-		objp = kmem_getpages(cachep, local_flags, nodeid);
+		objp = kmem_getpages(cachep, local_flags, nodeid, &pfmemalloc);
 	if (!objp)
 		goto failed;
 
@@ -2915,6 +3031,13 @@ static int cache_grow(struct kmem_cache *cachep,
 	if (!slabp)
 		goto opps1;
 
+	/* Record if ALLOC_PFMEMALLOC was set when allocating the slab */
+	if (pfmemalloc) {
+		struct array_cache *ac = cpu_cache_get(cachep);
+		slabp->pfmemalloc = true;
+		ac->pfmemalloc = true;
+	}
+
 	slab_map_pages(cachep, slabp, objp);
 
 	cache_init_objs(cachep, slabp);
@@ -3056,16 +3179,19 @@ bad:
 #define check_slabp(x,y) do { } while(0)
 #endif
 
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
+							bool force_refill)
 {
 	int batchcount;
 	struct kmem_list3 *l3;
 	struct array_cache *ac;
 	int node;
 
-retry:
 	check_irq_off();
 	node = numa_mem_id();
+	if (unlikely(force_refill))
+		goto force_grow;
+retry:
 	ac = cpu_cache_get(cachep);
 	batchcount = ac->batchcount;
 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -3083,7 +3209,7 @@ retry:
 
 	/* See if we can refill from the shared array */
 	if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
-		l3->shared->touched = 1;
+		l3->shared->touched = true;
 		goto alloc_done;
 	}
 
@@ -3115,8 +3241,8 @@ retry:
 			STATS_INC_ACTIVE(cachep);
 			STATS_SET_HIGH(cachep);
 
-			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
-							    node);
+			ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
+									node));
 		}
 		check_slabp(cachep, slabp);
 
@@ -3135,18 +3261,25 @@ alloc_done:
 
 	if (unlikely(!ac->avail)) {
 		int x;
-		x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
+force_grow:
+		x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL, false);
 
 		/* cache_grow can reenable interrupts, then ac could change. */
 		ac = cpu_cache_get(cachep);
-		if (!x && ac->avail == 0)	/* no objects in sight? abort */
+
+		/* no objects in sight? abort */
+		if (!x && (ac->avail == 0 || force_refill))
 			return NULL;
 
-		if (!ac->avail)		/* objects refilled by interrupt? */
+		/* objects refilled by interrupt? */
+		if (!ac->avail) {
+			node = numa_node_id();
 			goto retry;
+		}
 	}
-	ac->touched = 1;
-	return ac->entry[--ac->avail];
+	ac->touched = true;
+
+	return ac_get_obj(cachep, ac, flags, force_refill);
 }
 
 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -3228,23 +3361,35 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	void *objp;
 	struct array_cache *ac;
+	bool force_refill = false;
 
 	check_irq_off();
 
 	ac = cpu_cache_get(cachep);
 	if (likely(ac->avail)) {
-		STATS_INC_ALLOCHIT(cachep);
-		ac->touched = 1;
-		objp = ac->entry[--ac->avail];
-	} else {
-		STATS_INC_ALLOCMISS(cachep);
-		objp = cache_alloc_refill(cachep, flags);
+		ac->touched = true;
+		objp = ac_get_obj(cachep, ac, flags, false);
+
 		/*
-		 * the 'ac' may be updated by cache_alloc_refill(),
-		 * and kmemleak_erase() requires its correct value.
+		 * Allow for the possibility all avail objects are not allowed
+		 * by the current flags
 		 */
-		ac = cpu_cache_get(cachep);
+		if (objp) {
+			STATS_INC_ALLOCHIT(cachep);
+			goto out;
+		}
+		force_refill = true;
 	}
+
+	STATS_INC_ALLOCMISS(cachep);
+	objp = cache_alloc_refill(cachep, flags, force_refill);
+	/*
+	 * the 'ac' may be updated by cache_alloc_refill(),
+	 * and kmemleak_erase() requires its correct value.
+	 */
+	ac = cpu_cache_get(cachep);
+
+out:
 	/*
 	 * To avoid a false negative, if an object that is in one of the
 	 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
@@ -3297,6 +3442,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 	enum zone_type high_zoneidx = gfp_zone(flags);
 	void *obj = NULL;
 	int nid;
+	bool pfmemalloc;
 
 	if (flags & __GFP_THISNODE)
 		return NULL;
@@ -3333,7 +3479,8 @@ retry:
 		if (local_flags & __GFP_WAIT)
 			local_irq_enable();
 		kmem_flagcheck(cache, flags);
-		obj = kmem_getpages(cache, local_flags, numa_mem_id());
+		obj = kmem_getpages(cache, local_flags, numa_mem_id(),
+							&pfmemalloc);
 		if (local_flags & __GFP_WAIT)
 			local_irq_disable();
 		if (obj) {
@@ -3341,7 +3488,7 @@ retry:
 			 * Insert into the appropriate per node queues
 			 */
 			nid = page_to_nid(virt_to_page(obj));
-			if (cache_grow(cache, flags, nid, obj)) {
+			if (cache_grow(cache, flags, nid, obj, pfmemalloc)) {
 				obj = ____cache_alloc_node(cache,
 					flags | GFP_THISNODE, nid);
 				if (!obj)
@@ -3413,7 +3560,7 @@ retry:
 
 must_grow:
 	spin_unlock(&l3->list_lock);
-	x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
+	x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL, false);
 	if (x)
 		goto retry;
 
@@ -3563,9 +3710,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
 	struct kmem_list3 *l3;
 
 	for (i = 0; i < nr_objects; i++) {
-		void *objp = objpp[i];
+		void *objp;
 		struct slab *slabp;
 
+		clear_obj_pfmemalloc(&objpp[i]);
+		objp = objpp[i];
+
 		slabp = virt_to_slab(objp);
 		l3 = cachep->nodelists[node];
 		list_del(&slabp->list);
@@ -3678,12 +3828,12 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
 
 	if (likely(ac->avail < ac->limit)) {
 		STATS_INC_FREEHIT(cachep);
-		ac->entry[ac->avail++] = objp;
+		ac_put_obj(cachep, ac, objp);
 		return;
 	} else {
 		STATS_INC_FREEMISS(cachep);
 		cache_flusharray(cachep, ac);
-		ac->entry[ac->avail++] = objp;
+		ac_put_obj(cachep, ac, objp);
 	}
 }
 
@@ -4110,7 +4260,7 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 	if (!ac || !ac->avail)
 		return;
 	if (ac->touched && !force) {
-		ac->touched = 0;
+		ac->touched = false;
 	} else {
 		spin_lock_irq(&l3->list_lock);
 		if (ac->avail) {
diff --git a/mm/slub.c b/mm/slub.c
index 9f662d7..6945acf 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -32,6 +32,8 @@
 
 #include <trace/events/kmem.h>
 
+#include "internal.h"
+
 /*
  * Lock order:
  *   1. slub_lock (Global Semaphore)
@@ -1414,7 +1416,8 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 		s->ctor(object);
 }
 
-static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node,
+							bool *pfmemalloc)
 {
 	struct page *page;
 	void *start;
@@ -1429,6 +1432,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 		goto out;
 
 	inc_slabs_node(s, page_to_nid(page), page->objects);
+	*pfmemalloc = page->pfmemalloc;
 	page->slab = s;
 	page->flags |= 1 << PG_slab;
 
@@ -2027,6 +2031,14 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
 	}
 }
 
+static inline bool pfmemalloc_match(struct kmem_cache_cpu *c, gfp_t gfpflags)
+{
+	if (unlikely(c->pfmemalloc))
+		return gfp_pfmemalloc_allowed(gfpflags);
+
+	return true;
+}
+
 /*
  * Slow path. The lockless freelist is empty or we need to perform
  * debugging duties.
@@ -2053,6 +2065,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	unsigned long flags;
 	struct page new;
 	unsigned long counters;
+	bool pfmemalloc = false;
 
 	local_irq_save(flags);
 #ifdef CONFIG_PREEMPT
@@ -2077,6 +2090,16 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 		goto new_slab;
 	}
 
+	/*
+	 * By rights, we should be searching for a slab page that was
+	 * PFMEMALLOC but right now, we are losing the pfmemalloc
+	 * information when the page leaves the per-cpu allocator
+	 */
+	if (unlikely(!pfmemalloc_match(c, gfpflags))) {
+		deactivate_slab(s, c);
+		goto new_slab;
+	}
+
 	stat(s, ALLOC_SLOWPATH);
 
 	do {
@@ -2129,7 +2152,7 @@ new_slab:
 		goto load_freelist;
 	}
 
-	page = new_slab(s, gfpflags, node);
+	page = new_slab(s, gfpflags, node, &pfmemalloc);
 
 	if (page) {
 		c = __this_cpu_ptr(s->cpu_slab);
@@ -2147,6 +2170,7 @@ new_slab:
 		stat(s, ALLOC_SLAB);
 		c->node = page_to_nid(page);
 		c->page = page;
+		c->pfmemalloc = pfmemalloc;
 
 		if (kmem_cache_debug(s))
 			goto debug;
@@ -2209,8 +2233,8 @@ redo:
 	barrier();
 
 	object = c->freelist;
-	if (unlikely(!object || !node_match(c, node)))
-
+	if (unlikely(!object || !node_match(c, node) ||
+					!pfmemalloc_match(c, gfpflags)))
 		object = __slab_alloc(s, gfpflags, node, addr, c);
 
 	else {
@@ -2669,10 +2693,11 @@ static void early_kmem_cache_node_alloc(int node)
 {
 	struct page *page;
 	struct kmem_cache_node *n;
+	bool pfmemalloc;	/* Ignore this early in boot */
 
 	BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
 
-	page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
+	page = new_slab(kmem_cache_node, GFP_NOWAIT, node, &pfmemalloc);
 
 	BUG_ON(!page);
 	if (page_to_nid(page) != node) {
-- 
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH 03/14] mm: Introduce __GFP_MEMALLOC to allow access to emergency reserves
From: Mel Gorman @ 2011-09-09 10:57 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Neil Brown,
	Peter Zijlstra, Mel Gorman
In-Reply-To: <1315565845-16857-1-git-send-email-mgorman@suse.de>

__GFP_MEMALLOC will allow the allocation to disregard the watermarks,
much like PF_MEMALLOC. It allows one to pass along the memalloc state
in object related allocation flags as opposed to task related flags,
such as sk->sk_allocation. This removes the need for ALLOC_PFMEMALLOC
as callers using __GFP_MEMALLOC can get the ALLOC_NO_WATERMARK flag
which is now enough to identify allocations related to page reclaim.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/gfp.h             |   10 ++++++++--
 include/linux/mm_types.h        |    2 +-
 include/trace/events/gfpflags.h |    1 +
 mm/page_alloc.c                 |   14 ++++++--------
 mm/slab.c                       |    2 +-
 5 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 3a76faf..38acdc7 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -23,6 +23,7 @@ struct vm_area_struct;
 #define ___GFP_REPEAT		0x400u
 #define ___GFP_NOFAIL		0x800u
 #define ___GFP_NORETRY		0x1000u
+#define ___GFP_MEMALLOC		0x2000u
 #define ___GFP_COMP		0x4000u
 #define ___GFP_ZERO		0x8000u
 #define ___GFP_NOMEMALLOC	0x10000u
@@ -75,9 +76,14 @@ struct vm_area_struct;
 #define __GFP_REPEAT	((__force gfp_t)___GFP_REPEAT)	/* See above */
 #define __GFP_NOFAIL	((__force gfp_t)___GFP_NOFAIL)	/* See above */
 #define __GFP_NORETRY	((__force gfp_t)___GFP_NORETRY) /* See above */
+#define __GFP_MEMALLOC	((__force gfp_t)___GFP_MEMALLOC)/* Allow access to emergency reserves */
 #define __GFP_COMP	((__force gfp_t)___GFP_COMP)	/* Add compound page metadata */
 #define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)	/* Return zeroed page on success */
-#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves */
+#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves.
+							 * This takes precedence over the
+							 * __GFP_MEMALLOC flag if both are
+							 * set
+							 */
 #define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */
 #define __GFP_THISNODE	((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
@@ -127,7 +133,7 @@ struct vm_area_struct;
 /* Control page allocator reclaim behavior */
 #define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\
 			__GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\
-			__GFP_NORETRY|__GFP_NOMEMALLOC)
+			__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC)
 
 /* Control slab gfp mask during early boot */
 #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS))
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3716e9f..0be3d43 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -54,7 +54,7 @@ struct page {
 			pgoff_t index;		/* Our offset within mapping. */
 			void *freelist;		/* slub first free object */
 			bool pfmemalloc;	/* If set by the page allocator,
-						 * ALLOC_PFMEMALLOC was set
+						 * ALLOC_NO_WATERMARKS was set
 						 * and the low watermark was not
 						 * met implying that the system
 						 * is under some pressure. The
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h
index 9fe3a366..d6fd8e5 100644
--- a/include/trace/events/gfpflags.h
+++ b/include/trace/events/gfpflags.h
@@ -30,6 +30,7 @@
 	{(unsigned long)__GFP_COMP,		"GFP_COMP"},		\
 	{(unsigned long)__GFP_ZERO,		"GFP_ZERO"},		\
 	{(unsigned long)__GFP_NOMEMALLOC,	"GFP_NOMEMALLOC"},	\
+	{(unsigned long)__GFP_MEMALLOC,		"GFP_MEMALLOC"},	\
 	{(unsigned long)__GFP_HARDWALL,		"GFP_HARDWALL"},	\
 	{(unsigned long)__GFP_THISNODE,		"GFP_THISNODE"},	\
 	{(unsigned long)__GFP_RECLAIMABLE,	"GFP_RECLAIMABLE"},	\
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 561cb61..03fd18c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1369,7 +1369,6 @@ failed:
 #define ALLOC_HARDER		0x10 /* try to alloc harder */
 #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
 #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
-#define ALLOC_PFMEMALLOC	0x80 /* Caller has PF_MEMALLOC set */
 
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 
@@ -2058,11 +2057,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 	} else if (unlikely(rt_task(current)) && !in_interrupt())
 		alloc_flags |= ALLOC_HARDER;
 
-	if ((current->flags & PF_MEMALLOC) ||
-			unlikely(test_thread_flag(TIF_MEMDIE))) {
-		alloc_flags |= ALLOC_PFMEMALLOC;
-
-		if (likely(!(gfp_mask & __GFP_NOMEMALLOC)) && !in_interrupt())
+	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
+		if (gfp_mask & __GFP_MEMALLOC)
+			alloc_flags |= ALLOC_NO_WATERMARKS;
+		else if (likely(!(gfp_mask & __GFP_NOMEMALLOC)) && !in_interrupt())
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 	}
 
@@ -2071,7 +2069,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 
 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 {
-	return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_PFMEMALLOC);
+	return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
 }
 
 static inline struct page *
@@ -2253,7 +2251,7 @@ got_pg:
 	 * steps that will free more memory. The caller should avoid the
 	 * page being used for !PFMEMALLOC purposes.
 	 */
-	page->pfmemalloc = !!(alloc_flags & ALLOC_PFMEMALLOC);
+	page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
 
 	return page;
 }
diff --git a/mm/slab.c b/mm/slab.c
index 1dd03e0..25f69ec 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3031,7 +3031,7 @@ static int cache_grow(struct kmem_cache *cachep,
 	if (!slabp)
 		goto opps1;
 
-	/* Record if ALLOC_PFMEMALLOC was set when allocating the slab */
+	/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
 	if (pfmemalloc) {
 		struct array_cache *ac = cpu_cache_get(cachep);
 		slabp->pfmemalloc = true;
-- 
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH 04/14] mm: allow PF_MEMALLOC from softirq context
From: Mel Gorman @ 2011-09-09 10:57 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Neil Brown,
	Peter Zijlstra, Mel Gorman
In-Reply-To: <1315565845-16857-1-git-send-email-mgorman@suse.de>

This is needed to allow network softirq packet processing to make
use of PF_MEMALLOC.

Currently softirq context cannot use PF_MEMALLOC due to it not being
associated with a task, and therefore not having task flags to fiddle
with - thus the gfp to alloc flag mapping ignores the task flags when
in interrupts (hard or soft) context.

Allowing softirqs to make use of PF_MEMALLOC therefore requires some
trickery.  We basically borrow the task flags from whatever process
happens to be preempted by the softirq.

So we modify the gfp to alloc flags mapping to not exclude task flags
in softirq context, and modify the softirq code to save, clear and
restore the PF_MEMALLOC flag.

The save and clear, ensures the preempted task's PF_MEMALLOC flag
doesn't leak into the softirq. The restore ensures a softirq's
PF_MEMALLOC flag cannot leak back into the preempted process.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/sched.h |    7 +++++++
 kernel/softirq.c      |    3 +++
 mm/page_alloc.c       |    5 ++++-
 3 files changed, 14 insertions(+), 1 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4ac2c05..791536c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1869,6 +1869,13 @@ static inline void rcu_copy_process(struct task_struct *p)
 
 #endif
 
+static inline void tsk_restore_flags(struct task_struct *p,
+				     unsigned long pflags, unsigned long mask)
+{
+	p->flags &= ~mask;
+	p->flags |= pflags & mask;
+}
+
 #ifdef CONFIG_SMP
 extern void do_set_cpus_allowed(struct task_struct *p,
 			       const struct cpumask *new_mask);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index fca82c3..f773afe 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -210,6 +210,8 @@ asmlinkage void __do_softirq(void)
 	__u32 pending;
 	int max_restart = MAX_SOFTIRQ_RESTART;
 	int cpu;
+	unsigned long pflags = current->flags;
+	current->flags &= ~PF_MEMALLOC;
 
 	pending = local_softirq_pending();
 	account_system_vtime(current);
@@ -265,6 +267,7 @@ restart:
 
 	account_system_vtime(current);
 	__local_bh_enable(SOFTIRQ_OFFSET);
+	tsk_restore_flags(current, pflags, PF_MEMALLOC);
 }
 
 #ifndef __ARCH_HAS_DO_SOFTIRQ
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 03fd18c..31e0eb2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2060,7 +2060,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
 		if (gfp_mask & __GFP_MEMALLOC)
 			alloc_flags |= ALLOC_NO_WATERMARKS;
-		else if (likely(!(gfp_mask & __GFP_NOMEMALLOC)) && !in_interrupt())
+		else if (!in_irq() && (current->flags & PF_MEMALLOC))
+			alloc_flags |= ALLOC_NO_WATERMARKS;
+		else if (!in_interrupt() &&
+				unlikely(test_thread_flag(TIF_MEMDIE)))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 	}
 
-- 
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH 05/14] mm: Ignore mempolicies when using ALLOC_NO_WATERMARK
From: Mel Gorman @ 2011-09-09 10:57 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Neil Brown,
	Peter Zijlstra, Mel Gorman
In-Reply-To: <1315565845-16857-1-git-send-email-mgorman@suse.de>

The reserve is proportionally distributed over all !highmem zones
in the system. So we need to allow an emergency allocation access to
all zones.  In order to do that we need to break out of any mempolicy
boundaries we might have.

In my opinion that does not break mempolicies as those are user
oriented and not system oriented. That is, system allocations are
not guaranteed to be within mempolicy boundaries. For instance IRQs
do not even have a mempolicy.

So breaking out of mempolicy boundaries for 'rare' emergency
allocations, which are always system allocations (as opposed to user)
is ok.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 mm/page_alloc.c |    7 +++++++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 31e0eb2..17c8f93 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2140,6 +2140,13 @@ rebalance:
 
 	/* Allocate without watermarks if the context allows */
 	if (alloc_flags & ALLOC_NO_WATERMARKS) {
+		/*
+		 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
+		 * the allocation is high priority and these type of
+		 * allocations are system rather than user orientated
+		 */
+		zonelist = node_zonelist(numa_node_id(), gfp_mask);
+
 		page = __alloc_pages_high_priority(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
 				preferred_zone, migratetype);
-- 
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH 06/14] net: Introduce sk_allocation() to allow addition of GFP flags depending on the individual socket
From: Mel Gorman @ 2011-09-09 10:57 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Neil Brown,
	Peter Zijlstra, Mel Gorman
In-Reply-To: <1315565845-16857-1-git-send-email-mgorman@suse.de>

Introduce sk_allocation(), this function allows to inject sock specific
flags to each sock related allocation. It is only used on allocation
paths that may be required for writing pages back to network storage.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/net/sock.h    |    5 +++++
 net/ipv4/tcp.c        |    3 ++-
 net/ipv4/tcp_output.c |   13 +++++++------
 net/ipv6/tcp_ipv6.c   |   12 +++++++++---
 4 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 8e4062f..a4d5e61 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -586,6 +586,11 @@ static inline int sock_flag(struct sock *sk, enum sock_flags flag)
 	return test_bit(flag, &sk->sk_flags);
 }
 
+static inline gfp_t sk_allocation(struct sock *sk, gfp_t gfp_mask)
+{
+	return gfp_mask;
+}
+
 static inline void sk_acceptq_removed(struct sock *sk)
 {
 	sk->sk_ack_backlog--;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 46febca..67f4a6d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -698,7 +698,8 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
 	/* The TCP header must be at least 32-bit aligned.  */
 	size = ALIGN(size, 4);
 
-	skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
+	skb = alloc_skb_fclone(size + sk->sk_prot->max_header,
+			       sk_allocation(sk, gfp));
 	if (skb) {
 		if (sk_wmem_schedule(sk, skb->truesize)) {
 			/*
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 882e0b0..87b98f6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2324,7 +2324,7 @@ void tcp_send_fin(struct sock *sk)
 		/* Socket is locked, keep trying until memory is available. */
 		for (;;) {
 			skb = alloc_skb_fclone(MAX_TCP_HEADER,
-					       sk->sk_allocation);
+					       sk_allocation(sk, GFP_KERNEL));
 			if (skb)
 				break;
 			yield();
@@ -2350,7 +2350,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
 	struct sk_buff *skb;
 
 	/* NOTE: No TCP options attached and we never retransmit this. */
-	skb = alloc_skb(MAX_TCP_HEADER, priority);
+	skb = alloc_skb(MAX_TCP_HEADER, sk_allocation(sk, priority));
 	if (!skb) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
 		return;
@@ -2423,7 +2423,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 
 	if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
 		s_data_desired = cvp->s_data_desired;
-	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC);
+	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1,
+					sk_allocation(sk, GFP_ATOMIC));
 	if (skb == NULL)
 		return NULL;
 
@@ -2719,7 +2720,7 @@ void tcp_send_ack(struct sock *sk)
 	 * tcp_transmit_skb() will set the ownership to this
 	 * sock.
 	 */
-	buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+	buff = alloc_skb(MAX_TCP_HEADER, sk_allocation(sk, GFP_ATOMIC));
 	if (buff == NULL) {
 		inet_csk_schedule_ack(sk);
 		inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
@@ -2734,7 +2735,7 @@ void tcp_send_ack(struct sock *sk)
 
 	/* Send it off, this clears delayed acks for us. */
 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
-	tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
+	tcp_transmit_skb(sk, buff, 0, sk_allocation(sk, GFP_ATOMIC));
 }
 
 /* This routine sends a packet with an out of date sequence
@@ -2754,7 +2755,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
 	struct sk_buff *skb;
 
 	/* We don't queue it, tcp_transmit_skb() sets ownership. */
-	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+	skb = alloc_skb(MAX_TCP_HEADER, sk_allocation(sk, GFP_ATOMIC));
 	if (skb == NULL)
 		return -1;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d1fb63f..7ee93b2 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -598,7 +598,8 @@ static int tcp_v6_md5_do_add(struct sock *sk, const struct in6_addr *peer,
 	} else {
 		/* reallocate new list if current one is full. */
 		if (!tp->md5sig_info) {
-			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info), GFP_ATOMIC);
+			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
+					sk_allocation(sk, GFP_ATOMIC));
 			if (!tp->md5sig_info) {
 				kfree(newkey);
 				return -ENOMEM;
@@ -611,7 +612,8 @@ static int tcp_v6_md5_do_add(struct sock *sk, const struct in6_addr *peer,
 		}
 		if (tp->md5sig_info->alloced6 == tp->md5sig_info->entries6) {
 			keys = kmalloc((sizeof (tp->md5sig_info->keys6[0]) *
-				       (tp->md5sig_info->entries6 + 1)), GFP_ATOMIC);
+				       (tp->md5sig_info->entries6 + 1)),
+				       sk_allocation(sk, GFP_ATOMIC));
 
 			if (!keys) {
 				tcp_free_md5sig_pool();
@@ -735,7 +737,8 @@ static int tcp_v6_parse_md5_keys (struct sock *sk, char __user *optval,
 		struct tcp_sock *tp = tcp_sk(sk);
 		struct tcp_md5sig_info *p;
 
-		p = kzalloc(sizeof(struct tcp_md5sig_info), GFP_KERNEL);
+		p = kzalloc(sizeof(struct tcp_md5sig_info),
+				   sk_allocation(sk, GFP_KERNEL));
 		if (!p)
 			return -ENOMEM;
 
@@ -1085,6 +1088,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 	struct tcphdr *th = tcp_hdr(skb);
 	u32 seq = 0, ack_seq = 0;
 	struct tcp_md5sig_key *key = NULL;
+	gfp_t gfp_mask = GFP_ATOMIC;
 
 	if (th->rst)
 		return;
@@ -1096,6 +1100,8 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 	if (sk)
 		key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr);
 #endif
+	if (sk)
+		gfp_mask = sk_allocation(sk, gfp_mask);
 
 	if (th->ack)
 		seq = ntohl(th->ack_seq);
-- 
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH 07/14] netvm: Allow the use of __GFP_MEMALLOC by specific sockets
From: Mel Gorman @ 2011-09-09 10:57 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Neil Brown,
	Peter Zijlstra, Mel Gorman
In-Reply-To: <1315565845-16857-1-git-send-email-mgorman@suse.de>

Allow specific sockets to be tagged SOCK_MEMALLOC and use
__GFP_MEMALLOC for their allocations. These sockets will be able to go
below watermarks and allocate from the emergency reserve. Such sockets
are to be used to service the VM (iow. to swap over). They must be
handled kernel side, exposing such a socket to user-space is a bug.

There is a risk that the reserves be depleted so for now, the
administrator is responsible for increasing min_free_kbytes as
necessary to prevent deadlock for their workloads.

[a.p.zijlstra@chello.nl: Original patches]
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/net/sock.h |    5 ++++-
 net/core/sock.c    |   22 ++++++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index a4d5e61..583df68 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -554,6 +554,7 @@ enum sock_flags {
 	SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
 	SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
 	SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
+	SOCK_MEMALLOC, /* VM depends on this socket for swapping */
 	SOCK_TIMESTAMPING_TX_HARDWARE,  /* %SOF_TIMESTAMPING_TX_HARDWARE */
 	SOCK_TIMESTAMPING_TX_SOFTWARE,  /* %SOF_TIMESTAMPING_TX_SOFTWARE */
 	SOCK_TIMESTAMPING_RX_HARDWARE,  /* %SOF_TIMESTAMPING_RX_HARDWARE */
@@ -588,7 +589,7 @@ static inline int sock_flag(struct sock *sk, enum sock_flags flag)
 
 static inline gfp_t sk_allocation(struct sock *sk, gfp_t gfp_mask)
 {
-	return gfp_mask;
+	return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC);
 }
 
 static inline void sk_acceptq_removed(struct sock *sk)
@@ -718,6 +719,8 @@ extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
 extern void sk_stream_wait_close(struct sock *sk, long timeo_p);
 extern int sk_stream_error(struct sock *sk, int flags, int err);
 extern void sk_stream_kill_queues(struct sock *sk);
+extern void sk_set_memalloc(struct sock *sk);
+extern void sk_clear_memalloc(struct sock *sk);
 
 extern int sk_wait_data(struct sock *sk, long *timeo);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index bc745d0..2e3b69b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -221,6 +221,28 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 EXPORT_SYMBOL(sysctl_optmem_max);
 
+/**
+ * sk_set_memalloc - sets %SOCK_MEMALLOC
+ * @sk: socket to set it on
+ *
+ * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
+ * It's the responsibility of the admin to adjust min_free_kbytes
+ * to meet the requirements
+ */
+void sk_set_memalloc(struct sock *sk)
+{
+	sock_set_flag(sk, SOCK_MEMALLOC);
+	sk->sk_allocation |= __GFP_MEMALLOC;
+}
+EXPORT_SYMBOL_GPL(sk_set_memalloc);
+
+void sk_clear_memalloc(struct sock *sk)
+{
+	sock_reset_flag(sk, SOCK_MEMALLOC);
+	sk->sk_allocation &= ~__GFP_MEMALLOC;
+}
+EXPORT_SYMBOL_GPL(sk_clear_memalloc);
+
 #if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP)
 int net_cls_subsys_id = -1;
 EXPORT_SYMBOL_GPL(net_cls_subsys_id);
-- 
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH 08/14] netvm: Allow skb allocation to use PFMEMALLOC reserves
From: Mel Gorman @ 2011-09-09 10:57 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux-MM, Linux-Netdev, LKML, David Miller, Neil Brown,
	Peter Zijlstra, Mel Gorman
In-Reply-To: <1315565845-16857-1-git-send-email-mgorman@suse.de>

Change the skb allocation API to indicate RX usage and use this to fall
back to the PFMEMALLOC reserve when needed. SKBs allocated from the
reserve are tagged in skb->pfmemalloc. If an SKB is allocated from
the reserve and the socket is later found to be unrelated to page
reclaim, the packet is dropped so that the memory remains available
for page reclaim. Network protocols are expected to recover from this
packet loss.

[a.p.zijlstra@chello.nl: Ideas taken from various patches]
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/gfp.h    |    3 ++
 include/linux/skbuff.h |   19 ++++++++--
 include/net/sock.h     |    6 +++
 mm/internal.h          |    3 --
 net/core/filter.c      |    8 ++++
 net/core/skbuff.c      |   95 ++++++++++++++++++++++++++++++++++++++++--------
 net/core/sock.c        |    4 ++
 7 files changed, 116 insertions(+), 22 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 38acdc7..11588cdf 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -375,6 +375,9 @@ void drain_local_pages(void *dummy);
 
 extern gfp_t gfp_allowed_mask;
 
+/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
+
 extern void pm_restrict_gfp_mask(void);
 extern void pm_restore_gfp_mask(void);
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7b996ed..c8cde02 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -413,6 +413,7 @@ struct sk_buff {
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
 	__u8			ndisc_nodetype:2;
 #endif
+	__u8			pfmemalloc:1;
 	__u8			ooo_okay:1;
 	kmemcheck_bitfield_end(flags2);
 
@@ -451,6 +452,15 @@ struct sk_buff {
 
 #include <asm/system.h>
 
+#define SKB_ALLOC_FCLONE	0x01
+#define SKB_ALLOC_RX		0x02
+
+/* Returns true if the skb was allocated from PFMEMALLOC reserves */
+static inline bool skb_pfmemalloc(struct sk_buff *skb)
+{
+	return unlikely(skb->pfmemalloc);
+}
+
 /*
  * skb might have a dst pointer attached, refcounted or not.
  * _skb_refdst low order bit is set if refcount was _not_ taken
@@ -508,7 +518,7 @@ extern void kfree_skb(struct sk_buff *skb);
 extern void consume_skb(struct sk_buff *skb);
 extern void	       __kfree_skb(struct sk_buff *skb);
 extern struct sk_buff *__alloc_skb(unsigned int size,
-				   gfp_t priority, int fclone, int node);
+				   gfp_t priority, int flags, int node);
 static inline struct sk_buff *alloc_skb(unsigned int size,
 					gfp_t priority)
 {
@@ -518,7 +528,7 @@ static inline struct sk_buff *alloc_skb(unsigned int size,
 static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 					       gfp_t priority)
 {
-	return __alloc_skb(size, priority, 1, NUMA_NO_NODE);
+	return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
 }
 
 extern bool skb_recycle_check(struct sk_buff *skb, int skb_size);
@@ -1549,7 +1559,8 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
 static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
 					      gfp_t gfp_mask)
 {
-	struct sk_buff *skb = alloc_skb(length + NET_SKB_PAD, gfp_mask);
+	struct sk_buff *skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
+						SKB_ALLOC_RX, NUMA_NO_NODE);
 	if (likely(skb))
 		skb_reserve(skb, NET_SKB_PAD);
 	return skb;
@@ -1606,7 +1617,7 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
  */
 static inline struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
 {
-	return alloc_pages_node(NUMA_NO_NODE, gfp_mask, 0);
+	return alloc_pages_node(NUMA_NO_NODE, gfp_mask | __GFP_MEMALLOC, 0);
 }
 
 /**
diff --git a/include/net/sock.h b/include/net/sock.h
index 583df68..cf3f102 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -587,6 +587,12 @@ static inline int sock_flag(struct sock *sk, enum sock_flags flag)
 	return test_bit(flag, &sk->sk_flags);
 }
 
+extern atomic_t memalloc_socks;
+static inline int sk_memalloc_socks(void)
+{
+	return atomic_read(&memalloc_socks);
+}
+
 static inline gfp_t sk_allocation(struct sock *sk, gfp_t gfp_mask)
 {
 	return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC);
diff --git a/mm/internal.h b/mm/internal.h
index a520f3b..d071d380 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -193,9 +193,6 @@ static inline struct page *mem_map_next(struct page *iter,
 #define __paginginit __init
 #endif
 
-/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
-bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
-
 /* Memory initialisation debug and verification */
 enum mminit_level {
 	MMINIT_WARNING,
diff --git a/net/core/filter.c b/net/core/filter.c
index 36f975f..4ccf6f4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -80,6 +80,14 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
 	int err;
 	struct sk_filter *filter;
 
+	/*
+	 * If the skb was allocated from pfmemalloc reserves, only
+	 * allow SOCK_MEMALLOC sockets to use it as this socket is
+	 * helping free memory
+	 */
+	if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
+		return -ENOMEM;
+
 	err = security_sock_rcv_skb(sk, skb);
 	if (err)
 		return err;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 27002df..976cd90 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -147,6 +147,43 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
 	BUG();
 }
 
+
+/*
+ * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
+ * the caller if emergency pfmemalloc reserves are being used. If it is and
+ * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
+ * may be used. Otherwise, the packet data may be discarded until enough
+ * memory is free
+ */
+#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
+	 __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
+void *__kmalloc_reserve(size_t size, gfp_t flags, int node, unsigned long ip,
+			 bool *pfmemalloc)
+{
+	void *obj;
+	bool ret_pfmemalloc = false;
+
+	/*
+	 * Try a regular allocation, when that fails and we're not entitled
+	 * to the reserves, fail.
+	 */
+	obj = kmalloc_node_track_caller(size,
+				flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
+				node);
+	if (obj || !(gfp_pfmemalloc_allowed(flags)))
+		goto out;
+
+	/* Try again but now we are using pfmemalloc reserves */
+	ret_pfmemalloc = true;
+	obj = kmalloc_node_track_caller(size, flags, node);
+
+out:
+	if (pfmemalloc)
+		*pfmemalloc = ret_pfmemalloc;
+
+	return obj;
+}
+
 /* 	Allocate a new skbuff. We do this ourselves so we can fill in a few
  *	'private' fields and also do memory statistics to find all the
  *	[BEEP] leaks.
@@ -157,8 +194,10 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
  *	__alloc_skb	-	allocate a network buffer
  *	@size: size to allocate
  *	@gfp_mask: allocation mask
- *	@fclone: allocate from fclone cache instead of head cache
- *		and allocate a cloned (child) skb
+ *	@flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
+ *		instead of head cache and allocate a cloned (child) skb.
+ *		If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
+ *		allocations in case the data is required for writeback
  *	@node: numa node to allocate memory on
  *
  *	Allocate a new &sk_buff. The returned buffer has no headroom and a
@@ -169,14 +208,19 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
  *	%GFP_ATOMIC.
  */
 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
-			    int fclone, int node)
+			    int flags, int node)
 {
 	struct kmem_cache *cache;
 	struct skb_shared_info *shinfo;
 	struct sk_buff *skb;
 	u8 *data;
+	bool pfmemalloc;
+
+	cache = (flags & SKB_ALLOC_FCLONE)
+		? skbuff_fclone_cache : skbuff_head_cache;
 
-	cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
+	if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
+		gfp_mask |= __GFP_MEMALLOC;
 
 	/* Get the HEAD */
 	skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
@@ -185,8 +229,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	prefetchw(skb);
 
 	size = SKB_DATA_ALIGN(size);
-	data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
-			gfp_mask, node);
+	data = kmalloc_reserve(size + sizeof(struct skb_shared_info),
+			gfp_mask, node, &pfmemalloc);
 	if (!data)
 		goto nodata;
 	prefetchw(data + size);
@@ -197,6 +241,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	 * the tail pointer in struct sk_buff!
 	 */
 	memset(skb, 0, offsetof(struct sk_buff, tail));
+	skb->pfmemalloc = pfmemalloc;
 	skb->truesize = size + sizeof(struct sk_buff);
 	atomic_set(&skb->users, 1);
 	skb->head = data;
@@ -213,7 +258,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	atomic_set(&shinfo->dataref, 1);
 	kmemcheck_annotate_variable(shinfo->destructor_arg);
 
-	if (fclone) {
+	if (flags & SKB_ALLOC_FCLONE) {
 		struct sk_buff *child = skb + 1;
 		atomic_t *fclone_ref = (atomic_t *) (child + 1);
 
@@ -223,6 +268,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 		atomic_set(fclone_ref, 1);
 
 		child->fclone = SKB_FCLONE_UNAVAILABLE;
+		child->pfmemalloc = pfmemalloc;
 	}
 out:
 	return skb;
@@ -251,7 +297,8 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
 {
 	struct sk_buff *skb;
 
-	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE);
+	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
+						SKB_ALLOC_RX, NUMA_NO_NODE);
 	if (likely(skb)) {
 		skb_reserve(skb, NET_SKB_PAD);
 		skb->dev = dev;
@@ -542,6 +589,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 	new->ipvs_property	= old->ipvs_property;
 #endif
+	new->pfmemalloc		= old->pfmemalloc;
 	new->protocol		= old->protocol;
 	new->mark		= old->mark;
 	new->skb_iif		= old->skb_iif;
@@ -687,6 +735,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 		n->fclone = SKB_FCLONE_CLONE;
 		atomic_inc(fclone_ref);
 	} else {
+		if (skb_pfmemalloc(skb))
+			gfp_mask |= __GFP_MEMALLOC;
+
 		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
 		if (!n)
 			return NULL;
@@ -723,6 +774,13 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
 }
 
+static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
+{
+	if (skb_pfmemalloc((struct sk_buff *)skb))
+		return SKB_ALLOC_RX;
+	return 0;
+}
+
 /**
  *	skb_copy	-	create private copy of an sk_buff
  *	@skb: buffer to copy
@@ -744,7 +802,8 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
 {
 	int headerlen = skb_headroom(skb);
 	unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len;
-	struct sk_buff *n = alloc_skb(size, gfp_mask);
+	struct sk_buff *n = __alloc_skb(size, gfp_mask,
+					skb_alloc_rx_flag(skb), NUMA_NO_NODE);
 
 	if (!n)
 		return NULL;
@@ -778,7 +837,8 @@ EXPORT_SYMBOL(skb_copy);
 struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
 {
 	unsigned int size = skb_end_pointer(skb) - skb->head;
-	struct sk_buff *n = alloc_skb(size, gfp_mask);
+	struct sk_buff *n = __alloc_skb(size, gfp_mask,
+					skb_alloc_rx_flag(skb), NUMA_NO_NODE);
 
 	if (!n)
 		goto out;
@@ -876,7 +936,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 		goto adjust_others;
 	}
 
-	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
+	if (skb_pfmemalloc(skb))
+		gfp_mask |= __GFP_MEMALLOC;
+	data = kmalloc_reserve(size + sizeof(struct skb_shared_info), gfp_mask,
+			NUMA_NO_NODE, NULL);
 	if (!data)
 		goto nodata;
 
@@ -985,8 +1048,9 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
 	/*
 	 *	Allocate the copy buffer
 	 */
-	struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
-				      gfp_mask);
+	struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
+				      gfp_mask, skb_alloc_rx_flag(skb),
+				      NUMA_NO_NODE);
 	int oldheadroom = skb_headroom(skb);
 	int head_copy_len, head_copy_off;
 	int off;
@@ -2647,8 +2711,9 @@ struct sk_buff *skb_segment(struct sk_buff *skb, u32 features)
 			skb_release_head_state(nskb);
 			__skb_push(nskb, doffset);
 		} else {
-			nskb = alloc_skb(hsize + doffset + headroom,
-					 GFP_ATOMIC);
+			nskb = __alloc_skb(hsize + doffset + headroom,
+					 GFP_ATOMIC, skb_alloc_rx_flag(skb),
+					 NUMA_NO_NODE);
 
 			if (unlikely(!nskb))
 				goto err;
diff --git a/net/core/sock.c b/net/core/sock.c
index 2e3b69b..07e1292 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -221,6 +221,8 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 EXPORT_SYMBOL(sysctl_optmem_max);
 
+atomic_t memalloc_socks __read_mostly;
+
 /**
  * sk_set_memalloc - sets %SOCK_MEMALLOC
  * @sk: socket to set it on
@@ -233,6 +235,7 @@ void sk_set_memalloc(struct sock *sk)
 {
 	sock_set_flag(sk, SOCK_MEMALLOC);
 	sk->sk_allocation |= __GFP_MEMALLOC;
+	atomic_inc(&memalloc_socks);
 }
 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 
@@ -240,6 +243,7 @@ void sk_clear_memalloc(struct sock *sk)
 {
 	sock_reset_flag(sk, SOCK_MEMALLOC);
 	sk->sk_allocation &= ~__GFP_MEMALLOC;
+	atomic_dec(&memalloc_socks);
 }
 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 
-- 
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox