Netdev List
 help / color / mirror / Atom feed
* Re: [ethtool PATCH 5/6] v4 Add RX packet classification interface
From: Ben Hutchings @ 2011-04-27 18:12 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: davem, jeffrey.t.kirsher, netdev
In-Reply-To: <20110421204040.23054.87188.stgit@gitlad.jf.intel.com>

On Thu, 2011-04-21 at 13:40 -0700, Alexander Duyck wrote:
[...]
> diff --git a/ethtool.c b/ethtool.c
> index 15af86a..421fe20 100644
> --- a/ethtool.c
> +++ b/ethtool.c
[...]
> @@ -926,16 +862,45 @@ static void parse_cmdline(int argc, char **argp)
>  				i = argc;
>  				break;
>  			}
> -			if (mode == MODE_SNTUPLE) {
> +			if (mode == MODE_SCLSRULE) {
>  				if (!strcmp(argp[i], "flow-type")) {
>  					i += 1;
>  					if (i >= argc) {
>  						exit_bad_args();
>  						break;
>  					}
> -					parse_rxntupleopts(argc, argp, i);
> -					i = argc;
> -					break;
> +					if (rxclass_parse_ruleopts(&argp[i],
> +							argc - i,
> +							&rx_rule_fs) < 0) {
> +						exit_bad_args();
> +					} else {
> +						i = argc;
> +						rx_class_rule_added = 1;
> +					}
> +				} else if (!strcmp(argp[i], "class-rule-del")) {

A shorter keyword like 'delete' would do, since the -U option is only
used for flow classification.

> +					i += 1;
> +					if (i >= argc) {
> +						exit_bad_args();
> +						break;
> +					}
> +					rx_class_rule_del =
> +						get_uint_range(argp[i], 0,
> +							       INT_MAX);
> +				} else {
> +					exit_bad_args();
> +				}
> +				break;
> +			}
> +			if (mode == MODE_GCLSRULE) {
> +				if (!strcmp(argp[i], "class-rule")) {

This keyword seems redundant.

[...]
> @@ -3163,21 +3068,130 @@ static int do_permaddr(int fd, struct ifreq *ifr)
>  	return err;
>  }
>  
> +static int flow_spec_to_ntuple(struct ethtool_rx_flow_spec *fsp,
> +			       struct ethtool_rx_ntuple_flow_spec *ntuple)
> +{
> +	int i;

Should be size_t, since it's compared with a sizeof() expression.

> +	/* verify location is not specified */
> +	if (fsp->location != RX_CLS_LOC_UNSPEC)
> +		return -1;
> +
> +	/* verify ring cookie can transfer to action */
> +	if (fsp->ring_cookie > INT_MAX &&
> +	    ~fsp->ring_cookie > 1)
> +		return -1;

The second part of this condition would be more clearly expressed as:

	fsp->ring_cookie < (u64)(-2)

> +	/* verify only one field is setting data field */
> +	if ((fsp->flow_type & FLOW_EXT) &&
> +	    (fsp->m_ext.data[0] || fsp->m_ext.data[1]) &&
> +	    fsp->m_ext.vlan_etype)
> +		return -1;
> +
> +	/* initialize entire ntuple to all 0xFF */
> +	memset(ntuple, ~0, sizeof(*ntuple));

The comment needs to explain *why* the value is ~0 rather than 0.  I
assume the idea is to set the masks to ~0 if they are not initialised
below.

> +	/* set non-filter values */
> +	ntuple->flow_type = fsp->flow_type;
> +	ntuple->action = fsp->ring_cookie;
> +
> +	/* copy header portion over */
> +	memcpy(&ntuple->h_u, &fsp->h_u, sizeof(fsp->h_u));

This deserves a comment that the two h_u fields are different unions,
but are defined identically except for padding at the end.

> +	/* copy mask portion over and invert */
> +	memcpy(&ntuple->m_u, &fsp->m_u, sizeof(fsp->m_u));
> +	for (i = 0; i < sizeof(fsp->m_u); i++)
> +		ntuple->m_u.hdata[i] ^= 0xFF;
> +
> +	/* copy extended fields */
> +	if (fsp->flow_type & FLOW_EXT) {
> +		ntuple->vlan_tag =
> +			ntohs(fsp->h_ext.vlan_tci);
> +		ntuple->vlan_tag_mask =
> +			~ntohs(fsp->m_ext.vlan_tci);
> +		if (fsp->m_ext.vlan_etype) {
> +			ntuple->data =
> +				ntohl(fsp->h_ext.vlan_etype);
> +			ntuple->data_mask =
> +				~(u64)ntohl(fsp->m_ext.vlan_etype);
> +		} else {
> +			ntuple->data =
> +				(u64)ntohl(fsp->h_ext.data[0]);
> +			ntuple->data |=
> +				(u64)ntohl(fsp->h_ext.data[1]) << 32;
> +			ntuple->data_mask =
> +				(u64)ntohl(~fsp->m_ext.data[0]);
> +			ntuple->data_mask |=
> +				(u64)ntohl(~fsp->m_ext.data[1]) << 32;
> +		}
> +	}

I think it would be clearer to add:

	else {
		ntuple->vlan_tag_mask = ~(u16)0;
		ntuple->data_mask = ~(u64)0;
	}

rather than use memset() above.

> +	return 0;
> +}
> +
>  static int do_srxntuple(int fd, struct ifreq *ifr)
>  {
> +	struct ethtool_rx_ntuple ntuplecmd;
> +	struct ethtool_value eval;
>  	int err;
>  
> -	if (sntuple_changed) {
> -		struct ethtool_rx_ntuple ntuplecmd;
> +	/* verify if Ntuple is supported on the HW */

This comment is inaccurate.

> +	err = flow_spec_to_ntuple(&rx_rule_fs, &ntuplecmd.fs);
> +	if (err)
> +		return -1;
> +
> +	/*
> +	 * Check to see if the flag is set for N-tuple, this allows
> +	 * us to avoid the possible EINVAL response for the N-tuple
> +	 * flag not being set on the device
> +	 */
> +	eval.cmd = ETHTOOL_GFLAGS;
> +	ifr->ifr_data = (caddr_t)&eval;
> +	err = ioctl(fd, SIOCETHTOOL, ifr);
> +	if (err || !(eval.data & ETH_FLAG_NTUPLE))
> +		return -1;
>  
> -		ntuplecmd.cmd = ETHTOOL_SRXNTUPLE;
> -		memcpy(&ntuplecmd.fs, &ntuple_fs,
> -		       sizeof(struct ethtool_rx_ntuple_flow_spec));
> +	/* send rule via N-tuple */
> +	ntuplecmd.cmd = ETHTOOL_SRXNTUPLE;
> +	ifr->ifr_data = (caddr_t)&ntuplecmd;
> +	err = ioctl(fd, SIOCETHTOOL, ifr);
>  
> -		ifr->ifr_data = (caddr_t)&ntuplecmd;
> -		err = ioctl(fd, SIOCETHTOOL, ifr);
> -		if (err < 0)
> -			perror("Cannot add new RX n-tuple filter");
> +	/*
> +	 * Display error only if reponse is something other than op not
> +	 * supported.  It is possible that the interface uses the network
> +	 * flow classifier interface instead of N-tuple. 
> +	 */ 
> +	if (err && errno != EOPNOTSUPP)
> +		perror("Cannot add new rule via N-tuple");
> +
> +	return err;
> +}
> +
> +static int do_srxclsrule(int fd, struct ifreq *ifr)
> +{
> +	int err;
> +
> +	if (rx_class_rule_added) {
> +		/* attempt to add rule via N-tuple specifier */
> +		err = do_srxntuple(fd, ifr);
> +		if (!err)
> +			return 0;
> +
> +		/* attempt to add rule via network flow classifier */
> +		err = rxclass_rule_ins(fd, ifr, &rx_rule_fs);
> +		if (err < 0) {
> +			fprintf(stderr, "Cannot insert"
> +				" classification rule\n");
> +			return 1;
> +		}

Is this the right order to try them?  I'm not sure.

> +	} else if (rx_class_rule_del >= 0) {
> +		err = rxclass_rule_del(fd, ifr, rx_class_rule_del);
> +
> +		if (err < 0) {
> +			fprintf(stderr, "Cannot delete"
> +				" classification rule\n");
> +			return 1;
> +		}
>  	} else {
>  		exit_bad_args();
>  	}
[...]
> diff --git a/rxclass.c b/rxclass.c
> new file mode 100644
> index 0000000..5ad0639
> --- /dev/null
> +++ b/rxclass.c
> @@ -0,0 +1,1106 @@
> +/*
> + * Copyright (C) 2008 Sun Microsystems, Inc. All rights reserved.
> + */
> +#include <stdio.h>
> +#include <stdint.h>
> +#include <stddef.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <errno.h>
> +
> +#include <linux/sockios.h>
> +#include <arpa/inet.h>
> +#include "ethtool-util.h"
> +#include "ethtool-bitops.h"
> +
> +/*
> + * This is a rule manager implementation for ordering rx flow
> + * classification rules in a longest prefix first match order.
> + * The assumption is that this rule manager is the only one adding rules to
> + * the device's hardware classifier.
> + */
> +
> +struct rmgr_ctrl {
> +	/* slot contains a bitmap indicating which filters are valid */
> +	unsigned long		*slot;
> +	__u32			n_rules;
> +	__u32			size;
> +};
> +
> +static struct rmgr_ctrl rmgr;
> +static int rmgr_init_done = 0;
> +
> +static void invert_flow_mask(struct ethtool_rx_flow_spec *fsp)
> +{
> +	int i;

Should be size_t, since it's compared with a sizeof() expression.

> +	for (i = 0; i < sizeof(fsp->m_u); i++)
> +		fsp->m_u.hdata[i] ^= 0xFF;
> +}
> +
> +static void rmgr_print_nfc_spec_ext(struct ethtool_rx_flow_spec *fsp)
> +{
> +	u64 data, datam;
> +	__u16 etype, etypem, tci, tcim;
> +
> +	if (!(fsp->flow_type & FLOW_EXT))
> +		return;
> +
> +	etype = ntohs(fsp->h_ext.vlan_etype);
> +	etypem = ntohs(~fsp->m_ext.vlan_etype);
> +	tci = ntohs(fsp->h_ext.vlan_tci);
> +	tcim = ntohs(~fsp->m_ext.vlan_tci);
> +	data = (u64)ntohl(fsp->h_ext.data[0]) << 32;
> +	data = (u64)ntohl(fsp->h_ext.data[1]);
> +	datam = (u64)ntohl(~fsp->m_ext.data[0]) << 32;
> +	datam |= (u64)ntohl(~fsp->m_ext.data[1]);
> +
> +	fprintf(stdout,
> +		"\tVLAN EtherType: 0x%x mask: 0x%x\n"

Lower-case 'ethertype', please.

> +		"\tVLAN: 0x%x mask: 0x%x\n"
> +		"\tUser-defined: 0x%Lx mask: 0x%Lx\n",

'L' is not documented as an integer modifier for printf().  Use 'll'
instead.

> +		etype, etypem, tci, tcim, data, datam);
> +}
> +
> +static void rmgr_print_nfc_rule(struct ethtool_rx_flow_spec *fsp)
> +{
> +	unsigned char	*smac, *smacm, *dmac, *dmacm;
> +	__u32		sip, dip, sipm, dipm, flow_type;
> +	__u16		proto, protom;
> +
> +	if (fsp->location != RX_CLS_LOC_UNSPEC)
> +		fprintf(stdout,	"Filter: %d\n", fsp->location);
> +	else
> +		fprintf(stdout,	"Filter: Unspecified\n");
> +
> +	flow_type = fsp->flow_type & ~FLOW_EXT;
> +
> +	invert_flow_mask(fsp);
> +
> +	switch (flow_type) {
> +	case TCP_V4_FLOW:
> +	case UDP_V4_FLOW:
> +	case SCTP_V4_FLOW:
> +	case AH_V4_FLOW:
> +	case ESP_V4_FLOW:
> +	case IP_USER_FLOW:
> +		sip = ntohl(fsp->h_u.tcp_ip4_spec.ip4src);
> +		dip = ntohl(fsp->h_u.tcp_ip4_spec.ip4dst);
> +		sipm = ntohl(fsp->m_u.tcp_ip4_spec.ip4src);
> +		dipm = ntohl(fsp->m_u.tcp_ip4_spec.ip4dst);

I think this will work for AH_V4, ESP_V4 and IP_USER due to similar
structure layout, but it's kind of dodgy.  Please handle each structure
type separately.

[...]
> +static int rmgr_del(__u32 loc)
> +{
> +	/* verify rule exists before attempting to delete */
> +	int err = rmgr_find(loc);
> +	if (err)
> +		return err;
> +
> +	/* clear bit for the rule */
> +	clear_bit(loc, rmgr.slot);
> +
> +	return 0;
> +}
> +
> +static int rmgr_add(struct ethtool_rx_flow_spec *fsp)
> +{
> +	__u32 loc = fsp->location;
> +
> +	/* location provided, insert rule and update regions to match rule */
> +	if (loc != RX_CLS_LOC_UNSPEC)
> +		return rmgr_ins(loc);
> +
> +	/* start at the end of the list since it is lowest priority */
> +	loc = rmgr.size - 1;
> +
> +	/* only part of last word is set so fill in remaining bits and test */
> +	if (!~(rmgr.slot[loc / BITS_PER_LONG] |
> +	       (~1UL << (loc % BITS_PER_LONG))))
> +		loc -= 1 + (loc % BITS_PER_LONG);

I think this is meant to avoid the need to search for bits in a partial
word but it's a very non-obvious way to do that.

> +	/* find an open slot */
> +	while (loc != RX_CLS_LOC_UNSPEC && !~rmgr.slot[loc / BITS_PER_LONG])
> +		loc -= BITS_PER_LONG;

Just because RX_CLS_LOC_UNSPEC happens to be equal to -1 doesn't mean
it's a good idea to use the name here!

> +	/* find and use available location in slot */
> +	while (loc != RX_CLS_LOC_UNSPEC && test_bit(loc, rmgr.slot))
> +		loc--;
> +
> +       /* location found, insert rule */
> +       if (loc != RX_CLS_LOC_UNSPEC) {
> +               fsp->location = loc;
> +               return rmgr_ins(loc);
> +       }

I think it would be clearer to use a separate word index:

	__u32 loc;
	int i;

	/* location provided, insert rule and update regions to match rule */
	if (fs->location != RX_CLS_LOC_UNSPEC)
		return rmgr_ins(fs->location);

	/* start at the end of the list since it is lowest priority */
	for (i = rmgr.size / BITS_PER_LONG - 1; i >= 0; i--) {
		if (~rmgr.slot[i]) {
			loc = i * BITS_PER_LONG - 1;
			while (test_bit(loc, rmgr.slot))
				loc--;

			/* location found, insert rule */
			fsp->location = loc;
			return rmgr_ins(loc);
		}
	}

[...]
> +int rxclass_rule_getall(int fd, struct ifreq *ifr)
> +{
> +	struct ethtool_rxnfc nfccmd;
> +	int err, i, j;
> +
> +	/* init table of available rules */
> +	err = rmgr_init(fd, ifr);
> +	if (err < 0)
> +		return err;
> +
> +	fprintf(stdout, "Total %d rules\n\n", rmgr.n_rules);
> +
> +	/* fetch and display all available rules */
> +	for (i = 0; i < rmgr.size; i += BITS_PER_LONG) {
> +		if (!~rmgr.slot[i / BITS_PER_LONG])
> +			continue;

The '~' is wrong.  We want to skip words where all bits are clear, not
where all bits are set.

[...]
> +int rxclass_rule_get(int fd, struct ifreq *ifr, __u32 loc)
> +{
> +	struct ethtool_rxnfc nfccmd;
> +	int err;
> +
> +	/* init table of available rules */
> +	err = rmgr_init(fd, ifr);
> +	if (err < 0)
> +		return err;
> +
> +	/* verify rule exists before attempting to display */
> +	err = rmgr_find(loc);
> +	if (err < 0)
> +		return err;

Is this really necessary?  Shouldn't we let the driver check that for
us, and save the cost of initialising the rule manager?

[...]
> +int rxclass_rule_ins(int fd, struct ifreq *ifr,
> +		     struct ethtool_rx_flow_spec *fsp)
> +{
> +	struct ethtool_rxnfc nfccmd;
> +	int err;
> +
> +	/* init table of available rules */
> +	err = rmgr_init(fd, ifr);
> +	if (err < 0)
> +		return err;
> +
> +	/* verify rule location */
> +	err = rmgr_add(fsp);
> +	if (err < 0)
> +		return err;
> +
> +	/* notify netdev of new rule */
> +	nfccmd.cmd = ETHTOOL_SRXCLSRLINS;
> +	nfccmd.fs = *fsp;
> +	ifr->ifr_data = (caddr_t)&nfccmd;
> +	err = ioctl(fd, SIOCETHTOOL, ifr);
> +	if (err < 0) {
> +		perror("rmgr: Cannot insert RX class rule");
> +		return -1;
> +	}
> +	rmgr.n_rules++;

If we're about to destroy the rule manager immediately afterwards, why
bother maintaining n_rules?

> +	printf("Added rule with ID %d\n", fsp->location);
> +
> +	rmgr_cleanup();
> +
> +	return 0;
> +}
> +
> +int rxclass_rule_del(int fd, struct ifreq *ifr, __u32 loc)
> +{
> +	struct ethtool_rxnfc nfccmd;
> +	int err;
> +
> +	/* init table of available rules */
> +	err = rmgr_init(fd, ifr);
> +	if (err < 0)
> +		return err;
> +
> +	/* verify rule exists */
> +	err = rmgr_del(loc);
> +	if (err < 0)
> +		return err;

Again, I think that can be left to the driver.

[...]
> +typedef enum {
> +	OPT_NONE = 0,
> +	OPT_S32,
> +	OPT_U8,
> +	OPT_U16,
> +	OPT_U32,
> +	OPT_U64,
> +	OPT_BE16,
> +	OPT_BE32,
> +	OPT_BE64,
> +	OPT_IP4,
> +	OPT_MAC,
> +} rule_opt_type_t;
> +
> +#define NFC_FLAG_RING		0x001
> +#define NFC_FLAG_LOC		0x002
> +#define NFC_FLAG_SADDR		0x004
> +#define NFC_FLAG_DADDR		0x008
> +#define NFC_FLAG_SPORT		0x010
> +#define NFC_FLAG_DPORT		0x020
> +#define NFC_FLAG_SPI		0x030
> +#define NFC_FLAG_TOS		0x040
> +#define NFC_FLAG_PROTO		0x080
> +#define NTUPLE_FLAG_VLAN	0x100
> +#define NTUPLE_FLAG_UDEF	0x200
> +#define NTUPLE_FLAG_VETH	0x400
> +
> +struct rule_opts {
> +	const char	*name;
> +	rule_opt_type_t	type;
> +	u32		flag;
> +	int		offset;
> +	int		moffset;
> +};

I think that this ought to be merged with the argument parsing in
ethtool.c, but I won't insist that you do that immedaitely.  However:

[...]
> +static int rxclass_get_ipv4(char *str, __be32 *val)
> +{
> +	if (!strchr(str, '.')) {
> +		unsigned long long v;
> +		int err;
> +
> +		err = rxclass_get_ulong(str, &v, 32);
> +		if (err)
> +			return -1;
> +
> +		*val = htonl((u32)v);
> +
> +		return 0;
> +	}
> +
> +	if (!inet_pton(AF_INET, str, val))
> +		return -1;

There's no need to make a special case for integers.  inet_aton() should
handle all the historically supported IPv4 literal formats.

[...]
> +int rxclass_parse_ruleopts(char **argp, int argc,
> +			   struct ethtool_rx_flow_spec *fsp)
> +{
> +	const struct rule_opts *options;
> +	unsigned char *p = (unsigned char *)fsp;
> +	int i = 0, n_opts, err;
> +	u32 flags = 0;
> +	int flow_type;
> +
> +	if (*argp == NULL || **argp == '\0' || argc < 1)
> +		goto syntax_err;

argc < 1 is sufficient.

> +	if (!strcmp(argp[0], "tcp4"))
> +		flow_type = TCP_V4_FLOW;
> +	else if (!strcmp(argp[0], "udp4"))
> +		flow_type = UDP_V4_FLOW;
> +	else if (!strcmp(argp[0], "sctp4"))
> +		flow_type = SCTP_V4_FLOW;
> +	else if (!strcmp(argp[0], "ah4"))
> +		flow_type = AH_V4_FLOW;
> +	else if (!strcmp(argp[0], "esp4"))
> +		flow_type = ESP_V4_FLOW;
> +	else if (!strcmp(argp[0], "ip4"))
> +		flow_type = IP_USER_FLOW;
> +	else if (!strcmp(argp[0], "ether"))
> +		flow_type = ETHER_FLOW;
> +	else
> +		goto syntax_err;
> +
> +	switch (flow_type) {
> +	case TCP_V4_FLOW:
> +	case UDP_V4_FLOW:
> +	case SCTP_V4_FLOW:
> +		options = rule_nfc_tcp_ip4;
> +		n_opts = ARRAY_SIZE(rule_nfc_tcp_ip4);
> +		break;
> +	case AH_V4_FLOW:
> +	case ESP_V4_FLOW:
> +		options = rule_nfc_esp_ip4;
> +		n_opts = ARRAY_SIZE(rule_nfc_esp_ip4);
> +		break;
> +	case IP_USER_FLOW:
> +		options = rule_nfc_usr_ip4;
> +		n_opts = ARRAY_SIZE(rule_nfc_usr_ip4);
> +		break;
> +	case ETHER_FLOW:
> +		options = rule_nfc_ether;
> +		n_opts = ARRAY_SIZE(rule_nfc_ether);
> +		break;
> +	default:
> +		fprintf(stdout, "Add rule, invalid rule type[%s]\n", argp[0]);

stderr, not stdout

[...]
> +		if (idx == n_opts) {
> +			fprintf(stdout, "Add rule, unreconized option[%s]\n",

Typo: 'unreconized' should be 'unrecognized'.

> +				argp[i]);
> +			return -1;
> +		}
> +	}
> +
> +	if (flags & (NTUPLE_FLAG_VLAN | NTUPLE_FLAG_UDEF | NTUPLE_FLAG_VETH))
> +		fsp->flow_type |= FLOW_EXT;
> +
> +	return 0;
> +
> +syntax_err:
> +	fprintf(stdout, "Add rule, invalid syntax\n");

stderr

> +	return -1;
> +}
> 

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* RE: [RFC PATCH] netlink: Increase netlink dump skb message size
From: Rose, Gregory V @ 2011-04-27 18:08 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Steve Hodgson, David Miller, netdev@vger.kernel.org,
	bhutchings@solarflare.com
In-Reply-To: <1303927540.3166.129.camel@edumazet-laptop>

> -----Original Message-----
> From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
> Sent: Wednesday, April 27, 2011 11:06 AM
> To: Rose, Gregory V
> Cc: Steve Hodgson; David Miller; netdev@vger.kernel.org;
> bhutchings@solarflare.com
> Subject: RE: [RFC PATCH] netlink: Increase netlink dump skb message size
> 
> Le mercredi 27 avril 2011 à 10:39 -0700, Rose, Gregory V a écrit :
> 
> > Right, but when I look in rtnetlink I see the routine to calculate the
> > amount of buffer needed for VF info dump is the number of device
> > parent (PF) VFs * the sizeof various IFLA_VF_INFO items.  The more the
> > VFs the bigger this gets, especially if you want to add more stuff to
> > IFLA_VF_INFO.  So when the kernel dumps this all out it can get bigger
> > than the NLMSG_GOODSIZE (or DUMPSIZE) pretty quickly.
> >
> > >
> > > BTW "ip" uses a 16384 bytes buffer, not a 8192 bytes one.
> >
> > I know, that's why I suffered some confusion about which size to use.
> > The ip command uses 16K but the NLMSG_GOODSIZE can be as small as 3712
> > bytes (depending on page size).  Despite the user buffer being 16k if
> > the size calculated by if_nlmsg_size() in rtnetlink.c is bigger than
> > NLMSG_GOODSIZE then you don't see the info for more than 40 or so VFs.
> > More VFs than that and nothing gets displayed.
> >
> 
> One solution is to change rtnl_dump_ifinfo() to call rtnl_fill_ifinfo()
> once time per device (RTM_NEWLINK like now but no more VFINFO inside),
> then call another function to provide vf/vlan informations (RTM_NEWVF),
> using cb->args[2] as an index into VF space, so that we can stop if
> current skb is filled, and next recvmsg() starts at previous saved
> index.
> 
> Or implement a new "ip vf ..." dumper and only dumps RTM_NEWVF messages.

I'm a netlink newbie so maybe that is a more obvious solution that I missed.  Let me have a look at the code and see.

Thanks,

- Greg


^ permalink raw reply

* RE: [RFC PATCH] netlink: Increase netlink dump skb message size
From: Eric Dumazet @ 2011-04-27 18:05 UTC (permalink / raw)
  To: Rose, Gregory V
  Cc: Steve Hodgson, David Miller, netdev@vger.kernel.org,
	bhutchings@solarflare.com
In-Reply-To: <43F901BD926A4E43B106BF17856F0755018DF59D81@orsmsx508.amr.corp.intel.com>

Le mercredi 27 avril 2011 à 10:39 -0700, Rose, Gregory V a écrit :

> Right, but when I look in rtnetlink I see the routine to calculate the
> amount of buffer needed for VF info dump is the number of device
> parent (PF) VFs * the sizeof various IFLA_VF_INFO items.  The more the
> VFs the bigger this gets, especially if you want to add more stuff to
> IFLA_VF_INFO.  So when the kernel dumps this all out it can get bigger
> than the NLMSG_GOODSIZE (or DUMPSIZE) pretty quickly.
> 
> > 
> > BTW "ip" uses a 16384 bytes buffer, not a 8192 bytes one.
> 
> I know, that's why I suffered some confusion about which size to use.
> The ip command uses 16K but the NLMSG_GOODSIZE can be as small as 3712
> bytes (depending on page size).  Despite the user buffer being 16k if
> the size calculated by if_nlmsg_size() in rtnetlink.c is bigger than
> NLMSG_GOODSIZE then you don't see the info for more than 40 or so VFs.
> More VFs than that and nothing gets displayed.
> 

One solution is to change rtnl_dump_ifinfo() to call rtnl_fill_ifinfo()
once time per device (RTM_NEWLINK like now but no more VFINFO inside),
then call another function to provide vf/vlan informations (RTM_NEWVF),
using cb->args[2] as an index into VF space, so that we can stop if
current skb is filled, and next recvmsg() starts at previous saved
index.

Or implement a new "ip vf ..." dumper and only dumps RTM_NEWVF messages.




^ permalink raw reply

* Re: [PATCH] tcp: disallow bind() to reuse addr/port
From: Eric Dumazet @ 2011-04-27 18:02 UTC (permalink / raw)
  To: George B.; +Cc: David Miller, daniel.baluta, gasparch, netdev
In-Reply-To: <BANLkTin-KhU0KqSuUpbCi2STnnxszO5x=Q@mail.gmail.com>

Le mercredi 27 avril 2011 à 10:54 -0700, George B. a écrit :
> >>
> >> OK, just saw this, so please disregard my earlier.
> >
> > Hmm... you'll discover this patch was reverted, because it broke some
> > applications.
> >
> > So your problem remains.
> 
> Just to clarify, both the previous patch from last year *AND* this
> patch in January were reverted?  I can't seem to find anything showing
> the new one being reverted and am now confused.

Yes, all patches were reverted.

Last revert was very recent : 3e8c806a08c7beecd972e7ce15c

http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=3e8c806a08c7beecd972e7ce15c570b9aba64baa

Revert "tcp: disallow bind() to reuse addr/port"

This reverts commit c191a836a908d1dd6b40c503741f91b914de3348.

It causes known regressions for programs that expect to be able to use
SO_REUSEADDR to shutdown a socket, then successfully rebind another
socket to the same ID.

Programs such as haproxy and amavisd expect this to work.

This should fix kernel bugzilla 32832.




^ permalink raw reply

* Re: [PATCH] tcp: disallow bind() to reuse addr/port
From: George B. @ 2011-04-27 17:54 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, daniel.baluta, gasparch, netdev
In-Reply-To: <1303926014.3166.101.camel@edumazet-laptop>

>>
>> OK, just saw this, so please disregard my earlier.
>
> Hmm... you'll discover this patch was reverted, because it broke some
> applications.
>
> So your problem remains.

Just to clarify, both the previous patch from last year *AND* this
patch in January were reverted?  I can't seem to find anything showing
the new one being reverted and am now confused.

^ permalink raw reply

* Re: Linux TCP's Robustness to Multipath Packet Reordering
From: Yuchung Cheng @ 2011-04-27 17:39 UTC (permalink / raw)
  To: Dominik Kaspar
  Cc: Carsten Wolff, John Heffner, Eric Dumazet, netdev,
	Zimmermann Alexander, Lennart Schulte, Arnd Hannemann
In-Reply-To: <BANLkTimNC9DgWtkTTtjC_v0FsxzvBMsQGw@mail.gmail.com>

Hi Dominik,

On Wed, Apr 27, 2011 at 9:22 AM, Dominik Kaspar <dokaspar.ietf@gmail.com> wrote:
>
> Hi Carsten,
>
> Thanks for your feedback. I made some new tests with the same setup of
> packet-based forwarding over two emulated paths (600 KB/s, 10 ms) +
> (400 KB/s, 100 ms). In the first experiments, which showed a step-wise
> adaptation to reordering, SACK, DSACK, and Timestamps were all
> enabled. In the experiments, I individually disabled these three
> mechanisms and saw the following:
>
> - Disabling timestamps causes TCP to never adjust to reordering at all.
> - Disabling SACK allows TCP to adapt very rapidly ("perfect" aggregation!).

Did you enable tcp_fack when sack is enabled? this may make a (big)
difference. FACK assumes little network reordering and mark packet
losses more aggressively.

> - Disabling DSACK has no obvious impact (still a step-wise throughput).
>
> Is there an explanation for why turning off SACK can be beneficial in
> the presence of packet reordering? That sounds pretty
> counter-intuitive to me... I thought SACK=1 always performs better
> than SACK=0. The results are also illustrated in the following plot.
> For each setting, there are three runs, which all exhibit a similar
> behavior:
>
> http://home.simula.no/~kaspar/static/mptcp-emu-wlan-hspa-02-sack.png
>
> Greetings,
> Dominik
>
> On Wed, Apr 27, 2011 at 11:57 AM, Carsten Wolff <carsten@wolffcarsten.de> wrote:
> > Hi all,
> >
> > On Tuesday 26 April 2011, John Heffner wrote:
> >> First, TCP is definitely not designed to work under such conditions.
> >> For example, assumptions behind RTO calculation and fast retransmit
> >> heuristics are violated.  However, in this particular case my first
> >> guess is that you are being limited by "cwnd moderation," which was
> >> the topic of recent discussion here.  Under persistent reordering,
> >> cwnd moderation can inhibit the ability of cwnd to grow.
> >
> > it's not just cwnd moderation (of which I'm still in favor, even though I lost
> > the argument by inactivity ;-)).
> >
> > Anyway, there are a lot of things in reordering handling that can be improved.
> > Our group (Alexander, Lennart, Arnd, myself and others) has worked on the
> > problem for a long time now. This work resulted in an algorithm that is in
> > large parts TCP-NCR (RFC4653), but also utilizes information gathered by
> > reordering detection for determination of a good DupThresh, fixes a few
> > problems in RFC4653 and improves on the reordering detection in Linux when the
> > connection has no timestamps option. We implemented "pure" TCP-NCR and our own
> > variant in Linux using a modular framework similar to the congestion control
> > modules. A lot of measurements and evaluation have gone into the comparison of
> > the three algorithms. We are now very close(TM) to a final patch, that is more
> > suited for publication on this list and integrates our algorithm into tcp*.
> > [hc] without introducing the overhead of that modular framework.
> >
> > Greetings,
> > Carsten
> >
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] tcp: disallow bind() to reuse addr/port
From: Eric Dumazet @ 2011-04-27 17:40 UTC (permalink / raw)
  To: George B.; +Cc: David Miller, daniel.baluta, gasparch, netdev
In-Reply-To: <BANLkTi=176iFKUHn351OWOZUSaUBkgKf2Q@mail.gmail.com>

Le mercredi 27 avril 2011 à 10:37 -0700, George B. a écrit :
> On Tue, Jan 11, 2011 at 2:03 PM, David Miller <davem@davemloft.net> wrote:
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Tue, 11 Jan 2011 12:14:22 +0100
> >
> >> [PATCH] tcp: disallow bind() to reuse addr/port
> >>
> >> inet_csk_bind_conflict() logic currently disallows a bind() if
> >> it finds a friend socket (a socket bound on same address/port)
> >> satisfying a set of conditions :
> >>
> >> 1) Current (to be bound) socket doesnt have sk_reuse set
> >> OR
> >> 2) other socket doesnt have sk_reuse set
> >> OR
> >> 3) other socket is in LISTEN state
> >>
> >> We should add the CLOSE state in the 3) condition, in order to avoid two
> >> REUSEADDR sockets in CLOSE state with same local address/port, since
> >> this can deny further operations.
> >>
> >> Note : a prior patch tried to address the problem in a different (and
> >> buggy) way. (commit fda48a0d7a8412ced tcp: bind() fix when many ports
> >> are bound).
> >>
> >> Reported-by: Gaspar Chilingarov <gasparch@gmail.com>
> >> Reported-by: Daniel Baluta <daniel.baluta@gmail.com>
> >> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> >
> > Applied, thanks.
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >
> 
> OK, just saw this, so please disregard my earlier.

Hmm... you'll discover this patch was reverted, because it broke some
applications.

So your problem remains.




^ permalink raw reply

* RE: [RFC PATCH] netlink: Increase netlink dump skb message size
From: Rose, Gregory V @ 2011-04-27 17:39 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Steve Hodgson, David Miller, netdev@vger.kernel.org,
	bhutchings@solarflare.com
In-Reply-To: <1303925378.3166.99.camel@edumazet-laptop>

> -----Original Message-----
> From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
> Sent: Wednesday, April 27, 2011 10:30 AM
> To: Rose, Gregory V
> Cc: Steve Hodgson; David Miller; netdev@vger.kernel.org;
> bhutchings@solarflare.com
> Subject: RE: [RFC PATCH] netlink: Increase netlink dump skb message size
> 
> Le mercredi 27 avril 2011 à 10:15 -0700, Rose, Gregory V a écrit :
> > > -----Original Message-----
> > > From: netdev-owner@vger.kernel.org [mailto:netdev-
> owner@vger.kernel.org]
> > > On Behalf Of Eric Dumazet
> > > Sent: Wednesday, April 27, 2011 9:30 AM
> > > To: Steve Hodgson
> > > Cc: Rose, Gregory V; David Miller; netdev@vger.kernel.org;
> > > bhutchings@solarflare.com
> > > Subject: Re: [RFC PATCH] netlink: Increase netlink dump skb message
> size
> > >
> > > Le mercredi 27 avril 2011 à 16:46 +0100, Steve Hodgson a écrit :
> > > > On 04/27/2011 04:24 PM, Eric Dumazet wrote:
> > > > > Le mardi 26 avril 2011 à 09:12 -0700, Rose, Gregory V a écrit :
> > > > >
> > > > >> I'm fine with however you folks want to approach this, just give
> me
> > > some direction.
> > > > >
> > > > > I would just try following patch :
> > > > >
> > > >
> > > > This allows the sfc driver to use 102 VFs, up from the current limit
> of
> > > > 45 VFs.
> > > >
> > > > It's unfortunate that this patch isn't sufficient to allow all 127
> VFs
> > > > to be used, but whilst we wait for a new netlink api this is an
> > > > improvement worth having.
> > > >
> > >
> > > netlink recvmsg() supports MSG_PEEK so user would get the needed size
> of
> > > its buffer before calling the real recvmsg()
> > >
> > > big blobs could be attached as skb fragments (up to 64Kbytes), but do
> we
> > > really want this...
> > [Greg Rose]
> >
> > I'm looking into an approach in which we make the get info dump for VFs
> orthogonal to the set VF info, i.e. like this:
> >
> > To set VF info we would follow the current convention:
> >
> > # ip link set eth(x) vf (n) mac xx:xx:xx:xx:xx:xx
> > # ip link set eth(x) vf (n) vlan (nnnn)
> >
> > To see VF info:
> >
> > # ip link show eth(x) vf (n)
> >
> > would show that VF's mac and vlan and could then be expanded in the
> future to display more information required for additional features that
> users are asking for.
> >
> > The IFLA_VF_INFO dump would be moved out of the info dump for the
> physical function interface and would no longer be nested which would get
> rid of the need for huge amounts of buffer for info dumps on VFs.  The ip
> link show command for the PF would need to report the number of VFs
> currently allocated to the PF so that could fed into a script that loops
> to show each VFs info.
> >
> > I think this approach would fix the problems we're looking at right now.
> >
> 
> Hmm, if you look at "ip link ..." you'll see it dumps everything from
> kernel and does the filter inside user command.

Right, but when I look in rtnetlink I see the routine to calculate the amount of buffer needed for VF info dump is the number of device parent (PF) VFs * the sizeof various IFLA_VF_INFO items.  The more the VFs the bigger this gets, especially if you want to add more stuff to IFLA_VF_INFO.  So when the kernel dumps this all out it can get bigger than the NLMSG_GOODSIZE (or DUMPSIZE) pretty quickly.

> 
> BTW "ip" uses a 16384 bytes buffer, not a 8192 bytes one.

I know, that's why I suffered some confusion about which size to use.  The ip command uses 16K but the NLMSG_GOODSIZE can be as small as 3712 bytes (depending on page size).  Despite the user buffer being 16k if the size calculated by if_nlmsg_size() in rtnetlink.c is bigger than NLMSG_GOODSIZE then you don't see the info for more than 40 or so VFs.  More VFs than that and nothing gets displayed.

- Greg


^ permalink raw reply

* Re: [PATCH] tcp: disallow bind() to reuse addr/port
From: George B. @ 2011-04-27 17:37 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, daniel.baluta, gasparch, netdev
In-Reply-To: <20110111.140336.128591591.davem@davemloft.net>

On Tue, Jan 11, 2011 at 2:03 PM, David Miller <davem@davemloft.net> wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 11 Jan 2011 12:14:22 +0100
>
>> [PATCH] tcp: disallow bind() to reuse addr/port
>>
>> inet_csk_bind_conflict() logic currently disallows a bind() if
>> it finds a friend socket (a socket bound on same address/port)
>> satisfying a set of conditions :
>>
>> 1) Current (to be bound) socket doesnt have sk_reuse set
>> OR
>> 2) other socket doesnt have sk_reuse set
>> OR
>> 3) other socket is in LISTEN state
>>
>> We should add the CLOSE state in the 3) condition, in order to avoid two
>> REUSEADDR sockets in CLOSE state with same local address/port, since
>> this can deny further operations.
>>
>> Note : a prior patch tried to address the problem in a different (and
>> buggy) way. (commit fda48a0d7a8412ced tcp: bind() fix when many ports
>> are bound).
>>
>> Reported-by: Gaspar Chilingarov <gasparch@gmail.com>
>> Reported-by: Daniel Baluta <daniel.baluta@gmail.com>
>> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
>
> Applied, thanks.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

OK, just saw this, so please disregard my earlier.

George

^ permalink raw reply

* Re: 'tcp: bind() fix when many ports are bound' problem
From: George B. @ 2011-04-27 17:36 UTC (permalink / raw)
  To: Daniel Baluta; +Cc: Eric Dumazet, Gaspar Chilingarov, netdev
In-Reply-To: <AANLkTimuNM7vmeR89WKXtrUbJO1Wt1ivp9y=bQvtWg0j@mail.gmail.com>

On Wed, Jan 5, 2011 at 1:00 AM, Daniel Baluta <daniel.baluta@gmail.com> wrote:
>
> On Tue, Jan 4, 2011 at 1:22 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > Le mardi 04 janvier 2011 à 13:12 +0400, Gaspar Chilingarov a écrit :
> >> Hi there!
> >>
> >> Well, that looks strange.
> >>
> >> On my own side I've just put workaround (manually binding to all ports
> >> in sequence :)
> >> and moved production code to FreeBSD as it has better scalable network stack.
> >>
> >> I can see the potential problem with that bind() problem on highly
> >> loaded DNS servers/resolvers which establish tons of outgoing UDP
> >> connections.
> >>
> >> In some cases that connections could fail and as not receiving the
> >> answer it is normal condition for DNS this will go totally unnoticed.
> >>
> >> I don't think anyone will hit this bug in production environment
> >> except the very high load applications.
> >
> > Dont mix TCP and UDP, they are not the same.
> >
> > Problem with TCP is you can have TIME_WAIT sockets, disallowing a port
> > to be reused. Not with UDP.
>
> Isn't SO_REUSEADDR supposed to fix this problem?
>
> Anyhow, inet_csk_get_port the function used by bind(0) performs bad.
>
> Short reminder:
> 100 IP addr  - 70K sockets created => 700 sockets per IP.
> bind(0) for all sockets will results in a large number of
> (port, addr) duplicates although there are more than 30000 ports available.
>
> This wouldn't be so bad if you won't try to connect duplicates to the same
> remote (addr, port) which will result in a connect failure.
>
> Eric's patch introduced the restriction 'forbid two reuse enabled sockets
> to bind on same (addr, port) tuple with a (non ANY addr)'.
>
> Well, I think this will break rule #2 from inet_hastable.h:
> "
> If all sockets have sk->sk_reuse set, and none of them are in
> TCP_LISTEN state, the port may be shared.
> "
> and also caused problem ([1]), don't really know if they are the same.
>
> An attempt, to fix this was to "always allow a reuse listen if
> no other listen is already active on the same IP".
>
> The problem with this fix, is that at the moment of bind() we
> don't know what will be the usage of this socket. It can be,
> bind -> connect or bind -> listen.
> >
> > The connect() [without a previous bind()], or a sendto() [without a
> > previous bind()] problem is more an API problem.
>
> Can you share your thoughts on this?
>
> Going back to my first email, are there any follow ups on your
> "tcp: bind() fix when many ports are bound" patch. I've searched
> netdev archives but no luck. I might have missed something.
>
> I really appreciate your help.
>
> thanks,
> Daniel.
> --

This is also causing a problem for me in a very high load application
where more than 64K sockets are being sourced from multiple IP
addresses.

I, too, would like to know if this has been followed up on.  The old
patch that was reverted was actually working well for us, we never
actually hit the TIME_WAIT problem but we are hitting the problem of
not being able to source a connection from an IP when the global
number of connections is >64K or so.

^ permalink raw reply

* RE: [RFC PATCH] netlink: Increase netlink dump skb message size
From: Eric Dumazet @ 2011-04-27 17:29 UTC (permalink / raw)
  To: Rose, Gregory V
  Cc: Steve Hodgson, David Miller, netdev@vger.kernel.org,
	bhutchings@solarflare.com
In-Reply-To: <43F901BD926A4E43B106BF17856F0755018DF59D32@orsmsx508.amr.corp.intel.com>

Le mercredi 27 avril 2011 à 10:15 -0700, Rose, Gregory V a écrit :
> > -----Original Message-----
> > From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
> > On Behalf Of Eric Dumazet
> > Sent: Wednesday, April 27, 2011 9:30 AM
> > To: Steve Hodgson
> > Cc: Rose, Gregory V; David Miller; netdev@vger.kernel.org;
> > bhutchings@solarflare.com
> > Subject: Re: [RFC PATCH] netlink: Increase netlink dump skb message size
> > 
> > Le mercredi 27 avril 2011 à 16:46 +0100, Steve Hodgson a écrit :
> > > On 04/27/2011 04:24 PM, Eric Dumazet wrote:
> > > > Le mardi 26 avril 2011 à 09:12 -0700, Rose, Gregory V a écrit :
> > > >
> > > >> I'm fine with however you folks want to approach this, just give me
> > some direction.
> > > >
> > > > I would just try following patch :
> > > >
> > >
> > > This allows the sfc driver to use 102 VFs, up from the current limit of
> > > 45 VFs.
> > >
> > > It's unfortunate that this patch isn't sufficient to allow all 127 VFs
> > > to be used, but whilst we wait for a new netlink api this is an
> > > improvement worth having.
> > >
> > 
> > netlink recvmsg() supports MSG_PEEK so user would get the needed size of
> > its buffer before calling the real recvmsg()
> > 
> > big blobs could be attached as skb fragments (up to 64Kbytes), but do we
> > really want this...
> [Greg Rose] 
> 
> I'm looking into an approach in which we make the get info dump for VFs orthogonal to the set VF info, i.e. like this:
> 
> To set VF info we would follow the current convention:
> 
> # ip link set eth(x) vf (n) mac xx:xx:xx:xx:xx:xx
> # ip link set eth(x) vf (n) vlan (nnnn)
> 
> To see VF info:
> 
> # ip link show eth(x) vf (n)
> 
> would show that VF's mac and vlan and could then be expanded in the future to display more information required for additional features that users are asking for.
> 
> The IFLA_VF_INFO dump would be moved out of the info dump for the physical function interface and would no longer be nested which would get rid of the need for huge amounts of buffer for info dumps on VFs.  The ip link show command for the PF would need to report the number of VFs currently allocated to the PF so that could fed into a script that loops to show each VFs info.
> 
> I think this approach would fix the problems we're looking at right now.
> 

Hmm, if you look at "ip link ..." you'll see it dumps everything from
kernel and does the filter inside user command.

BTW "ip" uses a 16384 bytes buffer, not a 8192 bytes one.


$ strace -e trace=recvmsg ip link
recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
msg_iov(1)=[{"\340\3\0\0\20\0\2\0dR\270M\224s\0\0\0\0\4\3\1\0\0\0I\0\1\0
\0\0\0\0"..., 16384}], msg_controllen=0, msg_flags=0}, 0) = 3000
recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
msg_iov(1)=[{"\364\3\0\0\20\0\2\0dR\270M\224s\0\0\0\0\1\0\4\0\0\0C\24\1
\0\0\0\0\0"..., 16384}], msg_controllen=0, msg_flags=0}, 0) = 3024
recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
msg_iov(1)=[{"\34\4\0\0\20\0\2\0dR\270M\224s\0\0\0\0\1\0\7\0\0\0C\20\1\0
\0\0\0\0"..., 16384}], msg_controllen=0, msg_flags=0}, 0) = 2104
recvmsg(3, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000},
msg_iov(1)=[{"\24\0\0\0\3\0\2\0dR\270M\224s\0\0\0\0\0\0\7\0\0\0C\20\1\0
\0\0\0\0"..., 16384}], msg_controllen=0, msg_flags=0}, 0) = 20

...



^ permalink raw reply

* RE: [RFC PATCH] netlink: Increase netlink dump skb message size
From: Rose, Gregory V @ 2011-04-27 17:15 UTC (permalink / raw)
  To: Eric Dumazet, Steve Hodgson
  Cc: David Miller, netdev@vger.kernel.org, bhutchings@solarflare.com
In-Reply-To: <1303921809.3166.90.camel@edumazet-laptop>

> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
> On Behalf Of Eric Dumazet
> Sent: Wednesday, April 27, 2011 9:30 AM
> To: Steve Hodgson
> Cc: Rose, Gregory V; David Miller; netdev@vger.kernel.org;
> bhutchings@solarflare.com
> Subject: Re: [RFC PATCH] netlink: Increase netlink dump skb message size
> 
> Le mercredi 27 avril 2011 à 16:46 +0100, Steve Hodgson a écrit :
> > On 04/27/2011 04:24 PM, Eric Dumazet wrote:
> > > Le mardi 26 avril 2011 à 09:12 -0700, Rose, Gregory V a écrit :
> > >
> > >> I'm fine with however you folks want to approach this, just give me
> some direction.
> > >
> > > I would just try following patch :
> > >
> >
> > This allows the sfc driver to use 102 VFs, up from the current limit of
> > 45 VFs.
> >
> > It's unfortunate that this patch isn't sufficient to allow all 127 VFs
> > to be used, but whilst we wait for a new netlink api this is an
> > improvement worth having.
> >
> 
> netlink recvmsg() supports MSG_PEEK so user would get the needed size of
> its buffer before calling the real recvmsg()
> 
> big blobs could be attached as skb fragments (up to 64Kbytes), but do we
> really want this...
[Greg Rose] 

I'm looking into an approach in which we make the get info dump for VFs orthogonal to the set VF info, i.e. like this:

To set VF info we would follow the current convention:

# ip link set eth(x) vf (n) mac xx:xx:xx:xx:xx:xx
# ip link set eth(x) vf (n) vlan (nnnn)

To see VF info:

# ip link show eth(x) vf (n)

would show that VF's mac and vlan and could then be expanded in the future to display more information required for additional features that users are asking for.

The IFLA_VF_INFO dump would be moved out of the info dump for the physical function interface and would no longer be nested which would get rid of the need for huge amounts of buffer for info dumps on VFs.  The ip link show command for the PF would need to report the number of VFs currently allocated to the PF so that could fed into a script that loops to show each VFs info.

I think this approach would fix the problems we're looking at right now.

- Greg


^ permalink raw reply

* Re: [ethtool PATCH 4/6] Add support for __be64 and bitops to ethtool
From: Ben Hutchings @ 2011-04-27 17:09 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: davem@davemloft.net, Kirsher, Jeffrey T, netdev@vger.kernel.org
In-Reply-To: <4DB8484F.8030001@intel.com>

On Wed, 2011-04-27 at 09:46 -0700, Alexander Duyck wrote:
> On 4/27/2011 8:54 AM, Ben Hutchings wrote:
> > On Thu, 2011-04-21 at 13:40 -0700, Alexander Duyck wrote:
> >> This change is meant to add support for __be64 values and bitops to
> >> ethtool.  These changes will be needed in order to support network flow
> >> classifier rule configuration.
[...]
> > Where is __always_inline supposed to be defined?
> 
> Sorry that should have just been inline.  I forgot we have to take tools 
> other than gcc into account.

Oh, it's a gcc extension?  I read the code before trying to compile it.
I've never tested with anything other than gcc but I think it's worth
making a small effort to avoid gcc extensions.

[...]
> On a side note, is there a git tree somewhere I can re-base off of?  At 
> this point I know you have pulled in a number of patches and I figure it 
> would be helpful for me to clean up my tree so I am not guessing what is 
> there and what isn't.

git://git.kernel.org/pub/scm/network/ethtool/ethtool.git

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: Linux TCP's Robustness to Multipath Packet Reordering
From: Alexander Zimmermann @ 2011-04-27 16:36 UTC (permalink / raw)
  To: Dominik Kaspar
  Cc: Carsten Wolff, John Heffner, Eric Dumazet, netdev,
	Lennart Schulte, Arnd Hannemann
In-Reply-To: <BANLkTimNC9DgWtkTTtjC_v0FsxzvBMsQGw@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1835 bytes --]

Hi,

Am 27.04.2011 um 18:22 schrieb Dominik Kaspar:

> Hi Carsten,
> 
> Thanks for your feedback. I made some new tests with the same setup of
> packet-based forwarding over two emulated paths (600 KB/s, 10 ms) +
> (400 KB/s, 100 ms). In the first experiments, which showed a step-wise
> adaptation to reordering, SACK, DSACK, and Timestamps were all
> enabled. In the experiments, I individually disabled these three
> mechanisms and saw the following:
> 
> - Disabling timestamps causes TCP to never adjust to reordering at all.

Reordering detection with DSACK is broken in Linux. We will fix that in
a couple of weeks...

> - Disabling SACK allows TCP to adapt very rapidly ("perfect" aggregation!).

If you disable SACK, you will use the NewReno detection

> - Disabling DSACK has no obvious impact (still a step-wise throughput).

If Timestamps are enabled, Linux use Timestamps for detection. Regardless
of DSACK. Timestamp detection is quicker. See RFC3522. (However, in case
of an spurious FRet it's not so dramatical. In case of an Spurious RTO,
you can avoid the go-back-n behavior)


> 
> Is there an explanation for why turning off SACK can be beneficial in
> the presence of packet reordering? That sounds pretty
> counter-intuitive to me... I thought SACK=1 always performs better
> than SACK=0. The results are also illustrated in the following plot.
> For each setting, there are three runs, which all exhibit a similar
> behavior:
> 
> http://home.simula.no/~kaspar/static/mptcp-emu-wlan-hspa-02-sack.png
> 
> Greetings,
> Dominik
> 

//
// Dipl.-Inform. Alexander Zimmermann
// Department of Computer Science, Informatik 4
// RWTH Aachen University
// Ahornstr. 55, 52056 Aachen, Germany
// phone: (49-241) 80-21422, fax: (49-241) 80-22222
// email: zimmermann@cs.rwth-aachen.de
// web: http://www.umic-mesh.net
//


[-- Attachment #2: Signierter Teil der Nachricht --]
[-- Type: application/pgp-signature, Size: 243 bytes --]

^ permalink raw reply

* Re: Linux TCP's Robustness to Multipath Packet Reordering
From: Eric Dumazet @ 2011-04-27 16:48 UTC (permalink / raw)
  To: Dominik Kaspar
  Cc: Carsten Wolff, John Heffner, netdev, Zimmermann Alexander,
	Lennart Schulte, Arnd Hannemann
In-Reply-To: <BANLkTimNC9DgWtkTTtjC_v0FsxzvBMsQGw@mail.gmail.com>

Le mercredi 27 avril 2011 à 18:22 +0200, Dominik Kaspar a écrit :
> Hi Carsten,
> 
> Thanks for your feedback. I made some new tests with the same setup of
> packet-based forwarding over two emulated paths (600 KB/s, 10 ms) +
> (400 KB/s, 100 ms). In the first experiments, which showed a step-wise
> adaptation to reordering, SACK, DSACK, and Timestamps were all
> enabled. In the experiments, I individually disabled these three
> mechanisms and saw the following:
> 
> - Disabling timestamps causes TCP to never adjust to reordering at all.
> - Disabling SACK allows TCP to adapt very rapidly ("perfect" aggregation!).
> - Disabling DSACK has no obvious impact (still a step-wise throughput).
> 
> Is there an explanation for why turning off SACK can be beneficial in
> the presence of packet reordering? That sounds pretty
> counter-intuitive to me... I thought SACK=1 always performs better
> than SACK=0. The results are also illustrated in the following plot.
> For each setting, there are three runs, which all exhibit a similar
> behavior:
> 
> http://home.simula.no/~kaspar/static/mptcp-emu-wlan-hspa-02-sack.png
> 

SACK is a win in a normal environnement, with few reorders, but some
percents of losses ;)

Given the limit of 3 blocks in SACK option, and your pretty asymetric
paths (10ms and 100ms), SACK is useless and consume 12 bytes per
frame...

You really should add traces to every tp->reordering changes done in our
TCP stack, its a 20 minutes patch, and would help you to understand
where/when its increased/decreased.





^ permalink raw reply

* Re: [ethtool PATCH 4/6] Add support for __be64 and bitops to ethtool
From: Alexander Duyck @ 2011-04-27 16:46 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: davem@davemloft.net, Kirsher, Jeffrey T, netdev@vger.kernel.org
In-Reply-To: <1303919692.2875.11.camel@bwh-desktop>

On 4/27/2011 8:54 AM, Ben Hutchings wrote:
> On Thu, 2011-04-21 at 13:40 -0700, Alexander Duyck wrote:
>> This change is meant to add support for __be64 values and bitops to
>> ethtool.  These changes will be needed in order to support network flow
>> classifier rule configuration.
>>
>> Signed-off-by: Alexander Duyck<alexander.h.duyck@intel.com>
>> ---
>>
>>   ethtool-bitops.h |   25 +++++++++++++++++++++++++
>>   ethtool-util.h   |   30 ++++++++++++++++++++++++++----
>>   ethtool.c        |    7 -------
>>   3 files changed, 51 insertions(+), 11 deletions(-)
>>   create mode 100644 ethtool-bitops.h
>>
>> diff --git a/ethtool-bitops.h b/ethtool-bitops.h
>> new file mode 100644
>> index 0000000..0ff14f1
>> --- /dev/null
>> +++ b/ethtool-bitops.h
>> @@ -0,0 +1,25 @@
>> +#ifndef ETHTOOL_BITOPS_H__
>> +#define ETHTOOL_BITOPS_H__
>> +
>> +#define BITS_PER_BYTE		8
>> +#define BITS_PER_LONG		(BITS_PER_BYTE * sizeof(long))
>> +#define DIV_ROUND_UP(n, d)	(((n) + (d) - 1) / (d))
>> +#define BITS_TO_LONGS(nr)	DIV_ROUND_UP(nr, BITS_PER_LONG)
>> +
>> +static inline void set_bit(int nr, unsigned long *addr)
>> +{
>> +	addr[nr / BITS_PER_LONG] |= 1UL<<  (nr % BITS_PER_LONG);
>> +}
>> +
>> +static inline void clear_bit(int nr, unsigned long *addr)
>> +{
>> +	addr[nr / BITS_PER_LONG]&= ~(1UL<<  (nr % BITS_PER_LONG));
>> +}
>> +
>> +static __always_inline int test_bit(unsigned int nr, const unsigned long *addr)
>> +{
>> +	return !!((1UL<<  (nr % BITS_PER_LONG))&
>> +		  (((unsigned long *)addr)[nr / BITS_PER_LONG]));
>> +}
>
> Where is __always_inline supposed to be defined?

Sorry that should have just been inline.  I forgot we have to take tools 
other than gcc into account.

>> +#endif
>> diff --git a/ethtool-util.h b/ethtool-util.h
>> index f053028..3d46faf 100644
>> --- a/ethtool-util.h
>> +++ b/ethtool-util.h
>> @@ -5,15 +5,18 @@
>>
>>   #include<sys/types.h>
>>   #include<endian.h>
>> +#include<sys/ioctl.h>
>> +#include<net/if.h>
>> +#include "ethtool-config.h"
>> +#include "ethtool-copy.h"
>>
>>   /* ethtool.h expects these to be defined by<linux/types.h>  */
>>   #ifndef HAVE_BE_TYPES
>>   typedef __uint16_t __be16;
>>   typedef __uint32_t __be32;
>> +typedef unsigned long long __be64;
>>   #endif
>>
>> -#include "ethtool-copy.h"
>> -
>
> You can't move the inclusion of ethtool-copy.h; that defeats the whole
> purpose of the HAVE_BE_TYPES feature test.

You're correct. I will get that corrected so that it is located after 
the definitions of the types.  The key bit that mattered was getting 
ethtool-config.h in there before the type declarations.  I need it in 
place to address the test for HAVE_BE_TYPES.

> [...]
>> +#ifndef ARRAY_SIZE
>> +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
>> +#endif
>> +
>> +#ifndef SIOCETHTOOL
>> +#define SIOCETHTOOL     0x8946
>>   #endif
>>
>>   /* National Semiconductor DP83815, DP83816 */
>> diff --git a/ethtool.c b/ethtool.c
>> index 9ad7000..15af86a 100644
>> --- a/ethtool.c
>> +++ b/ethtool.c
>> @@ -45,16 +45,9 @@
>>   #include<linux/sockios.h>
>>   #include "ethtool-util.h"
>>
>> -
>> -#ifndef SIOCETHTOOL
>> -#define SIOCETHTOOL     0x8946
>> -#endif
>>   #ifndef MAX_ADDR_LEN
>>   #define MAX_ADDR_LEN	32
>>   #endif
>> -#ifndef ARRAY_SIZE
>> -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
>> -#endif
>>
>>   #ifndef HAVE_NETIF_MSG
>>   enum {
>>
>
> Presumably this is needed by the next patch, but it has nothing to do
> with what the commit message says.
>
> Ben.
>

Yes. These two moves and the addition of certain headers to the 
ethtool-util.h were to address the needs of both the rxclass.c file and 
the ethtool.c file in one central location.  I will probably break those 
off into a separate patch.

On a side note, is there a git tree somewhere I can re-base off of?  At 
this point I know you have pulled in a number of patches and I figure it 
would be helpful for me to clean up my tree so I am not guessing what is 
there and what isn't.

Thanks,

Alex




^ permalink raw reply

* Re: [RFC PATCH] netlink: Increase netlink dump skb message size
From: Eric Dumazet @ 2011-04-27 16:30 UTC (permalink / raw)
  To: Steve Hodgson
  Cc: Rose, Gregory V, David Miller, netdev@vger.kernel.org,
	bhutchings@solarflare.com
In-Reply-To: <4DB83A60.5040102@solarflare.com>

Le mercredi 27 avril 2011 à 16:46 +0100, Steve Hodgson a écrit :
> On 04/27/2011 04:24 PM, Eric Dumazet wrote:
> > Le mardi 26 avril 2011 à 09:12 -0700, Rose, Gregory V a écrit :
> >
> >> I'm fine with however you folks want to approach this, just give me some direction.
> >
> > I would just try following patch :
> >
> 
> This allows the sfc driver to use 102 VFs, up from the current limit of 
> 45 VFs.
> 
> It's unfortunate that this patch isn't sufficient to allow all 127 VFs 
> to be used, but whilst we wait for a new netlink api this is an 
> improvement worth having.
> 

netlink recvmsg() supports MSG_PEEK so user would get the needed size of
its buffer before calling the real recvmsg()

big blobs could be attached as skb fragments (up to 64Kbytes), but do we
really want this...




^ permalink raw reply

* Re: Linux TCP's Robustness to Multipath Packet Reordering
From: Dominik Kaspar @ 2011-04-27 16:22 UTC (permalink / raw)
  To: Carsten Wolff
  Cc: John Heffner, Eric Dumazet, netdev, Zimmermann Alexander,
	Lennart Schulte, Arnd Hannemann
In-Reply-To: <201104271157.49386.carsten@wolffcarsten.de>

Hi Carsten,

Thanks for your feedback. I made some new tests with the same setup of
packet-based forwarding over two emulated paths (600 KB/s, 10 ms) +
(400 KB/s, 100 ms). In the first experiments, which showed a step-wise
adaptation to reordering, SACK, DSACK, and Timestamps were all
enabled. In the experiments, I individually disabled these three
mechanisms and saw the following:

- Disabling timestamps causes TCP to never adjust to reordering at all.
- Disabling SACK allows TCP to adapt very rapidly ("perfect" aggregation!).
- Disabling DSACK has no obvious impact (still a step-wise throughput).

Is there an explanation for why turning off SACK can be beneficial in
the presence of packet reordering? That sounds pretty
counter-intuitive to me... I thought SACK=1 always performs better
than SACK=0. The results are also illustrated in the following plot.
For each setting, there are three runs, which all exhibit a similar
behavior:

http://home.simula.no/~kaspar/static/mptcp-emu-wlan-hspa-02-sack.png

Greetings,
Dominik

On Wed, Apr 27, 2011 at 11:57 AM, Carsten Wolff <carsten@wolffcarsten.de> wrote:
> Hi all,
>
> On Tuesday 26 April 2011, John Heffner wrote:
>> First, TCP is definitely not designed to work under such conditions.
>> For example, assumptions behind RTO calculation and fast retransmit
>> heuristics are violated.  However, in this particular case my first
>> guess is that you are being limited by "cwnd moderation," which was
>> the topic of recent discussion here.  Under persistent reordering,
>> cwnd moderation can inhibit the ability of cwnd to grow.
>
> it's not just cwnd moderation (of which I'm still in favor, even though I lost
> the argument by inactivity ;-)).
>
> Anyway, there are a lot of things in reordering handling that can be improved.
> Our group (Alexander, Lennart, Arnd, myself and others) has worked on the
> problem for a long time now. This work resulted in an algorithm that is in
> large parts TCP-NCR (RFC4653), but also utilizes information gathered by
> reordering detection for determination of a good DupThresh, fixes a few
> problems in RFC4653 and improves on the reordering detection in Linux when the
> connection has no timestamps option. We implemented "pure" TCP-NCR and our own
> variant in Linux using a modular framework similar to the congestion control
> modules. A lot of measurements and evaluation have gone into the comparison of
> the three algorithms. We are now very close(TM) to a final patch, that is more
> suited for publication on this list and integrates our algorithm into tcp*.
> [hc] without introducing the overhead of that modular framework.
>
> Greetings,
> Carsten
>

^ permalink raw reply

* Re: [PATCH] tg3: Convert u32 flag,flg2,flg3 uses to bitmap
From: Matt Carlson @ 2011-04-27 16:28 UTC (permalink / raw)
  To: Joe Perches
  Cc: Matthew Carlson, Michael Chan, Eric Dumazet,
	netdev@vger.kernel.org
In-Reply-To: <1303841530.24299.37.camel@Joe-Laptop>

On Tue, Apr 26, 2011 at 11:12:10AM -0700, Joe Perches wrote:
> Using a bitmap instead of separate u32 flags allows a consistent, simpler
> and more extensible mechanism to determine capabilities.
> 
> Convert bitmasks to enum.
> Add tg3_flag, tg3_flag_clear and tg3_flag_set.
> Convert the flag & bitmask tests.
> 
> Signed-off-by: Joe Perches <joe@perches.com>

I looked through it and didn't see any problems.  I've queued
Michael Chan's suggestion into my queue.

Acked-by: Matt Carlson <mcarlson@broadcom.com>


^ permalink raw reply

* [PATCH 13/13] mm: Account for the number of times direct reclaimers get throttled
From: Mel Gorman @ 2011-04-27 16:08 UTC (permalink / raw)
  To: Linux-MM, Linux-Netdev
  Cc: LKML, David Miller, Neil Brown, Peter Zijlstra, Mel Gorman
In-Reply-To: <1303920491-25302-1-git-send-email-mgorman@suse.de>

Under significant pressure when writing back to network-backed storage,
direct reclaimers may get throttled. This is expected to be a
short-lived event and the processes get woken up again but processes do
get stalled. This patch counts how many times such stalling occurs. It's
up to the administrator whether to reduce these stalls by increasing
min_free_kbytes.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/vm_event_item.h |    1 +
 mm/vmscan.c                   |    1 +
 mm/vmstat.c                   |    1 +
 3 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 03b90cdc..652e5f3 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -29,6 +29,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		FOR_ALL_ZONES(PGSTEAL),
 		FOR_ALL_ZONES(PGSCAN_KSWAPD),
 		FOR_ALL_ZONES(PGSCAN_DIRECT),
+		PGSCAN_DIRECT_THROTTLE,
 #ifdef CONFIG_NUMA
 		PGSCAN_ZONE_RECLAIM_FAILED,
 #endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7f1099f..1b042d8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2152,6 +2152,7 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 		return;
 
 	/* Throttle */
+	count_vm_event(PGSCAN_DIRECT_THROTTLE);
 	wait_event_interruptible(zone->zone_pgdat->pfmemalloc_wait,
 		pfmemalloc_watermark_ok(zone->zone_pgdat, high_zoneidx));
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a2b7344..5725387 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -911,6 +911,7 @@ const char * const vmstat_text[] = {
 	TEXTS_FOR_ZONES("pgsteal")
 	TEXTS_FOR_ZONES("pgscan_kswapd")
 	TEXTS_FOR_ZONES("pgscan_direct")
+	"pgscan_direct_throttle",
 
 #ifdef CONFIG_NUMA
 	"zone_reclaim_failed",
-- 
1.7.3.4


^ permalink raw reply related

* [PATCH 12/13] mm: Throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage
From: Mel Gorman @ 2011-04-27 16:08 UTC (permalink / raw)
  To: Linux-MM, Linux-Netdev
  Cc: LKML, David Miller, Neil Brown, Peter Zijlstra, Mel Gorman
In-Reply-To: <1303920491-25302-1-git-send-email-mgorman@suse.de>

If swap is backed by network storage such as NBD, there is a risk that a
large number of reclaimers can hang the system by consuming all
PF_MEMALLOC reserves. To avoid these hangs, the administrator must tune
min_free_kbytes in advance. This patch will throttle direct reclaimers
if half the PF_MEMALLOC reserves are in use as the system is at risk of
hanging.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/mmzone.h |    1 +
 mm/page_alloc.c        |    1 +
 mm/vmscan.c            |   54 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 0 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e56f835..8e5c627 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -638,6 +638,7 @@ typedef struct pglist_data {
 					     range, including holes */
 	int node_id;
 	wait_queue_head_t kswapd_wait;
+	wait_queue_head_t pfmemalloc_wait;
 	struct task_struct *kswapd;
 	int kswapd_max_order;
 	enum zone_type classzone_idx;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5ff1f71..eee10ec 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4225,6 +4225,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 	pgdat_resize_init(pgdat);
 	pgdat->nr_zones = 0;
 	init_waitqueue_head(&pgdat->kswapd_wait);
+	init_waitqueue_head(&pgdat->pfmemalloc_wait);
 	pgdat->kswapd_max_order = 0;
 	pgdat_page_cgroup_init(pgdat);
 	
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b3a569f..7f1099f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2117,6 +2117,45 @@ out:
 	return 0;
 }
 
+static bool pfmemalloc_watermark_ok(pg_data_t *pgdat, int high_zoneidx)
+{
+	struct zone *zone;
+	unsigned long pfmemalloc_reserve = 0;
+	unsigned long free_pages = 0;
+	int i;
+
+	for (i = 0; i <= high_zoneidx; i++) {
+		zone = &pgdat->node_zones[i];
+		pfmemalloc_reserve += min_wmark_pages(zone);
+		free_pages += zone_page_state(zone, NR_FREE_PAGES);
+	}
+
+	return (free_pages > pfmemalloc_reserve / 2) ? true : false;
+}
+
+/*
+ * Throttle direct reclaimers if backing storage is backed by the network
+ * and the PFMEMALLOC reserve for the preferred node is getting dangerously
+ * depleted. kswapd will continue to make progress and wake the processes
+ * when the low watermark is reached
+ */
+static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+					nodemask_t *nodemask)
+{
+	struct zone *zone;
+	int high_zoneidx = gfp_zone(gfp_mask);
+	DEFINE_WAIT(wait);
+
+	/* Check if the pfmemalloc reserves are ok */
+	first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
+	if (pfmemalloc_watermark_ok(zone->zone_pgdat, high_zoneidx))
+		return;
+
+	/* Throttle */
+	wait_event_interruptible(zone->zone_pgdat->pfmemalloc_wait,
+		pfmemalloc_watermark_ok(zone->zone_pgdat, high_zoneidx));
+}
+
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 				gfp_t gfp_mask, nodemask_t *nodemask)
 {
@@ -2133,6 +2172,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.nodemask = nodemask,
 	};
 
+	throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
+
+	/*
+	 * Do not enter reclaim if fatal signal is pending. 1 is returned so
+	 * that the page allocator does not consider triggering OOM
+	 */
+	if (fatal_signal_pending(current))
+		return 1;
+
 	trace_mm_vmscan_direct_reclaim_begin(order,
 				sc.may_writepage,
 				gfp_mask);
@@ -2488,6 +2536,12 @@ loop_again:
 			}
 
 		}
+
+		/* Wake throttled direct reclaimers if low watermark is met */
+		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
+				pfmemalloc_watermark_ok(pgdat, MAX_NR_ZONES - 1))
+			wake_up_interruptible(&pgdat->pfmemalloc_wait);
+
 		if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
 			break;		/* kswapd: all done */
 		/*
-- 
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH 11/13] nbd: Set SOCK_MEMALLOC for access to PFMEMALLOC reserves
From: Mel Gorman @ 2011-04-27 16:08 UTC (permalink / raw)
  To: Linux-MM, Linux-Netdev
  Cc: LKML, David Miller, Neil Brown, Peter Zijlstra, Mel Gorman
In-Reply-To: <1303920491-25302-1-git-send-email-mgorman@suse.de>

Set SOCK_MEMALLOC on the NBD socket to allow access to PFMEMALLOC
reserves so pages backed by NBD, particularly if swap related,
can be cleaned to prevent the machine being deadlocked. It is
still possible that the PFMEMALLOC reserves get depleted resulting
in deadlock but this can be resolved by the administrator by
increasing min_free_kbytes.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 drivers/block/nbd.c |    7 ++++++-
 1 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index e6fc716..322cef8 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -156,6 +156,7 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
 	struct msghdr msg;
 	struct kvec iov;
 	sigset_t blocked, oldset;
+	unsigned long pflags = current->flags;
 
 	if (unlikely(!sock)) {
 		printk(KERN_ERR "%s: Attempted %s on closed socket in sock_xmit\n",
@@ -168,8 +169,9 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
 	siginitsetinv(&blocked, sigmask(SIGKILL));
 	sigprocmask(SIG_SETMASK, &blocked, &oldset);
 
+	current->flags |= PF_MEMALLOC;
 	do {
-		sock->sk->sk_allocation = GFP_NOIO;
+		sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
 		iov.iov_base = buf;
 		iov.iov_len = size;
 		msg.msg_name = NULL;
@@ -214,6 +216,7 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
 	} while (size > 0);
 
 	sigprocmask(SIG_SETMASK, &oldset, NULL);
+	tsk_restore_flags(current, pflags, PF_MEMALLOC);
 
 	return result;
 }
@@ -404,6 +407,8 @@ static int nbd_do_it(struct nbd_device *lo)
 
 	BUG_ON(lo->magic != LO_MAGIC);
 
+	sk_set_memalloc(lo->sock->sk);
+
 	lo->pid = current->pid;
 	ret = sysfs_create_file(&disk_to_dev(lo->disk)->kobj, &pid_attr.attr);
 	if (ret) {
-- 
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH 10/13] mm: Micro-optimise slab to avoid a function call
From: Mel Gorman @ 2011-04-27 16:08 UTC (permalink / raw)
  To: Linux-MM, Linux-Netdev
  Cc: LKML, David Miller, Neil Brown, Peter Zijlstra, Mel Gorman
In-Reply-To: <1303920491-25302-1-git-send-email-mgorman@suse.de>

Getting and putting objects in SLAB currently requires a function call
but the bulk of the work is related to PFMEMALLOC reserves which are
only consumed when network-backed storage is critical. Use an inline
function to determine if the function call is required.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 mm/slab.c |   28 ++++++++++++++++++++++++++--
 1 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 342c7c7..1504096 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -116,6 +116,8 @@
 #include	<linux/kmemcheck.h>
 #include	<linux/memory.h>
 
+#include	<net/sock.h>
+
 #include	<asm/cacheflush.h>
 #include	<asm/tlbflush.h>
 #include	<asm/page.h>
@@ -944,7 +946,7 @@ static void check_ac_pfmemalloc(struct kmem_cache *cachep,
 	ac->pfmemalloc = false;
 }
 
-static void *ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
+static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
 						gfp_t flags, bool force_refill)
 {
 	int i;
@@ -991,7 +993,20 @@ static void *ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
 	return objp;
 }
 
-static void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+static inline void *ac_get_obj(struct kmem_cache *cachep,
+			struct array_cache *ac, gfp_t flags, bool force_refill)
+{
+	void *objp;
+
+	if (unlikely(sk_memalloc_socks()))
+		objp = __ac_get_obj(cachep, ac, flags, force_refill);
+	else
+		objp = ac->entry[--ac->avail];
+
+	return objp;
+}
+
+static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
 								void *objp)
 {
 	struct slab *slabp;
@@ -1004,6 +1019,15 @@ static void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
 			set_obj_pfmemalloc(&objp);
 	}
 
+	return objp;
+}
+
+static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+								void *objp)
+{
+	if (unlikely(sk_memalloc_socks()))
+		objp = __ac_put_obj(cachep, ac, objp);
+
 	ac->entry[ac->avail++] = objp;
 }
 
-- 
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH 09/13] netvm: Set PF_MEMALLOC as appropriate during SKB processing
From: Mel Gorman @ 2011-04-27 16:08 UTC (permalink / raw)
  To: Linux-MM, Linux-Netdev
  Cc: LKML, David Miller, Neil Brown, Peter Zijlstra, Mel Gorman
In-Reply-To: <1303920491-25302-1-git-send-email-mgorman@suse.de>

In order to make sure pfmemalloc packets receive all memory needed to proceed,
ensure processing of pfmemalloc SKBs happens under PF_MEMALLOC. This is
limited to a subset of protocols that are expected to be used for writing
to swap. Taps are not allowed to use PF_MEMALLOC as these are expected to
communicate with userspace processes which could be paged out.

[a.p.zijlstra@chello.nl: Ideas taken from various patches]
[jslaby@suse.cz: Lock imbalance fix]
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/net/sock.h |    5 +++++
 net/core/dev.c     |   48 ++++++++++++++++++++++++++++++++++++++++++++----
 net/core/sock.c    |   16 ++++++++++++++++
 3 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index e3aaa88..e928880 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -668,8 +668,13 @@ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *s
 	return 0;
 }
 
+extern int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);
+
 static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 {
+	if (skb_pfmemalloc(skb))
+		return __sk_backlog_rcv(sk, skb);
+
 	return sk->sk_backlog_rcv(sk, skb);
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 3871bf6..0b69efb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3095,6 +3095,23 @@ static void vlan_on_bond_hook(struct sk_buff *skb)
 	}
 }
 
+/*
+ * Limit which protocols can use the PFMEMALLOC reserves to those that are
+ * expected to be used for communication with swap.
+ */
+static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_ARP):
+	case __constant_htons(ETH_P_IP):
+	case __constant_htons(ETH_P_IPV6):
+	case __constant_htons(ETH_P_8021Q):
+		return true;
+	default:
+		return false;
+	}
+}
+
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
@@ -3104,15 +3121,28 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	bool deliver_exact = false;
 	int ret = NET_RX_DROP;
 	__be16 type;
+	unsigned long pflags = current->flags;
 
 	if (!netdev_tstamp_prequeue)
 		net_timestamp_check(skb);
 
 	trace_netif_receive_skb(skb);
 
+	/*
+	 * PFMEMALLOC skbs are special, they should
+	 * - be delivered to SOCK_MEMALLOC sockets only
+	 * - stay away from userspace
+	 * - have bounded memory usage
+	 *
+	 * Use PF_MEMALLOC as this saves us from propagating the allocation
+	 * context down to all allocation sites.
+	 */
+	if (skb_pfmemalloc(skb))
+		current->flags |= PF_MEMALLOC;
+
 	/* if we've gotten here through NAPI, check netpoll */
 	if (netpoll_receive_skb(skb))
-		return NET_RX_DROP;
+		goto out;
 
 	if (!skb->skb_iif)
 		skb->skb_iif = skb->dev->ifindex;
@@ -3143,6 +3173,9 @@ another_round:
 	}
 #endif
 
+	if (skb_pfmemalloc(skb))
+		goto skip_taps;
+
 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 		if (!ptype->dev || ptype->dev == skb->dev) {
 			if (pt_prev)
@@ -3151,13 +3184,17 @@ another_round:
 		}
 	}
 
+skip_taps:
 #ifdef CONFIG_NET_CLS_ACT
 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
 	if (!skb)
-		goto out;
+		goto unlock;
 ncls:
 #endif
 
+	if (skb_pfmemalloc(skb) && !skb_pfmemalloc_protocol(skb))
+		goto drop;
+
 	rx_handler = rcu_dereference(skb->dev->rx_handler);
 	if (rx_handler) {
 		if (pt_prev) {
@@ -3166,7 +3203,7 @@ ncls:
 		}
 		switch (rx_handler(&skb)) {
 		case RX_HANDLER_CONSUMED:
-			goto out;
+			goto unlock;
 		case RX_HANDLER_ANOTHER:
 			goto another_round;
 		case RX_HANDLER_EXACT:
@@ -3210,6 +3247,7 @@ ncls:
 	if (pt_prev) {
 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 	} else {
+drop:
 		atomic_long_inc(&skb->dev->rx_dropped);
 		kfree_skb(skb);
 		/* Jamal, now you will not able to escape explaining
@@ -3218,8 +3256,10 @@ ncls:
 		ret = NET_RX_DROP;
 	}
 
-out:
+unlock:
 	rcu_read_unlock();
+out:
+	tsk_restore_flags(current, pflags, PF_MEMALLOC);
 	return ret;
 }
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 8308609..ac36807 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -245,6 +245,22 @@ void sk_clear_memalloc(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 
+int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	int ret;
+	unsigned long pflags = current->flags;
+
+	/* these should have been dropped before queueing */
+	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
+
+	current->flags |= PF_MEMALLOC;
+	ret = sk->sk_backlog_rcv(sk, skb);
+	tsk_restore_flags(current, pflags, PF_MEMALLOC);
+
+	return ret;
+}
+EXPORT_SYMBOL(__sk_backlog_rcv);
+
 #if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP)
 int net_cls_subsys_id = -1;
 EXPORT_SYMBOL_GPL(net_cls_subsys_id);
-- 
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH 08/13] netvm: Allow skb allocation to use PFMEMALLOC reserves
From: Mel Gorman @ 2011-04-27 16:08 UTC (permalink / raw)
  To: Linux-MM, Linux-Netdev
  Cc: LKML, David Miller, Neil Brown, Peter Zijlstra, Mel Gorman
In-Reply-To: <1303920491-25302-1-git-send-email-mgorman@suse.de>

Change the skb allocation API to indicate RX usage and use this to fall back
to the PFMEMALLOC reserve when needed. SKBs allocated from the reserve are
tagged in skb->pfmemalloc. If an SKB is allocated from the reserve and
the socket is later found to be unrelated to page reclaim, the packet is
dropped so that the memory remains available for page reclaim. Network
protocols are expected to recover from this packet loss.

[a.p.zijlstra@chello.nl: Ideas taken from various patches]
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/gfp.h    |    3 ++
 include/linux/skbuff.h |   19 ++++++++--
 include/net/sock.h     |    6 +++
 mm/internal.h          |    3 --
 net/core/filter.c      |    8 ++++
 net/core/skbuff.c      |   95 ++++++++++++++++++++++++++++++++++++++++--------
 net/core/sock.c        |    4 ++
 7 files changed, 116 insertions(+), 22 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 7aa80f0..3e7ba18 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -380,6 +380,9 @@ void drain_local_pages(void *dummy);
 
 extern gfp_t gfp_allowed_mask;
 
+/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
+
 extern void pm_restrict_gfp_mask(void);
 extern void pm_restore_gfp_mask(void);
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d0ae90a..e08e2ce 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -396,6 +396,7 @@ struct sk_buff {
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
 	__u8			ndisc_nodetype:2;
 #endif
+	__u8			pfmemalloc:1;
 	__u8			ooo_okay:1;
 	kmemcheck_bitfield_end(flags2);
 
@@ -434,6 +435,15 @@ struct sk_buff {
 
 #include <asm/system.h>
 
+#define SKB_ALLOC_FCLONE	0x01
+#define SKB_ALLOC_RX		0x02
+
+/* Returns true if the skb was allocated from PFMEMALLOC reserves */
+static inline bool skb_pfmemalloc(struct sk_buff *skb)
+{
+	return unlikely(skb->pfmemalloc);
+}
+
 /*
  * skb might have a dst pointer attached, refcounted or not.
  * _skb_refdst low order bit is set if refcount was _not_ taken
@@ -491,7 +501,7 @@ extern void kfree_skb(struct sk_buff *skb);
 extern void consume_skb(struct sk_buff *skb);
 extern void	       __kfree_skb(struct sk_buff *skb);
 extern struct sk_buff *__alloc_skb(unsigned int size,
-				   gfp_t priority, int fclone, int node);
+				   gfp_t priority, int flags, int node);
 static inline struct sk_buff *alloc_skb(unsigned int size,
 					gfp_t priority)
 {
@@ -501,7 +511,7 @@ static inline struct sk_buff *alloc_skb(unsigned int size,
 static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 					       gfp_t priority)
 {
-	return __alloc_skb(size, priority, 1, NUMA_NO_NODE);
+	return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
 }
 
 extern bool skb_recycle_check(struct sk_buff *skb, int skb_size);
@@ -1527,7 +1537,8 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
 static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
 					      gfp_t gfp_mask)
 {
-	struct sk_buff *skb = alloc_skb(length + NET_SKB_PAD, gfp_mask);
+	struct sk_buff *skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
+						SKB_ALLOC_RX, NUMA_NO_NODE);
 	if (likely(skb))
 		skb_reserve(skb, NET_SKB_PAD);
 	return skb;
@@ -1578,7 +1589,7 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
  */
 static inline struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
 {
-	return alloc_pages_node(NUMA_NO_NODE, gfp_mask, 0);
+	return alloc_pages_node(NUMA_NO_NODE, gfp_mask | __GFP_MEMALLOC, 0);
 }
 
 /**
diff --git a/include/net/sock.h b/include/net/sock.h
index 046bc97..e3aaa88 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -586,6 +586,12 @@ static inline int sock_flag(struct sock *sk, enum sock_flags flag)
 	return test_bit(flag, &sk->sk_flags);
 }
 
+extern atomic_t memalloc_socks;
+static inline int sk_memalloc_socks(void)
+{
+	return atomic_read(&memalloc_socks);
+}
+
 static inline gfp_t sk_allocation(struct sock *sk, gfp_t gfp_mask)
 {
 	return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC);
diff --git a/mm/internal.h b/mm/internal.h
index a520f3b..d071d380 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -193,9 +193,6 @@ static inline struct page *mem_map_next(struct page *iter,
 #define __paginginit __init
 #endif
 
-/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
-bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
-
 /* Memory initialisation debug and verification */
 enum mminit_level {
 	MMINIT_WARNING,
diff --git a/net/core/filter.c b/net/core/filter.c
index afb8afb..eac4cc5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -138,6 +138,14 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
 	int err;
 	struct sk_filter *filter;
 
+	/*
+	 * If the skb was allocated from pfmemalloc reserves, only
+	 * allow SOCK_MEMALLOC sockets to use it as this socket is
+	 * helping free memory
+	 */
+	if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
+		return -ENOMEM;
+
 	err = security_sock_rcv_skb(sk, skb);
 	if (err)
 		return err;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7ebeed0..c90074d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -146,6 +146,43 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
 	BUG();
 }
 
+
+/*
+ * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
+ * the caller if emergency pfmemalloc reserves are being used. If it is and
+ * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
+ * may be used. Otherwise, the packet data may be discarded until enough
+ * memory is free
+ */
+#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
+	 __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
+void *__kmalloc_reserve(size_t size, gfp_t flags, int node, unsigned long ip,
+			 bool *pfmemalloc)
+{
+	void *obj;
+	bool ret_pfmemalloc = false;
+
+	/*
+	 * Try a regular allocation, when that fails and we're not entitled
+	 * to the reserves, fail.
+	 */
+	obj = kmalloc_node_track_caller(size,
+				flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
+				node);
+	if (obj || !(gfp_pfmemalloc_allowed(flags)))
+		goto out;
+
+	/* Try again but now we are using pfmemalloc reserves */
+	ret_pfmemalloc = true;
+	obj = kmalloc_node_track_caller(size, flags, node);
+
+out:
+	if (pfmemalloc)
+		*pfmemalloc = ret_pfmemalloc;
+
+	return obj;
+}
+
 /* 	Allocate a new skbuff. We do this ourselves so we can fill in a few
  *	'private' fields and also do memory statistics to find all the
  *	[BEEP] leaks.
@@ -156,8 +193,10 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
  *	__alloc_skb	-	allocate a network buffer
  *	@size: size to allocate
  *	@gfp_mask: allocation mask
- *	@fclone: allocate from fclone cache instead of head cache
- *		and allocate a cloned (child) skb
+ *	@flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
+ *		instead of head cache and allocate a cloned (child) skb.
+ *		If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
+ *		allocations in case the data is required for writeback
  *	@node: numa node to allocate memory on
  *
  *	Allocate a new &sk_buff. The returned buffer has no headroom and a
@@ -168,14 +207,19 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
  *	%GFP_ATOMIC.
  */
 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
-			    int fclone, int node)
+			    int flags, int node)
 {
 	struct kmem_cache *cache;
 	struct skb_shared_info *shinfo;
 	struct sk_buff *skb;
 	u8 *data;
+	bool pfmemalloc;
+
+	cache = (flags & SKB_ALLOC_FCLONE)
+		? skbuff_fclone_cache : skbuff_head_cache;
 
-	cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
+	if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
+		gfp_mask |= __GFP_MEMALLOC;
 
 	/* Get the HEAD */
 	skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
@@ -184,8 +228,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	prefetchw(skb);
 
 	size = SKB_DATA_ALIGN(size);
-	data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
-			gfp_mask, node);
+	data = kmalloc_reserve(size + sizeof(struct skb_shared_info),
+			gfp_mask, node, &pfmemalloc);
 	if (!data)
 		goto nodata;
 	prefetchw(data + size);
@@ -196,6 +240,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	 * the tail pointer in struct sk_buff!
 	 */
 	memset(skb, 0, offsetof(struct sk_buff, tail));
+	skb->pfmemalloc = pfmemalloc;
 	skb->truesize = size + sizeof(struct sk_buff);
 	atomic_set(&skb->users, 1);
 	skb->head = data;
@@ -212,7 +257,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	atomic_set(&shinfo->dataref, 1);
 	kmemcheck_annotate_variable(shinfo->destructor_arg);
 
-	if (fclone) {
+	if (flags & SKB_ALLOC_FCLONE) {
 		struct sk_buff *child = skb + 1;
 		atomic_t *fclone_ref = (atomic_t *) (child + 1);
 
@@ -222,6 +267,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 		atomic_set(fclone_ref, 1);
 
 		child->fclone = SKB_FCLONE_UNAVAILABLE;
+		child->pfmemalloc = pfmemalloc;
 	}
 out:
 	return skb;
@@ -250,7 +296,8 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
 {
 	struct sk_buff *skb;
 
-	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE);
+	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
+						SKB_ALLOC_RX, NUMA_NO_NODE);
 	if (likely(skb)) {
 		skb_reserve(skb, NET_SKB_PAD);
 		skb->dev = dev;
@@ -526,6 +573,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 	new->ipvs_property	= old->ipvs_property;
 #endif
+	new->pfmemalloc		= old->pfmemalloc;
 	new->protocol		= old->protocol;
 	new->mark		= old->mark;
 	new->skb_iif		= old->skb_iif;
@@ -620,6 +668,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 		n->fclone = SKB_FCLONE_CLONE;
 		atomic_inc(fclone_ref);
 	} else {
+		if (skb_pfmemalloc(skb))
+			gfp_mask |= __GFP_MEMALLOC;
+
 		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
 		if (!n)
 			return NULL;
@@ -656,6 +707,13 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
 }
 
+static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
+{
+	if (skb_pfmemalloc((struct sk_buff *)skb))
+		return SKB_ALLOC_RX;
+	return 0;
+}
+
 /**
  *	skb_copy	-	create private copy of an sk_buff
  *	@skb: buffer to copy
@@ -677,7 +735,8 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
 {
 	int headerlen = skb_headroom(skb);
 	unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len;
-	struct sk_buff *n = alloc_skb(size, gfp_mask);
+	struct sk_buff *n = __alloc_skb(size, gfp_mask,
+					skb_alloc_rx_flag(skb), NUMA_NO_NODE);
 
 	if (!n)
 		return NULL;
@@ -711,7 +770,8 @@ EXPORT_SYMBOL(skb_copy);
 struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
 {
 	unsigned int size = skb_end_pointer(skb) - skb->head;
-	struct sk_buff *n = alloc_skb(size, gfp_mask);
+	struct sk_buff *n = __alloc_skb(size, gfp_mask,
+					skb_alloc_rx_flag(skb), NUMA_NO_NODE);
 
 	if (!n)
 		goto out;
@@ -802,7 +862,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 		goto adjust_others;
 	}
 
-	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
+	if (skb_pfmemalloc(skb))
+		gfp_mask |= __GFP_MEMALLOC;
+	data = kmalloc_reserve(size + sizeof(struct skb_shared_info), gfp_mask,
+			NUMA_NO_NODE, NULL);
 	if (!data)
 		goto nodata;
 
@@ -903,8 +966,9 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
 	/*
 	 *	Allocate the copy buffer
 	 */
-	struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
-				      gfp_mask);
+	struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
+				      gfp_mask, skb_alloc_rx_flag(skb),
+				      NUMA_NO_NODE);
 	int oldheadroom = skb_headroom(skb);
 	int head_copy_len, head_copy_off;
 	int off;
@@ -2551,8 +2615,9 @@ struct sk_buff *skb_segment(struct sk_buff *skb, u32 features)
 			skb_release_head_state(nskb);
 			__skb_push(nskb, doffset);
 		} else {
-			nskb = alloc_skb(hsize + doffset + headroom,
-					 GFP_ATOMIC);
+			nskb = __alloc_skb(hsize + doffset + headroom,
+					 GFP_ATOMIC, skb_alloc_rx_flag(skb),
+					 NUMA_NO_NODE);
 
 			if (unlikely(!nskb))
 				goto err;
diff --git a/net/core/sock.c b/net/core/sock.c
index c685eda..8308609 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -219,6 +219,8 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 EXPORT_SYMBOL(sysctl_optmem_max);
 
+atomic_t memalloc_socks __read_mostly;
+
 /**
  * sk_set_memalloc - sets %SOCK_MEMALLOC
  * @sk: socket to set it on
@@ -231,6 +233,7 @@ void sk_set_memalloc(struct sock *sk)
 {
 	sock_set_flag(sk, SOCK_MEMALLOC);
 	sk->sk_allocation |= __GFP_MEMALLOC;
+	atomic_inc(&memalloc_socks);
 }
 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 
@@ -238,6 +241,7 @@ void sk_clear_memalloc(struct sock *sk)
 {
 	sock_reset_flag(sk, SOCK_MEMALLOC);
 	sk->sk_allocation &= ~__GFP_MEMALLOC;
+	atomic_dec(&memalloc_socks);
 }
 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 
-- 
1.7.3.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox