Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v4 4/5] net: introduce NETIF_F_RXCSUM
From: Michał Mirosław @ 2011-02-03 14:21 UTC (permalink / raw)
  To: netdev; +Cc: Ben Hutchings
In-Reply-To: <cover.1296741561.git.mirq-linux@rere.qmqm.pl>

Introduce NETIF_F_RXCSUM to replace device-private flags for RX checksum
offload. Integrate it with ndo_fix_features.

ethtool_op_get_rx_csum() is removed altogether as nothing in-tree uses it.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Reviewed-by: Ben Hutchings <bhutchings@solarflare.com>                                                    

---
 include/linux/ethtool.h   |    1 -
 include/linux/netdevice.h |    5 ++++-
 net/core/ethtool.c        |   36 ++++++++++++------------------------
 3 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 806e716..54d776c 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -625,7 +625,6 @@ struct net_device;
 
 /* Some generic methods drivers may use in their ethtool_ops */
 u32 ethtool_op_get_link(struct net_device *dev);
-u32 ethtool_op_get_rx_csum(struct net_device *dev);
 u32 ethtool_op_get_tx_csum(struct net_device *dev);
 int ethtool_op_set_tx_csum(struct net_device *dev, u32 data);
 int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4a3e554..45080bb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -964,6 +964,7 @@ struct net_device {
 #define NETIF_F_FCOE_MTU	(1 << 26) /* Supports max FCoE MTU, 2158 bytes*/
 #define NETIF_F_NTUPLE		(1 << 27) /* N-tuple filters supported */
 #define NETIF_F_RXHASH		(1 << 28) /* Receive hashing offload */
+#define NETIF_F_RXCSUM		(1 << 29) /* Receive checksumming offload */
 
 	/* Segmentation offload features */
 #define NETIF_F_GSO_SHIFT	16
@@ -979,7 +980,7 @@ struct net_device {
 	/* = all defined minus driver/device-class-related */
 #define NETIF_F_NEVER_CHANGE	(NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED | \
 				  NETIF_F_LLTX | NETIF_F_NETNS_LOCAL)
-#define NETIF_F_ETHTOOL_BITS	(0x1f3fffff & ~NETIF_F_NEVER_CHANGE)
+#define NETIF_F_ETHTOOL_BITS	(0x3f3fffff & ~NETIF_F_NEVER_CHANGE)
 
 	/* List of features with software fallbacks. */
 #define NETIF_F_GSO_SOFTWARE	(NETIF_F_TSO | NETIF_F_TSO_ECN | \
@@ -2501,6 +2502,8 @@ static inline int dev_ethtool_get_settings(struct net_device *dev,
 
 static inline u32 dev_ethtool_get_rx_csum(struct net_device *dev)
 {
+	if (dev->hw_features & NETIF_F_RXCSUM)
+		return !!(dev->features & NETIF_F_RXCSUM);
 	if (!dev->ethtool_ops || !dev->ethtool_ops->get_rx_csum)
 		return 0;
 	return dev->ethtool_ops->get_rx_csum(dev);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 6e7c6f2..52e4272 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -34,12 +34,6 @@ u32 ethtool_op_get_link(struct net_device *dev)
 }
 EXPORT_SYMBOL(ethtool_op_get_link);
 
-u32 ethtool_op_get_rx_csum(struct net_device *dev)
-{
-	return (dev->features & NETIF_F_ALL_CSUM) != 0;
-}
-EXPORT_SYMBOL(ethtool_op_get_rx_csum);
-
 u32 ethtool_op_get_tx_csum(struct net_device *dev)
 {
 	return (dev->features & NETIF_F_ALL_CSUM) != 0;
@@ -276,6 +270,9 @@ static u32 ethtool_get_feature_mask(u32 eth_cmd)
 	case ETHTOOL_GTXCSUM:
 	case ETHTOOL_STXCSUM:
 		return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM;
+	case ETHTOOL_GRXCSUM:
+	case ETHTOOL_SRXCSUM:
+		return NETIF_F_RXCSUM;
 	case ETHTOOL_GSG:
 	case ETHTOOL_SSG:
 		return NETIF_F_SG;
@@ -310,6 +307,7 @@ static int ethtool_get_one_feature(struct net_device *dev, char __user *useraddr
 }
 
 static int __ethtool_set_tx_csum(struct net_device *dev, u32 data);
+static int __ethtool_set_rx_csum(struct net_device *dev, u32 data);
 static int __ethtool_set_sg(struct net_device *dev, u32 data);
 static int __ethtool_set_tso(struct net_device *dev, u32 data);
 static int __ethtool_set_ufo(struct net_device *dev, u32 data);
@@ -347,6 +345,8 @@ static int ethtool_set_one_feature(struct net_device *dev,
 	switch (ethcmd) {
 	case ETHTOOL_STXCSUM:
 		return __ethtool_set_tx_csum(dev, edata.data);
+	case ETHTOOL_SRXCSUM:
+		return __ethtool_set_rx_csum(dev, edata.data);
 	case ETHTOOL_SSG:
 		return __ethtool_set_sg(dev, edata.data);
 	case ETHTOOL_STSO:
@@ -391,7 +391,7 @@ static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GS
 	/* NETIF_F_FCOE_MTU */        "fcoe-mtu",
 	/* NETIF_F_NTUPLE */          "rx-ntuple-filter",
 	/* NETIF_F_RXHASH */          "rx-hashing",
-	"",
+	/* NETIF_F_RXCSUM */          "rx-checksum",
 	"",
 	"",
 };
@@ -1369,20 +1369,15 @@ static int __ethtool_set_tx_csum(struct net_device *dev, u32 data)
 	return dev->ethtool_ops->set_tx_csum(dev, data);
 }
 
-static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr)
+static int __ethtool_set_rx_csum(struct net_device *dev, u32 data)
 {
-	struct ethtool_value edata;
-
 	if (!dev->ethtool_ops->set_rx_csum)
 		return -EOPNOTSUPP;
 
-	if (copy_from_user(&edata, useraddr, sizeof(edata)))
-		return -EFAULT;
-
-	if (!edata.data && dev->ethtool_ops->set_sg)
+	if (!data)
 		dev->features &= ~NETIF_F_GRO;
 
-	return dev->ethtool_ops->set_rx_csum(dev, edata.data);
+	return dev->ethtool_ops->set_rx_csum(dev, data);
 }
 
 static int __ethtool_set_tso(struct net_device *dev, u32 data)
@@ -1730,15 +1725,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_SPAUSEPARAM:
 		rc = ethtool_set_pauseparam(dev, useraddr);
 		break;
-	case ETHTOOL_GRXCSUM:
-		rc = ethtool_get_value(dev, useraddr, ethcmd,
-				       (dev->ethtool_ops->get_rx_csum ?
-					dev->ethtool_ops->get_rx_csum :
-					ethtool_op_get_rx_csum));
-		break;
-	case ETHTOOL_SRXCSUM:
-		rc = ethtool_set_rx_csum(dev, useraddr);
-		break;
 	case ETHTOOL_TEST:
 		rc = ethtool_self_test(dev, useraddr);
 		break;
@@ -1811,6 +1797,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 		rc = ethtool_set_features(dev, useraddr);
 		break;
 	case ETHTOOL_GTXCSUM:
+	case ETHTOOL_GRXCSUM:
 	case ETHTOOL_GSG:
 	case ETHTOOL_GTSO:
 	case ETHTOOL_GUFO:
@@ -1819,6 +1806,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 		rc = ethtool_get_one_feature(dev, useraddr, ethcmd);
 		break;
 	case ETHTOOL_STXCSUM:
+	case ETHTOOL_SRXCSUM:
 	case ETHTOOL_SSG:
 	case ETHTOOL_STSO:
 	case ETHTOOL_SUFO:
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH v4 0/5] net: Unified offload configuration
From: Michał Mirosław @ 2011-02-03 14:21 UTC (permalink / raw)
  To: netdev; +Cc: Ben Hutchings

Here's a v4 of the ethtool unification patch series.

What's in it?
 1:
	the patch - implement unified ethtool setting ops
 2..3:
	implement interoperation between old and new ethtool ops
 4:
	include RX checksum in features and plug it into new framework
 5:
	convert loopback device to new framework

What is it good for?
 - unifies driver behaviour wrt hardware offloads
 - removes a lot of boilerplate code from drivers
 - allows better fine-grained control over used offloads

I'm testing this on ARM Gemini arch now. Patch to ethtool userspace tool
will follow this series. I'm not fond of the GFEATURES output I implemented -
please throw some suggestions on it if you can.

Driver conversions stay the same as in v2 - as for v3, I'll keep them
from resending until after the core code gets accepted.

Patches 2,4,5 are unchanged from v3.

Best Regards,
Michał Mirosław


v1: http://marc.info/?l=linux-netdev&m=129245188832643&w=3

Changes from v3:
 - fixed kernel-doc and other comments
 - added HIGHDMA to never-changeable features
 - changed GFEATURES .size interpretation
 - changed feature strings
 - change __ethtool_set_flags() to reject invalid changes

Changes from v2:
 - rebase to net-next after merging v2 leading patches
 - fix missing comma in feature name table
 - force NETIF_F_SOFT_FEATURES in hw_features for simpler code
   (fixes a bug that disallowed changing GSO and GRO state)

Changes from v1:
 - split structures for GFEATURES/SFEATURES
 - naming of feature bits using GSTRINGS ETH_SS_FEATURES
 - strict checking of bits used in SFEATURES call
 - more comments and kernel-doc
 - rebased to net-next after 2.6.37


---

Michał Mirosław (5):
  net: Introduce new feature setting ops
  net: ethtool: use ndo_fix_features for offload setting
  net: use ndo_fix_features for ethtool_ops->set_flags
  net: introduce NETIF_F_RXCSUM
  loopback: convert to hw_features

 drivers/net/loopback.c    |    9 +-
 include/linux/ethtool.h   |   86 ++++++++-
 include/linux/netdevice.h |   48 +++++-
 net/core/dev.c            |   49 ++++-
 net/core/ethtool.c        |  481 ++++++++++++++++++++++++++++-----------------
 5 files changed, 480 insertions(+), 193 deletions(-)

-- 
1.7.2.3


^ permalink raw reply

* [PATCH v4 3/5] net: use ndo_fix_features for ethtool_ops->set_flags
From: Michał Mirosław @ 2011-02-03 14:21 UTC (permalink / raw)
  To: netdev; +Cc: Ben Hutchings
In-Reply-To: <cover.1296741561.git.mirq-linux@rere.qmqm.pl>

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---
 net/core/ethtool.c |   31 +++++++++++++++++++++++++++++--
 1 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 555accf..6e7c6f2 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -240,6 +240,34 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
 	return ret;
 }
 
+static int __ethtool_set_flags(struct net_device *dev, u32 data)
+{
+	u32 changed;
+
+	if (data & ~flags_dup_features)
+		return -EINVAL;
+
+	/* legacy set_flags() op */
+	if (dev->ethtool_ops->set_flags) {
+		if (unlikely(dev->hw_features & flags_dup_features))
+			netdev_warn(dev,
+				"driver BUG: mixed hw_features and set_flags()\n");
+		return dev->ethtool_ops->set_flags(dev, data);
+	}
+
+	/* allow changing only bits set in hw_features */
+	changed = (data ^ dev->wanted_features) & flags_dup_features;
+	if (changed & ~dev->hw_features)
+		return -EOPNOTSUPP;
+
+	dev->wanted_features =
+		(dev->wanted_features & ~changed) | data;
+
+	netdev_update_features(dev);
+
+	return 0;
+}
+
 static u32 ethtool_get_feature_mask(u32 eth_cmd)
 {
 	/* feature masks of legacy discrete ethtool ops */
@@ -1733,8 +1761,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 					ethtool_op_get_flags));
 		break;
 	case ETHTOOL_SFLAGS:
-		rc = ethtool_set_value(dev, useraddr,
-				       dev->ethtool_ops->set_flags);
+		rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags);
 		break;
 	case ETHTOOL_GPFLAGS:
 		rc = ethtool_get_value(dev, useraddr, ethcmd,
-- 
1.7.2.3


^ permalink raw reply related

* Re: [PATCH] tcp: Increase the initial congestion window to 10.
From: John Heffner @ 2011-02-03 14:17 UTC (permalink / raw)
  To: Hagen Paul Pfeifer; +Cc: David Miller, netdev, dccp, therbert
In-Reply-To: <47859e9273bb0c1b2c32fc1adfa450ee@localhost>

On Thu, Feb 3, 2011 at 9:00 AM, Hagen Paul Pfeifer <hagen@jauu.net> wrote:
>
> On Wed, 02 Feb 2011 17:07:50 -0800 (PST), David Miller wrote:
>
>> +/* TCP initial congestion window */
>> +#define TCP_INIT_CWND                10
>> +
>
> Davem, there is _no_ research how this huge IW will behave in environments
> with a small BDP. Belief it or not, but there are networks out there with a
> BDP similar to ~1980. Why for heaven's sake should this work in these
> environments? There are only _two_ extensive analysis one from Cherry and
> and one from Ilpo. Both analysis focus on current mainstream BDP. I started
> to setup a ns-2 environment but due to lack of time I am a little bit
> behind schedule. Anyway, why not make this a tunable per route knob so that
> it is easier to fix things, e.g. set IW back to 3.


There's already a per-route tunable, right (RTAX_INITCWND)?

  -John

^ permalink raw reply

* Re: [PATCH] tcp: Increase the initial congestion window to 10.
From: Hagen Paul Pfeifer @ 2011-02-03 14:00 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, dccp, therbert
In-Reply-To: <20110202.170750.229739784.davem@davemloft.net>

On Wed, 02 Feb 2011 17:07:50 -0800 (PST), David Miller wrote:

> +/* TCP initial congestion window */

> +#define TCP_INIT_CWND		10

> +

Davem, there is _no_ research how this huge IW will behave in environments

with a small BDP. Belief it or not, but there are networks out there with a

BDP similar to ~1980. Why for heaven's sake should this work in these

environments? There are only _two_ extensive analysis one from Cherry and

and one from Ilpo. Both analysis focus on current mainstream BDP. I started

to setup a ns-2 environment but due to lack of time I am a little bit

behind schedule. Anyway, why not make this a tunable per route knob so that

it is easier to fix things, e.g. set IW back to 3.

HGN

^ permalink raw reply

* Re: [PATCH] NETFILTER module xt_hmark new target for HASH MARK
From: Pablo Neira Ayuso @ 2011-02-03 13:51 UTC (permalink / raw)
  To: Hans Schillstrom; +Cc: kaber, jengelh, netfilter-devel, netdev, hans
In-Reply-To: <1296740050-6311-2-git-send-email-hans.schillstrom@ericsson.com>

On 03/02/11 14:34, Hans Schillstrom wrote:
> +/*
> + * Calc hash value, special casre is taken on icmp and fragmented messages
> + * i.e. fragmented messages don't use ports.
> + */
> +static __u32 get_hash(struct sk_buff *skb, struct xt_hmark_info *info)
> +{
[...]
> +	ip_proto &= info->prmask;
> +	/* get a consistent hash (same value on both flow directions) */
> +	if (addr2 < addr1)
> +		swap(addr1, addr2);

this assumption is not valid in NAT handlings.

If you want consistent hashing with NAT handlings you'll have to make
this stateful and use the conntrack source and reply directions of the
original tuples (thus making it stateful). That may be a problem because
some people may want to use this without enabling connection tracking.

Are you using this for (uplink) load balancing?

Could you also include one realistic example in the patch description on
how this is used?

If this is accepted, I think this has to be merge with the (already
overloaded) MARK target.

^ permalink raw reply

* Re: [PATCH v3 3/5] net: use ndo_fix_features for ethtool_ops->set_flags
From: Michał Mirosław @ 2011-02-03 13:45 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev
In-Reply-To: <1296341043.3477.64.camel@localhost>

On Sun, Jan 30, 2011 at 08:44:03AM +1000, Ben Hutchings wrote:
> On Sat, 2011-01-29 at 19:39 +0100, Michał Mirosław wrote:
> > Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
> > ---
> >  net/core/ethtool.c |   22 ++++++++++++++++++++--
> >  1 files changed, 20 insertions(+), 2 deletions(-)
> > 
> > diff --git a/net/core/ethtool.c b/net/core/ethtool.c
> > index 409aebb..17a689f4 100644
> > --- a/net/core/ethtool.c
> > +++ b/net/core/ethtool.c
> > @@ -240,6 +240,25 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
> >  	return ret;
> >  }
> >  
> > +static int __ethtool_set_flags(struct net_device *dev, u32 data)
> > +{
> > +	if (data & ~flags_dup_features)
> > +		return -EINVAL;
> > +
> > +	if (!(dev->hw_features & flags_dup_features)) {
> > +		if (!dev->ethtool_ops->set_flags)
> > +			return -EOPNOTSUPP;
> > +		return dev->ethtool_ops->set_flags(dev, data);
> > +	}
> > +
> > +	dev->wanted_features =
> > +		(dev->wanted_features & ~flags_dup_features) | data;
> > +
> > +	netdev_update_features(dev);
> > +
> > +	return 0;
> [...]
> 
> I think it would be clearer to write this as:
[cut]

I changed it to check for more invalid states and to reject changing
of bits not in hw_features but present in flags_dup_features.

Best Regards,
Michał Mirosław

^ permalink raw reply

* Re: [PATCH v3 1/5] net: Introduce new feature setting ops
From: Michał Mirosław @ 2011-02-03 13:42 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev
In-Reply-To: <1296339864.3477.52.camel@localhost>

On Sun, Jan 30, 2011 at 08:24:24AM +1000, Ben Hutchings wrote:
> On Sat, 2011-01-29 at 19:39 +0100, Michał Mirosław wrote:
> > This introduces a new framework to handle device features setting.
> > It consists of:
> >   - new fields in struct net_device:
> > 	+ hw_features - features that hw/driver supports toggling
> > 	+ wanted_features - features that user wants enabled, when possible
> >   - new netdev_ops:
> > 	+ feat = ndo_fix_features(dev, feat) - API checking constraints for
> > 		enabling features or their combinations
> > 	+ ndo_set_features(dev) - API updating hardware state to match
> > 		changed dev->features
> >   - new ethtool commands:
> > 	+ ETHTOOL_GFEATURES/ETHTOOL_SFEATURES: get/set dev->wanted_features
> > 		and trigger device reconfiguration if resulting dev->features
> > 		changed
> > 	+ ETHTOOL_GSTRINGS(ETH_SS_FEATURES): get feature bits names (meaning)
> > 
> > Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
> > ---
> >  include/linux/ethtool.h   |   86 +++++++++++++++++++++++++
> >  include/linux/netdevice.h |   44 ++++++++++++-
> >  net/core/dev.c            |   47 ++++++++++++--
> >  net/core/ethtool.c        |  154 +++++++++++++++++++++++++++++++++++++++++----
> >  4 files changed, 312 insertions(+), 19 deletions(-)
> > 
> > diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
> > index 1908929..b832083 100644
> > --- a/include/linux/ethtool.h
> > +++ b/include/linux/ethtool.h
> > @@ -251,6 +251,7 @@ enum ethtool_stringset {
> >  	ETH_SS_STATS,
> >  	ETH_SS_PRIV_FLAGS,
> >  	ETH_SS_NTUPLE_FILTERS,
> > +	ETH_SS_FEATURES,
> >  };
> >  
> >  /* for passing string sets for data tagging */
> > @@ -523,6 +524,88 @@ struct ethtool_flash {
> >  	char	data[ETHTOOL_FLASH_MAX_FILENAME];
> >  };
> >  
> > +/* for returning and changing feature sets */
> > +
> > +/**
> > + * struct ethtool_get_features_block - block with state of 32 features
> > + * @avaliable: mask of changeable features
> Typo, should be @available.

Fixed.

> > + * @requested: mask of features requested to be enabled if possible
> > + * @active: mask of currently enabled features
> > + * @never_changed: mask of never-changeable features
> 
> I don't think the description of never_changed is clear enough.  It
> should be described as something like:
>     Mask of feature flags that are not changeable for any device.

Fixed. I shortened the text a bit.

> > + */
> > +struct ethtool_get_features_block {
> > +	__u32	available;	/* features togglable */
> > +	__u32	requested;	/* features requested to be enabled */
> > +	__u32	active;		/* features currently enabled */
> > +	__u32	never_changed;	/* never-changeable features */
> 
> Don't comment the fields inline as well as in the kernel-doc.

Removed this and other occurrences.

> > +};
> > +
> > +/**
> > + * struct ethtool_gfeatures - command to get state of device's features
> > + * @cmd: command number = %ETHTOOL_GFEATURES
> > + * @size: in: array size of the features[] array
> > + *       out: count of features[] elements filled
> 
> The value on output should be the size required to read all features, so
> that the caller can discover it easily.
> 
> The two lines describing 'size' will be wrapped together when converted
> to another format (manual page, HTML...).  You need to use at least a
> full stop (period) to separate them.

Fixed. Anyway, userspace should use GSSET_INFO/SS_FEATURES for this.

> [...]
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index c7d7074..6490860 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -783,6 +783,16 @@ struct netdev_tc_txq {
> >   *	Set hardware filter for RFS.  rxq_index is the target queue index;
> >   *	flow_id is a flow ID to be passed to rps_may_expire_flow() later.
> >   *	Return the filter ID on success, or a negative error code.
> > + *
> > + * u32 (*ndo_fix_features)(struct net_device *dev, u32 features);
> > + *	Modifies features supported by device depending on device-specific
> > + *	constraints. Should not modify hardware state.
> 
> I don't think this wording is clear enough.  How about:
> 
>     Adjusts the requested feature flags according to device-specific
>     constraints, and returns the resulting flags.  Must not modify
>     the device state.

I like your version better, too. I also updated ndo_set_features() description.

> [...]
> > @@ -954,6 +974,12 @@ struct net_device {
> >  #define NETIF_F_TSO6		(SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT)
> >  #define NETIF_F_FSO		(SKB_GSO_FCOE << NETIF_F_GSO_SHIFT)
> >  
> > +	/* Features valid for ethtool to change */
> > +	/* = all defined minus driver/device-class-related */
> > +#define NETIF_F_NEVER_CHANGE	(NETIF_F_VLAN_CHALLENGED | \
> > +				  NETIF_F_LLTX | NETIF_F_NETNS_LOCAL)
> 
> Shouldn't NETIF_F_NO_CSUM and NETIF_F_HIGHDMA be included in this?  (And
> excluded from NETIF_F_ALL_TX_OFFLOADS.)

NETIF_F_HIGHDMA added. NO_CSUM can be changed for some software devices
like bridge or bond.

> > +#define NETIF_F_ETHTOOL_BITS	(0x1f3fffff & ~NETIF_F_NEVER_CHANGE)
> > +
> >  	/* List of features with software fallbacks. */
> >  #define NETIF_F_GSO_SOFTWARE	(NETIF_F_TSO | NETIF_F_TSO_ECN | \
> >  				 NETIF_F_TSO6 | NETIF_F_UFO)
> > @@ -964,6 +990,12 @@ struct net_device {
> >  #define NETIF_F_V6_CSUM		(NETIF_F_GEN_CSUM | NETIF_F_IPV6_CSUM)
> >  #define NETIF_F_ALL_CSUM	(NETIF_F_V4_CSUM | NETIF_F_V6_CSUM)
> >  
> > +#define NETIF_F_ALL_TSO 	(NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN)
> > +
> > +#define NETIF_F_ALL_TX_OFFLOADS	(NETIF_F_ALL_CSUM | NETIF_F_SG | \
> > +				 NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \
> > +				 NETIF_F_SCTP_CSUM | NETIF_F_FCOE_CRC)
> > +
> >  	/*
> >  	 * If one device supports one of these features, then enable them
> >  	 * for all in netdev_increment_features.
> [...]
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index ddd5df2..0ccbee6 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> [...]
> > @@ -5267,6 +5273,35 @@ u32 netdev_fix_features(struct net_device *dev, u32 features)
> >  }
> >  EXPORT_SYMBOL(netdev_fix_features);
> >  
> > +void netdev_update_features(struct net_device *dev)
> > +{
> > +	u32 features;
> > +	int err = 0;
> > +
> > +	features = netdev_get_wanted_features(dev);
> > +
> > +	if (dev->netdev_ops->ndo_fix_features)
> > +		features = dev->netdev_ops->ndo_fix_features(dev, features);
> > +
> > +	/* driver might be less strict about feature dependencies */
> > +	features = netdev_fix_features(dev, features);
> > +
> > +	if (dev->features == features)
> > +		return;
> > +
> > +	netdev_info(dev, "Features changed: 0x%08x -> 0x%08x\n",
> > +		dev->features, features);
> > +
> > +	if (dev->netdev_ops->ndo_set_features)
> > +		err = dev->netdev_ops->ndo_set_features(dev, features);
> > +
> > +	if (!err)
> > +		dev->features = features;
> > +	else if (err < 0)
> > +		netdev_err(dev, "set_features() failed (%d)\n", err);
> 
> The error message should include the feature flags passed, since the
> previous informational message may be filtered out.

Fixed. This will make checkpatch.pl complain about long line, but I don't
want to split the message's text as it'd be hard to grep for it then.

> > +}
> > +EXPORT_SYMBOL(netdev_update_features);
> > +
> >  /**
> >   *	netif_stacked_transfer_operstate -	transfer operstate
> >   *	@rootdev: the root or lower level device to transfer state from
> [...]
> > diff --git a/net/core/ethtool.c b/net/core/ethtool.c
> > index 5984ee0..1420edd 100644
> > --- a/net/core/ethtool.c
> > +++ b/net/core/ethtool.c
> > @@ -55,6 +55,7 @@ int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
> >  
> >  	return 0;
> >  }
> > +EXPORT_SYMBOL(ethtool_op_set_tx_csum);
> >  
> >  int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data)
> >  {
> > @@ -171,6 +172,136 @@ EXPORT_SYMBOL(ethtool_ntuple_flush);
> >  
> >  /* Handlers for each ethtool command */
> >  
> > +#define ETHTOOL_DEV_FEATURE_WORDS	1
> > +
> > +static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
> > +{
> > +	struct ethtool_gfeatures cmd = {
> > +		.cmd = ETHTOOL_GFEATURES,
> > +		.size = ETHTOOL_DEV_FEATURE_WORDS,
> > +	};
> > +	struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS] = {
> > +		{
> > +			.available = dev->hw_features,
> > +			.requested = dev->wanted_features,
> > +			.active = dev->features,
> > +			.never_changed = NETIF_F_NEVER_CHANGE,
> > +		},
> > +	};
> > +	u32 __user *sizeaddr;
> > +	u32 in_size;
> > +
> > +	sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size);
> > +	if (get_user(in_size, sizeaddr))
> > +		return -EFAULT;
> > +
> > +	if (in_size < ETHTOOL_DEV_FEATURE_WORDS)
> > +		return -EINVAL;
> I don't think this should be considered invalid.  Instead:
> 
> 	u32 copy_size;
> 	...
> 	copy_size = min_t(u32, in_size, ETHTOOL_DEV_FEATURE_WORDS);

> > +	if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
> > +		return -EFAULT;
> > +	useraddr += sizeof(cmd);
> > +	if (copy_to_user(useraddr, features, sizeof(features)))
> 
> and:
> 	if (copy_to_user(useraddr, features, copy_size * sizeof(features[0]))

Fixed to equivalent code but without additional variable.

> [...]
> > +static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GSTRING_LEN] = {
> > +	/* NETIF_F_SG */              "scatter-gather",
> SG really means TX DMA gather only, as the driver is responsible for
> allocating its own RX buffers.

I changed this and other feature strings to prefix them with "tx-" or "rx-"
as apropriate.

> > +	/* NETIF_F_IP_CSUM */         "tx-checksum-hw-ipv4",
> > +	/* NETIF_F_NO_CSUM */         "tx-checksum-local",
> > +	/* NETIF_F_HW_CSUM */         "tx-checksum-hw-ip-generic",
> > +	/* NETIF_F_IPV6_CSUM */       "tx_checksum-hw-ipv6",
> > +	/* NETIF_F_HIGHDMA */         "highdma",
> > +	/* NETIF_F_FRAGLIST */        "scatter-gather-fraglist",
> > +	/* NETIF_F_HW_VLAN_TX */      "tx-vlan-hw",
> > +
> > +	/* NETIF_F_HW_VLAN_RX */      "rx-vlan-hw",
> > +	/* NETIF_F_HW_VLAN_FILTER */  "rx-vlan-filter",
> > +	/* NETIF_F_VLAN_CHALLENGED */ "*vlan-challenged",
> Don't mark the unchangeable features specially here; that can be done by
> userland.  Actually, I wonder whether they really need descriptions at
> all.

Removed the markings. I think it's still good to have them for debugging
purposes - ethtool can show them then. At least vlan-challenged state is
useful information.

> > +	/* NETIF_F_GSO */             "generic-segmentation-offload",
> > +	/* NETIF_F_LLTX */            "*lockless-tx",
> > +	/* NETIF_F_NETNS_LOCAL */     "*netns-local",
> > +	/* NETIF_F_GRO */             "generic-receive-offload",
> > +	/* NETIF_F_LRO */             "large-receive-offload",
> > +
> > +	/* NETIF_F_TSO */             "tcp-segmentation-offload",
> > +	/* NETIF_F_UFO */             "udp-fragmentation-offload",
> > +	/* NETIF_F_GSO_ROBUST */      "gso-robust",
> > +	/* NETIF_F_TSO_ECN */         "tcp-ecn-segmentation-offload",
> > +	/* NETIF_F_TSO6 */            "ipv6-tcp-segmentation-offload",
> > +	/* NETIF_F_FSO */             "fcoe-segmentation-offload",
> > +	"",
> > +	"",
> > +
> > +	/* NETIF_F_FCOE_CRC */        "tx-checksum-fcoe-crc",
> > +	/* NETIF_F_SCTP_CSUM */       "tx-checksum-sctp",
> > +	/* NETIF_F_FCOE_MTU */        "fcoe-mtu",
> > +	/* NETIF_F_NTUPLE */          "ntuple-filter",
> [...]
> 
> I think this should be named 'rx-ntuple-filter'.  TX filtering may be
> controlled for related devices (VFs) and is completely separate from
> this.

Changed, as above.

Thanks for the review!

Best Regards,
Michał Mirosław

^ permalink raw reply

* [PATCH] NETFILTER userspace part for target HMARK
From: Hans Schillstrom @ 2011-02-03 13:34 UTC (permalink / raw)
  To: kaber, jengelh, netfilter-devel, netdev; +Cc: hans, Hans Schillstrom
In-Reply-To: <1296740050-6311-1-git-send-email-hans.schillstrom@ericsson.com>

The target allows you to create rules in the "raw" and "mangle" tables
which alter the netfilter mark (nfmark) field within a given range.
First a 32 bit hash value is generated then modulus by <limit> and
finally an offset is added before it's written to nfmark.
Prior to routing, the nfmark can influence the routing method (see
"Use netfilter MARK value as routing key") and can also be used by
other subsystems to change their behaviour.

The mark match can also be used to match nfmark produced by this module.

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 extensions/libxt_HMARK.c           |  366 ++++++++++++++++++++++++++++++++++++
 extensions/libxt_HMARK.man         |   60 ++++++
 include/linux/netfilter/xt_hmark.h |   28 +++
 3 files changed, 454 insertions(+), 0 deletions(-)
 create mode 100644 extensions/libxt_HMARK.c
 create mode 100644 extensions/libxt_HMARK.man
 create mode 100644 include/linux/netfilter/xt_hmark.h

diff --git a/extensions/libxt_HMARK.c b/extensions/libxt_HMARK.c
new file mode 100644
index 0000000..04ee516
--- /dev/null
+++ b/extensions/libxt_HMARK.c
@@ -0,0 +1,366 @@
+/*
+ * Shared library add-on to iptables to add HMARK target support.
+ *
+ * The kernel module calculates a hash value that can be modified by modulus
+ * and an offset. The hash value is based on a direction independent
+ * five tuple: src & dst addr src & dst ports and protocol.
+ * However src & dst port can be masked and are not used for fragmented
+ * packets, ESP and AH don't have ports so SPI will be used instead.
+ * For ICMP error messages the hash mark values will be calculated on
+ * the source packet i.e. the packet caused the error (If sufficient
+ * amount of data exists).
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <getopt.h>
+
+#include <xtables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_hmark.h>
+
+/*
+ * Flags must not start at 0, since it's used as none.
+ */
+enum {
+	XT_HMARK_SADR_AND = 1,
+	XT_HMARK_DADR_AND,
+	XT_HMARK_SPI_AND,
+	XT_HMARK_SPI_OR,
+	XT_HMARK_SPORT_AND,
+	XT_HMARK_DPORT_AND,
+	XT_HMARK_SPORT_OR,
+	XT_HMARK_DPORT_OR,
+	XT_HMARK_PROTO_AND,
+	XT_HMARK_RND,
+	XT_HMARK_MODULUS,
+	XT_HMARK_OFFSET,
+};
+
+#define DEF_HRAND 0xc175a3b8	/* Default "random" value to jhash */
+
+static void HMARK_help(void)
+{
+	printf(
+"HMARK target options, i.e. modify hash calculation by:\n"
+"  --hmark-smask value                Mask source address with value\n"
+"  --hmark-dmask value                Mask Dest. address with value\n"
+"  --hmark-sp-mask value              Mask src port with value\n"
+"  --hmark-dp-mask value              Mask dst port with value\n"
+"  --hmark-spi-mask value             For esp and ah AND spi with value\n"
+"  --hmark-sp-set value               OR src port with value\n"
+"  --hmark-dp-set value               OR dst port with value\n"
+"  --hmark-spi-set value              For esp and ah OR spi with value\n"
+"  --hmark-proto-mask value           Mask Protocol with value\n"
+"  --hmark-rnd                        Random value to hash cacl.\n"
+"  Limit/modify the calculated hash mark by:\n"
+"  --hmark-mod value                  nfmark modulus value\n"
+"  --hmark-offs value                 Last action add value to nfmark\n"
+" In many cases hmark can be omitted i.e. --smask can be used\n");
+}
+
+static const struct option HMARK_opts[] = {
+	{ "hmark-smask", 1, NULL, XT_HMARK_SADR_AND },
+	{ "hmark-dmask", 1, NULL, XT_HMARK_DADR_AND },
+	{ "hmark-sp-mask", 1, NULL, XT_HMARK_SPORT_AND },
+	{ "hmark-dp-mask", 1, NULL, XT_HMARK_DPORT_AND },
+	{ "hmark-spi-mask", 1, NULL, XT_HMARK_SPI_AND },
+	{ "hmark-sp-set", 1, NULL, XT_HMARK_SPORT_OR },
+	{ "hmark-dp-set", 1, NULL, XT_HMARK_DPORT_OR },
+	{ "hmark-spi-set", 1, NULL, XT_HMARK_SPI_OR },
+	{ "hmark-proto-mask", 1, NULL, XT_HMARK_PROTO_AND },
+	{ "hmark-rnd", 1, NULL, XT_HMARK_RND },
+	{ "hmark-mod", 1, NULL, XT_HMARK_MODULUS },
+	{ "hmark-offs", 1, NULL, XT_HMARK_OFFSET },
+	{ "smask", 1, NULL, XT_HMARK_SADR_AND },
+	{ "dmask", 1, NULL, XT_HMARK_DADR_AND },
+	{ "sp-mask", 1, NULL, XT_HMARK_SPORT_AND },
+	{ "dp-mask", 1, NULL, XT_HMARK_DPORT_AND },
+	{ "spi-mask", 1, NULL, XT_HMARK_SPI_AND },
+	{ "sp-set", 1, NULL, XT_HMARK_SPORT_OR },
+	{ "dp-set", 1, NULL, XT_HMARK_DPORT_OR },
+	{ "spi-set", 1, NULL, XT_HMARK_SPI_OR },
+	{ "proto-mask", 1, NULL, XT_HMARK_PROTO_AND },
+	{ "rnd", 1, NULL, XT_HMARK_RND },
+	{ "mod", 1, NULL, XT_HMARK_MODULUS },
+	{ "offs", 1, NULL, XT_HMARK_OFFSET },
+	{ .name = NULL }
+};
+
+static int
+HMARK_parse(int c, char **argv, int invert, unsigned int *flags,
+	    const void *entry, struct xt_entry_target **target)
+{
+	struct xt_hmark_info *hmarkinfo
+		= (struct xt_hmark_info *)(*target)->data;
+	unsigned int value = 0xffffffff;
+	unsigned int maxint = UINT32_MAX;
+
+	if ((c < XT_HMARK_SADR_AND) || (c > XT_HMARK_OFFSET)) {
+		xtables_error(PARAMETER_PROBLEM, "Bad HMARK option \"%s\"",
+			      optarg);
+		return 0;
+	}
+
+	if (c >= XT_HMARK_SPORT_AND && c <= XT_HMARK_DPORT_OR)
+		maxint = UINT16_MAX;
+	else if (c == XT_HMARK_PROTO_AND)
+		maxint = UINT8_MAX;
+
+	if (!xtables_strtoui(optarg, NULL, &value, 0, maxint))
+		xtables_error(PARAMETER_PROBLEM, "Bad HMARK value \"%s\"",
+			      optarg);
+
+	if (*flags == 0) {
+		memset(hmarkinfo, 0xff, sizeof(struct xt_hmark_info));
+		hmarkinfo->pset.v32 = 0;
+		hmarkinfo->flags = 0;
+		hmarkinfo->spiset = 0;
+		hmarkinfo->hoffs = 0;
+		hmarkinfo->hashrnd = DEF_HRAND;
+	}
+	switch (c) {
+	case XT_HMARK_SADR_AND:
+		if (*flags & (1 << c)) {
+			xtables_error(PARAMETER_PROBLEM,
+				      "Can only specify "
+				      "`--hmark-smask' once");
+		}
+		hmarkinfo->smask = htonl(value);
+		if (value == maxint)
+			c = 0;
+		break;
+
+	case XT_HMARK_DADR_AND:
+		if (*flags & (1 << c)) {
+			xtables_error(PARAMETER_PROBLEM,
+				      "Can only specify "
+				      "`--hmark-dmask' once");
+		}
+		hmarkinfo->dmask = htonl(value);
+		if (value == maxint)
+			c = 0;
+		break;
+
+	case XT_HMARK_MODULUS:
+		if (*flags & (1 << c)) {
+			xtables_error(PARAMETER_PROBLEM,
+				      "Can only specify "
+				      "`--hmark-mod' once");
+		}
+		if (value == 0) {
+			xtables_error(PARAMETER_PROBLEM,
+				      "xxx modulus 0 ? "
+				      "thats a div by 0");
+			value = 0xffffffff;
+		}
+		hmarkinfo->hmod = value;
+		if (value == maxint)
+			c = 0;
+		break;
+
+	case XT_HMARK_OFFSET:
+		if (*flags & (1 << c)) {
+			xtables_error(PARAMETER_PROBLEM,
+				      "Can only specify "
+				      "`--hmark-offs' once");
+		}
+		hmarkinfo->hoffs = value;
+		if (value == 0)
+			c = 0;
+		break;
+
+	case XT_HMARK_SPORT_AND:
+		if (*flags & (1 << c)) {
+			xtables_error(PARAMETER_PROBLEM,
+				      "Can only specify "
+				      "`--hmark-sp-mask' once");
+		}
+		hmarkinfo->pmask.p16.src = htons(value);
+		if (value == maxint)
+			c = 0;
+		break;
+
+	case XT_HMARK_DPORT_AND:
+		if (*flags & (1 << c)) {
+			xtables_error(PARAMETER_PROBLEM,
+				      "Can only specify "
+				      "`--hmark-dp-mask' once");
+		}
+		hmarkinfo->pmask.p16.dst = htons(value);
+		if (value == maxint)
+			c = 0;
+		break;
+
+	case XT_HMARK_SPI_AND:
+		if (*flags & (1 << c)) {
+			xtables_error(PARAMETER_PROBLEM,
+				      "Can only specify "
+				      "`--hmark-spi-mask' once");
+		}
+		hmarkinfo->spimask = htonl(value);
+		if (value == maxint)
+			c = 0;
+		break;
+
+	case XT_HMARK_SPORT_OR:
+		if (*flags & (1 << c)) {
+			xtables_error(PARAMETER_PROBLEM,
+				      "Can only specify "
+				      "`--hmark-sp-set' once");
+		}
+		hmarkinfo->pset.p16.src = htons(value);
+		if (!value)
+			c = 0;
+		break;
+
+	case XT_HMARK_DPORT_OR:
+		if (*flags & (1 << c)) {
+			xtables_error(PARAMETER_PROBLEM,
+				      "Can only specify "
+				      "`--hmark-dp-set' once");
+		}
+		hmarkinfo->pset.p16.dst = htons(value);
+		if (!value)
+			c = 0;
+		break;
+
+	case XT_HMARK_SPI_OR:
+		if (*flags & (1 << c)) {
+			xtables_error(PARAMETER_PROBLEM,
+				      "Can only specify "
+				      "`--hmark-spi-set' once");
+		}
+		hmarkinfo->spiset = htonl(value);
+		if (!value)
+			c = 0;
+		break;
+
+	case XT_HMARK_PROTO_AND:
+		if (*flags & (1 << c)) {
+			xtables_error(PARAMETER_PROBLEM,
+				      "Can only specify "
+				      "`--hmark-proto-mask' once");
+		}
+		hmarkinfo->prmask = value;
+		if (value == maxint)
+			c = 0;
+		break;
+
+	case XT_HMARK_RND:
+		if (*flags & (1 << c)) {
+			xtables_error(PARAMETER_PROBLEM,
+				      "Can only specify `--hmark-rnd' once");
+		}
+		hmarkinfo->hashrnd = value;
+		if (value == DEF_HRAND)
+			c = 0;
+		break;
+
+	default:
+		return 0;
+	}
+	*flags |= 1 << c;
+	hmarkinfo->flags = *flags;
+
+	return 1;
+}
+
+static void HMARK_check(unsigned int flags)
+{
+	if (!(flags & (1 << XT_HMARK_MODULUS)))
+		xtables_error(PARAMETER_PROBLEM, "HMARK: the --hmark-mod, "
+			   "is not set, that means the nfmark will be in range"
+			   " 0 - 0xffffffff");
+}
+
+static void HMARK_print(const void *ip, const struct xt_entry_target *target,
+			int numeric)
+{
+	const struct xt_hmark_info *info =
+			(const struct xt_hmark_info *)target->data;
+
+	printf("HMARK ");
+	if (info->flags & (1 << XT_HMARK_SADR_AND))
+		printf("smask 0x%x ", htonl(info->smask));
+	if (info->flags & (1 << XT_HMARK_DADR_AND))
+		printf("dmask 0x%x ", htonl(info->dmask));
+	if (info->flags & (1 << XT_HMARK_SPORT_AND))
+		printf("sp-mask 0x%x ", htons(info->pmask.p16.src));
+	if (info->flags & (1 << XT_HMARK_DPORT_AND))
+		printf("dp-mask 0x%x ", htons(info->pmask.p16.dst));
+	if (info->flags & (1 << XT_HMARK_SPI_AND))
+		printf("spi-mask 0x%x ", htonl(info->spimask));
+	if (info->flags & (1 << XT_HMARK_SPORT_OR))
+		printf("sp-set 0x%x ", htons(info->pset.p16.src));
+	if (info->flags & (1 << XT_HMARK_DPORT_OR))
+		printf("dp-set 0x%x ", htons(info->pset.p16.dst));
+	if (info->flags & (1 << XT_HMARK_SPI_OR))
+		printf("spi-set 0x%x ", htonl(info->spiset));
+	if (info->flags & (1 << XT_HMARK_PROTO_AND))
+		printf("proto-mask 0x%x ", info->prmask);
+	if (info->flags & (1 << XT_HMARK_RND))
+		printf("rnd 0x%x ", info->hashrnd);
+	if (info->flags & (1 << XT_HMARK_MODULUS))
+		printf("mark=hv %% 0x%x ", info->hmod);
+	if (info->flags & (1 << XT_HMARK_OFFSET))
+		printf("+ 0x%x ", info->hoffs);
+}
+
+static void HMARK_save(const void *ip, const struct xt_entry_target *target)
+{
+	const struct xt_hmark_info *info =
+		(const struct xt_hmark_info *)target->data;
+
+	if (info->flags & (1 << XT_HMARK_SADR_AND))
+		printf("--hmark-smask 0x%x ", htonl(info->smask));
+	if (info->flags & (1 << XT_HMARK_DADR_AND))
+		printf("--hmark-dmask 0x%x ", htonl(info->dmask));
+	if (info->flags & (1 << XT_HMARK_SPORT_AND))
+		printf("--hmark-sp-mask 0x%x ", htons(info->pmask.p16.src));
+	if (info->flags & (1 << XT_HMARK_DPORT_AND))
+		printf("--hmark-dp-mask 0x%x ", htons(info->pmask.p16.dst));
+	if (info->flags & (1 << XT_HMARK_SPI_AND))
+		printf("--hmark-spi-mask 0x%x ", htonl(info->spimask));
+	if (info->flags & (1 << XT_HMARK_SPORT_OR))
+		printf("--hmark-sp-set 0x%x ", htons(info->pset.p16.src));
+	if (info->flags & (1 << XT_HMARK_DPORT_OR))
+		printf("--hmark-dp-set 0x%x ", htons(info->pset.p16.dst));
+	if (info->flags & (1 << XT_HMARK_SPI_OR))
+		printf("--hmark-spi-set 0x%x ", htonl(info->spiset));
+	if (info->flags & (1 << XT_HMARK_PROTO_AND))
+		printf("--hmark-proto-mask 0x%x ", info->prmask);
+	if (info->flags & (1 << XT_HMARK_RND))
+		printf("--hmark-rnd 0x%x ", info->hashrnd);
+	if (info->flags & (1 << XT_HMARK_MODULUS))
+		printf("--hmark-mod 0x%x ", info->hmod);
+	if (info->flags & (1 << XT_HMARK_OFFSET))
+		printf("--hmark-offs 0x%x ", info->hoffs);
+}
+
+static struct xtables_target mark_tg_reg[] = {
+	{
+		.family        = NFPROTO_UNSPEC,
+		.name          = "HMARK",
+		.version       = XTABLES_VERSION,
+		.revision      = 0,
+		.size          = XT_ALIGN(sizeof(struct xt_hmark_info)),
+		.userspacesize = XT_ALIGN(sizeof(struct xt_hmark_info)),
+		.help          = HMARK_help,
+		.parse         = HMARK_parse,
+		.final_check   = HMARK_check,
+		.print         = HMARK_print,
+		.save          = HMARK_save,
+		.extra_opts    = HMARK_opts,
+	},
+};
+
+void _init(void)
+{
+	xtables_register_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
+}
+
diff --git a/extensions/libxt_HMARK.man b/extensions/libxt_HMARK.man
new file mode 100644
index 0000000..f24ac9b
--- /dev/null
+++ b/extensions/libxt_HMARK.man
@@ -0,0 +1,60 @@
+This module does the same as MARK, i.e. set an fwmark, but the mark is based on a hash value.
+The hash is based on saddr, daddr, sport, dport and proto. The same mark will be produced independet of direction if no masks is set or the same masks is used for src and dest.
+The hash mark could be adjusted by modulus and finally an offset could be added, i.e the final mark will be within a range. If state RELATED is used icmp will be handled also, i.e. hash will be calculated on the original message not the icmp it self.
+Note: None of the parameters effect the packet it self only the calculated hash value.
+.PP
+Parameters:
+For all masks default is all "1:s", to disable a field use mask 0
+For IPv6 it's just the last 32 bits that is included in the hash
+.TP
+\fB\-\-hmark\-smask\fP \fIvalue\fP
+The value to AND the source address with (saddr & value).
+.TP
+\fB\-\-hmark\-dmask\fP \fIvalue\fP
+The value to AND the dest. address with (daddr & value).
+.TP
+\fB\-\-hmark\-sp\-mask\fP \fIvalue\fP
+A 16 bit value to AND the src port with (sport & value).
+.TP
+\fB\-\-hmark\-dp\-mask\fP \fIvalue\fP
+A 16 bit value to AND the dest port with (dport & value).
+.TP
+\fB\-\-hmark\-sp\-set\fP \fIvalue\fP
+A 16 bit value to OR the src port with (sport | value).
+.TP
+\fB\-\-hmark\-dp\-set\fP \fIvalue\fP
+A 16 bit value to OR the dest port with (dport | value).
+.TP
+\fB\-\-hmark\-spi\-mask\fP \fIvalue\fP
+Value to AND the spi field with (spi & value) valid for proto esp or ah.
+.TP
+\fB\-\-hmark\-spi\-set\fP \fIvalue\fP
+Value to OR the spi field with (spi | value) valid for proto esp or ah.
+.TP
+\fB\-\-hmark\-proto\-mask\fP \fIvalue\fP
+An 8 bit value to AND the L4 proto field with (proto & value).
+.TP
+\fB\-\-hmark\-rnd\fP \fIvalue\fP
+A 32 bit initial value for hash calc, default is 0xc175a3b8.
+.PP
+Final processing of the mark in order of execution.
+.TP
+\fB\-\-hmark\-mod\fP \fvalue (must be > 0)\fP
+The easiest way to describe this is:  hash = hash mod <value>
+.TP
+\fB\-\-hmark\-offs\fP \fvalue\fP
+The easiest way to describe this is:  hash = hash + <value>
+.PP
+\fIExamples:\fP
+.PP
+Default rule handles all TCP, UDP, SCTP, ESP & AH
+.IP
+iptables \-t mangle \-A PREROUTING \-m state \-\-state NEW,ESTABLISHED,RELATED
+ \-j HMARK \-\-hmark-offs 10000 \-\-hmark-mod 10
+.PP
+Handle SCTP and hash dest port only and produce a nfmark between 100-119.
+.IP
+iptables \-t mangle \-A PREROUTING -p SCTP \-j HMARK \-\-smask 0 \-\-dmask 0
+ \-\-sp\-mask 0 \-\-offs 100 \-\-mod 20
+.PP
+
diff --git a/include/linux/netfilter/xt_hmark.h b/include/linux/netfilter/xt_hmark.h
new file mode 100644
index 0000000..3f7ecc6
--- /dev/null
+++ b/include/linux/netfilter/xt_hmark.h
@@ -0,0 +1,28 @@
+#ifndef XT_HMARK_H_
+#define XT_HMARK_H_
+
+#include <linux/types.h>
+
+union ports {
+	struct {
+		__u16	src;
+		__u16	dst;
+	} p16;
+	__u32	v32;
+};
+
+struct xt_hmark_info {
+	__u32		smask;		/* Source address mask */
+	__u32		dmask;		/* Dest address mask */
+	union ports	pmask;
+	union ports	pset;
+	__u32		spimask;
+	__u32		spiset;
+	__u16		flags;		/* Print out only */
+	__u16		prmask;		/* L4 Proto mask */
+	__u32		hashrnd;
+	__u32		hmod;		/* Modulus */
+	__u32		hoffs;		/* Offset */
+};
+
+#endif /* XT_HMARK_H_ */
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH] NETFILTER module xt_hmark new target for HASH MARK
From: Hans Schillstrom @ 2011-02-03 13:34 UTC (permalink / raw)
  To: kaber, jengelh, netfilter-devel, netdev; +Cc: hans, Hans Schillstrom
In-Reply-To: <1296740050-6311-1-git-send-email-hans.schillstrom@ericsson.com>

The target allows you to create rules in the "raw" and "mangle" tables
which alter the netfilter mark (nfmark) field within a given range.
First a 32 bit hash value is generated then modulus by <limit> and
finally an offset is added before it's written to nfmark.
Prior to routing, the nfmark can influence the routing method (see
"Use netfilter MARK value as routing key") and can also be used by
other subsystems to change their behavior.

man page
   HMARK
       This  module  does  the same as MARK, i.e. set an fwmark,
       but the mark is based on a hash value.  The hash is based on
       saddr, daddr, sport, dport and proto. The same mark will be produced
       independet of direction if no masks is set or the same masks is used for
       src and dest. The hash mark could be adjusted by modulus and finaly an
       offset could be added, i.e the final mark will be within a range.
       ICMP errors will have hash calc based on the original message.
       Note: None of the parameters effect the packet it self
       only the calculated hash value.

       Parameters: For all masks default is all "1:s", to disable a field
                   use mask 0. For IPv6 it's just the last 32 bits that
                   is included in the hash.

       --hmark-smask value
              The value to AND the source address with (saddr & value).

       --hmark-dmask value
              The value to AND the dest. address with (daddr & value).

       --hmark-sp-mask value
              A 16 bit value to AND the src port with (sport & value).

       --hmark-dp-mask value
              A 16 bit value to AND the dest port with (dport & value).

       --hmark-sp-set value
              A 16 bit value to OR the src port with (sport | value).

       --hmark-dp-set value
              A 16 bit value to OR the dest port with (dport | value).

       --hmark-spi-mask value
              Value to AND the spi field with (spi & value) valid for proto esp or ah.

       --hmark-spi-set value
              Value to OR the spi field with (spi | value) valid for proto esp or ah.

       --hmark-proto-mask value
              A 16 bit value to AND the L4 proto field with (proto & value).

       --hmark-rnd value
              A 32 bit intitial value for hash calc, default is 0xc175a3b8.

       Final processing of the mark in order of execution.

       --hmark-mod value (must be > 0)
              The easiest way to describe this is:  hash = hash mod <value>

       --hmark-offs alue (must be > 0)
              The easiest way to describe this is:  hash = hash + <value>

       Examples:

       Default rule handles all TCP, UDP, SCTP, ESP & AH

              iptables -t mangle -A PREROUTING \
               -j HMARK --hmark-offs 10000 --hmark-mod 10

       Handle SCTP and hash dest port only and produce a nfmark between 100-119.

              iptables -t mangle -A PREROUTING -p SCTP -j HMARK --smask 0 --dmask 0
               --sp-mask 0 --offs 100 --mod 20

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 include/linux/netfilter/xt_hmark.h |   28 ++++
 net/netfilter/Kconfig              |   18 +++
 net/netfilter/Makefile             |    1 +
 net/netfilter/xt_hmark.c           |  245 ++++++++++++++++++++++++++++++++++++
 4 files changed, 292 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/netfilter/xt_hmark.h
 create mode 100644 net/netfilter/xt_hmark.c

diff --git a/include/linux/netfilter/xt_hmark.h b/include/linux/netfilter/xt_hmark.h
new file mode 100644
index 0000000..3f7ecc6
--- /dev/null
+++ b/include/linux/netfilter/xt_hmark.h
@@ -0,0 +1,28 @@
+#ifndef XT_HMARK_H_
+#define XT_HMARK_H_
+
+#include <linux/types.h>
+
+union ports {
+	struct {
+		__u16	src;
+		__u16	dst;
+	} p16;
+	__u32	v32;
+};
+
+struct xt_hmark_info {
+	__u32		smask;		/* Source address mask */
+	__u32		dmask;		/* Dest address mask */
+	union ports	pmask;
+	union ports	pset;
+	__u32		spimask;
+	__u32		spiset;
+	__u16		flags;		/* Print out only */
+	__u16		prmask;		/* L4 Proto mask */
+	__u32		hashrnd;
+	__u32		hmod;		/* Modulus */
+	__u32		hoffs;		/* Offset */
+};
+
+#endif /* XT_HMARK_H_ */
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 82a6e0d..2115079 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -471,6 +471,24 @@ config NETFILTER_XT_TARGET_HL
 	since you can easily create immortal packets that loop
 	forever on the network.
 
+config NETFILTER_XT_TARGET_HMARK
+	tristate '"HMARK" target support'
+	depends on NETFILTER_ADVANCED
+	select IP6_NF_IPTABLES if IPV6
+	---help---
+	This option adds the "HMARK" target.
+
+	The target allows you to create rules in the "raw" and "mangle" tables
+	which alter the netfilter mark (nfmark) field within a given range.
+	First a 32 bit hash value is generated then modulus by <limit> and
+	finally an offset is added before it's written to nfmark.
+
+	Prior to routing, the nfmark can influence the routing method (see
+	"Use netfilter MARK value as routing key") and can also be used by
+	other subsystems to change their behavior.
+
+	The mark match can also be used to match nfmark produced by this module.
+
 config NETFILTER_XT_TARGET_IDLETIMER
 	tristate  "IDLETIMER target support"
 	depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index d57a890..b24a5e6 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -56,6 +56,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_hmark.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o
diff --git a/net/netfilter/xt_hmark.c b/net/netfilter/xt_hmark.c
new file mode 100644
index 0000000..d4b8257
--- /dev/null
+++ b/net/netfilter/xt_hmark.c
@@ -0,0 +1,245 @@
+/*
+ *	xt_hmark - Netfilter module to set mark as hash value
+ *
+ *	(C) 2010 Hans Schillstrom <hans.schillstrom@ericsson.com>
+ *
+ *	Description:
+ *	This module calculates a hash value that can be modified by modulus
+ *	and an offset. The hash value is based on a direction independent
+ *	five tuple: src & dst addr src & dst ports and protocol.
+ *	However src & dst port can be masked and are not used for fragmented
+ *	packets, ESP and AH don't have ports so SPI will be used instead.
+ *	For ICMP error messages the hash mark values will be calculated on
+ *	the source packet i.e. the packet caused the error (If sufficient
+ *	amount of data exists).
+ *
+ *	This program is free software; you can redistribute it and/or modify
+ *	it under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <linux/icmp.h>
+
+#include <linux/netfilter/xt_hmark.h>
+#include <linux/netfilter/x_tables.h>
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#	define WITH_IPV6 1
+#include <net/ipv6.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#endif
+
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Hans Schillstrom <hans.schillstrom@ericsson.com>");
+MODULE_DESCRIPTION("Xtables: packet range mark operations by hash value");
+MODULE_ALIAS("ipt_HMARK");
+MODULE_ALIAS("ip6t_HMARK");
+
+/*
+ * ICMP, get inner header so calc can be made on the source message
+ *       not the icmp header, i.e. same hash mark must be produced
+ *       on an icmp error message.
+ */
+static int get_inner_hdr(struct sk_buff *skb, int iphsz, int nhoff)
+{
+	const struct icmphdr *icmph;
+	struct icmphdr _ih;
+	struct iphdr *iph = NULL;
+
+	/* Not enough header? */
+	icmph = skb_header_pointer(skb, nhoff + iphsz, sizeof(_ih), &_ih);
+	if (icmph == NULL)
+		goto out;
+
+	if (icmph->type > NR_ICMP_TYPES)
+		goto out;
+
+
+	/* Error message? */
+	if (icmph->type != ICMP_DEST_UNREACH &&
+	    icmph->type != ICMP_SOURCE_QUENCH &&
+	    icmph->type != ICMP_TIME_EXCEEDED &&
+	    icmph->type != ICMP_PARAMETERPROB &&
+	    icmph->type != ICMP_REDIRECT)
+		goto out;
+	/* Checkin full IP header plus 8 bytes of protocol to
+	 * avoid additional coding at protocol handlers.
+	 */
+	if (!pskb_may_pull(skb, nhoff + iphsz + sizeof(_ih) + 8))
+		goto out;
+
+	iph = (struct iphdr *)(skb->data + nhoff + iphsz + sizeof(_ih));
+	return nhoff + iphsz + sizeof(_ih);
+out:
+	return nhoff;
+}
+/*
+ * ICMPv6
+ * Input nhoff Offset into network header
+ *       offset where ICMPv6 header starts
+ * Returns offset to icmp embedded msg or nhoff
+ */
+#ifdef WITH_IPV6
+static int get_inner6_hdr(struct sk_buff *skb, int nhoff, int offset)
+{
+	struct icmp6hdr *icmp6h;
+	struct icmp6hdr _ih6;
+
+	icmp6h = skb_header_pointer(skb, offset, sizeof(_ih6), &_ih6);
+	if (icmp6h == NULL)
+		goto out;
+
+	if (icmp6h->icmp6_type && icmp6h->icmp6_type < 128)
+		return offset + sizeof(_ih6);
+
+out:
+	return nhoff;
+}
+#endif
+/*
+ * Calc hash value, special casre is taken on icmp and fragmented messages
+ * i.e. fragmented messages don't use ports.
+ */
+static __u32 get_hash(struct sk_buff *skb, struct xt_hmark_info *info)
+{
+	int nhoff, hash = 0, poff, proto;
+	struct iphdr *ip;
+	u8 ip_proto;
+	u32 addr1, addr2, ihl;
+	union {
+		u32 v32;
+		u16 v16[2];
+	} ports;
+
+	nhoff = skb_network_offset(skb);
+	proto = skb->protocol;
+
+	if (!proto && skb->sk) {
+		if (skb->sk->sk_family == AF_INET)
+			proto = __constant_htons(ETH_P_IP);
+		else if (skb->sk->sk_family == AF_INET6)
+			proto = __constant_htons(ETH_P_IPV6);
+	}
+
+	switch (proto) {
+	case __constant_htons(ETH_P_IP):
+		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
+			goto done;
+
+		ip = (struct iphdr *) (skb->data + nhoff);
+		if (ip->protocol == IPPROTO_ICMP) {
+			/* Switch hash calc to inner header ? */
+			nhoff = get_inner_hdr(skb, ip->ihl * 4, nhoff);
+			ip = (struct iphdr *) (skb->data + nhoff);
+		}
+
+		if (ip->frag_off & htons(IP_MF | IP_OFFSET))
+			ip_proto = 0;
+		else
+			ip_proto = ip->protocol;
+
+		addr1 = (__force u32) ip->saddr & info->smask;
+		addr2 = (__force u32) ip->daddr & info->dmask;
+		ihl = ip->ihl;
+		break;
+#ifdef WITH_IPV6
+	case __constant_htons(ETH_P_IPV6):
+	{
+		struct ipv6hdr *ip6;
+		int ptr;
+
+		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
+			goto done;
+
+		ip6 = (struct ipv6hdr *) (skb->data + nhoff);
+
+		/* if (ip6->nexthdr == IPPROTO_ICMPV6) { */
+		if (ipv6_find_hdr(skb, &ptr, IPPROTO_ICMPV6, NULL) >= 0) {
+			nhoff = get_inner6_hdr(skb, nhoff, ptr);
+			ip6 = (struct ipv6hdr *) (skb->data + nhoff);
+		}
+		if (ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL) < 0)
+			ip_proto = ip6->nexthdr;
+		else
+			ip_proto = 0;	/* It's a fragment */
+
+		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
+		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
+		ihl = (40 >> 2);
+		break;
+	}
+#endif
+	default:
+		goto done;
+	}
+
+	ports.v32 = 0;
+	poff = proto_ports_offset(ip_proto);
+	nhoff += ihl * 4 + poff;
+	if (poff >= 0 && pskb_may_pull(skb, nhoff + 4)) {
+		ports.v32 = * (__force u32 *) (skb->data + nhoff);
+		if (ip_proto == IPPROTO_ESP || ip_proto == IPPROTO_AH) {
+			ports.v32 = (ports.v32 & info->spimask) | info->spiset;
+		} else {
+			ports.v32 = (ports.v32 & info->pmask.v32) |
+				    info->pset.v32;
+			if (ports.v16[1] < ports.v16[0])
+				swap(ports.v16[0], ports.v16[1]);
+		}
+	}
+	ip_proto &= info->prmask;
+	/* get a consistent hash (same value on both flow directions) */
+	if (addr2 < addr1)
+		swap(addr1, addr2);
+
+	hash = jhash_3words(addr1, addr2, ports.v32, info->hashrnd) ^ ip_proto;
+	if (!hash)
+		hash = 1;
+
+	return hash;
+
+done:
+	return 0;
+}
+
+static unsigned int
+hmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	struct xt_hmark_info *info = (struct xt_hmark_info *)par->targinfo;
+	__u32 hash = get_hash(skb, info);
+
+	if (info->hmod && hash)
+		skb->mark = (hash % info->hmod) + info->hoffs;
+	return XT_CONTINUE;
+}
+
+static struct xt_target hmark_tg_reg __read_mostly = {
+	.name           = "HMARK",
+	.revision       = 0,
+	.family         = NFPROTO_UNSPEC,
+	.target         = hmark_tg,
+	.targetsize     = sizeof(struct xt_hmark_info),
+	.me             = THIS_MODULE,
+};
+
+static int __init hmark_mt_init(void)
+{
+	int ret;
+
+	ret = xt_register_target(&hmark_tg_reg);
+	if (ret < 0)
+		return ret;
+	return 0;
+}
+
+static void __exit hmark_mt_exit(void)
+{
+	xt_unregister_target(&hmark_tg_reg);
+}
+
+module_init(hmark_mt_init);
+module_exit(hmark_mt_exit);
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 0/2] NETFILTER new target module, HMARK
From: Hans Schillstrom @ 2011-02-03 13:34 UTC (permalink / raw)
  To: kaber, jengelh, netfilter-devel, netdev; +Cc: hans, Hans Schillstrom

The target allows you to create rules in the "raw" and "mangle" tables
which alter the netfilter mark (nfmark) field within a given range.
First a 32 bit hash value is generated then modulus by <limit> and
finally an offset is added before it's written to nfmark.
Prior to routing, the nfmark can influence the routing method (see
"Use netfilter MARK value as routing key") and can also be used by
other subsystems to change their behaviour.

The mark match can also be used to match nfmark produced by this module.
See the kernel module for more info.

REVISION

This is version 1

Regards
Hans Schillstrom <hans.schillstrom@ericsson.com>

^ permalink raw reply

* Re: [GIT PULL] IPVS SCTP lock fix
From: Patrick McHardy @ 2011-02-03 12:10 UTC (permalink / raw)
  To: Simon Horman; +Cc: netdev, linux-next, linux-kernel, lvs-devel, Randy Dunlap
In-Reply-To: <1296733925-23131-1-git-send-email-horms@verge.net.au>

Am 03.02.2011 12:52, schrieb Simon Horman:
> Hi Patrick,
> 
> please consider pulling
> git://git.kernel.org/pub/scm/linux/kernel/git/horms/lvs-test-2.6.git for-patrick
> which contains a single patch to the IPVS SCTP module so that
> it uses the correct lock.

Pulled, thanks Simon.

^ permalink raw reply

* [GIT PULL] IPVS SCTP lock fix
From: Simon Horman @ 2011-02-03 11:52 UTC (permalink / raw)
  To: netdev, linux-next, linux-kernel, lvs-devel; +Cc: Randy Dunlap

Hi Patrick,

please consider pulling
git://git.kernel.org/pub/scm/linux/kernel/git/horms/lvs-test-2.6.git for-patrick
which contains a single patch to the IPVS SCTP module so that
it uses the correct lock.

^ permalink raw reply

* [PATCH] IPVS: Use correct lock in SCTP module
From: Simon Horman @ 2011-02-03 11:52 UTC (permalink / raw)
  To: netdev, linux-next, linux-kernel, lvs-devel
  Cc: Randy Dunlap, Simon Horman, Hans Schillstrom
In-Reply-To: <1296733925-23131-1-git-send-email-horms@verge.net.au>

Use sctp_app_lock instead of tcp_app_lock in the SCTP protocol module.

This appears to be a typo introduced by the netns changes.

Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
---
 net/netfilter/ipvs/ip_vs_proto_sctp.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index fb2d04a..b027ccc 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -1101,7 +1101,7 @@ static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
 	ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
-	spin_lock_init(&ipvs->tcp_app_lock);
+	spin_lock_init(&ipvs->sctp_app_lock);
 	pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
 							sizeof(sctp_timeouts));
 }
-- 
1.7.2.3

^ permalink raw reply related

* Re: 2.6.38-rc3-git1: Reported regressions 2.6.36 -> 2.6.37
From: Carlos R. Mafra @ 2011-02-03 11:23 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Linux Kernel Mailing List, Maciej Rutecki, Florian Mickler,
	Andrew Morton, Linus Torvalds, Kernel Testers List,
	Network Development, Linux ACPI, Linux PM List, Linux SCSI List,
	Linux Wireless List, DRI
In-Reply-To: <e9vOd5LzdBK.A.dyD.h6eSNB@chimera>

On Thu  3.Feb'11 at  1:03:41 +0100, Rafael J. Wysocki wrote:
> 
> If you know of any other unresolved post-2.6.36 regressions, please let us know
> either and we'll add them to the list.  Also, please let us know if any
> of the entries below are invalid.

I'm sorry if I'm overlooking something, but as far as I can see the regression
reported here:

https://lkml.org/lkml/2011/1/24/457

is not in the list (update on that report: reverting that commit on top of 
2.6.37 fixes the issue).

After some time, I also ended up finding an earlier report in the kernel bugzilla
which I think is the same regression (it was bisected to the same commit):

https://bugzilla.kernel.org/show_bug.cgi?id=24982

but I do not see it in the list either, even though it's marked as a
regression in the bugzilla.

The issue was also present in 2.6.38-rc2 last time I tested.

^ permalink raw reply

* [PATCH] fixing hw timestamping in igb
From: Anders Berggren @ 2011-02-03 10:39 UTC (permalink / raw)
  To: John Ronciak; +Cc: e1000-devel, netdev, Patrick Ohly

Hardware timestamping for Intel 82580 didn't work in either 2.6.36 or 2.6.37. Comparing it to Intel's igb-2.4.12 I found that the timecounter_init clock/counter initialization was done too early. 

Anders Berggren
Halon Security

lab-slang-1:~# diff -u linux-2.6.37/drivers/net/igb/igb_main.c linux/drivers/net/igb/igb_main.c
--- linux-2.6.37/drivers/net/igb/igb_main.c	2011-02-03 10:02:53.000000000 +0100
+++ linux/drivers/net/igb/igb_main.c	2011-02-03 10:12:40.000000000 +0100
@@ -98,6 +98,7 @@
 static void igb_setup_mrqc(struct igb_adapter *);
 static int igb_probe(struct pci_dev *, const struct pci_device_id *);
 static void __devexit igb_remove(struct pci_dev *pdev);
+static void igb_init_hw_timer(struct igb_adapter *adapter);
 static int igb_sw_init(struct igb_adapter *);
 static int igb_open(struct net_device *);
 static int igb_close(struct net_device *);
@@ -1987,6 +1988,10 @@
 	}
 
 #endif
+
+	/* do hw tstamp init after resetting */ 
+	igb_init_hw_timer(adapter);
+
 	dev_info(&pdev->dev, "Intel(R) Gigabit Ethernet Network Connection\n");
 	/* print bus type/speed/width info */
 	dev_info(&pdev->dev, "%s: (PCIe:%s:%s) %pM\n",
@@ -2301,7 +2306,6 @@
 		return -ENOMEM;
 	}
 
-	igb_init_hw_timer(adapter);
 	igb_probe_vfs(adapter);
 
 	/* Explicitly disable IRQ since the NIC can be in any state. */


------------------------------------------------------------------------------
Special Offer-- Download ArcSight Logger for FREE (a $49 USD value)!
Finally, a world-class log management solution at an even better price-free!
Download using promo code Free_Logger_4_Dev2Dev. Offer expires 
February 28th, so secure your free ArcSight Logger TODAY! 
http://p.sf.net/sfu/arcsight-sfd2d
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* Re: [PATCH 2/2] ARM: QQ2440 networking support
From: Domenico Andreoli @ 2011-02-03 10:34 UTC (permalink / raw)
  To: Jamie Iles; +Cc: netdev, Kukjin Kim, Russell King, Ben Dooks, linux-arm-kernel
In-Reply-To: <20110203094123.GB3141@pulham.picochip.com>

On Thu, Feb 03, 2011 at 09:41:23AM +0000, Jamie Iles wrote:
> Hi Domenico,

Hi Jamie,

> This should probably also go to the netdev mailing list: 
> netdev@vger.kernel.org.  A couple of other minor comments inline.

CCed them

> On Wed, Feb 02, 2011 at 10:06:37PM +0000, Domenico Andreoli wrote:
> > From: Domenico Andreoli <cavokz@gmail.com>
> > 
> > Add networking support for QQ2440.
> > 
> > Signed-off-by: Domenico Andreoli <cavokz@gmail.com>
> > 
> > ---
> >  arch/arm/mach-s3c2440/include/mach/qq2440.h |   22 ++++++++++++
> >  arch/arm/mach-s3c2440/mach-qq2440.c         |   16 ++++++++
> >  drivers/net/Kconfig                         |    4 +-
> >  drivers/net/cs89x0.c                        |   33 ++++++++++++------
> >  4 files changed, 61 insertions(+), 14 deletions(-)
> > 
> > Index: arm-2.6.git/arch/arm/mach-s3c2440/include/mach/qq2440.h
> > ===================================================================
> > --- /dev/null	1970-01-01 00:00:00.000000000 +0000
> > +++ arm-2.6.git/arch/arm/mach-s3c2440/include/mach/qq2440.h	2011-02-02 18:32:38.000000000 +0000
> > @@ -0,0 +1,22 @@
> > +/*
> > + * arch/arm/mach-s3c2440/include/mach/qq2440.h
> > + *
> > + * Copyright (c) 2011 Domenico Andreoli <cavokz@gmail.com>
> > + *
> > + * QQ2440 - platform definitions
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > +*/
> > +
> > +#ifndef __ASM_MACH_QQ2440_H
> > +#define __ASM_MACH_QQ2440_H
> > +
> > +#define QQ2440_CS8900_IRQ         IRQ_EINT9
> > +
> > +#define QQ2440_CS8900_VIRT_BASE   S3C_ADDR(0x00500000)
> > +#define QQ2440_CS8900_PA          (S3C2410_CS3 + 0x1000000)
> > +#define QQ2440_CS8900_SZ          SZ_1M
> > +
> > +#endif /* __ASM_MACH_QQ2440_H */
> > Index: arm-2.6.git/drivers/net/cs89x0.c
> > ===================================================================
> > --- arm-2.6.git.orig/drivers/net/cs89x0.c	2011-02-02 18:28:01.000000000 +0000
> > +++ arm-2.6.git/drivers/net/cs89x0.c	2011-02-02 18:32:38.000000000 +0000
> > @@ -95,6 +95,9 @@
> >    Dmitry Pervushin  : dpervushin@ru.mvista.com
> >                      : PNX010X platform support
> >  
> > +  Domenico Andreoli : cavokz@gmail.com
> > +                    : QQ2440 platform support
> > +
> >  */
> >  
> >  /* Always include 'config.h' first in case the user wants to turn on
> > @@ -117,7 +120,7 @@
> >   * Set this to zero to remove all the debug statements via
> >   * dead code elimination
> >   */
> > -#define DEBUGGING	1
> > +#define DEBUGGING	0
> 
> This is probably best split out as a separate patch.

I will

> >  /*
> >    Sources:
> > @@ -173,6 +176,10 @@
> >  #if defined(CONFIG_MACH_IXDP2351)
> >  static unsigned int netcard_portlist[] __used __initdata = {IXDP2351_VIRT_CS8900_BASE, 0};
> >  static unsigned int cs8900_irq_map[] = {IRQ_IXDP2351_CS8900, 0, 0, 0};
> > +#elif defined(CONFIG_MACH_QQ2440)
> > +#include <mach/qq2440.h>
> > +static unsigned int netcard_portlist[] __used __initdata = {QQ2440_CS8900_VIRT_BASE + 0x300, 0};
> > +static unsigned int cs8900_irq_map[] = {QQ2440_CS8900_IRQ, 0, 0, 0};
> >  #elif defined(CONFIG_ARCH_IXDP2X01)
> >  static unsigned int netcard_portlist[] __used __initdata = {IXDP2X01_CS8900_VIRT_BASE, 0};
> >  static unsigned int cs8900_irq_map[] = {IRQ_IXDP2X01_CS8900, 0, 0, 0};
> > @@ -521,6 +528,10 @@
> >  #endif
> >  		lp->force = g_cs89x0_media__force;
> >  #endif
> > +
> > +#if defined(CONFIG_MACH_QQ2440)
> > +		lp->force |= FORCE_RJ45 | FORCE_FULL;
> > +#endif
> >          }
> >  
> >  	/* Grab the region so we can find another board if autoIRQ fails. */
> > @@ -608,7 +619,7 @@
> >  		        dev->dev_addr[i*2+1] = Addr >> 8;
> >  		}
> >  
> > -	   	/* Load the Adapter Configuration.
> > +		/* Load the Adapter Configuration.
> >  		   Note:  Barring any more specific information from some
> >  		   other source (ie EEPROM+Schematics), we would not know
> >  		   how to operate a 10Base2 interface on the AUI port.
> > @@ -655,7 +666,7 @@
> >  	if ((readreg(dev, PP_SelfST) & EEPROM_PRESENT) == 0)
> >  		printk(KERN_WARNING "cs89x0: No EEPROM, relying on command line....\n");
> >  	else if (get_eeprom_data(dev, START_EEPROM_DATA,CHKSUM_LEN,eeprom_buff) < 0) {
> > -		printk(KERN_WARNING "\ncs89x0: EEPROM read failed, relying on command line.\n");
> > +		printk(KERN_WARNING "cs89x0: EEPROM read failed, relying on command line.\n");
> >          } else if (get_eeprom_cksum(START_EEPROM_DATA,CHKSUM_LEN,eeprom_buff) < 0) {
> >  		/* Check if the chip was able to read its own configuration starting
> >  		   at 0 in the EEPROM*/
> > @@ -709,7 +720,7 @@
> >          /* FIXME: we don't set the Ethernet address on the command line.  Use
> >             ifconfig IFACE hw ether AABBCCDDEEFF */
> >  
> > -	printk(KERN_INFO "cs89x0 media %s%s%s",
> > +	printk(KERN_INFO "cs89x0: media %s%s%s",
> >  	       (lp->adapter_cnf & A_CNF_10B_T)?"RJ-45,":"",
> >  	       (lp->adapter_cnf & A_CNF_AUI)?"AUI,":"",
> >  	       (lp->adapter_cnf & A_CNF_10B_2)?"BNC,":"");
> 
> Splitting these cleanups into a cleanup patch would be handly so you can 
> see the real changes easier.

generally speaking, this driver requires some care. it's not only
about formatting.

you can see the ifdefs I had to add for QQ2440. you can note also that
it's not possible to compile it for multiple platforms because there
would be multiple definitions of some symbols.

for instance, there is a design bug on the QQ2440. EEPROM is not present
but the CS8900 is not correctly wired and EEPROM is incorrectly supposed
to be present. one way to solve this is with some more ifdefs...

in practice I'm volunteering to switch it to some more modern model like
platform driver but i still need to understand if it has any sense. it
does not seem a popular driver and I doubt there will be so many new
boards requiring it that the work could be justified (but I would do
it anyway).

what do netdev people think?

> > @@ -943,7 +954,7 @@
> >  static void __init reset_chip(struct net_device *dev)
> >  {
> >  #if !defined(CONFIG_MACH_MX31ADS)
> > -#if !defined(CONFIG_MACH_IXDP2351) && !defined(CONFIG_ARCH_IXDP2X01)
> > +#if !defined(CS89x0_NONISA_IRQ)
> >  	struct net_local *lp = netdev_priv(dev);
> >  	int ioaddr = dev->base_addr;
> >  #endif
> > @@ -954,18 +965,18 @@
> >  	/* wait 30 ms */
> >  	msleep(30);
> >  
> > -#if !defined(CONFIG_MACH_IXDP2351) && !defined(CONFIG_ARCH_IXDP2X01)
> > +#if !defined(CS89x0_NONISA_IRQ)
> >  	if (lp->chip_type != CS8900) {
> >  		/* Hardware problem requires PNP registers to be reconfigured after a reset */
> >  		writeword(ioaddr, ADD_PORT, PP_CS8920_ISAINT);
> > -		outb(dev->irq, ioaddr + DATA_PORT);
> > -		outb(0,      ioaddr + DATA_PORT + 1);
> > +		writeword(ioaddr, DATA_PORT, dev->irq);
> > +		writeword(ioaddr, DATA_PORT + 1, 0);
> >  
> >  		writeword(ioaddr, ADD_PORT, PP_CS8920_ISAMemB);
> > -		outb((dev->mem_start >> 16) & 0xff, ioaddr + DATA_PORT);
> > -		outb((dev->mem_start >> 8) & 0xff,   ioaddr + DATA_PORT + 1);
> > +		writeword(ioaddr, DATA_PORT, (dev->mem_start >> 16) & 0xff);
> > +		writeword(ioaddr, DATA_PORT + 1, (dev->mem_start >> 8) & 0xff);
> >  	}
> > -#endif	/* IXDP2x01 */
> > +#endif
> >  
> >  	/* Wait until the chip is reset */
> >  	reset_start_time = jiffies;
> > Index: arm-2.6.git/drivers/net/Kconfig
> > ===================================================================
> > --- arm-2.6.git.orig/drivers/net/Kconfig	2011-02-02 18:28:01.000000000 +0000
> > +++ arm-2.6.git/drivers/net/Kconfig	2011-02-02 18:32:38.000000000 +0000
> > @@ -1498,7 +1498,7 @@
> >  config CS89x0
> >  	tristate "CS89x0 support"
> >  	depends on NET_ETHERNET && (ISA || EISA || MACH_IXDP2351 \
> > -		|| ARCH_IXDP2X01 || MACH_MX31ADS)
> > +		|| ARCH_IXDP2X01 || MACH_MX31ADS || MACH_QQ2440)
> >  	---help---
> >  	  Support for CS89x0 chipset based Ethernet cards. If you have a
> >  	  network (Ethernet) card of this type, say Y and read the
> > @@ -1512,7 +1512,7 @@
> >  config CS89x0_NONISA_IRQ
> >  	def_bool y
> >  	depends on CS89x0 != n
> > -	depends on MACH_IXDP2351 || ARCH_IXDP2X01 || MACH_MX31ADS
> > +	depends on MACH_IXDP2351 || ARCH_IXDP2X01 || MACH_MX31ADS || MACH_QQ2440
> >  
> >  config TC35815
> >  	tristate "TOSHIBA TC35815 Ethernet support"
> > Index: arm-2.6.git/arch/arm/mach-s3c2440/mach-qq2440.c
> > ===================================================================
> > --- arm-2.6.git.orig/arch/arm/mach-s3c2440/mach-qq2440.c	2011-02-02 18:29:48.000000000 +0000
> > +++ arm-2.6.git/arch/arm/mach-s3c2440/mach-qq2440.c	2011-02-02 18:32:38.000000000 +0000
> > @@ -36,6 +36,7 @@
> >  #include <mach/leds-gpio.h>
> >  #include <mach/regs-mem.h>
> >  #include <mach/irqs.h>
> > +#include <mach/qq2440.h>
> >  #include <plat/nand.h>
> >  #include <plat/iic.h>
> >  #include <plat/mci.h>
> > @@ -54,7 +55,12 @@
> >  #include <sound/s3c24xx_uda134x.h>
> >  
> >  static struct map_desc qq2440_iodesc[] __initdata = {
> > -	/* nothing to declare, move along */
> > +	{
> > +		.virtual  = QQ2440_CS8900_VIRT_BASE,
> > +		.pfn      = __phys_to_pfn(QQ2440_CS8900_PA),
> > +		.length   = QQ2440_CS8900_SZ,
> > +		.type     = MT_DEVICE
> > +	}
> >  };
> >  
> >  #define UCON S3C2410_UCON_DEFAULT
> > @@ -325,10 +331,18 @@
> >  	s3c24xx_init_uarts(qq2440_uartcfgs, ARRAY_SIZE(qq2440_uartcfgs));
> >  }
> >  
> > +#define QQ2440_CS8900_BANKCON  (S3C2410_BANKCON_Tacp6 | S3C2410_BANKCON_Tcah4 | S3C2410_BANKCON_Tcoh1 | \
> > +                                S3C2410_BANKCON_Tacc14 | S3C2410_BANKCON_Tcos4)
> > +
> >  static void __init qq2440_init(void)
> >  {
> >  	int i;
> >  
> > +	/* Ethernet */
> > +	__raw_writel(__raw_readl(S3C2410_BWSCON) | S3C2410_BWSCON_WS3 | S3C2410_BWSCON_ST3, S3C2410_BWSCON);
> > +	__raw_writel(QQ2440_CS8900_BANKCON, S3C2410_BANKCON3);
> > +	set_irq_type(QQ2440_CS8900_IRQ, IRQ_TYPE_EDGE_RISING);
> > +
> >  	/* Make sure the D+ pullup pin is output */
> >  	WARN_ON(gpio_request(S3C2410_GPG(12), "udc pup"));
> >  	gpio_direction_output(S3C2410_GPG(12), 0);
> 
> This arch stuff should really be a separate patch too.

thank you for the review

cheers,
Domenico

^ permalink raw reply

* Re: epoll broken [was: mmotm 2011-01-25-15-47 uploaded]
From: Jiri Slaby @ 2011-02-03  9:03 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: linux-kernel, akpm, mm-commits, ML netdev, davidel
In-Reply-To: <1296719601.3438.0.camel@edumazet-laptop>

On 02/03/2011 08:53 AM, Eric Dumazet wrote:
>> {0, {u32=0, u64=0}} .............. {0, {u32=0, u64=0}}, ?}
>> 0x7fb816996660, 8192, 0) = 379151968
>> 17836 --- SIGSEGV (Segmentation fault) @ 0 (0) ---
>> 17836 +++ killed by SIGSEGV +++
>>
>> The parameter, the same as the retval, seems to be bogus.
>>
>> Is it known (fixed in newer kernels)?
>>
>> thanks,
> 
> Yes, its known, and a fix is there : https://lkml.org/lkml/2011/1/26/121

Thanks, it works indeed.

-- 
js

^ permalink raw reply

* Re: non-symmetric Unix dgram sockets and poll
From: Rémi Denis-Courmont @ 2011-02-03  8:45 UTC (permalink / raw)
  To: netdev
In-Reply-To: <20110202173640.2af412f0@chocolatine.cbg.collabora.co.uk>

Le mercredi 2 février 2011 19:36:40 Alban Crequy, vous avez écrit :
> Hi,
> 
> I have 3 Unix dgram sockets (sockA, sockB, sockC):
> - sockA is connected to sockB.
> - sockB is connected to sockC.
> - sockC is not connected.
> 
> SockA cannot send any message to sockB because
> net/unix/af_unix.c::unix_may_send() prevents it. Is there any reason for
> that restriction?

Yes, absolutely. When you connect() a socket, you expect to only *receive* 
packets from the specified peer.

-- 
Rémi Denis-Courmont
http://www.remlab.net/
http://fi.linkedin.com/in/remidenis

^ permalink raw reply

* Re: epoll broken [was: mmotm 2011-01-25-15-47 uploaded]
From: Eric Dumazet @ 2011-02-03  7:53 UTC (permalink / raw)
  To: Jiri Slaby; +Cc: linux-kernel, akpm, mm-commits, ML netdev, davidel
In-Reply-To: <4D4A5BBF.20806@gmail.com>

Le jeudi 03 février 2011 à 08:39 +0100, Jiri Slaby a écrit :
> On 01/26/2011 12:48 AM, akpm@linux-foundation.org wrote:
> > The mm-of-the-moment snapshot 2011-01-25-15-47 has been uploaded to
> 
> Hi, the network daemons are broken here. cupsd and httpd children
> segfault too often without servicing requests. It's a regression against
> mmotm 2011-01-06-15-41.
> 
> It's epoll after it dies:
> 17836 epoll_create(8192)                = 3
> ...
> 17836 accept(7, {sa_family=AF_FILE, NULL}, [2]) = 11
> 17836 getsockname(11, {sa_family=AF_FILE,
> path="/var/run/cups/cups.sock"}, [26]) = 0
> 17836 setsockopt(11, SOL_TCP, TCP_NODELAY, [1], 4) = -1 EOPNOTSUPP
> (Operation not supported)
> 17836 fcntl(11, F_GETFD)                = 0
> 17836 fcntl(11, F_SETFD, FD_CLOEXEC)    = 0
> 17836 epoll_ctl(3, EPOLL_CTL_ADD, 11, {EPOLLIN, {u32=379708832,
> u64=140428630418848}}) = 0
> 17836 epoll_wait(3, {{EPOLLIN, {u32=379708832, u64=140428630418848}}},
> 8192, 1000) = 1
> 17836 recvfrom(11, "P", 1, MSG_PEEK, NULL, NULL) = 1
> 17836 poll([{fd=11, events=POLLIN}], 1, 10000) = 1 ([{fd=11,
> revents=POLLIN}])
> 17836 recvfrom(11, "POST / HTTP/1.1\r\nContent-Length:"..., 2048, 0,
> NULL, NULL) = 771
> 17836 sendto(11, "HTTP/1.1 100 Continue\r\n\r\n", 25, 0, NULL, 0) = 25
> 17836 epoll_wait(3, {{EPOLLIN, {u32=379708832, u64=140428630418848}},
> {0, {u32=0, u64=0}} .............. {0, {u32=0, u64=0}}, ?}
> 0x7fb816996660, 8192, 0) = 379151968
> 17836 --- SIGSEGV (Segmentation fault) @ 0 (0) ---
> 17836 +++ killed by SIGSEGV +++
> 
> The parameter, the same as the retval, seems to be bogus.
> 
> Is it known (fixed in newer kernels)?
> 
> thanks,

Yes, its known, and a fix is there : https://lkml.org/lkml/2011/1/26/121

^ permalink raw reply

* epoll broken [was: mmotm 2011-01-25-15-47 uploaded]
From: Jiri Slaby @ 2011-02-03  7:39 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, mm-commits, ML netdev, davidel
In-Reply-To: <201101260021.p0Q0LxsS016458@imap1.linux-foundation.org>

On 01/26/2011 12:48 AM, akpm@linux-foundation.org wrote:
> The mm-of-the-moment snapshot 2011-01-25-15-47 has been uploaded to

Hi, the network daemons are broken here. cupsd and httpd children
segfault too often without servicing requests. It's a regression against
mmotm 2011-01-06-15-41.

It's epoll after it dies:
17836 epoll_create(8192)                = 3
...
17836 accept(7, {sa_family=AF_FILE, NULL}, [2]) = 11
17836 getsockname(11, {sa_family=AF_FILE,
path="/var/run/cups/cups.sock"}, [26]) = 0
17836 setsockopt(11, SOL_TCP, TCP_NODELAY, [1], 4) = -1 EOPNOTSUPP
(Operation not supported)
17836 fcntl(11, F_GETFD)                = 0
17836 fcntl(11, F_SETFD, FD_CLOEXEC)    = 0
17836 epoll_ctl(3, EPOLL_CTL_ADD, 11, {EPOLLIN, {u32=379708832,
u64=140428630418848}}) = 0
17836 epoll_wait(3, {{EPOLLIN, {u32=379708832, u64=140428630418848}}},
8192, 1000) = 1
17836 recvfrom(11, "P", 1, MSG_PEEK, NULL, NULL) = 1
17836 poll([{fd=11, events=POLLIN}], 1, 10000) = 1 ([{fd=11,
revents=POLLIN}])
17836 recvfrom(11, "POST / HTTP/1.1\r\nContent-Length:"..., 2048, 0,
NULL, NULL) = 771
17836 sendto(11, "HTTP/1.1 100 Continue\r\n\r\n", 25, 0, NULL, 0) = 25
17836 epoll_wait(3, {{EPOLLIN, {u32=379708832, u64=140428630418848}},
{0, {u32=0, u64=0}} .............. {0, {u32=0, u64=0}}, ?}
0x7fb816996660, 8192, 0) = 379151968
17836 --- SIGSEGV (Segmentation fault) @ 0 (0) ---
17836 +++ killed by SIGSEGV +++

The parameter, the same as the retval, seems to be bogus.

Is it known (fixed in newer kernels)?

thanks,
-- 
js

^ permalink raw reply

* [PATCH] sch_choke: Need linux/vmalloc.h
From: David Miller @ 2011-02-03  7:08 UTC (permalink / raw)
  To: netdev; +Cc: shemminger, eric.dumazet


Signed-off-by: David S. Miller <davem@davemloft.net>
---

This fixes the build on sparc64.

 net/sched/sch_choke.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index a1cec18..ee1e209 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 #include <linux/reciprocal_div.h>
+#include <linux/vmalloc.h>
 #include <net/pkt_sched.h>
 #include <net/inet_ecn.h>
 #include <net/red.h>
-- 
1.7.4


^ permalink raw reply related

* Re: linux-next: Tree for February 1 (ip_vs)
From: Hans Schillstrom @ 2011-02-03  7:04 UTC (permalink / raw)
  To: Simon Horman
  Cc: Randy Dunlap, Stephen Rothwell, netdev,
	linux-next@vger.kernel.org, LKML
In-Reply-To: <20110202223159.GB2248@verge.net.au>

On Wed, 2011-02-02 at 23:31 +0100, Simon Horman wrote:
> On Tue, Feb 01, 2011 at 09:16:20AM -0800, Randy Dunlap wrote:
> > On Tue, 1 Feb 2011 15:34:03 +1100 Stephen Rothwell wrote:
> > 
> > > Hi all,
> > > 
> > > Changes since 20110131:
> > 
> > 
> > When CONFIG_IP_VS_PROTO_TCP is not set:
> > 
> > net/netfilter/ipvs/ip_vs_proto_sctp.c:1104: error: 'struct netns_ipvs' has no member named 'tcp_app_lock'
> > net/netfilter/ipvs/ip_vs_proto_sctp.c:1104: error: 'struct netns_ipvs' has no member named 'tcp_app_lock'
> 
> Thanks, the following patch should resolve this problem.
> Hans, could you verify this change?
> 
Ooops, 

> The change is available at
> git://git.kernel.org/pub/scm/linux/kernel/git/horms/lvs-test-2.6.git master
> 
> 
> From: Simon Horman <horms@verge.net.au>
> 
> IPVS: Use correct lock in SCTP module
> 
> Use sctp_app_lock instead of tcp_app_lock in the SCTP protocol module.
> 
> This appears to be a typo introduced by the netns changes.
> 
> Cc: Hans Schillstrom <hans@schillstrom.com>
> Signed-off-by: Simon Horman <horms@verge.net.au>

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>

> ---
>  net/netfilter/ipvs/ip_vs_proto_sctp.c |    2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
> index fb2d04a..b027ccc 100644
> --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
> +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
> @@ -1101,7 +1101,7 @@ static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
>  	struct netns_ipvs *ipvs = net_ipvs(net);
>  
>  	ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
> -	spin_lock_init(&ipvs->tcp_app_lock);
> +	spin_lock_init(&ipvs->sctp_app_lock);
>  	pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
>  							sizeof(sctp_timeouts));
>  }

Thanks
Hans

^ permalink raw reply

* Re: Network performance with small packets
From: Michael S. Tsirkin @ 2011-02-03  6:16 UTC (permalink / raw)
  To: Shirley Ma
  Cc: Krishna Kumar2, David Miller, kvm, mashirle, netdev, netdev-owner,
	Sridhar Samudrala, Steve Dobbelstein
In-Reply-To: <1296713354.25430.143.camel@localhost.localdomain>

On Wed, Feb 02, 2011 at 10:09:14PM -0800, Shirley Ma wrote:
> On Thu, 2011-02-03 at 07:59 +0200, Michael S. Tsirkin wrote:
> > > Let's look at the sequence here:
> > > 
> > > guest start_xmit()
> > >       xmit_skb()
> > >       if ring is full,
> > >               enable_cb()
> > > 
> > > guest skb_xmit_done()
> > >       disable_cb,
> > >         printk free_old_xmit_skbs <-- it was between more than 1/2
> > to
> > > full ring size
> > >       printk vq->num_free 
> > > 
> > > vhost handle_tx()
> > >       if (guest interrupt is enabled)
> > >               signal guest to free xmit buffers
> > > 
> > > So between guest queue full/stopped queue/enable call back to guest
> > > receives the callback from host to free_old_xmit_skbs, there were
> > about
> > > 1/2 to full ring size descriptors available. I thought there were
> > only a
> > > few. (I disabled your vhost patch for this test.)
> > 
> > 
> > The expected number is vq->num - max skb frags - 2. 
> 
> It was various (up to the ring size 256). This is using indirection
> buffers, it returned how many freed descriptors, not number of buffers.
> 
> Why do you think it is vq->num - max skb frags - 2 here?
> 
> Shirley

well queue is stopped which happens when

        if (capacity < 2+MAX_SKB_FRAGS) {
                netif_stop_queue(dev);
                if (unlikely(!virtqueue_enable_cb(vi->svq))) {
                        /* More just got used, free them then recheck.
 * */
                        capacity += free_old_xmit_skbs(vi);
                        if (capacity >= 2+MAX_SKB_FRAGS) {
                                netif_start_queue(dev);
                                virtqueue_disable_cb(vi->svq);
                        }
                }
        }

This should be the most common case.
I guess the case with += free_old_xmit_skbs is what can get us more.
But it should be rare. Can you count how common it is?

-- 
MST

^ permalink raw reply

* Re: Network performance with small packets
From: Michael S. Tsirkin @ 2011-02-03  6:13 UTC (permalink / raw)
  To: Shirley Ma
  Cc: Krishna Kumar2, David Miller, kvm, mashirle, netdev, netdev-owner,
	Sridhar Samudrala, Steve Dobbelstein
In-Reply-To: <1296709556.25430.140.camel@localhost.localdomain>

On Wed, Feb 02, 2011 at 09:05:56PM -0800, Shirley Ma wrote:
> On Wed, 2011-02-02 at 23:20 +0200, Michael S. Tsirkin wrote:
> > > I think I need to define the test matrix to collect data for TX xmit
> > > from guest to host here for different tests.
> > > 
> > > Data to be collected:
> > > ---------------------
> > > 1. kvm_stat for VM, I/O exits
> > > 2. cpu utilization for both guest and host
> > > 3. cat /proc/interrupts on guest
> > > 4. packets rate from vhost handle_tx per loop
> > > 5. guest netif queue stop rate
> > > 6. how many packets are waiting for free between vhost signaling and
> > > guest callback
> > > 7. performance results
> > > 
> > > Test
> > > ----
> > > 1. TCP_STREAM single stream test for 1K to 4K message size
> > > 2. TCP_RR (64 instance test): 128 - 1K request/response size
> > > 
> > > Different hacks
> > > ---------------
> > > 1. Base line data ( with the patch to fix capacity check first,
> > > free_old_xmit_skbs returns number of skbs)
> > > 
> > > 2. Drop packet data (will put some debugging in generic networking
> > code)
> 
> Since I found that the netif queue stop/wake up is so expensive, I
> created a dropping packets patch on guest side so I don't need to debug
> generic networking code.
> 
> guest start_xmit()
> 	capacity = free_old_xmit_skb() + virtqueue_get_num_freed()
> 	if (capacity == 0)
> 		drop this packet;
> 		return;
> 
> In the patch, both guest TX interrupts and callback have been omitted.
> Host vhost_signal in handle_tx can totally be removed as well. (A new
> virtio_ring API is needed for exporting total of num_free descriptors
> here -- virtioqueue_get_num_freed)
> 
> Initial TCP_STREAM performance results I got for guest to local host 
> 4.2Gb/s for 1K message size, (vs. 2.5Gb/s)
> 6.2Gb/s for 2K message size, and (vs. 3.8Gb/s)
> 9.8Gb/s for 4K message size. (vs.5.xGb/s)

What is the average packet size, # bytes per ack, and the # of interrupts
per packet? It could be that just slowing down trahsmission
makes GSO work better.

> Since large message size (64K) doesn't hit (capacity == 0) case, so the
> performance only has a little better. (from 13.xGb/s to 14.x Gb/s)
> 
> kvm_stat output shows significant exits reduction for both VM and I/O,
> no guest TX interrupts.
> 
> With dropping packets, TCP retrans has been increased here, so I can see
> performance numbers are various.
> 
> This might be not a good solution, but it gave us some ideas on
> expensive netif queue stop/wake up between guest and host notification.
> 
> I couldn't find a better solution on how to reduce netif queue stop/wake
> up rate for small message size. But I think once we can address this,
> the guest TX performance will burst for small message size.
> 
> I also compared this with return TX_BUSY approach when (capacity == 0),
> it is not as good as dropping packets.
> 
> > > 3. Delay guest netif queue wake up until certain descriptors (1/2
> > ring
> > > size, 1/4 ring size...) are available once the queue has stopped.
> > > 
> > > 4. Accumulate more packets per vhost signal in handle_tx?
> > > 
> > > 5. 3 & 4 combinations
> > > 
> > > 6. Accumulate more packets per guest kick() (TCP_RR) by adding a
> > timer? 
> > > 
> > > 7. Accumulate more packets per vhost handle_tx() by adding some
> > delay?
> > > 
> > > > Haven't noticed that part, how does your patch make it
> > > handle more packets?
> > > 
> > > Added a delay in handle_tx().
> > > 
> > > What else?
> > > 
> > > It would take sometimes to do this.
> > > 
> > > Shirley
> > 
> > 
> > Need to think about this.
> > 
> > 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox