Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v3] em_canid: Ematch rule to match CAN frames according to their identifiers
From: Rostislav Lisovy @ 2012-07-02 15:06 UTC (permalink / raw)
  To: netdev; +Cc: linux-can, lartc, pisa, sojkam1, Rostislav Lisovy

This ematch makes it possible to classify CAN frames (AF_CAN) according
to their identifiers. This functionality can not be easily achieved with
existing classifiers, such as u32, because CAN identifier is always stored
in native endianness, whereas u32 expects Network byte order.

Signed-off-by: Rostislav Lisovy <lisovy@gmail.com>
---
Results of simple benchmark are available at address:
  http://rtime.felk.cvut.cz/~lisovros/em_canid_benchmark.pdf
In short: there is no significant difference between em_canid and cls_can
(i.e. stand-alone AF_CAN classifier).

Changes in v3:
  * Added zero-length check for configuration data
  * Small fixes in patch format (patch created against net-next)

Changes in v2:
  * Integrated remarks received from the mailing list
  * Array containing rules (rules_raw) is dynamically allocated
  * When processing a rule during configuration, CAN_EFF_FLAG in mask
    determines if we care about the value of CAN_EFF_FLAG in an identifier
    -- i.e. when CAN_EFF_FLAG is set in the mask, the rule will be added
    as SFF or EFF only depending on CAN_EFF_FLAG value in the identifier.
    If CAN_EFF_FLAG is 0 in the mask, the rule will be added as both SFF
    and EFF.

---
 include/linux/can.h     |    3 +
 include/linux/pkt_cls.h |    5 +-
 net/sched/Kconfig       |   10 ++
 net/sched/Makefile      |    1 +
 net/sched/em_canid.c    |  250 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 267 insertions(+), 2 deletions(-)
 create mode 100644 net/sched/em_canid.c

diff --git a/include/linux/can.h b/include/linux/can.h
index 1a66cf61..018055e 100644
--- a/include/linux/can.h
+++ b/include/linux/can.h
@@ -38,6 +38,9 @@
  */
 typedef __u32 canid_t;
 
+#define CAN_SFF_ID_BITS		11
+#define CAN_EFF_ID_BITS		29
+
 /*
  * Controller Area Network Error Message Frame Mask structure
  *
diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
index defbde2..7fbe6c2 100644
--- a/include/linux/pkt_cls.h
+++ b/include/linux/pkt_cls.h
@@ -451,8 +451,9 @@ enum {
 #define	TCF_EM_U32		3
 #define	TCF_EM_META		4
 #define	TCF_EM_TEXT		5
-#define        TCF_EM_VLAN		6
-#define	TCF_EM_MAX		6
+#define	TCF_EM_VLAN		6
+#define        TCF_EM_CANID		7
+#define	TCF_EM_MAX		7
 
 enum {
 	TCF_EM_PROG_TC
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index e7a8976..8f759b6 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -507,6 +507,16 @@ config NET_EMATCH_TEXT
 	  To compile this code as a module, choose M here: the
 	  module will be called em_text.
 
+config NET_EMATCH_CANID
+	tristate "CAN Identifier"
+	depends on NET_EMATCH && CAN
+	---help---
+          Say Y here if you want to be able to classify CAN frames based
+          on CAN Identifier.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called em_canid.
+
 config NET_CLS_ACT
 	bool "Actions"
 	---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 5940a19..bcada75 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -55,3 +55,4 @@ obj-$(CONFIG_NET_EMATCH_NBYTE)	+= em_nbyte.o
 obj-$(CONFIG_NET_EMATCH_U32)	+= em_u32.o
 obj-$(CONFIG_NET_EMATCH_META)	+= em_meta.o
 obj-$(CONFIG_NET_EMATCH_TEXT)	+= em_text.o
+obj-$(CONFIG_NET_EMATCH_CANID)	+= em_canid.o
diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c
new file mode 100644
index 0000000..f12ce98
--- /dev/null
+++ b/net/sched/em_canid.c
@@ -0,0 +1,250 @@
+/*
+ * em_canid.c  Ematch rule to match CAN frames according to their CAN IDs
+ *
+ *              This program is free software; you can distribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Idea:       Oliver Hartkopp <oliver.hartkopp@volkswagen.de>
+ * Copyright:  (c) 2011 Czech Technical University in Prague
+ *             (c) 2011 Volkswagen Group Research
+ * Authors:    Michal Sojka <sojkam1@fel.cvut.cz>
+ *             Pavel Pisa <pisa@cmp.felk.cvut.cz>
+ *             Rostislav Lisovy <lisovy@gmail.cz>
+ * Funded by:  Volkswagen Group Research
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+#include <linux/can.h>
+
+#define EM_CAN_RULES_MAX 500
+
+struct canid_match {
+	/* For each SFF CAN ID (11 bit) there is one record in this bitfield */
+	DECLARE_BITMAP(match_sff, (1 << CAN_SFF_ID_BITS));
+
+	int rules_count;
+	int sff_rules_count;
+	int eff_rules_count;
+
+	/*
+	 * Raw rules copied from netlink message; Used for sending
+	 * information to userspace (when 'tc filter show' is invoked)
+	 * AND when matching EFF frames
+	 */
+	struct can_filter rules_raw[];
+};
+
+/**
+ * em_canid_get_id() - Extracts Can ID out of the sk_buff structure.
+ */
+static canid_t em_canid_get_id(struct sk_buff *skb)
+{
+	/* CAN ID is stored within the data field */
+	struct can_frame *cf = (struct can_frame *)skb->data;
+
+	return cf->can_id;
+}
+
+static void em_canid_sff_match_add(struct canid_match *cm, u32 can_id,
+					u32 can_mask)
+{
+	int i;
+
+	/*
+	 * Limit can_mask and can_id to SFF range to
+	 * protect against write after end of array
+	 */
+	can_mask &= CAN_SFF_MASK;
+	can_id &= can_mask;
+
+	/* Single frame */
+	if (can_mask == CAN_SFF_MASK) {
+		set_bit(can_id, cm->match_sff);
+		return;
+	}
+
+	/* All frames */
+	if (can_mask == 0) {
+		bitmap_fill(cm->match_sff, (1 << CAN_SFF_ID_BITS));
+		return;
+	}
+
+	/*
+	 * Individual frame filter.
+	 * Add record (set bit to 1) for each ID that
+	 * conforms particular rule
+	 */
+	for (i = 0; i < (1 << CAN_SFF_ID_BITS); i++) {
+		if ((i & can_mask) == can_id)
+			set_bit(i, cm->match_sff);
+	}
+}
+
+static inline struct canid_match *em_canid_priv(struct tcf_ematch *m)
+{
+	return (struct canid_match *)m->data;
+}
+
+static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m,
+			 struct tcf_pkt_info *info)
+{
+	struct canid_match *cm = em_canid_priv(m);
+	canid_t can_id;
+	int match = 0;
+	int i;
+	const struct can_filter *lp;
+
+	can_id = em_canid_get_id(skb);
+
+	if (can_id & CAN_EFF_FLAG) {
+		for (i = 0, lp = cm->rules_raw;
+		     i < cm->eff_rules_count; i++, lp++) {
+			if (!(((lp->can_id ^ can_id) & lp->can_mask))) {
+				match = 1;
+				break;
+			}
+		}
+	} else { /* SFF */
+		can_id &= CAN_SFF_MASK;
+		match = (test_bit(can_id, cm->match_sff) ? 1 : 0);
+	}
+
+	return match;
+}
+
+static int em_canid_change(struct tcf_proto *tp, void *data, int len,
+			  struct tcf_ematch *m)
+{
+	struct can_filter *conf = data; /* Array with rules,
+					 * fixed size EM_CAN_RULES_SIZE
+					 */
+	struct canid_match *cm;
+	struct canid_match *cm_old = (struct canid_match *)m->data;
+	int i;
+	int rulescnt;
+
+	if (!len)
+		return -EINVAL;
+
+	if (len % sizeof(struct can_filter))
+		return -EINVAL;
+
+	if (len > sizeof(struct can_filter) * EM_CAN_RULES_MAX)
+		return -EINVAL;
+
+	rulescnt = len / sizeof(struct can_filter);
+
+	cm = kzalloc(sizeof(struct canid_match) + sizeof(struct can_filter) *
+		rulescnt, GFP_KERNEL);
+	if (!cm)
+		return -ENOMEM;
+
+	cm->sff_rules_count = 0;
+	cm->eff_rules_count = 0;
+	cm->rules_count = rulescnt;
+
+	/*
+	 * We need two for() loops for copying rules into
+	 * two contiguous areas in rules_raw
+	 */
+
+	/* Process EFF frame rules*/
+	for (i = 0; i < cm->rules_count; i++) {
+		if (((conf[i].can_id & CAN_EFF_FLAG) &&
+		    (conf[i].can_mask & CAN_EFF_FLAG)) ||
+		    !(conf[i].can_mask & CAN_EFF_FLAG)) {
+			memcpy(cm->rules_raw + cm->eff_rules_count,
+				&conf[i],
+				sizeof(struct can_filter));
+
+			cm->eff_rules_count++;
+		} else {
+			continue;
+		}
+	}
+
+	/* Process SFF frame rules */
+	for (i = 0; i < cm->rules_count; i++) {
+		if ((conf[i].can_id & CAN_EFF_FLAG) &&
+		    (conf[i].can_mask & CAN_EFF_FLAG)) {
+			continue;
+		} else {
+			memcpy(cm->rules_raw
+				+ cm->eff_rules_count
+				+ cm->sff_rules_count,
+				&conf[i], sizeof(struct can_filter));
+
+			cm->sff_rules_count++;
+
+			em_canid_sff_match_add(cm,
+				conf[i].can_id, conf[i].can_mask);
+		}
+	}
+
+	m->datalen = sizeof(*cm);
+	m->data = (unsigned long)cm;
+
+	if (cm_old != NULL) {
+		pr_err("canid: Configuring an existing ematch!\n");
+		kfree(cm_old);
+	}
+
+	return 0;
+}
+
+static void em_canid_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
+{
+	struct canid_match *cm = em_canid_priv(m);
+
+	kfree(cm);
+}
+
+static int em_canid_dump(struct sk_buff *skb, struct tcf_ematch *m)
+{
+	struct canid_match *cm = em_canid_priv(m);
+
+	/*
+	 * When configuring this ematch 'rules_count' is set not to exceed
+	 * 'rules_raw' array size
+	 */
+	if (nla_put_nohdr(skb, sizeof(struct can_filter) * cm->rules_count,
+	    &cm->rules_raw) < 0)
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static struct tcf_ematch_ops em_canid_ops = {
+	.kind	  = TCF_EM_CANID,
+	.change	  = em_canid_change,
+	.match	  = em_canid_match,
+	.destroy  = em_canid_destroy,
+	.dump	  = em_canid_dump,
+	.owner	  = THIS_MODULE,
+	.link	  = LIST_HEAD_INIT(em_canid_ops.link)
+};
+
+static int __init init_em_canid(void)
+{
+	return tcf_em_register(&em_canid_ops);
+}
+
+static void __exit exit_em_canid(void)
+{
+	tcf_em_unregister(&em_canid_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_canid);
+module_exit(exit_em_canid);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_CANID);
-- 
1.7.10


^ permalink raw reply related

* Re: AF_BUS socket address family
From: Vincent Sanders @ 2012-07-02 14:51 UTC (permalink / raw)
  To: Alan Cox; +Cc: Benjamin LaHaise, David Miller, netdev, linux-kernel
In-Reply-To: <20120630135240.41dbaacf@pyramind.ukuu.org.uk>

On Sat, Jun 30, 2012 at 01:52:40PM +0100, Alan Cox wrote:
> On Fri, 29 Jun 2012 20:13:50 -0400
> Benjamin LaHaise <bcrl@kvack.org> wrote:
> 
> > On Sat, Jun 30, 2012 at 12:42:30AM +0100, Vincent Sanders wrote:
> > > The current users are suffering from the issues outlined in my
> > > introductory mail all the time. These issues are caused by emulating an
> > > IPC system over AF_UNIX in userspace.
> > 
> > Nothing in your introductory statements indicate how your requirements 
> > can't be met through a hybrid socket + shared memory solution.  The IPC 
> > facilities of the kernel are already quite rich, and sufficient for 
> > building many kinds of complex systems.  What's so different about DBus' 
> > requirements?
> 
> dbus wants to
> - multicast
> - pass file handles
> - never lose an event
> - be fast
> - have a security model
> 
> The security model makes a shared memory hack impractical, the file
> handle passing means at least some of it needs to be AF_UNIX. The event
> loss handling/speed argue for putting it in kernel.

Thankyou for making this point more eloquently than I had previously
been able to.

> 
> I'm not convinced AF_BUS entirely sorts this either. In particular the
> failure case dbus currently has to handle for not losing events allows it
> to identify who in a "group" has jammed the bus by not listening (eg by
> locking up). This information appears to be lost in the AF_BUS case and
> that's slightly catastrophic for error recovery.
> 

The strategy the existing AF_UNIX D-Bus daemon implements is simply to
have huge queues and thus rarely encounters the situation. When It
does the bus daemon crafts an error message as a reply to the sender.

The AF_BUS solution is more direct in that the sender gets either
EAGAIN for a direct send or EPOLLOUT from poll. Whatever the response
the sender can use this information to implement a userspace policy
decision.

Your feedback sparked a discussion and we have considered this in more
depth and propose implementing a userspace policy of:

 - sending a message to the bus master and let it "deal" with the
   blocking client.

 - The bus master might choose to isolate the offending client or
    perhaps even cause a service restart etc. 

   The bus master is a privileged client and has state information
    about the bus allowing an optimal decision. Though we intend to
    add a socket option to query the queue lengths so it can make a
    better decisions.

   Regardless this is all userspace policy for the D-Bus client
    library / bus master daemon which I believe addresses David Miller's
    concerns about such decisions being made in userspace.

--
Best Regards 
Vincent Sanders <vincent.sanders@collabora.co.uk>

^ permalink raw reply

* Re: [PATCH 00/12] Swap-over-NFS without deadlocking V8
From: Mel Gorman @ 2012-07-02 14:35 UTC (permalink / raw)
  To: Eric B Munson
  Cc: Andrew Morton, Linux-MM, Linux-Netdev, Linux-NFS, LKML,
	David Miller, Trond Myklebust, Neil Brown, Christoph Hellwig,
	Peter Zijlstra, Mike Christie, Sebastian Andrzej Siewior
In-Reply-To: <20120701172254.GB2470@mgebm.net>

On Sun, Jul 01, 2012 at 01:22:54PM -0400, Eric B Munson wrote:
> On Fri, 29 Jun 2012, Mel Gorman wrote:
> 
> > Changelog since V7
> >   o Rebase to linux-next 20120629
> >   o bi->page_dma instead of bi->page in intel driver
> >   o Build fix for !CONFIG_NET					(sebastian)
> >   o Restore PF_MEMALLOC flags correctly in all cases		(jlayton)
> > 
> > Changelog since V6
> >   o Rebase to linux-next 20120622
> > 
> > Changelog since V5
> >   o Rebase to v3.5-rc3
> > 
> > Changelog since V4
> >   o Catch if SOCK_MEMALLOC flag is cleared with rmem tokens	(davem)
> > 
> > Changelog since V3
> >   o Rebase to 3.4-rc5
> >   o kmap pages for writing to swap				(akpm)
> >   o Move forward declaration to reduce chance of duplication	(akpm)
> > 
> > Changelog since V2
> >   o Nothing significant, just rebases. A radix tree lookup is replaced with
> >     a linear search would be the biggest rebase artifact
> > 
> > This patch series is based on top of "Swap-over-NBD without deadlocking v14"
> > as it depends on the same reservation of PF_MEMALLOC reserves logic.
> > 
> > When a user or administrator requires swap for their application, they
> > create a swap partition and file, format it with mkswap and activate it with
> > swapon. In diskless systems this is not an option so if swap if required
> > then swapping over the network is considered.  The two likely scenarios
> > are when blade servers are used as part of a cluster where the form factor
> > or maintenance costs do not allow the use of disks and thin clients.
> > 
> > The Linux Terminal Server Project recommends the use of the Network
> > Block Device (NBD) for swap but this is not always an option.  There is
> > no guarantee that the network attached storage (NAS) device is running
> > Linux or supports NBD. However, it is likely that it supports NFS so there
> > are users that want support for swapping over NFS despite any performance
> > concern. Some distributions currently carry patches that support swapping
> > over NFS but it would be preferable to support it in the mainline kernel.
> > 
> > Patch 1 avoids a stream-specific deadlock that potentially affects TCP.
> > 
> > Patch 2 is a small modification to SELinux to avoid using PFMEMALLOC
> > 	reserves.
> > 
> > Patch 3 adds three helpers for filesystems to handle swap cache pages.
> > 	For example, page_file_mapping() returns page->mapping for
> > 	file-backed pages and the address_space of the underlying
> > 	swap file for swap cache pages.
> > 
> > Patch 4 adds two address_space_operations to allow a filesystem
> > 	to pin all metadata relevant to a swapfile in memory. Upon
> > 	successful activation, the swapfile is marked SWP_FILE and
> > 	the address space operation ->direct_IO is used for writing
> > 	and ->readpage for reading in swap pages.
> > 
> > Patch 5 notes that patch 3 is bolting
> > 	filesystem-specific-swapfile-support onto the side and that
> > 	the default handlers have different information to what
> > 	is available to the filesystem. This patch refactors the
> > 	code so that there are generic handlers for each of the new
> > 	address_space operations.
> > 
> > Patch 6 adds an API to allow a vector of kernel addresses to be
> > 	translated to struct pages and pinned for IO.
> > 
> > Patch 7 adds support for using highmem pages for swap by kmapping
> > 	the pages before calling the direct_IO handler.
> > 
> > Patch 8 updates NFS to use the helpers from patch 3 where necessary.
> > 
> > Patch 9 avoids setting PF_private on PG_swapcache pages within NFS.
> > 
> > Patch 10 implements the new swapfile-related address_space operations
> > 	for NFS and teaches the direct IO handler how to manage
> > 	kernel addresses.
> > 
> > Patch 11 prevents page allocator recursions in NFS by using GFP_NOIO
> > 	where appropriate.
> > 
> > Patch 12 fixes a NULL pointer dereference that occurs when using
> > 	swap-over-NFS.
> > 
> > With the patches applied, it is possible to mount a swapfile that is on an
> > NFS filesystem. Swap performance is not great with a swap stress test taking
> > roughly twice as long to complete than if the swap device was backed by NBD.
> 
> To test this set I am using memory cgroups to force swap usage.  I am seeing
> the cgroup controller killing my processes instead of using the nfs swapfile.
> 

How sure are you that this is not a cgroup bug? For dirty file data on some
kernels, cgroups can prematurely kill processes if pages are not being
cleaned fast enough. I would not expect the same problem for anonymous
pages but it's worth considering. Please also test with a normal swapfile.

If OOM is disabled and the process hangs, try capturing a sysrq+t and
see where the process is stuck.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH net-next] em_canid: Ematch rule to match CAN frames according to their identifiers
From: Rostislav Lisovy @ 2012-07-02 14:12 UTC (permalink / raw)
  To: Oliver Hartkopp; +Cc: netdev, linux-can, lartc, pisa, sojkam1
In-Reply-To: <4FEDCD42.8010203@hartkopp.net>

Hello Oliver;

On Fri, 2012-06-29 at 17:44 +0200, Oliver Hartkopp wrote: 
> What about a zero length check here?
> 
> 	if (!len)
> 		return -EINVAL;
> 

> 
> The length could alternatively be checked here too
> 
> http://lxr.linux.no/#linux+v3.4.4/net/sched/ematch.c#L235
> 
> if em->ops->datalen is set.
> 
> But here's no
> 
> 	.datalen = sizeof(struct can_filter),
> 
> defined, right?
> 

The main reason I didn't define the tcf_ematch_ops.datalen field is
because the documentation says it is "length of expected configuration
data" (not "minimal").
For the sake of possible future changes of the built-in length checking,
I will do the check by myself -- I will add the zero-length check (at
least all checks will be at the same place).

Regards;
Rostislav

^ permalink raw reply

* Re: [RFC] [TCP 0/3] Receive from socket into bio without copying
From: Eric Dumazet @ 2012-07-02 13:54 UTC (permalink / raw)
  To: Andreas Gruenbacher; +Cc: netdev, linux-kernel, Herbert Xu, David S. Miller
In-Reply-To: <1341234177.29646.69.camel@gurkel.linbit>

On Mon, 2012-07-02 at 15:02 +0200, Andreas Gruenbacher wrote:

> bio_vec's have some alignment requirements that must be met, and
> anything that doesn't meet those requirements can't be passed to the
> block layer (without copying it first).  Additional layers between the
> network and block layers, like a pipe, won't make that problem go away.
> 

What are the "some alignment requirements" exactly, and how do you use
TCP exactly to meet them ? (MSS= multiple of 512 ?)

I believe you try to escape from the real problem.

If the NIC driver provides non aligned data, neither splice() or your
new stuff will magically align it. You _need_ a copy in either cases.

If NIC driver provides aligned data, splice(socket -> pipe) will keep
this alignment for you at 0 cost.

> It's not already there, it requires the alignment issue to be addresses
> first.

There is no guarantee TCP payload is aligned to a bio, ever, in linux
ethernet/ip/tcp stack.

Really, your patches work for you, by pure luck, because you use one
particular NIC driver that happens to prepare things for you (presumably
doing header split). Nothing guarantee this wont change even for the
same hardware in linux-3.8

So I will just say no to your patches, unless you demonstrate the
splice() problems, and how you can fix the alignment problem in a new
layer instead of in the existing zero copy standard one.

^ permalink raw reply

* зайчик мой
From: Миленка Мясникова @ 2012-07-02  8:37 UTC (permalink / raw)
  To: netdev

здаров.!)) 
если скучаешь и есть желание подружиться, www.newpol.by/show_photos.php через поиск ищи..!) 

^ permalink raw reply

* Re: [RFC] [TCP 0/3] Receive from socket into bio without copying
From: saeed bishara @ 2012-07-02 13:39 UTC (permalink / raw)
  To: Andreas Gruenbacher
  Cc: Eric Dumazet, netdev, linux-kernel, Herbert Xu, David S. Miller
In-Reply-To: <1341229532.29646.39.camel@gurkel.linbit>

On Mon, Jul 2, 2012 at 2:45 PM, Andreas Gruenbacher <agruen@linbit.com> wrote:

>
> We want to go directly to the block layer instead.  This requires that
> the network hardware receives the data into sector aligned buffers.
> Hence the proposed MSG_NEW_PACKET flag.
Andreas,
 I didn't read your patches, but what are you looking for can't be
achieved using "normal" NICs, for that you need to use RDMA protocol
with a hardware that supports RDMA.

saeed

^ permalink raw reply

* Re: [PATCH net-next 09/10] net/mlx4_en: Manage flow steering rules with ethtool
From: Or Gerlitz @ 2012-07-02 13:32 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: davem, roland, yevgenyp, oren, netdev, Hadar Hen Zion, Amir Vadai
In-Reply-To: <1341158452.4852.107.camel@deadeye.wl.decadent.org.uk>

On 7/1/2012 7:00 PM, Ben Hutchings wrote:
>> static int mlx4_en_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd,
>> >  			     u32 *rule_locs)
>> >  {
>> >  	struct mlx4_en_priv *priv = netdev_priv(dev);
>> >+	struct mlx4_en_dev *mdev = priv->mdev;
>> >  	int err = 0;
>> >+	int i = 0, priority = 0;
>> >+
>> >+	if (mdev->dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED)
>> >+		return -EOPNOTSUPP;
>> >  
>> >  	switch (cmd->cmd) {
>> >  	case ETHTOOL_GRXRINGS:
>> >  		cmd->data = priv->rx_ring_num;
>> >  		break;
>> >+	case ETHTOOL_GRXCLSRLCNT:
>> >+		cmd->rule_cnt = mlx4_en_get_num_flows(priv);
>> >+		break;
>> >+	case ETHTOOL_GRXCLSRULE:
>> >+		err = mlx4_en_get_flow(dev, cmd, cmd->fs.location);
>> >+		break;
>> >+	case ETHTOOL_GRXCLSRLALL:
>> >+		while (!err || err == -ENOENT) {
>> >+			err = mlx4_en_get_flow(dev, cmd, i);
>> >+			if (!err)
>> >+				((u32 *)(rule_locs))[priority++] = i;
> I don't see any range check against cmd->rule_cnt.

OK, will fix to make sure we don't cross cmd->rule_cnt

>
>
> Why are you casting rule_locs?

doesn't seem to be needed, will remove that casting

>
>
> Also, are the rules really prioritised by location, so that if rule 0
> and 1 match a packet then only rule 0 is applied?  If they are actually
> prioritised by the match type then you need to assign locations on the
> driver side that reflect the actual priorities.


Rules are prioritized by a scheme made of "domain" X location, see below 
and in patch #6
the MLX4_DOMAIN_yyy entries, higher domain value means lower priority, 
so for instance rules
placed by ethtool would have higher priority over rules added by the L2 
NIC  or by future RFS
patch. Within a domain, the location matters.

You can see that simple L2 rules (e.g contain dest-mac, possibly vlan) 
added by the driver
use the NIC domain, wheres those added to serve ethtool commands use the 
ETHTOOL one.

Within the ethtool domain, we maintain the priority as the location 
specified by the user, hope this explains
things.

> +enum {
> +	MLX4_DOMAIN_UVERBS	= 0x1000,
> +	MLX4_DOMAIN_ETHTOOL     = 0x2000,
> +	MLX4_DOMAIN_RFS         = 0x3000,
> +	MLX4_DOMAIN_NIC    = 0x5000,
> +};

>> >+			i++;
>> >+		}
>> >+		if (priority)
>> >+			err = 0;
> [...]
>
> But if there are no rules defined, this is an error?  That's not right.
> I think you should unconditionally set err = 0 here.

OK, will do.

Or.

^ permalink raw reply

* Re: [PATCH net-next 06/10] {NET,IB}/mlx4: Add device managed flow steering firmware API
From: Or Gerlitz @ 2012-07-02 13:00 UTC (permalink / raw)
  To: David Miller; +Cc: roland, yevgenyp, oren, netdev, hadarh, Amir Vadai
In-Reply-To: <20120702.013626.13785263551119133.davem@davemloft.net>

On 7/2/2012 11:36 AM, David Miller wrote:
> So you must create a real generic interface that other chipsets in similar situations can hook into as well.

OK, understood.

We will remove the module param from the patch set, such that at this point
of submission,  there's no run time setting for the flow steering hash 
function.

Or.

^ permalink raw reply

* Re: [RFC] [TCP 0/3] Receive from socket into bio without copying
From: Andreas Gruenbacher @ 2012-07-02 13:02 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, linux-kernel, Herbert Xu, David S. Miller
In-Reply-To: <1341232568.22621.8.camel@edumazet-glaptop>

On Mon, 2012-07-02 at 14:36 +0200, Eric Dumazet wrote:
> No files or page cache are needed for splice() usage, for example from
> socket to another socket.
> 
> It just works (check haproxy for an example), with 10Gb performance out
> of the box.

bio_vec's have some alignment requirements that must be met, and
anything that doesn't meet those requirements can't be passed to the
block layer (without copying it first).  Additional layers between the
network and block layers, like a pipe, won't make that problem go away.

> The pipe is only a container for buffers, in case the data fetched from
> producer cannot be fully sent to consumer. You don't want to lose this
> data.

Stuff that isn't pulled out of a socket receive buffer will stay there,
it won't magically be lost.

> > We want to go directly to the block layer instead.  This requires that
> > the network hardware receives the data into sector aligned buffers.
> > Hence the proposed MSG_NEW_PACKET flag.
> 
> This only is a hint something is wrong with the approach.

It just means that I'm trying to do something that isn't currently
supported.

> You only need proper splice() support (from pipe to bio), if not already
> there.

It's not already there, it requires the alignment issue to be addresses
first.

Andreas

^ permalink raw reply

* Re: [PATCH net-next 09/10] net/mlx4_en: Manage flow steering rules with ethtool
From: Or Gerlitz @ 2012-07-02 12:57 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: davem, roland, Yevgeny Petrilin, Oren Duer, netdev,
	Hadar Hen Zion, Amir Vadai
In-Reply-To: <1341158452.4852.107.camel@deadeye.wl.decadent.org.uk>

On 7/1/2012 7:00 PM, Ben Hutchings wrote:
[...]

Hi Ben,

Thanks for the detailed feedback, see below some responses


> +		l4_mask = &cmd->fs.m_u.tcp_ip4_spec;
> +		/* don't allow mask which isn't all 0 or 1 */
> +		if (not_all_zeros_or_all_ones(l4_mask->ip4src, __be32) ||
> +		    not_all_zeros_or_all_ones(l4_mask->ip4dst, __be32) ||
> +		    not_all_zeros_or_all_ones(l4_mask->psrc, __be16) ||
> +		    not_all_zeros_or_all_ones(l4_mask->pdst, __be16))
> +			return -EOPNOTSUPP;
>
> Again, here and in many further instances, the error code should be EINVAL.


OK, will fix to use EINVAL instead of EOPNOTSUPP here and else-where needed.


> +
> +static void add_ip_rule(struct mlx4_en_priv *priv,
> +			struct ethtool_rxnfc *cmd,
> +			struct list_head *list_h)
> +{
> +	struct mlx4_spec_list *spec_l3;
> +
> +	spec_l3 = kzalloc(sizeof *spec_l3, GFP_KERNEL);
> +	if (!spec_l3) {
> +		en_err(priv, "Fail to alloc ethtool rule.\n");
> +		return;
> +	}
>
> This should return an error code as well - logging is not a substitue.

OK, will do.

>
>
>> +	spec_l3->id = MLX4_NET_TRANS_RULE_ID_IPV4;
>> +	spec_l3->ipv4.src_ip = cmd->fs.h_u.usr_ip4_spec.ip4src;
>> +	if (spec_l3->ipv4.src_ip)
>> +		spec_l3->ipv4.src_ip_msk = EN_ETHTOOL_WORD_MASK;
>> +	spec_l3->ipv4.dst_ip = cmd->fs.h_u.usr_ip4_spec.ip4dst;
>> +	if (spec_l3->ipv4.dst_ip)
>> +		spec_l3->ipv4.dst_ip_msk = EN_ETHTOOL_WORD_MASK;
>
> The conditions should be using the mask (cmd->fs.m_u) not the value.

I'd like to clarify things here a bit - the way the code is written, is to

1st make sure that we can deal with the mask provided by the user in the 
ethtool
command, e.g its all zeroes or all ones (leaving a side for a minute the 
other
discussion on how would be best to impl. that check...) - this check is 
done
in mlx4_en_validate_flow

2nd initialize mlx4 flow spec which is all empty - see calls to kzalloc 
spec_l2/l3/l4

3rd import the non-zero values (not masks) from the ethtool command into 
the mlx4
flow specs, with FULL mask


Under this logic, we can use the values and not the masks, isn't that?



>
>
>> +static void add_tcp_udp_rule(struct mlx4_en_priv *priv,
>> +			     struct ethtool_rxnfc *cmd,
>> +			     struct list_head *list_h, int proto)
>> +{
>> +	struct mlx4_spec_list *spec_l3;
>> +	struct mlx4_spec_list *spec_l4;
>> +
>> +	spec_l3 = kzalloc(sizeof *spec_l3, GFP_KERNEL);
>> +	spec_l4 = kzalloc(sizeof *spec_l4, GFP_KERNEL);
>> +	if (!spec_l4 || !spec_l3) {
>> +		en_err(priv, "Fail to alloc ethtool rule.\n");
>
> If one of them was successfully allocated, it will now be leaked.

THANKS, will fix.

>> +static int mlx4_en_ethtool_to_net_trans_rule(struct net_device *dev,
>> +					     struct ethtool_rxnfc *cmd,
>> +					     struct list_head *rule_list_h)
>> +{
>> +	int err;
>> +	u64 mac;
>> +	__be64 be_mac;
>> +	struct ethhdr *eth_spec;
>> +	struct mlx4_en_priv *priv = netdev_priv(dev);
>> +	struct mlx4_spec_list *spec_l2;
>> +	__be64 mac_msk = cpu_to_be64(EN_ETHTOOL_MAC_MASK << 16);
>> +
>> +	err = mlx4_en_validate_flow(dev, cmd);
>> +	if (err)
>> +		return err;
>> +
>> +	spec_l2 = kzalloc(sizeof *spec_l2, GFP_KERNEL);
>> +	if (!spec_l2)
>> +		return -ENOMEM;
>> +
>> +	mac = priv->mac & EN_ETHTOOL_MAC_MASK;
>> +	be_mac = cpu_to_be64(mac << 16);
>> +
>> +	spec_l2->id = MLX4_NET_TRANS_RULE_ID_ETH;
>> +	memcpy(spec_l2->eth.dst_mac_msk, &mac_msk, ETH_ALEN);
>> +	if ((cmd->fs.flow_type & ~FLOW_EXT) != ETHER_FLOW)
>> +		memcpy(spec_l2->eth.dst_mac, &be_mac, ETH_ALEN);
>
> Does the hardware require filtering by MAC address and not only by layer 3/4 addresses?

YES

>
>
>> +	if ((cmd->fs.flow_type & FLOW_EXT) && cmd->fs.m_ext.vlan_tci) {
>> +		spec_l2->eth.vlan_id = cmd->fs.h_ext.vlan_tci;
>> +		spec_l2->eth.vlan_id_msk = cpu_to_be16(0xfff);
>
> This doesn't match mlx4_en_validate_flow(); you are replacing a mask of
> 0xffff with 0xfff.

I need to check how exactly this should be done here, vlan ID is only 12 
bits in size, so this is
probably the source for the 0xfff vs 0xffff


>
>
>> +	switch (cmd->fs.flow_type & ~FLOW_EXT) {
>> +	case ETHER_FLOW:
>> +		eth_spec = &cmd->fs.h_u.ether_spec;
>> +		memcpy(&spec_l2->eth.dst_mac, eth_spec->h_dest, ETH_ALEN);
>> +		spec_l2->eth.ether_type = eth_spec->h_proto;
>> +		if (eth_spec->h_proto)
>> +			spec_l2->eth.ether_type_enable = 1;
>> +		break;
>> +	case IP_USER_FLOW:
>> +		add_ip_rule(priv, cmd, rule_list_h);
>> +		break;
>> +	case TCP_V4_FLOW:
>> +		add_tcp_udp_rule(priv, cmd, rule_list_h, TCP_V4_FLOW);
>> +		break;
>> +	case UDP_V4_FLOW:
>> +		add_tcp_udp_rule(priv, cmd, rule_list_h, UDP_V4_FLOW);
>> +		break;
>
> All those functions need to be able to return errors.


OK

^ permalink raw reply

* Re: [RFC] [TCP 0/3] Receive from socket into bio without copying
From: Eric Dumazet @ 2012-07-02 12:36 UTC (permalink / raw)
  To: Andreas Gruenbacher; +Cc: netdev, linux-kernel, Herbert Xu, David S. Miller
In-Reply-To: <1341229532.29646.39.camel@gurkel.linbit>

On Mon, 2012-07-02 at 13:45 +0200, Andreas Gruenbacher wrote:
> On Fri, 2012-06-29 at 17:08 +0200, Eric Dumazet wrote:
> > This looks like yet another zero copy, needing another couple of hundred
> > of lines.
> 
> Kind of, yes.  We really want to make no copies at all though; the cpu
> just passes buffers from one device to the other.
> 
> > Why splice infrastructure doesnt fit your needs ?
> 
> The pipe api that splice is based on saves a copy between the kernel and
> user space, but it currently writes to files, going through the page
> cache.  For that, the alignment of data in the network receive buffers
> doesn't matter.
> 

No files or page cache are needed for splice() usage, for example from
socket to another socket.

It just works (check haproxy for an example), with 10Gb performance out
of the box.

The pipe is only a container for buffers, in case the data fetched from
producer cannot be fully sent to consumer. You don't want to lose this
data.

> We want to go directly to the block layer instead.  This requires that
> the network hardware receives the data into sector aligned buffers.
> Hence the proposed MSG_NEW_PACKET flag.
> 

This only is a hint something is wrong with the approach.

> With that, it might be possible to implement a pipe "sink" that goes to
> a bio instead of writing to a file.  Going through the pipe
> infrastructure doesn't actually help in this case though, it's just
> overhead.

There is no expensive overhead in splice() infrastructure, only some
small details that should be eventually solved instead of designing a
new zero copy mode.

You didnt actually tried splice() if you believe a regular file is
needed.

You only need proper splice() support (from pipe to bio), if not already
there.

^ permalink raw reply

* Re: [PATCH v6] sctp: be more restrictive in transport selection on bundled sacks
From: Neil Horman @ 2012-07-02 12:25 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, vyasevich, linux-sctp
In-Reply-To: <20120701234424.GA23935@neilslaptop.think-freely.org>

On Sun, Jul 01, 2012 at 07:44:25PM -0400, Neil Horman wrote:
> On Sun, Jul 01, 2012 at 02:43:19PM -0700, David Miller wrote:
> > From: Neil Horman <nhorman@tuxdriver.com>
> > Date: Sun, 1 Jul 2012 08:47:50 -0400
> > 
> > > Perhaps we could modify the SubmittingPatches document to indicate that an
> > > Acked-by from a subsystem maintainer implicitly confers authority on the
> > > upstream receiver to request reasonable stylistic changes that don't affect the
> > > functionality of the patch in the interests of maintaining coding conventions.
> > 
> > Yes, that would make sense.
> > 
> 
> 
> I'll propose it in a few days.
> Neil
> 
How does this language sound to you?


diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches
index c379a2a..1eaaeb1 100644
--- a/Documentation/SubmittingPatches
+++ b/Documentation/SubmittingPatches
@@ -414,6 +414,16 @@ the part which affects that maintainer's code.  Judgement should be used here.
 When in doubt people should refer to the original discussion in the mailing
 list archives.
 
+Note that an Acked-by: From a subsystem maintainer on a given patch confers
+upon the tree maintainer integrating the path the authority to carry those Acks
+forward through subsequent versions of a patch, as long as those versions do not
+significantly impact the functionality of the patch.  For example, say the isdn
+subsystem maintainer sends an Acked-by: on version 1 of a patch bound for the
+networking tree.  The networking maintainer then requests that some comments in
+the code be modified to comply with the CodingStyle document.  The networking
+tree maintanier may reapply the subsystem maintainers Acked-by: to the new
+version as no significant changes were made to the patch functionality.
+
 If a person has had the opportunity to comment on a patch, but has not
 provided such comments, you may optionally add a "Cc:" tag to the patch.
 This is the only tag which might be added without an explicit action by the

^ permalink raw reply related

* RE: [PATCH net-next 09/10] net/mlx4_en: Manage flow steering rules with ethtool
From: David Laight @ 2012-07-02 12:15 UTC (permalink / raw)
  To: Andreas Schwab
  Cc: Joe Perches, Ben Hutchings, Or Gerlitz, davem, roland, yevgenyp,
	oren, netdev, Hadar Hen Zion
In-Reply-To: <m28vf2o0oa.fsf@igel.home>

 
> "David Laight" <David.Laight@ACULAB.COM> writes:
> 
> >  
> >> > Or write it as (!field || !(typeof(field))~field) which more
closely
> >> > resembles what the macro name expresses.
> >> 
> >> Better still, or maybe:
> >> 
> >> 	field == 0 || field == (typeof field)~0
> >
> > Which doesn't work when sizeof(field) > sizeof(int).
> > Needs another cast.
> >
> > 	field == 0 || field == (typeof field)~(typeof field)0
> 
> You can avoid that by using (typeof field)-1.

Gah, I thought I knew the integral promotion rules!
-1 and ~0 are both 'integer' and get treated the same.

A quick test shows that gcc does sign extend when converting
32bit int to 64bit unsigned long long.
Which probably means that is required by the standard!

	David

^ permalink raw reply

* Re: [RFC] [TCP 0/3] Receive from socket into bio without copying
From: Andreas Gruenbacher @ 2012-07-02 11:45 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, linux-kernel, Herbert Xu, David S. Miller
In-Reply-To: <1340982523.21162.1.camel@edumazet-glaptop>

On Fri, 2012-06-29 at 17:08 +0200, Eric Dumazet wrote:
> This looks like yet another zero copy, needing another couple of hundred
> of lines.

Kind of, yes.  We really want to make no copies at all though; the cpu
just passes buffers from one device to the other.

> Why splice infrastructure doesnt fit your needs ?

The pipe api that splice is based on saves a copy between the kernel and
user space, but it currently writes to files, going through the page
cache.  For that, the alignment of data in the network receive buffers
doesn't matter.

We want to go directly to the block layer instead.  This requires that
the network hardware receives the data into sector aligned buffers.
Hence the proposed MSG_NEW_PACKET flag.

With that, it might be possible to implement a pipe "sink" that goes to
a bio instead of writing to a file.  Going through the pipe
infrastructure doesn't actually help in this case though, it's just
overhead.

Thanks,
Andreas

^ permalink raw reply

* Re: [PATCH net-next 09/10] net/mlx4_en: Manage flow steering rules with ethtool
From: Andreas Schwab @ 2012-07-02 11:35 UTC (permalink / raw)
  To: David Laight
  Cc: Joe Perches, Ben Hutchings, Or Gerlitz, davem, roland, yevgenyp,
	oren, netdev, Hadar Hen Zion
In-Reply-To: <AE90C24D6B3A694183C094C60CF0A2F6026B6F70@saturn3.aculab.com>

"David Laight" <David.Laight@ACULAB.COM> writes:

>  
>> > Or write it as (!field || !(typeof(field))~field) which more closely
>> > resembles what the macro name expresses.
>> 
>> Better still, or maybe:
>> 
>> 	field == 0 || field == (typeof field)~0
>
> Which doesn't work when sizeof(field) > sizeof(int).
> Needs another cast.
>
> 	field == 0 || field == (typeof field)~(typeof field)0

You can avoid that by using (typeof field)-1.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply

* [PATCH] NFC: Don't warn when closing uninitialized socket
From: Sasha Levin @ 2012-07-02 11:31 UTC (permalink / raw)
  To: lauro.venancio-430g2QfJUUCGglJvpFV4uA,
	aloisio.almeida-430g2QfJUUCGglJvpFV4uA,
	sameo-VuQAYsv1563Yd54FQh9/CA, linville-2XuSBdqkA4R54TAoqtyWWQ
  Cc: linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Sasha Levin

It is possible to close a NFC socket without initializing it first,
there's no need to warn about it.

Signed-off-by: Sasha Levin <levinsasha928-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
---
 net/nfc/llcp/llcp.c |    2 --
 1 files changed, 0 insertions(+), 2 deletions(-)

diff --git a/net/nfc/llcp/llcp.c b/net/nfc/llcp/llcp.c
index 5d503ee..cf84544 100644
--- a/net/nfc/llcp/llcp.c
+++ b/net/nfc/llcp/llcp.c
@@ -118,8 +118,6 @@ static void local_release(struct kref *ref)
 
 int nfc_llcp_local_put(struct nfc_llcp_local *local)
 {
-	WARN_ON(local == NULL);

^ permalink raw reply related

* [PATCH v3] ieee802154: verify packet size before trying to allocate it
From: Sasha Levin @ 2012-07-02 11:29 UTC (permalink / raw)
  To: dbaryshkov-Re5JQEeQqe8AvxtiuMwx3w, slapin-9cOl001CZnBAfugRpC6u6w,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Sasha Levin

Currently when sending data over datagram, the send function will attempt to
allocate any size passed on from the userspace.

We should make sure that this size is checked and limited. We'll limit it
to the MTU of the device, which is checked later anyway.

Signed-off-by: Sasha Levin <levinsasha928-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
---
 net/ieee802154/dgram.c |   12 ++++++------
 1 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c
index 6fbb2ad..1670561 100644
--- a/net/ieee802154/dgram.c
+++ b/net/ieee802154/dgram.c
@@ -230,6 +230,12 @@ static int dgram_sendmsg(struct kiocb *iocb, struct sock *sk,
 	mtu = dev->mtu;
 	pr_debug("name = %s, mtu = %u\n", dev->name, mtu);
 
+	if (size > mtu) {
+		pr_debug("size = %Zu, mtu = %u\n", size, mtu);
+		err = -EINVAL;
+		goto out_dev;
+	}
+
 	hlen = LL_RESERVED_SPACE(dev);
 	tlen = dev->needed_tailroom;
 	skb = sock_alloc_send_skb(sk, hlen + tlen + size,
@@ -258,12 +264,6 @@ static int dgram_sendmsg(struct kiocb *iocb, struct sock *sk,
 	if (err < 0)
 		goto out_skb;
 
-	if (size > mtu) {
-		pr_debug("size = %Zu, mtu = %u\n", size, mtu);
-		err = -EINVAL;
-		goto out_skb;
-	}
-
 	skb->dev = dev;
 	skb->sk  = sk;
 	skb->protocol = htons(ETH_P_IEEE802154);
-- 
1.7.8.6


------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and 
threat landscape has changed and how IT managers can respond. Discussions 
will include endpoint security, mobile security and the latest in malware 
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/

^ permalink raw reply related

* Re: one pci_id missing in sky2 driver
From: Mirko Lindner @ 2012-07-02 11:15 UTC (permalink / raw)
  To: Xose Vazquez Perez; +Cc: netdev@vger.kernel.org, shemminger@vyatta.com
In-Reply-To: <4FEEDE8D.1000403@gmail.com>

On Sat, 2012-06-30 at 04:10 -0700, Xose Vazquez Perez wrote:
> Mirko Lindner wrote:
> 
>  > On Sunday, April 01, 2012 12:22:35 PM Stephen Hemminger wrote:
> 
>  >> I would like to use this opportunity to have the developers at Marvell, test
>  >> and submit this. They have the hardware (I don't) and often new chips
>  >> require other special tweaks.  Marvell expressed interest in taking over
>  >> maintaining the sky2 driver, this would be a good first step.
> 
> > Thanks Stephen. The ID belongs to our newest device. We'll include the code
> > into the driver and send a patch to the list as soon the driver has passed our
> > internal tests.
> 
> Mirko, any news on this ?

Sorry for the late reply. The patch will be ready by the end of this
week.

Mirko

^ permalink raw reply

* Re: [RFC] [TCP 1/3] tcp: Add MSG_NEW_PACKET flag to indicate preferable packet boundaries
From: Andreas Gruenbacher @ 2012-07-02 11:00 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, linux-kernel, Herbert Xu, David S. Miller
In-Reply-To: <1340990157.21162.5.camel@edumazet-glaptop>

On Fri, 2012-06-29 at 19:15 +0200, Eric Dumazet wrote:
> On Fri, 2012-06-29 at 17:38 +0200, Andreas Gruenbacher wrote:
> 
> > The primary use case is fast Gigabit (10 or more) Ethernet connections
> > with jumbo frames and switches that support them.  There, frames will go
> > through unchanged and you can zero-copy receive all the time.
> > 
> > Not sure how well the approach scales to other kinds of connections; it
> > may work often enough to be worth it.  When things get distorted between
> > the sender and the receiver and tcp_recvbio() fails, the data can still
> > be copied out of the socket as before.
> 
> If you have a packet loss, receiver can and will coalesce frames.

That's alright as long as we'll get "back to normal" eventually; the
only effect will be that we'll copy data out of the socket receive
buffers for a while.  There will be extremely little packet loss on the
kinds of networks that we want to use this on.

Thanks,
Andreas

^ permalink raw reply

* Re: [PATCH 0/5] rtcache remove respin
From: David Miller @ 2012-07-02 10:59 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1341225841.5269.69.camel@edumazet-glaptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 02 Jul 2012 12:44:01 +0200

> On Sun, 2012-07-01 at 05:02 -0700, David Miller wrote:
>> It's been a while and there were of course a lot of merge hassles with
>> the most recent set I posted, so I respun these patches tonight
>> because I wanted to see the effects of the recent rpfilter hacks on an
>> rtcache-less system.
>> 
>> On a SPARC T3-1:
>> 
>> 1) Output route lookup: ~2800 cycles
>> 2) Input route lookups: ~3000 cycles (rpfilter=0)
>>                         ~4300 cycles (rpfilter=1)
>> 
>> Another nice part is how small struct rtable is after this patch set:
>> 
>> struct rtable {
>> 	struct dst_entry        dst;
>> 	int                     rt_genid;
>> 	unsigned int            rt_flags;
>> 	__u16                   rt_type;
>> 	__be32                  rt_dst;
>> 	int                     rt_route_iif;
>> 	int                     rt_iif;
>> 	int                     rt_oif;
>> 	__be32                  rt_gateway;
>> 	u32                     rt_peer_genid;
>> 	unsigned long           _peer;
>> 	struct fib_info         *fi;
>> };
>> 
>> which is about 208 bytes on sparc64.
>> 
>> Signed-off-by: David S. Miller <davem@davemloft.net>
> 
> Can be <= 192 actually
> 
> rcu_head not needed anymore in dst_entry
> 
> If we still want __refcnt being on cache line boundary, we might find a
> better way to accomplish this.

Once we can actually check something like this in, rt_dst is
guarenteed to be eliminated as well.

The dst neighbour pointer will also be gone.

So lots of shrinking still to go and yes we'll need to reposition
that __refcnt member carefully.

^ permalink raw reply

* Re: [patch] [SCSI] bnx2i: use strlcpy() instead of memcpy() for strings
From: Dan Carpenter @ 2012-07-02 10:48 UTC (permalink / raw)
  To: David Laight
  Cc: James E.J. Bottomley, Barak Witkowski, Eddie Wai, Michael Chan,
	linux-scsi, netdev, David S. Miller
In-Reply-To: <AE90C24D6B3A694183C094C60CF0A2F6026B6F6F@saturn3.aculab.com>

On Mon, Jul 02, 2012 at 11:09:19AM +0100, David Laight wrote:
> > Subject: [patch] [SCSI] bnx2i: use strlcpy() instead of memcpy() for
> strings
> > 
> > DRV_MODULE_VERSION here is "2.7.2.2" which is only 8 chars but we copy
> > 12 bytes from the stack so it's a small information leak.
> > 
> > Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
> > ---
> > This was just added to linux-next yesterday, but I'm not sure 
> > which tree it came from.
> > 
> > diff --git a/drivers/scsi/bnx2i/bnx2i_init.c 
> > b/drivers/scsi/bnx2i/bnx2i_init.c
> > index 7729a52..b17637a 100644
> > --- a/drivers/scsi/bnx2i/bnx2i_init.c
> > +++ b/drivers/scsi/bnx2i/bnx2i_init.c
> > @@ -400,7 +400,7 @@ int bnx2i_get_stats(void *handle)
> >  	if (!stats)
> >  		return -ENOMEM;
> >  
> > -	memcpy(stats->version, DRV_MODULE_VERSION,
> sizeof(stats->version));
> > +	strlcpy(stats->version, DRV_MODULE_VERSION,
> sizeof(stats->version));
> >  	memcpy(stats->mac_add1 + 2, hba->cnic->mac_addr, ETH_ALEN);
> 
> Doesn't that leak the original contents of the last bytes of
> stats->version instead?

I'm pretty sure we set those to zero in bnx2x_handle_drv_info_req().

regards,
dan carpenter


^ permalink raw reply

* Re: [PATCH 0/5] rtcache remove respin
From: Eric Dumazet @ 2012-07-02 10:44 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20120701.050243.908285695895815999.davem@davemloft.net>

On Sun, 2012-07-01 at 05:02 -0700, David Miller wrote:
> It's been a while and there were of course a lot of merge hassles with
> the most recent set I posted, so I respun these patches tonight
> because I wanted to see the effects of the recent rpfilter hacks on an
> rtcache-less system.
> 
> On a SPARC T3-1:
> 
> 1) Output route lookup: ~2800 cycles
> 2) Input route lookups: ~3000 cycles (rpfilter=0)
>                         ~4300 cycles (rpfilter=1)
> 
> Another nice part is how small struct rtable is after this patch set:
> 
> struct rtable {
> 	struct dst_entry        dst;
> 	int                     rt_genid;
> 	unsigned int            rt_flags;
> 	__u16                   rt_type;
> 	__be32                  rt_dst;
> 	int                     rt_route_iif;
> 	int                     rt_iif;
> 	int                     rt_oif;
> 	__be32                  rt_gateway;
> 	u32                     rt_peer_genid;
> 	unsigned long           _peer;
> 	struct fib_info         *fi;
> };
> 
> which is about 208 bytes on sparc64.
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>

Can be <= 192 actually

rcu_head not needed anymore in dst_entry

If we still want __refcnt being on cache line boundary, we might find a
better way to accomplish this.

^ permalink raw reply

* Re: [RFC PATCH net-next] ipvs: add missing lock in ip_vs_ftp_init_conn()
From: Xiaotian Feng @ 2012-07-02 10:30 UTC (permalink / raw)
  To: Julian Anastasov
  Cc: netdev, lvs-devel, netfilter-devel, netfilter, coreteam,
	linux-kernel, Xiaotian Feng, Wensong Zhang, Simon Horman,
	Pablo Neira Ayuso, Patrick McHardy, David S. Miller
In-Reply-To: <alpine.LFD.2.00.1206291125270.1690@ja.ssi.bg>

On Fri, Jun 29, 2012 at 5:04 PM, Julian Anastasov <ja@ssi.bg> wrote:
>
>         Hello,
>
> On Fri, 29 Jun 2012, Xiaotian Feng wrote:
>
>> > On Thu, 28 Jun 2012, Xiaotian Feng wrote:
>> >
>> >> We met a kernel panic in 2.6.32.43 kernel:
>> >>
>> >> [2680191.848044] IPVS: ip_vs_conn_hash(): request for already hashed, called from run_timer_softirq+0x175/0x1d0
>> >> <snip>
>> >> [2680311.849009] general protection fault: 0000 [#1] SMP
>
>         What we see here is 120 seconds between 2680191 and
> 2680311. It can mean 2 things:
>
> - some state timeout, it depends on your forwarding method.
> What is it? NAT? DR?
>
> - 60 seconds for ip_vs_conn_expire retries
>
>> >> After code review, the only chance that kernel change connection flag without protection is
>> >> in ip_vs_ftp_init_conn().
>> >
>> >        Hm, ip_vs_ftp_init_conn is called before 1st hashing,
>> > from ip_vs_bind_app() in ip_vs_conn_new() before
>> > ip_vs_conn_hash(). It should be another problem with
>> > the flags. How different is IPVS in 2.6.32.43 compared to
>> > recent kernels? If commit aea9d711 is present, I'm not
>> > aware of other similar problems.
>>
>> ip_vs_bind_app() is also called by ip_vs_try_bind_dest(), which can be
>> traced to ip_vs_proc_conn().
>> I've checked the changes in upstream, but nothing helps since aea9d711
>> has been taken into 2.6.32.28 kernel.
>
>         OK, this fix should make it safe for master-backup
> sync and it should be applied but I suspect you are not
> using sync, right? And then this fix will not solve the oops.
>

We're using sync.

>         There are no many places that rehash conn:
>
> ip_vs_conn_fill_cport
>         - used for FTP
>
> ip_vs_check_template:
>         - do you have persistence configured?

No.

>
>         After you provide details for the used forwarding
> method, persistence and sync we should think how such races
> with rehashing can lead to double hlist_del. May be
> you can modify the debug message in ip_vs_conn_hash, so
> that we can see cp->flags and ntohs of cp->cport, cp->dport
> and cp->vport when oops happens again.

I just found 2.6.32.34 kernel differ from upstream kernel,  2.6.32
kernel doesn't have ip_vs_try_bind_dest(), but ip_vs_process_message()
kernel might change conn flags without lock protection. This is fixed in
commit f73181c, following changes:

@@ -834,6 +843,7 @@ static void ip_vs_proc_conn(struct net *net,
struct ip_vs_conn_param *param,
                kfree(param->pe_data);

                dest = cp->dest;
+               spin_lock(&cp->lock);
                if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE &&
                    !(flags & IP_VS_CONN_F_TEMPLATE) && dest) {
                        if (flags & IP_VS_CONN_F_INACTIVE) {
@@ -847,6 +857,7 @@ static void ip_vs_proc_conn(struct net *net,
struct ip_vs_conn_param *param,
                flags &= IP_VS_CONN_F_BACKUP_UPD_MASK;
                flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
                cp->flags = flags;
+               spin_unlock(&cp->lock);
                if (!dest) {
                        dest = ip_vs_try_bind_dest(cp);
                        if (dest)

So I took this part into 2.6.32 kernel. But I still think the patch I
posted is required for upstream kernel. Even though there are no many
places that rehash conn, this is potential race as cp->flags is not
protected.

Thanks.

>
> Regards
>
> --
> Julian Anastasov <ja@ssi.bg>

^ permalink raw reply

* RE: [PATCH net-next 09/10] net/mlx4_en: Manage flow steering rules with ethtool
From: David Laight @ 2012-07-02 10:19 UTC (permalink / raw)
  To: Joe Perches, Andreas Schwab
  Cc: Ben Hutchings, Or Gerlitz, davem, roland, yevgenyp, oren, netdev,
	Hadar Hen Zion
In-Reply-To: <1341172352.8675.5.camel@joe2Laptop>

 
> > Or write it as (!field || !(typeof(field))~field) which more closely
> > resembles what the macro name expresses.
> 
> Better still, or maybe:
> 
> 	field == 0 || field == (typeof field)~0

Which doesn't work when sizeof(field) > sizeof(int).
Needs another cast.

	field == 0 || field == (typeof field)~(typeof field)0

	David

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox