Netdev List
 help / color / mirror / Atom feed
* [PATCH 5/7] batman-adv: update README (date & ap isolation sysfs file)
From: Marek Lindner @ 2011-09-08 16:40 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	b.a.t.m.a.n-ZwoEplunGu2X36UT3dwllkB+6BGkLq7r, Marek Lindner,
	Simon Wunderlich
In-Reply-To: <1315500051-1122-1-git-send-email-lindner_marek-LWAfsSFWpa4@public.gmane.org>

From: Simon Wunderlich <siwu-MaAgPAbsBIVS8oHt8HbXEIQuADTiUCJX@public.gmane.org>

Signed-off-by: Simon Wunderlich <siwu-MaAgPAbsBIVS8oHt8HbXEIQuADTiUCJX@public.gmane.org>
Signed-off-by: Marek Lindner <lindner_marek-LWAfsSFWpa4@public.gmane.org>
---
 Documentation/networking/batman-adv.txt |    8 ++++----
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/networking/batman-adv.txt b/Documentation/networking/batman-adv.txt
index 88d4afb..c86d03f 100644
--- a/Documentation/networking/batman-adv.txt
+++ b/Documentation/networking/batman-adv.txt
@@ -1,4 +1,4 @@
-[state: 17-04-2011]
+[state: 21-08-2011]
 
 BATMAN-ADV
 ----------
@@ -68,9 +68,9 @@ All  mesh  wide  settings  can be found in batman's own interface
 folder:
 
 #  ls  /sys/class/net/bat0/mesh/
-#  aggregated_ogms  gw_bandwidth  hop_penalty
-#  bonding          gw_mode       orig_interval
-#  fragmentation    gw_sel_class  vis_mode
+# aggregated_ogms   fragmentation gw_sel_class   vis_mode
+# ap_isolation      gw_bandwidth  hop_penalty
+# bonding           gw_mode       orig_interval
 
 
 There is a special folder for debugging information:
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 6/7] batman-adv: update internal version number
From: Marek Lindner @ 2011-09-08 16:40 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	b.a.t.m.a.n-ZwoEplunGu2X36UT3dwllkB+6BGkLq7r, Marek Lindner
In-Reply-To: <1315500051-1122-1-git-send-email-lindner_marek-LWAfsSFWpa4@public.gmane.org>

From: Sven Eckelmann <sven-KaDOiPu9UxWEi8DpZVb4nw@public.gmane.org>

Signed-off-by: Sven Eckelmann <sven-KaDOiPu9UxWEi8DpZVb4nw@public.gmane.org>
Signed-off-by: Marek Lindner <lindner_marek-LWAfsSFWpa4@public.gmane.org>
---
 net/batman-adv/main.h |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 60b3696..964ad4d 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -28,7 +28,7 @@
 #define DRIVER_DEVICE "batman-adv"
 
 #ifndef SOURCE_VERSION
-#define SOURCE_VERSION "2011.3.0"
+#define SOURCE_VERSION "2011.4.0"
 #endif
 
 /* B.A.T.M.A.N. parameters */
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 7/7] batman-adv: Remove unnecessary OOM logging messages
From: Marek Lindner @ 2011-09-08 16:40 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: Joe Perches, netdev-u79uwXL29TY76Z2rM5mHXA,
	b.a.t.m.a.n-ZwoEplunGu2X36UT3dwllkB+6BGkLq7r, Marek Lindner
In-Reply-To: <1315500051-1122-1-git-send-email-lindner_marek-LWAfsSFWpa4@public.gmane.org>

From: Joe Perches <joe-6d6DIl74uiNBDgjK7y7TUQ@public.gmane.org>

Removing unnecessary messages saves code and text.

Site specific OOM messages are duplications of a generic MM
out of memory message and aren't really useful, so just
delete them.

Signed-off-by: Joe Perches <joe-6d6DIl74uiNBDgjK7y7TUQ@public.gmane.org>
Signed-off-by: Marek Lindner <lindner_marek-LWAfsSFWpa4@public.gmane.org>
---
 net/batman-adv/hard-interface.c |    5 +----
 net/batman-adv/main.c           |    2 --
 net/batman-adv/originator.c     |   16 ++++------------
 net/batman-adv/soft-interface.c |    4 +---
 net/batman-adv/vis.c            |    4 +---
 5 files changed, 7 insertions(+), 24 deletions(-)

diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 0cc0f04..7704df4 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -434,11 +434,8 @@ static struct hard_iface *hardif_add_interface(struct net_device *net_dev)
 	dev_hold(net_dev);
 
 	hard_iface = kmalloc(sizeof(*hard_iface), GFP_ATOMIC);
-	if (!hard_iface) {
-		pr_err("Can't add interface (%s): out of memory\n",
-		       net_dev->name);
+	if (!hard_iface)
 		goto release_dev;
-	}
 
 	ret = sysfs_add_hardif(&hard_iface->hardif_obj, net_dev);
 	if (ret)
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 79b9ae5..fb87bdc 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -117,8 +117,6 @@ int mesh_init(struct net_device *soft_iface)
 	goto end;
 
 err:
-	pr_err("Unable to allocate memory for mesh information structures: "
-	       "out of mem ?\n");
 	mesh_free(soft_iface);
 	return -1;
 
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index cd7d256..0e5b772 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -492,10 +492,8 @@ static int orig_node_add_if(struct orig_node *orig_node, int max_if_num)
 
 	data_ptr = kmalloc(max_if_num * sizeof(unsigned long) * NUM_WORDS,
 			   GFP_ATOMIC);
-	if (!data_ptr) {
-		pr_err("Can't resize orig: out of memory\n");
+	if (!data_ptr)
 		return -1;
-	}
 
 	memcpy(data_ptr, orig_node->bcast_own,
 	       (max_if_num - 1) * sizeof(unsigned long) * NUM_WORDS);
@@ -503,10 +501,8 @@ static int orig_node_add_if(struct orig_node *orig_node, int max_if_num)
 	orig_node->bcast_own = data_ptr;
 
 	data_ptr = kmalloc(max_if_num * sizeof(uint8_t), GFP_ATOMIC);
-	if (!data_ptr) {
-		pr_err("Can't resize orig: out of memory\n");
+	if (!data_ptr)
 		return -1;
-	}
 
 	memcpy(data_ptr, orig_node->bcast_own_sum,
 	       (max_if_num - 1) * sizeof(uint8_t));
@@ -561,10 +557,8 @@ static int orig_node_del_if(struct orig_node *orig_node,
 
 	chunk_size = sizeof(unsigned long) * NUM_WORDS;
 	data_ptr = kmalloc(max_if_num * chunk_size, GFP_ATOMIC);
-	if (!data_ptr) {
-		pr_err("Can't resize orig: out of memory\n");
+	if (!data_ptr)
 		return -1;
-	}
 
 	/* copy first part */
 	memcpy(data_ptr, orig_node->bcast_own, del_if_num * chunk_size);
@@ -582,10 +576,8 @@ free_bcast_own:
 		goto free_own_sum;
 
 	data_ptr = kmalloc(max_if_num * sizeof(uint8_t), GFP_ATOMIC);
-	if (!data_ptr) {
-		pr_err("Can't resize orig: out of memory\n");
+	if (!data_ptr)
 		return -1;
-	}
 
 	memcpy(data_ptr, orig_node->bcast_own_sum,
 	       del_if_num * sizeof(uint8_t));
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 7d8332e..aceeabc 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -801,10 +801,8 @@ struct net_device *softif_create(const char *name)
 
 	soft_iface = alloc_netdev(sizeof(*bat_priv), name, interface_setup);
 
-	if (!soft_iface) {
-		pr_err("Unable to allocate the batman interface: %s\n", name);
+	if (!soft_iface)
 		goto out;
-	}
 
 	ret = register_netdevice(soft_iface);
 	if (ret < 0) {
diff --git a/net/batman-adv/vis.c b/net/batman-adv/vis.c
index fb9b19f..f81a6b6 100644
--- a/net/batman-adv/vis.c
+++ b/net/batman-adv/vis.c
@@ -887,10 +887,8 @@ int vis_init(struct bat_priv *bat_priv)
 	}
 
 	bat_priv->my_vis_info = kmalloc(MAX_VIS_PACKET_SIZE, GFP_ATOMIC);
-	if (!bat_priv->my_vis_info) {
-		pr_err("Can't initialize vis packet\n");
+	if (!bat_priv->my_vis_info)
 		goto err;
-	}
 
 	bat_priv->my_vis_info->skb_packet = dev_alloc_skb(sizeof(*packet) +
 							  MAX_VIS_PACKET_SIZE +
-- 
1.7.5.4

^ permalink raw reply related

* Re: pull request: batman-adv 2011-09-08
From: Marek Lindner @ 2011-09-08 17:31 UTC (permalink / raw)
  To: b.a.t.m.a.n-ZwoEplunGu2X36UT3dwllkB+6BGkLq7r
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, davem-fT/PcQaiUtIeIZ0/mPfg9Q
In-Reply-To: <1315500051-1122-1-git-send-email-lindner_marek-LWAfsSFWpa4@public.gmane.org>

On Thursday, September 08, 2011 18:40:44 Marek Lindner wrote:
> here comes the next batch batch I'd like to get the pulled into 
> net-next-2.6/3.2. These patches mainly focus restructering the 
> routing code in order to allow the kernel module the handling
> of the current routing algorithm and the upcoming one. 
> 
> To ensure a smooth transition and efficient testing later, we 
> decided to offer a choice of routing algorithm but have not     
> come to a final decision on how to design this choice yet. It
> either will be a compile time option, a module parameter or a
> runtime switch. Any suggestions / best practice tips ?
> 
> Note: The new algorithm has been designed with backward 
> compatibility in mind (using TLVs). We are trying to address
> the incompatibility issue you and others have pointed out.

Forgot to mention: We are well aware of the difficulties after 
the kernel.org intrusion. If you can't pull our patches right
now, feel free to do it whenever kernel.org is back online.

Regards,
Marek

^ permalink raw reply

* Re: [net-next-2.6 PATCH 0/3 RFC] macvlan: MAC Address filtering support for passthru mode
From: Sridhar Samudrala @ 2011-09-08 17:42 UTC (permalink / raw)
  To: Roopa Prabhu
  Cc: Michael S. Tsirkin, netdev, dragos.tatulea, arnd, dwang2, benve,
	kaber, davem, eric.dumazet, mchan, kvm
In-Reply-To: <CA8E3924.33B60%roprabhu@cisco.com>

On Thu, 2011-09-08 at 09:19 -0700, Roopa Prabhu wrote:
> 
> 
> On 9/8/11 4:08 AM, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Wed, Sep 07, 2011 at 10:20:28PM -0700, Roopa Prabhu wrote:
> >> On 9/7/11 5:34 AM, "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >> 
> >>> On Tue, Sep 06, 2011 at 03:35:40PM -0700, Roopa Prabhu wrote:
> >>>> This patch is an attempt at providing address filtering support for macvtap
> >>>> devices in PASSTHRU mode. Its still a work in progress.
> >>>> Briefly tested for basic functionality. Wanted to get some feedback on the
> >>>> direction before proceeding.
> >>>> 
> >>> 
> >>> Good work, thanks.
> >>> 
> >> 
> >> Thanks.
> >> 
> >>>> I have hopefully CC'ed all concerned people.
> >>> 
> >>> kvm crowd might also be interested.
> >>> Try using ./scripts/get_maintainer.pl as well.
> >>> 
> >> Thanks for the tip. Expanded CC list a bit more.
> >> 
> >>>> PASSTHRU mode today sets the lowerdev in promiscous mode. In PASSTHRU mode
> >>>> there is a 1-1 mapping between macvtap device and physical nic or VF. And
> >>>> all
> >>>> filtering is done in lowerdev hw. The lowerdev does not need to be in
> >>>> promiscous mode as long as the guest filters are passed down to the
> >>>> lowerdev.
> >>>> This patch tries to remove the need for putting the lowerdev in promiscous
> >>>> mode. 
> >>>> I have also referred to the thread below where TUNSETTXFILTER was mentioned
> >>>> in 
> >>>> this context: 
> >>>>  http://patchwork.ozlabs.org/patch/69297/
> >>>> 
> >>>> This patch basically passes the addresses got by TUNSETTXFILTER to macvlan
> >>>> lowerdev.
> >>>> 
> >>>> I have looked at previous work and discussions on this for qemu-kvm
> >>>> by Michael Tsirkin, Alex Williamson and Dragos Tatulea
> >>>> http://patchwork.ozlabs.org/patch/78595/
> >>>> http://patchwork.ozlabs.org/patch/47160/
> >>>> https://patchwork.kernel.org/patch/474481/
> >>>> 
> >>>> Redhat bugzilla by Michael Tsirkin:
> >>>> https://bugzilla.redhat.com/show_bug.cgi?id=655013
> >>>> 
> >>>> I used Michael's qemu-kvm patch for testing the changes with KVM
> >>>> 
> >>>> I would like to cover both MAC and vlan filtering in this work.
> >>>> 
> >>>> Open Questions/Issues:
> >>>> - There is a need for vlan filtering to complete the patch. It will require
> >>>>   a new tap ioctl cmd for vlans.
> >>>>   Some ideas on this are:
> >>>> 
> >>>>   a) TUNSETVLANFILTER: This will entail we send the whole vlan bitmap
> >>>> filter
> >>>> (similar to tun_filter for addresses). Passing the vlan id's to lower
> >>>> device will mean going thru the whole list of vlans every time.
> >>>> 
> >>>>   OR
> >>>> 
> >>>>   b) TUNSETVLAN with vlan id and flag to set/unset
> >>>> 
> >>>>   Does option 'b' sound ok ?
> >>>> 
> >>>> - In this implementation we make the macvlan address list same as the
> >>>> address
> >>>>   list that came in the filter with TUNSETTXFILTER. This will not cover
> >>>> cases
> >>>>   where the macvlan device needs to have other addresses that are not
> >>>>   necessarily in the filter. Is this a problem ?
> >>> 
> >>> What cases do you have in mind?
> >>> 
> >> This patch targets only macvlan PASSTHRU mode and for PASSTHRU mode I don't
> >> see a problem with uc/mc address list being the same in all the stacked
> >> netdevs in the path. I called that out above to make sure I was not missing
> >> any case in PASSTHRU mode where this might be invalid. Otherwise I don't see
> >> a problem in the simple PASSTHRU use case this patch supports.
> >> 
> >>>> - The patch currently only supports passing of IFF_PROMISC and
> >>>> IFF_MULTICAST
> >>>> filter flags to lowerdev
> >>>> 
> >>>> This patch series implements the following
> >>>> 01/3 - macvlan: Add support for unicast filtering in macvlan
> >>>> 02/3 - macvlan: Add function to set addr filter on lower device in passthru
> >>>> mode
> >>>> 03/3 - macvtap: Add support for TUNSETTXFILTER
> >>>> 
> >>>> Please comment. Thanks.
> >>>> 
> >>>> Signed-off-by: Roopa Prabhu <roprabhu@cisco.com>
> >>>> Signed-off-by: Christian Benvenuti <benve@cisco.com>
> >>>> Signed-off-by: David Wang <dwang2@cisco.com>
> >>> 
> >>> The security isn't lower than with promisc, so I don't see
> >>> a problem with this as such.
> >>> 
> >>> There are more features we'll want down the road though,
> >>> so let's see whether the interface will be able to
> >>> satisfy them in a backwards compatible way before we
> >>> set it in stone. Here's what I came up with:
> >>> 
> >>> How will the filtering table be partitioned within guests?
> >> 
> >> Since this patch supports macvlan PASSTHRU mode only, in which the lower
> >> device has 1-1 mapping to the guest nic, it does not require any
> >> partitioning of filtering table within guests. Unless I missed understanding
> >> something. 
> >> If the lower device were being shared by multiple guest network interfaces
> >> (non PASSTHRU mode), only then we will need to maintain separate filter
> >> tables for each guest network interface in macvlan and forward the pkt to
> >> respective guest interface after a filter lookup. This could affect
> >> performance too I think.
> > 
> > Not with hardware filtering support. Which is where we'd need to
> > partition the host nic mac table between guests.
> > 
> I need to understand this more. In non passthru case when a VF or physical
> nic is shared between guests, the nic does not really know about the guests,
> so I was thinking we do the same thing as we do for the passthru case (ie
> send all the address filters from macvlan to the physical nic). So at the
> hardware, filtering is done for all guests sharing the nic. But if we want
> each virtio-net nic or guest to get exactly what it asked for
> macvlan/macvtap needs to maintain a copy of each guest filter and do a
> lookup and send only the requested traffic to the guest. Here is the
> performance hit that I was seeing. Please see my next comment for further
> details. 
> 
> 
> >> I chose to support PASSTHRU Mode only at first because its simpler and all
> >> code additions are in control path only.
> > 
> > I agree. It would be a bit silly to have a dedicated interface
> > for passthough and a completely separate one for
> > non passthrough.
> >
> Agree. The reason I did not focus on non-passthru case in the initial
> version was because I was thinking things to do in the non-passthru case
> will be just add-ons to the passthru case. But true Better to flush out the
> non-pasthru case details.
> 
> After dwelling on this a bit more how about the below:
> 
> Phase 1: Goal: Enable hardware filtering for all macvlan modes
>     - In macvlan passthru mode the single guest virtio-nic connected will
> receive traffic that he requested for
Currently the guest receives all the packets seen on the interface as it
is put in promiscuous mode. With your patch it only sees the packets
that he requested for. Have you tried creating a macvlan interface on
top of the guest virtio-net interface? Is the new mac address propagated
all the way to the host nic?

I think the main usecase for passthru mode is to assign a SR-IOV VF to
a single guest.

>     - In macvlan non-passthru mode all guest virtio-nics sharing the
>       physical nic will see all other guest traffic
>       but the filtering at guest virtio-nic will make sure each guest
>       eventually sees traffic he asked for. This is still better than
> putting the physical nic in promiscuous mode.

With the default macvlan mode (vepa), i think each guest will only see
its own traffic. But currently adding a secondary mac address on a guest
will not work as it is not propagated all the way down to the host.


> (This is mainly what my patch does...but will need to remove the passthru
> check and see if there are any thing else needed for non-passthru case)
> 
> 
> Phase 2: Goal: Enable filtering at macvlan so that each guest virtio-nic
> receives only what he requested for.
>     - In this case, in addition to pushing the filters down to the physical
> nic we will have to maintain the same filter in macvlan and do a filter
> lookup before forwarding the traffic to a virtio-nic.
> 
> But I am thinking phase 2 might be redundant given virtio-nic already does
> filtering for the guest. In which case we might not need phase 2 at all. I
> might have been over complicating things.

I think filtering at macvlan will be more efficient than replicating all
the packets to all the guest virtio-nics.

Thanks
Sridhar


^ permalink raw reply

* Re: [PATCH v2 9/9] Add documentation about kmem_cgroup
From: Randy Dunlap @ 2011-09-08 17:46 UTC (permalink / raw)
  To: Glauber Costa
  Cc: linux-kernel, linux-mm, containers, netdev, xemul,
	David S. Miller, Hiroyouki Kamezawa, Eric W. Biederman
In-Reply-To: <1315369399-3073-10-git-send-email-glommer@parallels.com>

On 09/06/11 21:23, Glauber Costa wrote:
> Signed-off-by: Glauber Costa <glommer@parallels.com>
> CC: David S. Miller <davem@davemloft.net>
> CC: Hiroyouki Kamezawa <kamezawa.hiroyu@jp.fujitsu.com>
> CC: Eric W. Biederman <ebiederm@xmission.com>
> CC: Randy Dunlap <rdunlap@xenotime.net>
> ---
>  Documentation/cgroups/kmem_cgroups.txt |   27 +++++++++++++++++++++++++++
>  1 files changed, 27 insertions(+), 0 deletions(-)
>  create mode 100644 Documentation/cgroups/kmem_cgroups.txt
> 
> diff --git a/Documentation/cgroups/kmem_cgroups.txt b/Documentation/cgroups/kmem_cgroups.txt
> new file mode 100644
> index 0000000..930e069
> --- /dev/null
> +++ b/Documentation/cgroups/kmem_cgroups.txt
> @@ -0,0 +1,27 @@
> +Kernel Memory Cgroup
> +====================
> +
> +This document briefly describes the kernel memory cgroup, or "kmem cgroup".
> +Unlike user memory, kernel memory cannot be swapped. This effectively means
> +that rogue processes can start operations that pin kernel objects permanently
> +into memory, exhausting resources of all other processes in the system.
> +
> +kmem_cgroup main goal is to control the amount of memory a group of processes

   kmem_cgroup's main goal

> +can pin at any given point in time. Other uses of this infrastructure are
> +expected to come up with time. Right now, the only resource effectively limited

                                                      resources

> +are tcp send and receive buffers.

or:
                                             the only resource effectively limited
  is TCP network buffers.

> +
> +TCP network buffers
> +===================
> +
> +TCP network buffers, both on the send and receive sides, can be controlled
> +by the kmem cgroup. Once a socket is created, it is attached to the cgroup of
> +the controller process, where it stays until the end of its lifetime.
> +
> +Files
> +=====
> +	kmem.tcp_maxmem: control the maximum amount in bytes that can be used by

	                 controls the maximum amount of memory in bytes ...


> +	tcp sockets inside the cgroup. 
> +
> +	kmem.tcp_current_memory: current amount in bytes used by all sockets in

	                         current amount of memory in bytes ...

> +	this cgroup


-- 
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 08/11] netlink: implement memory mapped sendmsg()
From: Michał Mirosław @ 2011-09-08 18:08 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: davem, netfilter-devel, netdev
In-Reply-To: <4E688BD5.5030909@trash.net>

On Thu, Sep 08, 2011 at 11:33:09AM +0200, Patrick McHardy wrote:
> Am 07.09.2011 22:03, schrieb Michał Mirosław:
> > On Wed, Sep 07, 2011 at 05:22:00PM +0200, Patrick McHardy wrote:
> >> On 04.09.2011 18:18, Michał Mirosław wrote:
> >>> On Sat, Sep 03, 2011 at 07:26:08PM +0200, kaber@trash.net wrote:
> >>>> From: Patrick McHardy <kaber@trash.net>
> >>>>
> >>>> Add support for memory mapped sendmsg() to netlink. Userspace queued to
> >>>> be processed frames into the TX ring and invokes sendmsg with
> >>>> msg.iov.iov_base = NULL to trigger processing of all pending messages.
> >>>>
> >>>> Since the kernel usually performs full message validation before beginning
> >>>> processing, userspace must be prevented from modifying the message
> >>>> contents while the kernel is processing them. In order to do so, the
> >>>> frames contents are copied to an allocated skb in case the the ring is
> >>>> mapped more than once or the file descriptor is shared (f.i. through
> >>>> AF_UNIX file descriptor passing).
> >>>>
> >>>> Otherwise an skb without a data area is allocated, the data pointer set
> >>>> to point to the data area of the ring frame and the skb is processed.
> >>>> Once the skb is freed, the destructor releases the frame back to userspace
> >>>> by setting the status to NL_MMAP_STATUS_UNUSED.
> >>>
> >>> Is this protected from threads? Like: one thread waits on sendmsg() and
> >>> another (same process) changes the buffer.
> >> Yes, if the ring is mapped multiple times (or the file descriptor
> >> is changed), the contents are copied to an allocated skb.
> > 
> > I mean:
> > 
> > [1] mmap()
> > [1] fill buffers
> > [1] pthread_create() [creates: 2]
> > [1] sendmsg() starts
> > [2] modify buffers
> > [1] sendmsg() returns
> > 
> > So: no multiple mmaps, and no touching of the fd. I haven't dug into
> > filesystem layer to see if threads affect file->f_count, but there
> > sure are no multiple mappings here.
> If CLONE_VM is given to clone(), the mapping is visible in both
> threads and thus we have multiple mappings (vma_ops->open() is
> invoked through clone()). Without CLONE_VM, the second thread
> can't access the ring unless it mmap()s it itself, in case we'd
> also have multiple mappings.

I made a quick look into kernel/fork.c, and it looks to me that if CLONE_VM
is set, then vma->open() is actually avoided --- it's called via dup_mm()
and dup_mmap() only if CLONE_VM is not there and the VMA needs to be copied.

Best Regards,
Michał Mirosław
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v3] net/smsc911x: add device tree probe support
From: Grant Likely @ 2011-09-08 18:29 UTC (permalink / raw)
  To: Dave Martin
  Cc: patches-QSEj5FYQhm4dnm+yROfE0A, netdev-u79uwXL29TY76Z2rM5mHXA,
	devicetree-discuss-uLR06cmDAlY/bJ5BZ2RsiQ, Steve Glendinning,
	David S. Miller,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r
In-Reply-To: <20110908145946.GE2070-5wv7dgnIgG8@public.gmane.org>

On Thu, Sep 08, 2011 at 03:59:46PM +0100, Dave Martin wrote:
> Hi Shawn,
> 
> On Sun, Jul 31, 2011 at 02:26:00AM +0800, Shawn Guo wrote:
> > It adds device tree probe support for smsc911x driver.
> > 
> > Signed-off-by: Shawn Guo <shawn.guo-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>
> > Cc: Grant Likely <grant.likely-s3s/WqlpOiPyB63q8FvJNQ@public.gmane.org>
> > Cc: Steve Glendinning <steve.glendinning-sdUf+H5yV5I@public.gmane.org>
> > Cc: David S. Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
> > Reviewed-by: Grant Likely <grant.likely-s3s/WqlpOiPyB63q8FvJNQ@public.gmane.org>
> > ---
> > Changes since v2:
> >  * Fix a typo in smsc911x.txt
> > 
> > Changes since v1:
> >  * Instead of getting irq line from gpio number, it use irq domain
> >    to keep platform_get_resource(IORESOURCE_IRQ) works for dt too.
> >  * Use 'lan9115' the first model that smsc911x supports in the match
> >    table
> >  * Use reg-shift and reg-io-width which already used in of_serial for
> >    shift and access size binding
> 
> When using this patch with vexpress, I found that 16-bit register access
> mode doesn't seem to be getting set correctly.
> 
> Can you take a look at this additional patch and let me know if it looks
> correct?
> 
> Cheers
> ---Dave
> 
> From: Dave Martin <dave.martin-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>
> Date: Wed, 7 Sep 2011 17:26:31 +0100
> Subject: [PATCH] net/smsc911x: Correctly configure 16-bit register access from DT
> 
> The SMSC911X_USE_16BIT needs to be set when using 16-bit register
> access.  However, currently no flag is set if the DT doesn't specify
> 32-bit access.
> 
> This patch should set the SMSC911X_USE_16BIT flag in a manner consistent
> with the documented DT bindings.
> 
> Signed-off-by: Dave Martin <dave.martin-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>
> ---
>  drivers/net/smsc911x.c |    2 ++
>  1 files changed, 2 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/net/smsc911x.c b/drivers/net/smsc911x.c
> index 75c08a5..1a35c25 100644
> --- a/drivers/net/smsc911x.c
> +++ b/drivers/net/smsc911x.c
> @@ -2121,6 +2121,8 @@ static int __devinit smsc911x_probe_config_dt(
>  	of_property_read_u32(np, "reg-io-width", &width);
>  	if (width == 4)
>  		config->flags |= SMSC911X_USE_32BIT;
> +	else
> +		config->flags |= SMSC911X_USE_16BIT;

Would it be better to do "else if (width == 2)"?  (completely
uninformed comment.  I've not looked at what the non-DT probe path
does on this driver.)

g.

^ permalink raw reply

* Re: [net-next-2.6 PATCH 0/3 RFC] macvlan: MAC Address filtering support for passthru mode
From: Roopa Prabhu @ 2011-09-08 19:23 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Michael S. Tsirkin, netdev, dragos.tatulea, arnd, dwang2, benve,
	kaber, davem, eric.dumazet, mchan, kvm
In-Reply-To: <1315503765.11074.33.camel@w-sridhar.beaverton.ibm.com>




On 9/8/11 10:42 AM, "Sridhar Samudrala" <sri@us.ibm.com> wrote:

> On Thu, 2011-09-08 at 09:19 -0700, Roopa Prabhu wrote:
>> 
>> 
>> On 9/8/11 4:08 AM, "Michael S. Tsirkin" <mst@redhat.com> wrote:
>> 
>>> On Wed, Sep 07, 2011 at 10:20:28PM -0700, Roopa Prabhu wrote:
>>>> On 9/7/11 5:34 AM, "Michael S. Tsirkin" <mst@redhat.com> wrote:
>>>> 
>>>>> On Tue, Sep 06, 2011 at 03:35:40PM -0700, Roopa Prabhu wrote:
>>>>>> This patch is an attempt at providing address filtering support for
>>>>>> macvtap
>>>>>> devices in PASSTHRU mode. Its still a work in progress.
>>>>>> Briefly tested for basic functionality. Wanted to get some feedback on
>>>>>> the
>>>>>> direction before proceeding.
>>>>>> 
>>>>> 
>>>>> Good work, thanks.
>>>>> 
>>>> 
>>>> Thanks.
>>>> 
>>>>>> I have hopefully CC'ed all concerned people.
>>>>> 
>>>>> kvm crowd might also be interested.
>>>>> Try using ./scripts/get_maintainer.pl as well.
>>>>> 
>>>> Thanks for the tip. Expanded CC list a bit more.
>>>> 
>>>>>> PASSTHRU mode today sets the lowerdev in promiscous mode. In PASSTHRU
>>>>>> mode
>>>>>> there is a 1-1 mapping between macvtap device and physical nic or VF. And
>>>>>> all
>>>>>> filtering is done in lowerdev hw. The lowerdev does not need to be in
>>>>>> promiscous mode as long as the guest filters are passed down to the
>>>>>> lowerdev.
>>>>>> This patch tries to remove the need for putting the lowerdev in
>>>>>> promiscous
>>>>>> mode. 
>>>>>> I have also referred to the thread below where TUNSETTXFILTER was
>>>>>> mentioned
>>>>>> in 
>>>>>> this context:
>>>>>>  http://patchwork.ozlabs.org/patch/69297/
>>>>>> 
>>>>>> This patch basically passes the addresses got by TUNSETTXFILTER to
>>>>>> macvlan
>>>>>> lowerdev.
>>>>>> 
>>>>>> I have looked at previous work and discussions on this for qemu-kvm
>>>>>> by Michael Tsirkin, Alex Williamson and Dragos Tatulea
>>>>>> http://patchwork.ozlabs.org/patch/78595/
>>>>>> http://patchwork.ozlabs.org/patch/47160/
>>>>>> https://patchwork.kernel.org/patch/474481/
>>>>>> 
>>>>>> Redhat bugzilla by Michael Tsirkin:
>>>>>> https://bugzilla.redhat.com/show_bug.cgi?id=655013
>>>>>> 
>>>>>> I used Michael's qemu-kvm patch for testing the changes with KVM
>>>>>> 
>>>>>> I would like to cover both MAC and vlan filtering in this work.
>>>>>> 
>>>>>> Open Questions/Issues:
>>>>>> - There is a need for vlan filtering to complete the patch. It will
>>>>>> require
>>>>>>   a new tap ioctl cmd for vlans.
>>>>>>   Some ideas on this are:
>>>>>> 
>>>>>>   a) TUNSETVLANFILTER: This will entail we send the whole vlan bitmap
>>>>>> filter
>>>>>> (similar to tun_filter for addresses). Passing the vlan id's to lower
>>>>>> device will mean going thru the whole list of vlans every time.
>>>>>> 
>>>>>>   OR
>>>>>> 
>>>>>>   b) TUNSETVLAN with vlan id and flag to set/unset
>>>>>> 
>>>>>>   Does option 'b' sound ok ?
>>>>>> 
>>>>>> - In this implementation we make the macvlan address list same as the
>>>>>> address
>>>>>>   list that came in the filter with TUNSETTXFILTER. This will not cover
>>>>>> cases
>>>>>>   where the macvlan device needs to have other addresses that are not
>>>>>>   necessarily in the filter. Is this a problem ?
>>>>> 
>>>>> What cases do you have in mind?
>>>>> 
>>>> This patch targets only macvlan PASSTHRU mode and for PASSTHRU mode I don't
>>>> see a problem with uc/mc address list being the same in all the stacked
>>>> netdevs in the path. I called that out above to make sure I was not missing
>>>> any case in PASSTHRU mode where this might be invalid. Otherwise I don't
>>>> see
>>>> a problem in the simple PASSTHRU use case this patch supports.
>>>> 
>>>>>> - The patch currently only supports passing of IFF_PROMISC and
>>>>>> IFF_MULTICAST
>>>>>> filter flags to lowerdev
>>>>>> 
>>>>>> This patch series implements the following
>>>>>> 01/3 - macvlan: Add support for unicast filtering in macvlan
>>>>>> 02/3 - macvlan: Add function to set addr filter on lower device in
>>>>>> passthru
>>>>>> mode
>>>>>> 03/3 - macvtap: Add support for TUNSETTXFILTER
>>>>>> 
>>>>>> Please comment. Thanks.
>>>>>> 
>>>>>> Signed-off-by: Roopa Prabhu <roprabhu@cisco.com>
>>>>>> Signed-off-by: Christian Benvenuti <benve@cisco.com>
>>>>>> Signed-off-by: David Wang <dwang2@cisco.com>
>>>>> 
>>>>> The security isn't lower than with promisc, so I don't see
>>>>> a problem with this as such.
>>>>> 
>>>>> There are more features we'll want down the road though,
>>>>> so let's see whether the interface will be able to
>>>>> satisfy them in a backwards compatible way before we
>>>>> set it in stone. Here's what I came up with:
>>>>> 
>>>>> How will the filtering table be partitioned within guests?
>>>> 
>>>> Since this patch supports macvlan PASSTHRU mode only, in which the lower
>>>> device has 1-1 mapping to the guest nic, it does not require any
>>>> partitioning of filtering table within guests. Unless I missed
>>>> understanding
>>>> something. 
>>>> If the lower device were being shared by multiple guest network interfaces
>>>> (non PASSTHRU mode), only then we will need to maintain separate filter
>>>> tables for each guest network interface in macvlan and forward the pkt to
>>>> respective guest interface after a filter lookup. This could affect
>>>> performance too I think.
>>> 
>>> Not with hardware filtering support. Which is where we'd need to
>>> partition the host nic mac table between guests.
>>> 
>> I need to understand this more. In non passthru case when a VF or physical
>> nic is shared between guests, the nic does not really know about the guests,
>> so I was thinking we do the same thing as we do for the passthru case (ie
>> send all the address filters from macvlan to the physical nic). So at the
>> hardware, filtering is done for all guests sharing the nic. But if we want
>> each virtio-net nic or guest to get exactly what it asked for
>> macvlan/macvtap needs to maintain a copy of each guest filter and do a
>> lookup and send only the requested traffic to the guest. Here is the
>> performance hit that I was seeing. Please see my next comment for further
>> details. 
>> 
>> 
>>>> I chose to support PASSTHRU Mode only at first because its simpler and all
>>>> code additions are in control path only.
>>> 
>>> I agree. It would be a bit silly to have a dedicated interface
>>> for passthough and a completely separate one for
>>> non passthrough.
>>> 
>> Agree. The reason I did not focus on non-passthru case in the initial
>> version was because I was thinking things to do in the non-passthru case
>> will be just add-ons to the passthru case. But true Better to flush out the
>> non-pasthru case details.
>> 
>> After dwelling on this a bit more how about the below:
>> 
>> Phase 1: Goal: Enable hardware filtering for all macvlan modes
>>     - In macvlan passthru mode the single guest virtio-nic connected will
>> receive traffic that he requested for
> Currently the guest receives all the packets seen on the interface as it
> is put in promiscuous mode. With your patch it only sees the packets
> that he requested for. Have you tried creating a macvlan interface on
> top of the guest virtio-net interface? Is the new mac address propagated
> all the way to the host nic?

Yes I have tried this and it works. The mac address gets propagated to the
physical nic.

> 
> I think the main usecase for passthru mode is to assign a SR-IOV VF to
> a single guest.
> 
Yes and for the passthru usecase this patch should be enough to enable
filtering in hw (eventually like I indicated before I need to fix vlan
filtering too).

>>     - In macvlan non-passthru mode all guest virtio-nics sharing the
>>       physical nic will see all other guest traffic
>>       but the filtering at guest virtio-nic will make sure each guest
>>       eventually sees traffic he asked for. This is still better than
>> putting the physical nic in promiscuous mode.
> 
> With the default macvlan mode (vepa), i think each guest will only see
> its own traffic. But currently adding a secondary mac address on a guest
> will not work as it is not propagated all the way down to the host.
> 
> 
>> (This is mainly what my patch does...but will need to remove the passthru
>> check and see if there are any thing else needed for non-passthru case)
>> 
>> 
>> Phase 2: Goal: Enable filtering at macvlan so that each guest virtio-nic
>> receives only what he requested for.
>>     - In this case, in addition to pushing the filters down to the physical
>> nic we will have to maintain the same filter in macvlan and do a filter
>> lookup before forwarding the traffic to a virtio-nic.
>> 
>> But I am thinking phase 2 might be redundant given virtio-nic already does
>> filtering for the guest. In which case we might not need phase 2 at all. I
>> might have been over complicating things.
> 
> I think filtering at macvlan will be more efficient than replicating all
> the packets to all the guest virtio-nics.

True. This usecase is for non-passthru and I think it will require some
performance testing for all modes too. Could be a phase II patch with the
current patch only enabling filtering in hw.

Thanks for the comments,
Roopa



^ permalink raw reply

* Re: [net-next-2.6 PATCH 0/3 RFC] macvlan: MAC Address filtering support for passthru mode
From: Michael S. Tsirkin @ 2011-09-08 19:33 UTC (permalink / raw)
  To: Roopa Prabhu
  Cc: Sridhar Samudrala, netdev, dragos.tatulea, arnd, dwang2, benve,
	kaber, davem, eric.dumazet, mchan, kvm
In-Reply-To: <CA8E645C.33B90%roprabhu@cisco.com>

On Thu, Sep 08, 2011 at 12:23:56PM -0700, Roopa Prabhu wrote:
> > 
> > I think the main usecase for passthru mode is to assign a SR-IOV VF to
> > a single guest.
> > 
> Yes and for the passthru usecase this patch should be enough to enable
> filtering in hw (eventually like I indicated before I need to fix vlan
> filtering too).

So with filtering in hw, and in sriov VF case, VFs
actually share a filtering table. How will that
be partitioned?

-- 
MST

^ permalink raw reply

* [PATCH net-next-2.6] can/sja1000: driver for PEAK PCAN PCI/PCIe cards
From: Wolfgang Grandegger @ 2011-09-08 20:07 UTC (permalink / raw)
  To: Netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: SocketCAN Core Mailing List, linux-g4cQ8AsIbFbL9ATBNaCtXw,
	Thomas Wiedemann

This patch adds the "peak_pci" driver for the PCAN PCI/PCIe cards (1, 2, 3
or 4 channels) from PEAK Systems (http://www.peak-system.com).

Signed-off-by: Wolfgang Grandegger <wg-5Yr1BZd7O62+XT7JhA+gdA@public.gmane.org>
---

I have tested this patch on my 2 channel PEAK PCI card. Thomas, or somebody
else with a PEAK PCAN PCI/PCIe card at hand, it would be nice if you could
test it on your 4 channel card and add your "Tested-by".

Thanks,

Wolfgang. 

 drivers/net/can/sja1000/Kconfig    |    7 +
 drivers/net/can/sja1000/Makefile   |    1 +
 drivers/net/can/sja1000/peak_pci.c |  293 ++++++++++++++++++++++++++++++++++++
 3 files changed, 301 insertions(+), 0 deletions(-)
 create mode 100644 drivers/net/can/sja1000/peak_pci.c

diff --git a/drivers/net/can/sja1000/Kconfig b/drivers/net/can/sja1000/Kconfig
index 6fdc031..72b637d 100644
--- a/drivers/net/can/sja1000/Kconfig
+++ b/drivers/net/can/sja1000/Kconfig
@@ -37,6 +37,13 @@ config CAN_EMS_PCI
 	  CPC-PCIe and CPC-104P cards from EMS Dr. Thomas Wuensche
 	  (http://www.ems-wuensche.de).
 
+config CAN_PEAK_PCI
+	tristate "PEAK PCAN PCI/PCIe Cards"
+	depends on PCI
+	---help---
+	  This driver is for the PCAN PCI/PCIe cards (1, 2, 3 or 4 channels)
+	  from PEAK Systems (http://www.peak-system.com).
+
 config CAN_KVASER_PCI
 	tristate "Kvaser PCIcanx and Kvaser PCIcan PCI Cards"
 	depends on PCI
diff --git a/drivers/net/can/sja1000/Makefile b/drivers/net/can/sja1000/Makefile
index 2c591eb..428f5cf 100644
--- a/drivers/net/can/sja1000/Makefile
+++ b/drivers/net/can/sja1000/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_CAN_SJA1000_PLATFORM) += sja1000_platform.o
 obj-$(CONFIG_CAN_SJA1000_OF_PLATFORM) += sja1000_of_platform.o
 obj-$(CONFIG_CAN_EMS_PCI) += ems_pci.o
 obj-$(CONFIG_CAN_KVASER_PCI) += kvaser_pci.o
+obj-$(CONFIG_CAN_PEAK_PCI) += peak_pci.o
 obj-$(CONFIG_CAN_PLX_PCI) += plx_pci.o
 obj-$(CONFIG_CAN_TSCAN1) += tscan1.o
 
diff --git a/drivers/net/can/sja1000/peak_pci.c b/drivers/net/can/sja1000/peak_pci.c
new file mode 100644
index 0000000..23d865d
--- /dev/null
+++ b/drivers/net/can/sja1000/peak_pci.c
@@ -0,0 +1,293 @@
+/*
+ * Copyright (C) 2007, 2011 Wolfgang Grandegger <wg-5Yr1BZd7O62+XT7JhA+gdA@public.gmane.org>
+ *
+ * Derived from the PCAN project file driver/src/pcan_pci.c:
+ *
+ * Copyright (C) 2001-2006  PEAK System-Technik GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the version 2 of the GNU General Public License
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/can.h>
+#include <linux/can/dev.h>
+
+#include "sja1000.h"
+
+MODULE_AUTHOR("Wolfgang Grandegger <wg-5Yr1BZd7O62+XT7JhA+gdA@public.gmane.org>");
+MODULE_DESCRIPTION("Socket-CAN driver for PEAK PCAN PCI/PCIe cards");
+MODULE_SUPPORTED_DEVICE("PEAK PCAN PCI/PCIe CAN card");
+MODULE_LICENSE("GPL v2");
+
+#define DRV_NAME  "peak_pci"
+
+struct peak_pci_chan {
+	void __iomem *cfg_base;	     /* Common for all channels */
+	struct net_device *next_dev; /* Chain of network devices */
+	u16 icr_mask;		     /* Interrupt mask for fast ack */
+};
+
+#define PEAK_PCI_CAN_CLOCK	(16000000 / 2)
+
+#define PEAK_PCI_CDR		(CDR_CBP | CDR_CLKOUT_MASK)
+#define PEAK_PCI_OCR		OCR_TX0_PUSHPULL
+
+/*
+ * Important PITA registers
+ */
+#define PITA_ICR		0x00	/* Interrupt control register */
+#define PITA_GPIOICR		0x18	/* GPIO interface control register */
+#define PITA_MISC		0x1C	/* Miscellaneous register */
+
+#define PEAK_PCI_CFG_SIZE	0x1000	/* Size of the config PCI bar */
+#define PEAK_PCI_CHAN_SIZE	0x0400	/* Size used by the channel */
+
+#define PEAK_PCI_VENDOR_ID	0x001C	/* The PCI device and vendor IDs */
+#define PEAK_PCI_DEVICE_ID	0x0001	/* for PCI / PCIe slot cards */
+#define PEAK_PCIE_DEVICE_ID	0x0002	/* for PCIExpress cards */
+
+static const u16 peak_pci_icr_masks[] = {0x02, 0x01, 0x40, 0x80};
+
+static DEFINE_PCI_DEVICE_TABLE(peak_pci_tbl) = {
+	{PEAK_PCI_VENDOR_ID, PEAK_PCI_DEVICE_ID, PCI_ANY_ID, PCI_ANY_ID,},
+	{PEAK_PCI_VENDOR_ID, PEAK_PCIE_DEVICE_ID, PCI_ANY_ID, PCI_ANY_ID,},
+	{0,}
+};
+
+MODULE_DEVICE_TABLE(pci, peak_pci_tbl);
+
+static u8 peak_pci_read_reg(const struct sja1000_priv *priv, int port)
+{
+	return readb(priv->reg_base + (port << 2));
+}
+
+static void peak_pci_write_reg(const struct sja1000_priv *priv,
+			       int port, u8 val)
+{
+	writeb(val, priv->reg_base + (port << 2));
+}
+
+static void peak_pci_post_irq(const struct sja1000_priv *priv)
+{
+	struct peak_pci_chan *chan = priv->priv;
+	u16 icr;
+
+	/* Select and clear in PITA stored interrupt */
+	icr = readw(chan->cfg_base + PITA_ICR);
+	if (icr & chan->icr_mask)
+		writew(chan->icr_mask, chan->cfg_base + PITA_ICR);
+}
+
+static int __devinit peak_pci_probe(struct pci_dev *pdev,
+				    const struct pci_device_id *ent)
+{
+	struct sja1000_priv *priv;
+	struct peak_pci_chan *chan;
+	struct net_device *dev, *dev0 = NULL;
+	void __iomem *cfg_base, *reg_base;
+	u16 sub_sys_id, icr;
+	int i, err, channels;
+
+	err = pci_enable_device(pdev);
+	if (err)
+		return err;
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err)
+		goto failure_disable_pci;
+
+	err = pci_read_config_word(pdev, 0x2e, &sub_sys_id);
+	if (err)
+		goto failure_release_regions;
+
+	dev_dbg(&pdev->dev, "probing device %04x:%04x:%04x\n",
+		pdev->vendor, pdev->device, sub_sys_id);
+
+	err = pci_write_config_word(pdev, 0x44, 0);
+	if (err)
+		goto failure_release_regions;
+
+	if (sub_sys_id >= 12)
+		channels = 4;
+	else if (sub_sys_id >= 10)
+		channels = 3;
+	else if (sub_sys_id >= 4)
+		channels = 2;
+	else
+		channels = 1;
+
+	cfg_base = pci_iomap(pdev, 0, PEAK_PCI_CFG_SIZE);
+	if (!cfg_base) {
+		dev_err(&pdev->dev, "failed to map PCI resource #0\n");
+		goto failure_release_regions;
+	}
+
+	reg_base = pci_iomap(pdev, 1, PEAK_PCI_CHAN_SIZE * channels);
+	if (!reg_base) {
+		dev_err(&pdev->dev, "failed to map PCI resource #1\n");
+		goto failure_unmap_cfg_base;
+	}
+
+	/* Set GPIO control register */
+	writew(0x0005, cfg_base + PITA_GPIOICR + 2);
+	/* Enable all channels of this card */
+	writeb(0x00, cfg_base + PITA_GPIOICR);
+	/* Toggle reset */
+	writeb(0x05, cfg_base + PITA_MISC + 3);
+	mdelay(5);
+	/* Leave parport mux mode */
+	writeb(0x04, cfg_base + PITA_MISC + 3);
+
+	icr = readw(cfg_base + PITA_ICR + 2);
+
+	for (i = 0; i < channels; i++) {
+		dev = alloc_sja1000dev(sizeof(struct peak_pci_chan));
+		if (!dev) {
+			err = -ENOMEM;
+			goto failure_remove_channels;
+		}
+
+		priv = netdev_priv(dev);
+		chan = priv->priv;
+
+		chan->cfg_base = cfg_base;
+		priv->reg_base = reg_base + i * PEAK_PCI_CHAN_SIZE;
+
+		priv->read_reg = peak_pci_read_reg;
+		priv->write_reg = peak_pci_write_reg;
+		priv->post_irq = peak_pci_post_irq;
+
+		priv->can.clock.freq = PEAK_PCI_CAN_CLOCK;
+		priv->ocr = PEAK_PCI_OCR;
+		priv->cdr = PEAK_PCI_CDR;
+		/* Neither a slave nor a single device distributes the clock */
+		if (channels == 1 || i > 0)
+			priv->cdr |= CDR_CLK_OFF;
+
+		/* Setup interrupt handling */
+		priv->irq_flags = IRQF_SHARED;
+		dev->irq = pdev->irq;
+
+		chan->icr_mask = peak_pci_icr_masks[i];
+		icr |= chan->icr_mask;
+
+		SET_NETDEV_DEV(dev, &pdev->dev);
+
+		err = register_sja1000dev(dev);
+		if (err) {
+			dev_err(&pdev->dev, "failed to register device\n");
+			free_sja1000dev(dev);
+			goto failure_remove_channels;
+		}
+
+		/* Create chain of SJA1000 devices */
+		if (i == 0)
+			dev0 = dev;
+		else
+			chan->next_dev = dev;
+
+		dev_info(&pdev->dev,
+			 "%s at reg_base=0x%p cfg_base=0x%p irq=%d\n",
+			 dev->name, priv->reg_base, chan->cfg_base, dev->irq);
+	}
+
+	pci_set_drvdata(pdev, dev0);
+
+	/* Enable interrupts */
+	writew(icr, cfg_base + PITA_ICR + 2);
+
+	return 0;
+
+failure_remove_channels:
+	/* Disable interrupts */
+	writew(0x0, cfg_base + PITA_ICR + 2);
+
+	for (dev = dev0; dev; dev = chan->next_dev) {
+		unregister_sja1000dev(dev);
+		free_sja1000dev(dev);
+		priv = netdev_priv(dev);
+		chan = priv->priv;
+		dev = chan->next_dev;
+	}
+
+	pci_iounmap(pdev, reg_base);
+
+failure_unmap_cfg_base:
+	pci_iounmap(pdev, cfg_base);
+
+failure_release_regions:
+	pci_release_regions(pdev);
+
+failure_disable_pci:
+	pci_disable_device(pdev);
+
+	return err;
+}
+
+static void __devexit peak_pci_remove(struct pci_dev *pdev)
+{
+	struct net_device *dev = pci_get_drvdata(pdev); /* First device */
+	struct sja1000_priv *priv = netdev_priv(dev);
+	struct peak_pci_chan *chan = priv->priv;
+	void __iomem *cfg_base = chan->cfg_base;
+	void __iomem *reg_base = priv->reg_base;
+
+	/* Disable interrupts */
+	writew(0x0, cfg_base + PITA_ICR + 2);
+
+	/* Loop over all registered devices */
+	while (1) {
+		dev_info(&pdev->dev, "removing device %s\n", dev->name);
+		unregister_sja1000dev(dev);
+		free_sja1000dev(dev);
+		dev = chan->next_dev;
+		if (!dev)
+			break;
+		priv = netdev_priv(dev);
+		chan = priv->priv;
+	}
+
+	pci_iounmap(pdev, reg_base);
+	pci_iounmap(pdev, cfg_base);
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+
+	pci_set_drvdata(pdev, NULL);
+}
+
+static struct pci_driver peak_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = peak_pci_tbl,
+	.probe = peak_pci_probe,
+	.remove = __devexit_p(peak_pci_remove),
+};
+
+static int __init peak_pci_init(void)
+{
+	return pci_register_driver(&peak_pci_driver);
+}
+module_init(peak_pci_init);
+
+static void __exit peak_pci_exit(void)
+{
+	pci_unregister_driver(&peak_pci_driver);
+}
+module_exit(peak_pci_exit);
-- 
1.7.4.1

^ permalink raw reply related

* Re: [PATCH] per-cgroup tcp buffer limitation
From: Greg Thelen @ 2011-09-08 21:53 UTC (permalink / raw)
  To: Glauber Costa
  Cc: linux-kernel, linux-mm, containers, netdev, xemul,
	David S. Miller, Hiroyouki Kamezawa, Eric W. Biederman,
	Suleiman Souhlal
In-Reply-To: <4E68484A.4000201@parallels.com>

On Wed, Sep 7, 2011 at 9:44 PM, Glauber Costa <glommer@parallels.com> wrote:

Thanks for your ideas and patience.

> Well, it is a way to see this. The other way to see this, is that you're
> proposing to move to the kernel, something that really belongs in userspace.
> That's because:
>
> With the information you provided me, I have no reason to believe that the
> kernel has more condition to do this work. Do the kernel have access to any
> information that userspace do not, and can't be exported? If not, userspace
> is traditionally where this sort of stuff has been done.

I think direct reclaim is a pain if user space is required to participate in
memory balancing decisions.  One thing a single memory limit solution has is the
ability to reclaim user memory to satisfy growing kernel memory needs (and vise
versa).  If a container must fit within 100M, then a single limit solution
would set the limit to 100M and never change it.  In a split limit solution a
user daemon (e.g. uswapd) would need to monitor the usage and the amount of
active memory vs inactive user memory and unreferenced kernel memory to
determine where to apply pressure.  With some more knobs such a uswapd could
attempt to keep ahead of demand.  But eventually direct reclaim would
be needed to satisfy rapid growth spikes.  Example: If the 100M container
starts with limits of 20M kmem and 80M user memory but later its kernel
memory needs grow to 70M.  With separate user and kernel memory
limits the kernel memory allocation could fail despite there being
reclaimable user pages available.  The job should have a way to
transition to memory limits to 70M+ kernel and 30M- of user.

I suppose a GFP_WAIT slab kernel page allocation could wakeup user space to
perform user-assisted direct reclaim.  User space would then lower the user
limit thereby causing the kernel to direct reclaim user pages, then
the user daemon would raise the kernel limit allowing the slab allocation to
succeed.  My hunch is that this would be prone to deadlocks (what prevents
uswapd from needing more even more kmem?)  I'll defer to more
experienced minds to know if user assisted direct memory reclaim has
other pitfalls.  It scares me.

Fundamentally I have no problem putting an upper bound on a cgroup's resource
usage.  This serves to contain the damage a job can do to the system and other
jobs.  My concern is about limiting the kernel's ability to trade one type of
memory for another by using different cgroups for different types of memory.

If kmem expands to include reclaimable kernel memory (e.g. dentry) then I
presume the kernel would have no way to exchange unused user pages for dentry
pages even if the user memory in the container is well below its limit.  This is
motivation for the above user assisted direct reclaim.

Do you feel the need to segregate user and kernel memory into different cgroups
with independent limits?  Or is this this just a way to create a new clean
cgroup with a simple purpose?

In some resource sharing shops customers purchase a certain amount of memory,
cpu, network, etc.  Such customers don't define how the memory is used and the
user/kernel mixture may change over time.  Can a user space reclaim daemon stay
ahead of the workloads needs?

> Using userspace CPU is no different from using kernel cpu in this particular
> case. It is all overhead, regardless where it comes from. Moreover, you end
> up setting up a policy, instead of a mechanism. What should be this
> proportion?  Do we reclaim everything with the same frequency? Should we be
> more tolerant with a specific container?

I assume that this implies that a generic kmem cgroup usage is inferior to
separate limits for each kernel memory type to allow user space the flexibility
to choose between kernel types (udp vs tcp vs ext4 vs page_tables vs ...)?  Do
you foresee a way to provide a limit on the total amount of kmem usage by all
such types?  If a container wants to dedicate 4M for all network protocol
buffers (tcp, udp, etc.) would that require a user space daemon to balance
memory limits b/w the protocols?

> Also, If you want to allow any flexibility in this scheme, like: "Should
> this network container be able to stress the network more, pinning more
> memory, but not other subsystems?", you end up having to touch all
> individual files anyway - probably with a userspace daemon.
>
> Also, as you noticed yourself, kernel memory is fundamentally different from
> userspace memory. You can't just set reclaim limits, since you have no
> guarantees it will work. User memory is not a scarce resource.
> Kernel memory is.

I agree that kernel memory is somewhat different.  In some (I argue most)
situations containers want the ability to exchange job kmem and job umem.
Either split or combined accounting protects the system and isolates other
containers from kmem allocations of a bad job.  To me it seems natural to
indicate that job X gets Y MB of memory.  I have more trouble dividing the
Y MB of memory into dedicated slices for different types of memory.

>> While there are people (like me) who want a combined memory usage
>> limit there are also people (like you) who want separate user and
>> kernel limiting.
>
> Combined excludes separate. Separate does not exclude combined.

I agree.  I have no problem with separate accounting and separate
user-accessible pressure knobs to allow for complex policies.  My concern is
about limiting the kernel's ability to reclaim one type of memory to
fulfill the needs of another memory type (e.g. I think reclaiming clean file
pages should be possible to make room for user slab needs).  I think
memcg aware slab accounting does a good job of limiting a job's
memory allocations.
Would such slab accounting meet your needs?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [net-next-2.6 PATCH 3/3 RFC] macvtap: Add support for TUNSETTXFILTER
From: Arnd Bergmann @ 2011-09-08 16:25 UTC (permalink / raw)
  To: Roopa Prabhu; +Cc: netdev, dragos.tatulea, mst, dwang2, benve, kaber, sri
In-Reply-To: <20110906223555.6552.50485.stgit@savbu-pc100.cisco.com>

On Wednesday 07 September 2011, Roopa Prabhu wrote:
> From: Roopa Prabhu <roprabhu@cisco.com>
> 
> This patch adds support for TUNSETTXFILTER. Calls macvlan set filter function
> with address list and flags received via TUNSETTXFILTER.
> 
> Signed-off-by: Roopa Prabhu <roprabhu@cisco.com>
> Signed-off-by: Christian Benvenuti <benve@cisco.com>
> Signed-off-by: David Wang <dwang2@cisco.com>

Looks ok to me in principle, but

> +               /* XXX: If broadcast address present, set IFF_BROADCAST */
> +               /* XXX: If multicast address present, set IFF_MULTICAST */
> +               flags |= (tf.flags & TUN_FLT_ALLMULTI ? IFF_ALLMULTI : 0) |
> +                        (!tf.count ? IFF_PROMISC : 0);
> +               ret = 0;
> +               if (tf.count > 0) {
> +                       alen = ETH_ALEN * tf.count;
> +                       addrs = kmalloc(alen, GFP_KERNEL);
> +                       if (!addrs) {
> +                               dev_put(vlan->dev);
> +                               return -ENOMEM;
> +                       }

I think you need to check tf.count for a maximum value. In theory, a user
could pass a rather large number (65536) which is not good.

Also the TUNSETTXFILTER code looks sufficiently large that it would be
better to put it into a separate function. Use "goto" statements in
order to do the error handling in there, instead of repeating
lots of kfree and dev_put calls in each error case.

	Arnd

^ permalink raw reply

* [PATCH 2/2] GRETH: avoid overwrite IP-stack's IP-frags checksum
From: Daniel Hellstrom @ 2011-09-08 13:14 UTC (permalink / raw)
  To: davem; +Cc: netdev, kristoffer
In-Reply-To: <1315487676-16733-1-git-send-email-daniel@gaisler.com>

The GRETH GBIT core does not do checksum offloading for IP
segmentation. This patch adds a check in the xmit function to
determine if the stack has calculated the checksum for us.

Signed-off-by: Daniel Hellstrom <daniel@gaisler.com>
---
 drivers/net/greth.c |    7 +++++--
 1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/greth.c b/drivers/net/greth.c
index 9d39fb9..27ba855 100644
--- a/drivers/net/greth.c
+++ b/drivers/net/greth.c
@@ -489,7 +489,8 @@ greth_start_xmit_gbit(struct sk_buff *skb, struct net_device *dev)
 	if (nr_frags != 0)
 		status = GRETH_TXBD_MORE;
 
-	status |= GRETH_TXBD_CSALL;
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		status |= GRETH_TXBD_CSALL;
 	status |= skb_headlen(skb) & GRETH_BD_LEN;
 	if (greth->tx_next == GRETH_TXBD_NUM_MASK)
 		status |= GRETH_BD_WR;
@@ -512,7 +513,9 @@ greth_start_xmit_gbit(struct sk_buff *skb, struct net_device *dev)
 		greth->tx_skbuff[curr_tx] = NULL;
 		bdp = greth->tx_bd_base + curr_tx;
 
-		status = GRETH_TXBD_CSALL | GRETH_BD_EN;
+		status = GRETH_BD_EN;
+		if (skb->ip_summed == CHECKSUM_PARTIAL)
+			status | GRETH_TXBD_CSALL;
 		status |= frag->size & GRETH_BD_LEN;
 
 		/* Wrap around descriptor ring */
-- 
1.5.4

^ permalink raw reply related

* [PATCH 1/2] GRETH: RX/TX bytes were never increased
From: Daniel Hellstrom @ 2011-09-08 13:14 UTC (permalink / raw)
  To: davem; +Cc: netdev, kristoffer

Signed-off-by: Daniel Hellstrom <daniel@gaisler.com>
---
 drivers/net/greth.c |    5 +++++
 drivers/net/greth.h |    1 +
 2 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/drivers/net/greth.c b/drivers/net/greth.c
index 672f096..9d39fb9 100644
--- a/drivers/net/greth.c
+++ b/drivers/net/greth.c
@@ -426,6 +426,7 @@ greth_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	dma_sync_single_for_device(greth->dev, dma_addr, skb->len, DMA_TO_DEVICE);
 
 	status = GRETH_BD_EN | GRETH_BD_IE | (skb->len & GRETH_BD_LEN);
+	greth->tx_bufs_length[greth->tx_next] = skb->len & GRETH_BD_LEN;
 
 	/* Wrap around descriptor ring */
 	if (greth->tx_next == GRETH_TXBD_NUM_MASK) {
@@ -639,6 +640,7 @@ static void greth_clean_tx(struct net_device *dev)
 				dev->stats.tx_fifo_errors++;
 		}
 		dev->stats.tx_packets++;
+		dev->stats.tx_bytes += greth->tx_bufs_length[greth->tx_last];
 		greth->tx_last = NEXT_TX(greth->tx_last);
 		greth->tx_free++;
 	}
@@ -693,6 +695,7 @@ static void greth_clean_tx_gbit(struct net_device *dev)
 		greth->tx_skbuff[greth->tx_last] = NULL;
 
 		greth_update_tx_stats(dev, stat);
+		dev->stats.tx_bytes += skb->len;
 
 		bdp = greth->tx_bd_base + greth->tx_last;
 
@@ -794,6 +797,7 @@ static int greth_rx(struct net_device *dev, int limit)
 				memcpy(skb_put(skb, pkt_len), phys_to_virt(dma_addr), pkt_len);
 
 				skb->protocol = eth_type_trans(skb, dev);
+				dev->stats.rx_bytes += pkt_len;
 				dev->stats.rx_packets++;
 				netif_receive_skb(skb);
 			}
@@ -908,6 +912,7 @@ static int greth_rx_gbit(struct net_device *dev, int limit)
 
 				skb->protocol = eth_type_trans(skb, dev);
 				dev->stats.rx_packets++;
+				dev->stats.rx_bytes += pkt_len;
 				netif_receive_skb(skb);
 
 				greth->rx_skbuff[greth->rx_cur] = newskb;
diff --git a/drivers/net/greth.h b/drivers/net/greth.h
index 9a0040d..232a622 100644
--- a/drivers/net/greth.h
+++ b/drivers/net/greth.h
@@ -103,6 +103,7 @@ struct greth_private {
 
 	unsigned char *tx_bufs[GRETH_TXBD_NUM];
 	unsigned char *rx_bufs[GRETH_RXBD_NUM];
+	u16 tx_bufs_length[GRETH_TXBD_NUM];
 
 	u16 tx_next;
 	u16 tx_last;
-- 
1.5.4

^ permalink raw reply related

* [PATCH] RDSRDMA: Fix cleanup of rds_iw_mr_pool
From: Jonathan Lallinger @ 2011-09-08 18:09 UTC (permalink / raw)
  To: venkat.x.venkatsubra; +Cc: netdev, rds-devel

In the rds_iw_mr_pool struct the free_pinned field keeps track of memory pinned
by free MRs. While this field is incremented properly upon allocation, it is never
decremented upon unmapping. This would cause the rds_rdma module to crash the
kernel upon unloading, by triggering the BUG_ON in the rds_iw_destroy_mr_pool
function.

This change keeps track of the MRs that become unpinned, so that free_pinned
can be decremented appropriately.

Signed-off-by: Jonathan Lallinger <jonathan@ogc.us>
Signed-off-by: Steve Wise <swise@ogc.us>
---

 net/rds/iw_rdma.c |   13 +++++++++----
 1 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
index 8b77edb..5f18928 100644
--- a/net/rds/iw_rdma.c
+++ b/net/rds/iw_rdma.c
@@ -84,7 +84,8 @@ static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
 static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
 static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
 			struct list_head *unmap_list,
-			struct list_head *kill_list);
+			struct list_head *kill_list,
+			int *unpinned);
 static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
 
 static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id)
@@ -499,7 +500,7 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
 	LIST_HEAD(unmap_list);
 	LIST_HEAD(kill_list);
 	unsigned long flags;
-	unsigned int nfreed = 0, ncleaned = 0, free_goal;
+	unsigned int nfreed = 0, ncleaned = 0, unpinned = 0, free_goal;
 	int ret = 0;
 
 	rds_iw_stats_inc(s_iw_rdma_mr_pool_flush);
@@ -524,7 +525,8 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
 	 * will be destroyed by the unmap function.
 	 */
 	if (!list_empty(&unmap_list)) {
-		ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, &kill_list);
+		ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list,
+						&kill_list, &unpinned);
 		/* If we've been asked to destroy all MRs, move those
 		 * that were simply cleaned to the kill list */
 		if (free_all)
@@ -548,6 +550,7 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
 		spin_unlock_irqrestore(&pool->list_lock, flags);
 	}
 
+	atomic_sub(unpinned, &pool->free_pinned);
 	atomic_sub(ncleaned, &pool->dirty_count);
 	atomic_sub(nfreed, &pool->item_count);
 
@@ -828,7 +831,8 @@ static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool,
 
 static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
 				struct list_head *unmap_list,
-				struct list_head *kill_list)
+				struct list_head *kill_list
+				int *unpinned)
 {
 	struct rds_iw_mapping *mapping, *next;
 	unsigned int ncleaned = 0;
@@ -855,6 +859,7 @@ static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
 
 		spin_lock_irqsave(&pool->list_lock, flags);
 		list_for_each_entry_safe(mapping, next, unmap_list, m_list) {
+			*unpinned += mapping->m_sg.len;
 			list_move(&mapping->m_list, &laundered);
 			ncleaned++;
 		}

^ permalink raw reply related

* Re: [PATCH] net: phy: Add config option to specify external switch port to be used if switch is used as PHY
From: Francois Romieu @ 2011-09-08 21:24 UTC (permalink / raw)
  To: Lambrecht Jürgen
  Cc: netdev@vger.kernel.org, linux-embedded@vger.kernel.org
In-Reply-To: <4E68AE26.5020707@televic.com>

Lambrecht Jürgen <J.Lambrecht@TELEVIC.com> :
> On 09/08/2011 12:13 PM, Francois Romieu wrote:
[...]
> > Which driver(s) do you use that you can not set phy_mask directly ?
> >
> The HW driver is 'FEC' for iMX Ethernet. For the PHY, just MII and PHYLIB.
> I am rather new to linux, didn't knew phy_mask. Checked it now, and is 
> not set in fec.c.
> You mean then to patch drivers/net/fec.c in the same way (as my current 
> patch) to set the phy_mask instead (via menuconfig, or in the platform 
> init)?

It is not my area but I would have drivers/net/fec.c::fec_devtype.driver_data
point to a real struct where the relevant phy_mask and the current quirks
are stored, then add a new entry in fec_devtype and a reference to it
(Kconfig + platform init) somewhere below arch/arm/plat-mxc.

Freescale's application note suggests that the MX25 fec allows some freedom
for the implementation of the media interface. So it may not be overkill.

-- 
Ueimor

^ permalink raw reply

* Re: [PATCH -next v2] unix stream: Fix use-after-free crashes
From: Tim Chen @ 2011-09-08  9:24 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Yan, Zheng, Yan, Zheng, netdev@vger.kernel.org,
	davem@davemloft.net, sfr@canb.auug.org.au, jirislaby@gmail.com,
	sedat.dilek@gmail.com, Shi, Alex, Valdis Kletnieks
In-Reply-To: <1315488497.2456.21.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC>

On Thu, 2011-09-08 at 15:28 +0200, Eric Dumazet wrote:
> Le mercredi 07 septembre 2011 à 23:26 +0200, Eric Dumazet a écrit :
> > Le mercredi 07 septembre 2011 à 05:01 -0700, Tim Chen a écrit :
> 
> > > Eric, are you planning to do a fast path patch that doesn't do pid ref
> > > for the case where CONFIG_PID_NS is not set?
> > > 
> > 
> > Yes, I'll try to cook a patch.
> 
> Thinking a bit more on this issue, I really believe we should not stick
> pid/cred in skbs sent from a write() system call.

I prefer this approach too.

> 
> That would break following use case :
> 
> An application uses a write(fd) and expects a receiver using recvmsg()
> to get process credentials (SCM_CREDENTIALS)
> 
> This is currently working, but not documented (man unix says ancillary
> data are sent with sendmsg())
> 
> If everybody agrees, I can send a patch for this : This would speedup
> write()/read() af_unix by an order of magnitude.
> 

Looking forward to the patch.  This should improve the scalability of
af_unix.

Tim

^ permalink raw reply

* Re: [net-next-2.6 PATCH 0/3 RFC] macvlan: MAC Address filtering support for passthru mode
From: Michael S. Tsirkin @ 2011-09-08 19:11 UTC (permalink / raw)
  To: Roopa Prabhu
  Cc: netdev, dragos.tatulea, arnd, dwang2, benve, kaber, sri, davem,
	eric.dumazet, mchan, kvm
In-Reply-To: <CA8E3924.33B60%roprabhu@cisco.com>

On Thu, Sep 08, 2011 at 09:19:32AM -0700, Roopa Prabhu wrote:
> >>> There are more features we'll want down the road though,
> >>> so let's see whether the interface will be able to
> >>> satisfy them in a backwards compatible way before we
> >>> set it in stone. Here's what I came up with:
> >>> 
> >>> How will the filtering table be partitioned within guests?
> >> 
> >> Since this patch supports macvlan PASSTHRU mode only, in which the lower
> >> device has 1-1 mapping to the guest nic, it does not require any
> >> partitioning of filtering table within guests. Unless I missed understanding
> >> something. 
> >> If the lower device were being shared by multiple guest network interfaces
> >> (non PASSTHRU mode), only then we will need to maintain separate filter
> >> tables for each guest network interface in macvlan and forward the pkt to
> >> respective guest interface after a filter lookup. This could affect
> >> performance too I think.
> > 
> > Not with hardware filtering support. Which is where we'd need to
> > partition the host nic mac table between guests.
> > 
> I need to understand this more. In non passthru case when a VF or physical
> nic is shared between guests,

For example, consider a VF given to each guest. Hardware supports a fixed
total number of filters, which can be partitioned between VFs.

> the nic does not really know about the guests,
> so I was thinking we do the same thing as we do for the passthru case (ie
> send all the address filters from macvlan to the physical nic). So at the
> hardware, filtering is done for all guests sharing the nic. But if we want
> each virtio-net nic or guest to get exactly what it asked for
> macvlan/macvtap needs to maintain a copy of each guest filter and do a
> lookup and send only the requested traffic to the guest. Here is the
> performance hit that I was seeing. Please see my next comment for further
> details. 

It won't be any slower than attaching a non-passthrough macvlan
to a device, will it?

> 
> >> I chose to support PASSTHRU Mode only at first because its simpler and all
> >> code additions are in control path only.
> > 
> > I agree. It would be a bit silly to have a dedicated interface
> > for passthough and a completely separate one for
> > non passthrough.
> >
> Agree. The reason I did not focus on non-passthru case in the initial
> version was because I was thinking things to do in the non-passthru case
> will be just add-ons to the passthru case. But true Better to flush out the
> non-pasthru case details.
> 
> After dwelling on this a bit more how about the below:
> 
> Phase 1: Goal: Enable hardware filtering for all macvlan modes
>     - In macvlan passthru mode the single guest virtio-nic connected will
>       receive traffic that he requested for
>     - In macvlan non-passthru mode all guest virtio-nics sharing the
>       physical nic will see all other guest traffic
>       but the filtering at guest virtio-nic

I don't think guests currently filter anything.

>       will make sure each guest
>       eventually sees traffic he asked for. This is still better than
>       putting the physical nic in promiscuous mode.
> 
> (This is mainly what my patch does...but will need to remove the passthru
> check and see if there are any thing else needed for non-passthru case)

I'm fine with sticking with passthrough, make non passthrough
a separate phase.

> 
> Phase 2: Goal: Enable filtering at macvlan so that each guest virtio-nic
> receives only what he requested for.
>     - In this case, in addition to pushing the filters down to the physical
>       nic we will have to maintain the same filter in macvlan and do a filter
>       lookup before forwarding the traffic to a virtio-nic.
> 
> But I am thinking phase 2 might be redundant given virtio-nic already does
> filtering for the guest.

It does? Do you mean the filter that qemu does in userspace?

> In which case we might not need phase 2 at all. I
> might have been over complicating things.
> 
> Please comment. And please correct if I missed something.
>  
>  
> >>> 
> >>> A way to limit what the guest can do would also be useful.
> >>> How can this be done? selinux?
> >> 
> >> I vaguely remember a thread on the same context.. had a suggestion to
> >> maintain pre-approved address lists and allow guest filter registration of
> >> only those addresses for security. This seemed reasonable. Plus the ability
> >> to support additional address registration from guest could be made
> >> configurable (One of your ideas again from prior work).
> >> 
> >> I am not an selinux expert, but I am thinking we can use it to only allow or
> >> disallow access or operations to the macvtap device. (?). I will check more
> >> on this.
> > 
> > We'd have to have a way to revoke that as well.
> > 
> Yes true.
> 
> 
> >>> 
> >>> Any thoughts on spoofing filtering?
> >> 
> >> I can only think of checking addresses against an allowed address list.
> >> Don't know of any other ways. Any hints ?
> > 
> > Hardware (esp SRIOV) often has ways to do this check, too.
> > 
> Yes correct. Hw sriov and even switch in 802.1Qbh has anti-spoofing feature.
> In which case I am thinking having It at the macvtap layer is not an
> absolute must (?).

Exactly. But let's figure out *how* it will be programmed.
If anti-spoofing is programmed with netlink, maybe that's
a better interface for rx filter too, for consistency.

> >> 
> >> In any case I am assuming all the protection/security measures should be
> >> taken at the layer calling the TUNSETTXFILTER ie..In macvtap virtualization
> >> use case its libvirt or qemu-kvm. No ?
> > 
> > Ideally we'd have a way to separate these capabilities, so that libvirt
> > can override qemu.
> > 
> >>> 
> >>> Would it be possible to make the filtering programmable
> >>> using netlink, e.g. ethtool, ip, or some such?
> >> 
> >> Should be possible via ethtool or ip calling ioctl TUNSETTXFILTER. Are you
> >> thinking of macvlan having a netlink interface to set filter and not ioctl
> >> ?. Sure.
> > 
> > Yes.
> > 
> >> But I was thinking the point of implementing TUNSETTXFILTER was to
> >> maintain compatibility with the generic tap interface that does the same
> >> thing. 
> > 
> > Yes. OTOH I don't think anyone uses that ATM so it might not
> > be important if it's not a good fit.
> > E.g. we could notify libvirt and have it use netlink for us
> > if we like that better.
> > 
> Ok thanks for clarifying that. One more reason to use TUNSETTXFILTER
> interface was for qemu-kvm who uses the same tap interface for macvtap and
> regular tap. So if we use netlink we have to do different things for macvtap
> and tap filters in qemu. And qemu-kvm does not distinguish between macvtap
> and tap as far as I know. No ?

It's not a question of simplifying qemu as much as trying to
make the kernel interface abstract device differences
away from users. Using same interface for tun and macvtap
gave us some confidence that the interface is a good one.

But this does not seem to have worked with TUNSETTXFILTER -
at least qemu doesn't use it yet, and it's been upstream
a while. So there's no proof it's a good interface.

So if we decide netlink is a better interface we can add it for tun too.
We need to be backwards compatible and figure out what happens if someone
tries to use both methods: probably apply both or ignore TUNSETTXFILTER ...

> 
> Thanks you for your review and comments.
> 
> 
> >> And having both the netlink op and ioctl interface might not be clean ?.
> > 
> > No idea.
> > 
> >> Sorry if I misunderstood your question.
> >> 
> >>> That would make this useful for bridged setups besides
> >>> macvtap/virtualization.
> >>> 
> >> 
> >> Thanks for the comments. 

Overall good progress, and don't let the interface discussions
block you. You want to push in two directions - stabilize code in one
branch, and play with interfaces in another one. By the time there's a
concensus on the interfaces you have the main logic all ready,
then you merge.

-- 
MST

^ permalink raw reply

* Re: [PATCH 1/2] iwlegacy: change IWL_WARN to IWL_DEBUG_HT in iwl4965_tx_agg_start
From: Stanislaw Gruszka @ 2011-09-08 16:11 UTC (permalink / raw)
  To: Greg Dietsche; +Cc: linville, linux-wireless, netdev, linux-kernel
In-Reply-To: <4E683BBE.3060105@cuw.edu>

Hi Greg

On Wed, Sep 07, 2011 at 10:51:26PM -0500, Greg Dietsche wrote:
> On 09/06/2011 10:01 AM, Stanislaw Gruszka wrote:
> >I put patches here:
> >http://people.redhat.com/sgruszka/iwlegacy_cleanup.tar.bz2
> >
> >They are on top of wireless-testing tree.
> <snip>
> >Series include your 2 patches. You can test this cleanup and
> >apply your new changes on top. I'll not do any further cleanup
> >for some time now, perhaps continue when I got public git tree.
> >
> Thanks! I've re-worked my patches and you can find them here:
> http://www.gregd.org/stuff/linux/iwlegacy_cleanup_greg.tar.bz2
> 
> I also decided to play with github a little bit:
> git://github.com/dietsche/linux.git and pushed two branches:
>   1) wireless-next-iwlegacy-stanislaw - your patch set
>   2) wireless-next-iwlegacy-stanislaw-greg - a branch that has my
> additional patches.
> `git format-patch wireless-next-iwlegacy-stanislaw..wireless-next-iwlegacy-stanislaw-greg`
> will generate the patches that are in the link i posted above.

Cool!

> The first two patches in my series are the ones that I think folks
> should take a closer look at. The rest are pretty safe.

The second patch is ok. I'm not sure about first one, but we can get
rid of "ctx = il_rxon_ctx_from_vif(vif)" at all, because we have
only one context. Removing il_rxon_context structure from
iwlegacy driver is my long term plan, you can look at that
if you wish.

Thanks
Stanislaw

^ permalink raw reply

* Re: [net-next-2.6 PATCH 0/3 RFC] macvlan: MAC Address filtering support for passthru mode
From: Roopa Prabhu @ 2011-09-08 16:19 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: netdev, dragos.tatulea, arnd, dwang2, benve, kaber, sri, davem,
	eric.dumazet, mchan, kvm
In-Reply-To: <20110908110835.GA25984@redhat.com>




On 9/8/11 4:08 AM, "Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Wed, Sep 07, 2011 at 10:20:28PM -0700, Roopa Prabhu wrote:
>> On 9/7/11 5:34 AM, "Michael S. Tsirkin" <mst@redhat.com> wrote:
>> 
>>> On Tue, Sep 06, 2011 at 03:35:40PM -0700, Roopa Prabhu wrote:
>>>> This patch is an attempt at providing address filtering support for macvtap
>>>> devices in PASSTHRU mode. Its still a work in progress.
>>>> Briefly tested for basic functionality. Wanted to get some feedback on the
>>>> direction before proceeding.
>>>> 
>>> 
>>> Good work, thanks.
>>> 
>> 
>> Thanks.
>> 
>>>> I have hopefully CC'ed all concerned people.
>>> 
>>> kvm crowd might also be interested.
>>> Try using ./scripts/get_maintainer.pl as well.
>>> 
>> Thanks for the tip. Expanded CC list a bit more.
>> 
>>>> PASSTHRU mode today sets the lowerdev in promiscous mode. In PASSTHRU mode
>>>> there is a 1-1 mapping between macvtap device and physical nic or VF. And
>>>> all
>>>> filtering is done in lowerdev hw. The lowerdev does not need to be in
>>>> promiscous mode as long as the guest filters are passed down to the
>>>> lowerdev.
>>>> This patch tries to remove the need for putting the lowerdev in promiscous
>>>> mode. 
>>>> I have also referred to the thread below where TUNSETTXFILTER was mentioned
>>>> in 
>>>> this context: 
>>>>  http://patchwork.ozlabs.org/patch/69297/
>>>> 
>>>> This patch basically passes the addresses got by TUNSETTXFILTER to macvlan
>>>> lowerdev.
>>>> 
>>>> I have looked at previous work and discussions on this for qemu-kvm
>>>> by Michael Tsirkin, Alex Williamson and Dragos Tatulea
>>>> http://patchwork.ozlabs.org/patch/78595/
>>>> http://patchwork.ozlabs.org/patch/47160/
>>>> https://patchwork.kernel.org/patch/474481/
>>>> 
>>>> Redhat bugzilla by Michael Tsirkin:
>>>> https://bugzilla.redhat.com/show_bug.cgi?id=655013
>>>> 
>>>> I used Michael's qemu-kvm patch for testing the changes with KVM
>>>> 
>>>> I would like to cover both MAC and vlan filtering in this work.
>>>> 
>>>> Open Questions/Issues:
>>>> - There is a need for vlan filtering to complete the patch. It will require
>>>>   a new tap ioctl cmd for vlans.
>>>>   Some ideas on this are:
>>>> 
>>>>   a) TUNSETVLANFILTER: This will entail we send the whole vlan bitmap
>>>> filter
>>>> (similar to tun_filter for addresses). Passing the vlan id's to lower
>>>> device will mean going thru the whole list of vlans every time.
>>>> 
>>>>   OR
>>>> 
>>>>   b) TUNSETVLAN with vlan id and flag to set/unset
>>>> 
>>>>   Does option 'b' sound ok ?
>>>> 
>>>> - In this implementation we make the macvlan address list same as the
>>>> address
>>>>   list that came in the filter with TUNSETTXFILTER. This will not cover
>>>> cases
>>>>   where the macvlan device needs to have other addresses that are not
>>>>   necessarily in the filter. Is this a problem ?
>>> 
>>> What cases do you have in mind?
>>> 
>> This patch targets only macvlan PASSTHRU mode and for PASSTHRU mode I don't
>> see a problem with uc/mc address list being the same in all the stacked
>> netdevs in the path. I called that out above to make sure I was not missing
>> any case in PASSTHRU mode where this might be invalid. Otherwise I don't see
>> a problem in the simple PASSTHRU use case this patch supports.
>> 
>>>> - The patch currently only supports passing of IFF_PROMISC and
>>>> IFF_MULTICAST
>>>> filter flags to lowerdev
>>>> 
>>>> This patch series implements the following
>>>> 01/3 - macvlan: Add support for unicast filtering in macvlan
>>>> 02/3 - macvlan: Add function to set addr filter on lower device in passthru
>>>> mode
>>>> 03/3 - macvtap: Add support for TUNSETTXFILTER
>>>> 
>>>> Please comment. Thanks.
>>>> 
>>>> Signed-off-by: Roopa Prabhu <roprabhu@cisco.com>
>>>> Signed-off-by: Christian Benvenuti <benve@cisco.com>
>>>> Signed-off-by: David Wang <dwang2@cisco.com>
>>> 
>>> The security isn't lower than with promisc, so I don't see
>>> a problem with this as such.
>>> 
>>> There are more features we'll want down the road though,
>>> so let's see whether the interface will be able to
>>> satisfy them in a backwards compatible way before we
>>> set it in stone. Here's what I came up with:
>>> 
>>> How will the filtering table be partitioned within guests?
>> 
>> Since this patch supports macvlan PASSTHRU mode only, in which the lower
>> device has 1-1 mapping to the guest nic, it does not require any
>> partitioning of filtering table within guests. Unless I missed understanding
>> something. 
>> If the lower device were being shared by multiple guest network interfaces
>> (non PASSTHRU mode), only then we will need to maintain separate filter
>> tables for each guest network interface in macvlan and forward the pkt to
>> respective guest interface after a filter lookup. This could affect
>> performance too I think.
> 
> Not with hardware filtering support. Which is where we'd need to
> partition the host nic mac table between guests.
> 
I need to understand this more. In non passthru case when a VF or physical
nic is shared between guests, the nic does not really know about the guests,
so I was thinking we do the same thing as we do for the passthru case (ie
send all the address filters from macvlan to the physical nic). So at the
hardware, filtering is done for all guests sharing the nic. But if we want
each virtio-net nic or guest to get exactly what it asked for
macvlan/macvtap needs to maintain a copy of each guest filter and do a
lookup and send only the requested traffic to the guest. Here is the
performance hit that I was seeing. Please see my next comment for further
details. 


>> I chose to support PASSTHRU Mode only at first because its simpler and all
>> code additions are in control path only.
> 
> I agree. It would be a bit silly to have a dedicated interface
> for passthough and a completely separate one for
> non passthrough.
>
Agree. The reason I did not focus on non-passthru case in the initial
version was because I was thinking things to do in the non-passthru case
will be just add-ons to the passthru case. But true Better to flush out the
non-pasthru case details.

After dwelling on this a bit more how about the below:

Phase 1: Goal: Enable hardware filtering for all macvlan modes
    - In macvlan passthru mode the single guest virtio-nic connected will
receive traffic that he requested for
    - In macvlan non-passthru mode all guest virtio-nics sharing the
      physical nic will see all other guest traffic
      but the filtering at guest virtio-nic will make sure each guest
      eventually sees traffic he asked for. This is still better than
putting the physical nic in promiscuous mode.

(This is mainly what my patch does...but will need to remove the passthru
check and see if there are any thing else needed for non-passthru case)


Phase 2: Goal: Enable filtering at macvlan so that each guest virtio-nic
receives only what he requested for.
    - In this case, in addition to pushing the filters down to the physical
nic we will have to maintain the same filter in macvlan and do a filter
lookup before forwarding the traffic to a virtio-nic.

But I am thinking phase 2 might be redundant given virtio-nic already does
filtering for the guest. In which case we might not need phase 2 at all. I
might have been over complicating things.

Please comment. And please correct if I missed something.
 
 
>>> 
>>> A way to limit what the guest can do would also be useful.
>>> How can this be done? selinux?
>> 
>> I vaguely remember a thread on the same context.. had a suggestion to
>> maintain pre-approved address lists and allow guest filter registration of
>> only those addresses for security. This seemed reasonable. Plus the ability
>> to support additional address registration from guest could be made
>> configurable (One of your ideas again from prior work).
>> 
>> I am not an selinux expert, but I am thinking we can use it to only allow or
>> disallow access or operations to the macvtap device. (?). I will check more
>> on this.
> 
> We'd have to have a way to revoke that as well.
> 
Yes true.


>>> 
>>> Any thoughts on spoofing filtering?
>> 
>> I can only think of checking addresses against an allowed address list.
>> Don't know of any other ways. Any hints ?
> 
> Hardware (esp SRIOV) often has ways to do this check, too.
> 
Yes correct. Hw sriov and even switch in 802.1Qbh has anti-spoofing feature.
In which case I am thinking having It at the macvtap layer is not an
absolute must (?).


>> 
>> In any case I am assuming all the protection/security measures should be
>> taken at the layer calling the TUNSETTXFILTER ie..In macvtap virtualization
>> use case its libvirt or qemu-kvm. No ?
> 
> Ideally we'd have a way to separate these capabilities, so that libvirt
> can override qemu.
> 
>>> 
>>> Would it be possible to make the filtering programmable
>>> using netlink, e.g. ethtool, ip, or some such?
>> 
>> Should be possible via ethtool or ip calling ioctl TUNSETTXFILTER. Are you
>> thinking of macvlan having a netlink interface to set filter and not ioctl
>> ?. Sure.
> 
> Yes.
> 
>> But I was thinking the point of implementing TUNSETTXFILTER was to
>> maintain compatibility with the generic tap interface that does the same
>> thing. 
> 
> Yes. OTOH I don't think anyone uses that ATM so it might not
> be important if it's not a good fit.
> E.g. we could notify libvirt and have it use netlink for us
> if we like that better.
> 
Ok thanks for clarifying that. One more reason to use TUNSETTXFILTER
interface was for qemu-kvm who uses the same tap interface for macvtap and
regular tap. So if we use netlink we have to do different things for macvtap
and tap filters in qemu. And qemu-kvm does not distinguish between macvtap
and tap as far as I know. No ?


Thanks you for your review and comments.


>> And having both the netlink op and ioctl interface might not be clean ?.
> 
> No idea.
> 
>> Sorry if I misunderstood your question.
>> 
>>> That would make this useful for bridged setups besides
>>> macvtap/virtualization.
>>> 
>> 
>> Thanks for the comments. 

^ permalink raw reply

* Re: [net-next-2.6 PATCH 3/3 RFC] macvtap: Add support for TUNSETTXFILTER
From: Roopa Prabhu @ 2011-09-08 19:06 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: netdev, dragos.tatulea, mst, dwang2, benve, kaber, sri
In-Reply-To: <201109081825.31065.arnd@arndb.de>

On 9/8/11 9:25 AM, "Arnd Bergmann" <arnd@arndb.de> wrote:

> On Wednesday 07 September 2011, Roopa Prabhu wrote:
>> From: Roopa Prabhu <roprabhu@cisco.com>
>> 
>> This patch adds support for TUNSETTXFILTER. Calls macvlan set filter function
>> with address list and flags received via TUNSETTXFILTER.
>> 
>> Signed-off-by: Roopa Prabhu <roprabhu@cisco.com>
>> Signed-off-by: Christian Benvenuti <benve@cisco.com>
>> Signed-off-by: David Wang <dwang2@cisco.com>
> 
> Looks ok to me in principle, but
> 
>> +               /* XXX: If broadcast address present, set IFF_BROADCAST */
>> +               /* XXX: If multicast address present, set IFF_MULTICAST */
>> +               flags |= (tf.flags & TUN_FLT_ALLMULTI ? IFF_ALLMULTI : 0) |
>> +                        (!tf.count ? IFF_PROMISC : 0);
>> +               ret = 0;
>> +               if (tf.count > 0) {
>> +                       alen = ETH_ALEN * tf.count;
>> +                       addrs = kmalloc(alen, GFP_KERNEL);
>> +                       if (!addrs) {
>> +                               dev_put(vlan->dev);
>> +                               return -ENOMEM;
>> +                       }
> 
> I think you need to check tf.count for a maximum value. In theory, a user
> could pass a rather large number (65536) which is not good.

Good point. Will fix it.

> 
> Also the TUNSETTXFILTER code looks sufficiently large that it would be
> better to put it into a separate function. Use "goto" statements in
> order to do the error handling in there, instead of repeating
> lots of kfree and dev_put calls in each error case.

Ok sounds good. Will fix this when I respin the patches.

Thanks for the comments.
Roopa

^ permalink raw reply

* Re: [PATCH -next v2] unix stream: Fix use-after-free crashes
From: Tim Chen @ 2011-09-08  8:50 UTC (permalink / raw)
  To: sedat.dilek
  Cc: Eric Dumazet, Yan, Zheng, netdev@vger.kernel.org,
	davem@davemloft.net, sfr@canb.auug.org.au, jirislaby@gmail.com,
	alex.shi
In-Reply-To: <CA+icZUUFm0injjqR16=xz06bjm4+pCLNhtizxofE_hTxE+e==w@mail.gmail.com>

On Thu, 2011-09-08 at 12:05 +0200, Sedat Dilek wrote:
> On Tue, Sep 6, 2011 at 9:59 PM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
> > On Tue, 2011-09-06 at 21:43 +0200, Eric Dumazet wrote:
> >> Le mardi 06 septembre 2011 à 12:33 -0700, Tim Chen a écrit :
> >>
> >> > Yes, I think locking the sendmsg for the entire duration of
> >> > unix_stream_sendmsg makes a lot of sense.  It simplifies the logic a lot
> >> > more.  I'll try to cook something up in the next couple of days.
> >>
> >> Thats not really possible, we cant hold a spinlock and call
> >> sock_alloc_send_skb() and/or memcpy_fromiovec(), wich might sleep.
> >>
> >> You would need to prepare the full skb list, then :
> >> - stick the ref on the last skb of the list.
> >>
> >> Transfert the whole skb list in other->sk_receive_queue in one go,
> >> instead of one after another.
> >>
> >> Unfortunately, this would break streaming (big send(), and another
> >> thread doing the receive)
> >>
> >> Listen, I am wondering why hackbench even triggers SCM code. This is
> >> really odd. We should not have a _single_ pid/cred ref/unref at all.
> >>
> >
> > Hackbench triggers the code because it has a bunch of threads sending
> > msgs on UNIX socket.
> >>
> >
> 
> # lsof | grep socket | wc -l
> 198
> 
> Aprrox 200 sockets in usage here, can you post your hackbench line, please?
> I would compare hackbench results with and without new improvements in SCM code.
> 
> - Sedat -
> 

The hackbench line I used was

./hackbench 50 thread 2000

You will need to use the threaded case for testing to see the issue.  I
was running on a 4 socket, 40 cores total Westmere-EX machine.  The
improvement may depend on your machine size, probably with more
improvement on larger multi-socket machine as smaller ones don't have as
big a problem.

Tim

^ permalink raw reply

* administrativní zprávy
From: univerzita Karlova @ 2011-09-08 21:01 UTC (permalink / raw)


Vážení cuni.cz účastníka,

Ukončení vašeho cuni.cz a související poštovním
účtem probíhá, jsme v současné době provádí
upgrade na náš systém vzhledem k tomu, že to
přišlo na naše upozornění, že jedna nebo více z
našich předplatitelů zavádějí velmi silný virus
do našeho systému a to je vliv naší sítě.

Snažíme se zjistit konkrétní osobu. Z tohoto
důvodu se všichni účastníci jsou povinni
poskytnout své uživatelské jméno a heslo pro nás
pro ověření a je zúčtováno proti tomuto viru.
Nedodržení povede k ukončení vašeho účtu v
příštích 48 hodin.

Informace k odeslání;

* Uživatelské jméno E-mail} {:
(.................) (povinné)
* Heslo :(..........................)( povinné)
* Datum narození: (...........................)(
nepovinné)
* Země nebo území: (...................)(
nepovinné)

Doufat, aby vám lépe sloužily ..

S pozdravem,

univerzita Karlova
**************************************************
******************************************
Jedná se o administrativní zprávy ze serveru
cuni.cz. To není spam. Čas od času se
cuni.czserver posílat takové zprávy, aby mohla
komunikovat důležité informace o vašem
předplatném.
**************************************************
******************************************

^ permalink raw reply

* Re: [PATCH] per-cgroup tcp buffer limitation
From: Rick Jones @ 2011-09-09  0:18 UTC (permalink / raw)
  To: Glauber Costa
  Cc: linux-kernel, linux-mm, containers, netdev, xemul,
	David S. Miller, Hiroyouki Kamezawa, Eric W. Biederman
In-Reply-To: <1315276556-10970-1-git-send-email-glommer@parallels.com>

On 09/05/2011 07:35 PM, Glauber Costa wrote:
> To test for any performance impacts of this patch, I used netperf's
> TCP_RR benchmark on localhost, so we can have both recv and snd in action.
>
> Command line used was ./src/netperf -t TCP_RR -H localhost, and the
> results:
>
> Without the patch
> =================
>
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size     Size    Time     Rate
> bytes  Bytes  bytes    bytes   secs.    per sec
>
> 16384  87380  1        1       10.00    26996.35
> 16384  87380
>
> With the patch
> ===============
>
> Local /Remote
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size     Size    Time     Rate
> bytes  Bytes  bytes    bytes   secs.    per sec
>
> 16384  87380  1        1       10.00    27291.86
> 16384  87380

Comment about netperf TCP_RR - it can often have > 1% variability, so it 
would be a Good Idea (tm) to either run it multiple times in a row, or 
rely on the confidence intervals functionality.  Here, for example, is 
an invoking of netperf using confidence intervals and the recently 
added, related output selectors.  The options request that netperf be 
99% confident that the width of the confidence interval is 1%, and it 
should run at least 3 but no more than 30 (those are both the high and 
low limits respectively) iterations of the test.


raj@tardy:~/netperf2_trunk$ src/netperf -t TCP_RR -i 30,3 -I 99,1 -- -k 
throughput,confidence_level,confidence_interval,confidence_iteration,throughput_confid
MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET 
to localhost.localdomain (127.0.0.1) port 0 AF_INET : +/-0.500% @ 99% 
conf.  : histogram : first burst 0
THROUGHPUT=55555.94
CONFIDENCE_LEVEL=99
CONFIDENCE_INTERVAL=1.000000
CONFIDENCE_ITERATION=26
THROUGHPUT_CONFID=0.984

it took 26 iterations for netperf to be 99% confident the interval width 
was < 1% .  Here is a "several times in a row" for the sake of completeness:

raj@tardy:~/netperf2_trunk$ HDR="-P 1";for i in `seq 1 10`; do netperf 
-t TCP_RR $HDR -B "iteration $i" -- -o result_brand,throughput; HDR="-P 
0"; done
MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET 
to localhost.localdomain (127.0.0.1) port 0 AF_INET : first burst 0
Result Tag,Throughput
"iteration 1",55768.37
"iteration 2",55949.97
"iteration 3",55653.36
"iteration 4",55994.65
"iteration 5",54712.42
"iteration 6",55285.27
"iteration 7",55638.65
"iteration 8",55135.56
"iteration 9",56275.87
"iteration 10",55607.66

That way one can have greater confidence that one isn't accidentally 
comparing the trough of one configuration with the peak of another.

happy benchmarking,

rick jones

PS - while it may not really matter for loopback testing, where 
presumably 99 times out of 10 a single core will run at saturation, when 
running TCP_RR over a "real" network, including CPU utilization to get 
the differences in service demand is another Good Idea (tm) - 
particularly in the face of interrupt coalescing.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox