Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 2/2] GRETH: avoid overwrite IP-stack's IP-frags checksum
From: Daniel Hellstrom @ 2011-09-08 13:14 UTC (permalink / raw)
  To: davem; +Cc: netdev, kristoffer
In-Reply-To: <1315487676-16733-1-git-send-email-daniel@gaisler.com>

The GRETH GBIT core does not do checksum offloading for IP
segmentation. This patch adds a check in the xmit function to
determine if the stack has calculated the checksum for us.

Signed-off-by: Daniel Hellstrom <daniel@gaisler.com>
---
 drivers/net/greth.c |    7 +++++--
 1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/greth.c b/drivers/net/greth.c
index 9d39fb9..27ba855 100644
--- a/drivers/net/greth.c
+++ b/drivers/net/greth.c
@@ -489,7 +489,8 @@ greth_start_xmit_gbit(struct sk_buff *skb, struct net_device *dev)
 	if (nr_frags != 0)
 		status = GRETH_TXBD_MORE;
 
-	status |= GRETH_TXBD_CSALL;
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		status |= GRETH_TXBD_CSALL;
 	status |= skb_headlen(skb) & GRETH_BD_LEN;
 	if (greth->tx_next == GRETH_TXBD_NUM_MASK)
 		status |= GRETH_BD_WR;
@@ -512,7 +513,9 @@ greth_start_xmit_gbit(struct sk_buff *skb, struct net_device *dev)
 		greth->tx_skbuff[curr_tx] = NULL;
 		bdp = greth->tx_bd_base + curr_tx;
 
-		status = GRETH_TXBD_CSALL | GRETH_BD_EN;
+		status = GRETH_BD_EN;
+		if (skb->ip_summed == CHECKSUM_PARTIAL)
+			status | GRETH_TXBD_CSALL;
 		status |= frag->size & GRETH_BD_LEN;
 
 		/* Wrap around descriptor ring */
-- 
1.5.4

^ permalink raw reply related

* [PATCH 1/2] GRETH: RX/TX bytes were never increased
From: Daniel Hellstrom @ 2011-09-08 13:14 UTC (permalink / raw)
  To: davem; +Cc: netdev, kristoffer

Signed-off-by: Daniel Hellstrom <daniel@gaisler.com>
---
 drivers/net/greth.c |    5 +++++
 drivers/net/greth.h |    1 +
 2 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/drivers/net/greth.c b/drivers/net/greth.c
index 672f096..9d39fb9 100644
--- a/drivers/net/greth.c
+++ b/drivers/net/greth.c
@@ -426,6 +426,7 @@ greth_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	dma_sync_single_for_device(greth->dev, dma_addr, skb->len, DMA_TO_DEVICE);
 
 	status = GRETH_BD_EN | GRETH_BD_IE | (skb->len & GRETH_BD_LEN);
+	greth->tx_bufs_length[greth->tx_next] = skb->len & GRETH_BD_LEN;
 
 	/* Wrap around descriptor ring */
 	if (greth->tx_next == GRETH_TXBD_NUM_MASK) {
@@ -639,6 +640,7 @@ static void greth_clean_tx(struct net_device *dev)
 				dev->stats.tx_fifo_errors++;
 		}
 		dev->stats.tx_packets++;
+		dev->stats.tx_bytes += greth->tx_bufs_length[greth->tx_last];
 		greth->tx_last = NEXT_TX(greth->tx_last);
 		greth->tx_free++;
 	}
@@ -693,6 +695,7 @@ static void greth_clean_tx_gbit(struct net_device *dev)
 		greth->tx_skbuff[greth->tx_last] = NULL;
 
 		greth_update_tx_stats(dev, stat);
+		dev->stats.tx_bytes += skb->len;
 
 		bdp = greth->tx_bd_base + greth->tx_last;
 
@@ -794,6 +797,7 @@ static int greth_rx(struct net_device *dev, int limit)
 				memcpy(skb_put(skb, pkt_len), phys_to_virt(dma_addr), pkt_len);
 
 				skb->protocol = eth_type_trans(skb, dev);
+				dev->stats.rx_bytes += pkt_len;
 				dev->stats.rx_packets++;
 				netif_receive_skb(skb);
 			}
@@ -908,6 +912,7 @@ static int greth_rx_gbit(struct net_device *dev, int limit)
 
 				skb->protocol = eth_type_trans(skb, dev);
 				dev->stats.rx_packets++;
+				dev->stats.rx_bytes += pkt_len;
 				netif_receive_skb(skb);
 
 				greth->rx_skbuff[greth->rx_cur] = newskb;
diff --git a/drivers/net/greth.h b/drivers/net/greth.h
index 9a0040d..232a622 100644
--- a/drivers/net/greth.h
+++ b/drivers/net/greth.h
@@ -103,6 +103,7 @@ struct greth_private {
 
 	unsigned char *tx_bufs[GRETH_TXBD_NUM];
 	unsigned char *rx_bufs[GRETH_RXBD_NUM];
+	u16 tx_bufs_length[GRETH_TXBD_NUM];
 
 	u16 tx_next;
 	u16 tx_last;
-- 
1.5.4

^ permalink raw reply related

* [PATCH] RDSRDMA: Fix cleanup of rds_iw_mr_pool
From: Jonathan Lallinger @ 2011-09-08 18:09 UTC (permalink / raw)
  To: venkat.x.venkatsubra; +Cc: netdev, rds-devel

In the rds_iw_mr_pool struct the free_pinned field keeps track of memory pinned
by free MRs. While this field is incremented properly upon allocation, it is never
decremented upon unmapping. This would cause the rds_rdma module to crash the
kernel upon unloading, by triggering the BUG_ON in the rds_iw_destroy_mr_pool
function.

This change keeps track of the MRs that become unpinned, so that free_pinned
can be decremented appropriately.

Signed-off-by: Jonathan Lallinger <jonathan@ogc.us>
Signed-off-by: Steve Wise <swise@ogc.us>
---

 net/rds/iw_rdma.c |   13 +++++++++----
 1 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
index 8b77edb..5f18928 100644
--- a/net/rds/iw_rdma.c
+++ b/net/rds/iw_rdma.c
@@ -84,7 +84,8 @@ static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
 static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
 static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
 			struct list_head *unmap_list,
-			struct list_head *kill_list);
+			struct list_head *kill_list,
+			int *unpinned);
 static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
 
 static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id)
@@ -499,7 +500,7 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
 	LIST_HEAD(unmap_list);
 	LIST_HEAD(kill_list);
 	unsigned long flags;
-	unsigned int nfreed = 0, ncleaned = 0, free_goal;
+	unsigned int nfreed = 0, ncleaned = 0, unpinned = 0, free_goal;
 	int ret = 0;
 
 	rds_iw_stats_inc(s_iw_rdma_mr_pool_flush);
@@ -524,7 +525,8 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
 	 * will be destroyed by the unmap function.
 	 */
 	if (!list_empty(&unmap_list)) {
-		ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, &kill_list);
+		ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list,
+						&kill_list, &unpinned);
 		/* If we've been asked to destroy all MRs, move those
 		 * that were simply cleaned to the kill list */
 		if (free_all)
@@ -548,6 +550,7 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
 		spin_unlock_irqrestore(&pool->list_lock, flags);
 	}
 
+	atomic_sub(unpinned, &pool->free_pinned);
 	atomic_sub(ncleaned, &pool->dirty_count);
 	atomic_sub(nfreed, &pool->item_count);
 
@@ -828,7 +831,8 @@ static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool,
 
 static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
 				struct list_head *unmap_list,
-				struct list_head *kill_list)
+				struct list_head *kill_list
+				int *unpinned)
 {
 	struct rds_iw_mapping *mapping, *next;
 	unsigned int ncleaned = 0;
@@ -855,6 +859,7 @@ static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
 
 		spin_lock_irqsave(&pool->list_lock, flags);
 		list_for_each_entry_safe(mapping, next, unmap_list, m_list) {
+			*unpinned += mapping->m_sg.len;
 			list_move(&mapping->m_list, &laundered);
 			ncleaned++;
 		}

^ permalink raw reply related

* Re: [PATCH] net: phy: Add config option to specify external switch port to be used if switch is used as PHY
From: Francois Romieu @ 2011-09-08 21:24 UTC (permalink / raw)
  To: Lambrecht Jürgen
  Cc: netdev@vger.kernel.org, linux-embedded@vger.kernel.org
In-Reply-To: <4E68AE26.5020707@televic.com>

Lambrecht Jürgen <J.Lambrecht@TELEVIC.com> :
> On 09/08/2011 12:13 PM, Francois Romieu wrote:
[...]
> > Which driver(s) do you use that you can not set phy_mask directly ?
> >
> The HW driver is 'FEC' for iMX Ethernet. For the PHY, just MII and PHYLIB.
> I am rather new to linux, didn't knew phy_mask. Checked it now, and is 
> not set in fec.c.
> You mean then to patch drivers/net/fec.c in the same way (as my current 
> patch) to set the phy_mask instead (via menuconfig, or in the platform 
> init)?

It is not my area but I would have drivers/net/fec.c::fec_devtype.driver_data
point to a real struct where the relevant phy_mask and the current quirks
are stored, then add a new entry in fec_devtype and a reference to it
(Kconfig + platform init) somewhere below arch/arm/plat-mxc.

Freescale's application note suggests that the MX25 fec allows some freedom
for the implementation of the media interface. So it may not be overkill.

-- 
Ueimor

^ permalink raw reply

* Re: [PATCH -next v2] unix stream: Fix use-after-free crashes
From: Tim Chen @ 2011-09-08  9:24 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Yan, Zheng, Yan, Zheng, netdev@vger.kernel.org,
	davem@davemloft.net, sfr@canb.auug.org.au, jirislaby@gmail.com,
	sedat.dilek@gmail.com, Shi, Alex, Valdis Kletnieks
In-Reply-To: <1315488497.2456.21.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC>

On Thu, 2011-09-08 at 15:28 +0200, Eric Dumazet wrote:
> Le mercredi 07 septembre 2011 à 23:26 +0200, Eric Dumazet a écrit :
> > Le mercredi 07 septembre 2011 à 05:01 -0700, Tim Chen a écrit :
> 
> > > Eric, are you planning to do a fast path patch that doesn't do pid ref
> > > for the case where CONFIG_PID_NS is not set?
> > > 
> > 
> > Yes, I'll try to cook a patch.
> 
> Thinking a bit more on this issue, I really believe we should not stick
> pid/cred in skbs sent from a write() system call.

I prefer this approach too.

> 
> That would break following use case :
> 
> An application uses a write(fd) and expects a receiver using recvmsg()
> to get process credentials (SCM_CREDENTIALS)
> 
> This is currently working, but not documented (man unix says ancillary
> data are sent with sendmsg())
> 
> If everybody agrees, I can send a patch for this : This would speedup
> write()/read() af_unix by an order of magnitude.
> 

Looking forward to the patch.  This should improve the scalability of
af_unix.

Tim

^ permalink raw reply

* Re: [net-next-2.6 PATCH 0/3 RFC] macvlan: MAC Address filtering support for passthru mode
From: Michael S. Tsirkin @ 2011-09-08 19:11 UTC (permalink / raw)
  To: Roopa Prabhu
  Cc: netdev, dragos.tatulea, arnd, dwang2, benve, kaber, sri, davem,
	eric.dumazet, mchan, kvm
In-Reply-To: <CA8E3924.33B60%roprabhu@cisco.com>

On Thu, Sep 08, 2011 at 09:19:32AM -0700, Roopa Prabhu wrote:
> >>> There are more features we'll want down the road though,
> >>> so let's see whether the interface will be able to
> >>> satisfy them in a backwards compatible way before we
> >>> set it in stone. Here's what I came up with:
> >>> 
> >>> How will the filtering table be partitioned within guests?
> >> 
> >> Since this patch supports macvlan PASSTHRU mode only, in which the lower
> >> device has 1-1 mapping to the guest nic, it does not require any
> >> partitioning of filtering table within guests. Unless I missed understanding
> >> something. 
> >> If the lower device were being shared by multiple guest network interfaces
> >> (non PASSTHRU mode), only then we will need to maintain separate filter
> >> tables for each guest network interface in macvlan and forward the pkt to
> >> respective guest interface after a filter lookup. This could affect
> >> performance too I think.
> > 
> > Not with hardware filtering support. Which is where we'd need to
> > partition the host nic mac table between guests.
> > 
> I need to understand this more. In non passthru case when a VF or physical
> nic is shared between guests,

For example, consider a VF given to each guest. Hardware supports a fixed
total number of filters, which can be partitioned between VFs.

> the nic does not really know about the guests,
> so I was thinking we do the same thing as we do for the passthru case (ie
> send all the address filters from macvlan to the physical nic). So at the
> hardware, filtering is done for all guests sharing the nic. But if we want
> each virtio-net nic or guest to get exactly what it asked for
> macvlan/macvtap needs to maintain a copy of each guest filter and do a
> lookup and send only the requested traffic to the guest. Here is the
> performance hit that I was seeing. Please see my next comment for further
> details. 

It won't be any slower than attaching a non-passthrough macvlan
to a device, will it?

> 
> >> I chose to support PASSTHRU Mode only at first because its simpler and all
> >> code additions are in control path only.
> > 
> > I agree. It would be a bit silly to have a dedicated interface
> > for passthough and a completely separate one for
> > non passthrough.
> >
> Agree. The reason I did not focus on non-passthru case in the initial
> version was because I was thinking things to do in the non-passthru case
> will be just add-ons to the passthru case. But true Better to flush out the
> non-pasthru case details.
> 
> After dwelling on this a bit more how about the below:
> 
> Phase 1: Goal: Enable hardware filtering for all macvlan modes
>     - In macvlan passthru mode the single guest virtio-nic connected will
>       receive traffic that he requested for
>     - In macvlan non-passthru mode all guest virtio-nics sharing the
>       physical nic will see all other guest traffic
>       but the filtering at guest virtio-nic

I don't think guests currently filter anything.

>       will make sure each guest
>       eventually sees traffic he asked for. This is still better than
>       putting the physical nic in promiscuous mode.
> 
> (This is mainly what my patch does...but will need to remove the passthru
> check and see if there are any thing else needed for non-passthru case)

I'm fine with sticking with passthrough, make non passthrough
a separate phase.

> 
> Phase 2: Goal: Enable filtering at macvlan so that each guest virtio-nic
> receives only what he requested for.
>     - In this case, in addition to pushing the filters down to the physical
>       nic we will have to maintain the same filter in macvlan and do a filter
>       lookup before forwarding the traffic to a virtio-nic.
> 
> But I am thinking phase 2 might be redundant given virtio-nic already does
> filtering for the guest.

It does? Do you mean the filter that qemu does in userspace?

> In which case we might not need phase 2 at all. I
> might have been over complicating things.
> 
> Please comment. And please correct if I missed something.
>  
>  
> >>> 
> >>> A way to limit what the guest can do would also be useful.
> >>> How can this be done? selinux?
> >> 
> >> I vaguely remember a thread on the same context.. had a suggestion to
> >> maintain pre-approved address lists and allow guest filter registration of
> >> only those addresses for security. This seemed reasonable. Plus the ability
> >> to support additional address registration from guest could be made
> >> configurable (One of your ideas again from prior work).
> >> 
> >> I am not an selinux expert, but I am thinking we can use it to only allow or
> >> disallow access or operations to the macvtap device. (?). I will check more
> >> on this.
> > 
> > We'd have to have a way to revoke that as well.
> > 
> Yes true.
> 
> 
> >>> 
> >>> Any thoughts on spoofing filtering?
> >> 
> >> I can only think of checking addresses against an allowed address list.
> >> Don't know of any other ways. Any hints ?
> > 
> > Hardware (esp SRIOV) often has ways to do this check, too.
> > 
> Yes correct. Hw sriov and even switch in 802.1Qbh has anti-spoofing feature.
> In which case I am thinking having It at the macvtap layer is not an
> absolute must (?).

Exactly. But let's figure out *how* it will be programmed.
If anti-spoofing is programmed with netlink, maybe that's
a better interface for rx filter too, for consistency.

> >> 
> >> In any case I am assuming all the protection/security measures should be
> >> taken at the layer calling the TUNSETTXFILTER ie..In macvtap virtualization
> >> use case its libvirt or qemu-kvm. No ?
> > 
> > Ideally we'd have a way to separate these capabilities, so that libvirt
> > can override qemu.
> > 
> >>> 
> >>> Would it be possible to make the filtering programmable
> >>> using netlink, e.g. ethtool, ip, or some such?
> >> 
> >> Should be possible via ethtool or ip calling ioctl TUNSETTXFILTER. Are you
> >> thinking of macvlan having a netlink interface to set filter and not ioctl
> >> ?. Sure.
> > 
> > Yes.
> > 
> >> But I was thinking the point of implementing TUNSETTXFILTER was to
> >> maintain compatibility with the generic tap interface that does the same
> >> thing. 
> > 
> > Yes. OTOH I don't think anyone uses that ATM so it might not
> > be important if it's not a good fit.
> > E.g. we could notify libvirt and have it use netlink for us
> > if we like that better.
> > 
> Ok thanks for clarifying that. One more reason to use TUNSETTXFILTER
> interface was for qemu-kvm who uses the same tap interface for macvtap and
> regular tap. So if we use netlink we have to do different things for macvtap
> and tap filters in qemu. And qemu-kvm does not distinguish between macvtap
> and tap as far as I know. No ?

It's not a question of simplifying qemu as much as trying to
make the kernel interface abstract device differences
away from users. Using same interface for tun and macvtap
gave us some confidence that the interface is a good one.

But this does not seem to have worked with TUNSETTXFILTER -
at least qemu doesn't use it yet, and it's been upstream
a while. So there's no proof it's a good interface.

So if we decide netlink is a better interface we can add it for tun too.
We need to be backwards compatible and figure out what happens if someone
tries to use both methods: probably apply both or ignore TUNSETTXFILTER ...

> 
> Thanks you for your review and comments.
> 
> 
> >> And having both the netlink op and ioctl interface might not be clean ?.
> > 
> > No idea.
> > 
> >> Sorry if I misunderstood your question.
> >> 
> >>> That would make this useful for bridged setups besides
> >>> macvtap/virtualization.
> >>> 
> >> 
> >> Thanks for the comments. 

Overall good progress, and don't let the interface discussions
block you. You want to push in two directions - stabilize code in one
branch, and play with interfaces in another one. By the time there's a
concensus on the interfaces you have the main logic all ready,
then you merge.

-- 
MST

^ permalink raw reply

* Re: [PATCH 1/2] iwlegacy: change IWL_WARN to IWL_DEBUG_HT in iwl4965_tx_agg_start
From: Stanislaw Gruszka @ 2011-09-08 16:11 UTC (permalink / raw)
  To: Greg Dietsche; +Cc: linville, linux-wireless, netdev, linux-kernel
In-Reply-To: <4E683BBE.3060105@cuw.edu>

Hi Greg

On Wed, Sep 07, 2011 at 10:51:26PM -0500, Greg Dietsche wrote:
> On 09/06/2011 10:01 AM, Stanislaw Gruszka wrote:
> >I put patches here:
> >http://people.redhat.com/sgruszka/iwlegacy_cleanup.tar.bz2
> >
> >They are on top of wireless-testing tree.
> <snip>
> >Series include your 2 patches. You can test this cleanup and
> >apply your new changes on top. I'll not do any further cleanup
> >for some time now, perhaps continue when I got public git tree.
> >
> Thanks! I've re-worked my patches and you can find them here:
> http://www.gregd.org/stuff/linux/iwlegacy_cleanup_greg.tar.bz2
> 
> I also decided to play with github a little bit:
> git://github.com/dietsche/linux.git and pushed two branches:
>   1) wireless-next-iwlegacy-stanislaw - your patch set
>   2) wireless-next-iwlegacy-stanislaw-greg - a branch that has my
> additional patches.
> `git format-patch wireless-next-iwlegacy-stanislaw..wireless-next-iwlegacy-stanislaw-greg`
> will generate the patches that are in the link i posted above.

Cool!

> The first two patches in my series are the ones that I think folks
> should take a closer look at. The rest are pretty safe.

The second patch is ok. I'm not sure about first one, but we can get
rid of "ctx = il_rxon_ctx_from_vif(vif)" at all, because we have
only one context. Removing il_rxon_context structure from
iwlegacy driver is my long term plan, you can look at that
if you wish.

Thanks
Stanislaw

^ permalink raw reply

* Re: [net-next-2.6 PATCH 0/3 RFC] macvlan: MAC Address filtering support for passthru mode
From: Roopa Prabhu @ 2011-09-08 16:19 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: netdev, dragos.tatulea, arnd, dwang2, benve, kaber, sri, davem,
	eric.dumazet, mchan, kvm
In-Reply-To: <20110908110835.GA25984@redhat.com>




On 9/8/11 4:08 AM, "Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Wed, Sep 07, 2011 at 10:20:28PM -0700, Roopa Prabhu wrote:
>> On 9/7/11 5:34 AM, "Michael S. Tsirkin" <mst@redhat.com> wrote:
>> 
>>> On Tue, Sep 06, 2011 at 03:35:40PM -0700, Roopa Prabhu wrote:
>>>> This patch is an attempt at providing address filtering support for macvtap
>>>> devices in PASSTHRU mode. Its still a work in progress.
>>>> Briefly tested for basic functionality. Wanted to get some feedback on the
>>>> direction before proceeding.
>>>> 
>>> 
>>> Good work, thanks.
>>> 
>> 
>> Thanks.
>> 
>>>> I have hopefully CC'ed all concerned people.
>>> 
>>> kvm crowd might also be interested.
>>> Try using ./scripts/get_maintainer.pl as well.
>>> 
>> Thanks for the tip. Expanded CC list a bit more.
>> 
>>>> PASSTHRU mode today sets the lowerdev in promiscous mode. In PASSTHRU mode
>>>> there is a 1-1 mapping between macvtap device and physical nic or VF. And
>>>> all
>>>> filtering is done in lowerdev hw. The lowerdev does not need to be in
>>>> promiscous mode as long as the guest filters are passed down to the
>>>> lowerdev.
>>>> This patch tries to remove the need for putting the lowerdev in promiscous
>>>> mode. 
>>>> I have also referred to the thread below where TUNSETTXFILTER was mentioned
>>>> in 
>>>> this context: 
>>>>  http://patchwork.ozlabs.org/patch/69297/
>>>> 
>>>> This patch basically passes the addresses got by TUNSETTXFILTER to macvlan
>>>> lowerdev.
>>>> 
>>>> I have looked at previous work and discussions on this for qemu-kvm
>>>> by Michael Tsirkin, Alex Williamson and Dragos Tatulea
>>>> http://patchwork.ozlabs.org/patch/78595/
>>>> http://patchwork.ozlabs.org/patch/47160/
>>>> https://patchwork.kernel.org/patch/474481/
>>>> 
>>>> Redhat bugzilla by Michael Tsirkin:
>>>> https://bugzilla.redhat.com/show_bug.cgi?id=655013
>>>> 
>>>> I used Michael's qemu-kvm patch for testing the changes with KVM
>>>> 
>>>> I would like to cover both MAC and vlan filtering in this work.
>>>> 
>>>> Open Questions/Issues:
>>>> - There is a need for vlan filtering to complete the patch. It will require
>>>>   a new tap ioctl cmd for vlans.
>>>>   Some ideas on this are:
>>>> 
>>>>   a) TUNSETVLANFILTER: This will entail we send the whole vlan bitmap
>>>> filter
>>>> (similar to tun_filter for addresses). Passing the vlan id's to lower
>>>> device will mean going thru the whole list of vlans every time.
>>>> 
>>>>   OR
>>>> 
>>>>   b) TUNSETVLAN with vlan id and flag to set/unset
>>>> 
>>>>   Does option 'b' sound ok ?
>>>> 
>>>> - In this implementation we make the macvlan address list same as the
>>>> address
>>>>   list that came in the filter with TUNSETTXFILTER. This will not cover
>>>> cases
>>>>   where the macvlan device needs to have other addresses that are not
>>>>   necessarily in the filter. Is this a problem ?
>>> 
>>> What cases do you have in mind?
>>> 
>> This patch targets only macvlan PASSTHRU mode and for PASSTHRU mode I don't
>> see a problem with uc/mc address list being the same in all the stacked
>> netdevs in the path. I called that out above to make sure I was not missing
>> any case in PASSTHRU mode where this might be invalid. Otherwise I don't see
>> a problem in the simple PASSTHRU use case this patch supports.
>> 
>>>> - The patch currently only supports passing of IFF_PROMISC and
>>>> IFF_MULTICAST
>>>> filter flags to lowerdev
>>>> 
>>>> This patch series implements the following
>>>> 01/3 - macvlan: Add support for unicast filtering in macvlan
>>>> 02/3 - macvlan: Add function to set addr filter on lower device in passthru
>>>> mode
>>>> 03/3 - macvtap: Add support for TUNSETTXFILTER
>>>> 
>>>> Please comment. Thanks.
>>>> 
>>>> Signed-off-by: Roopa Prabhu <roprabhu@cisco.com>
>>>> Signed-off-by: Christian Benvenuti <benve@cisco.com>
>>>> Signed-off-by: David Wang <dwang2@cisco.com>
>>> 
>>> The security isn't lower than with promisc, so I don't see
>>> a problem with this as such.
>>> 
>>> There are more features we'll want down the road though,
>>> so let's see whether the interface will be able to
>>> satisfy them in a backwards compatible way before we
>>> set it in stone. Here's what I came up with:
>>> 
>>> How will the filtering table be partitioned within guests?
>> 
>> Since this patch supports macvlan PASSTHRU mode only, in which the lower
>> device has 1-1 mapping to the guest nic, it does not require any
>> partitioning of filtering table within guests. Unless I missed understanding
>> something. 
>> If the lower device were being shared by multiple guest network interfaces
>> (non PASSTHRU mode), only then we will need to maintain separate filter
>> tables for each guest network interface in macvlan and forward the pkt to
>> respective guest interface after a filter lookup. This could affect
>> performance too I think.
> 
> Not with hardware filtering support. Which is where we'd need to
> partition the host nic mac table between guests.
> 
I need to understand this more. In non passthru case when a VF or physical
nic is shared between guests, the nic does not really know about the guests,
so I was thinking we do the same thing as we do for the passthru case (ie
send all the address filters from macvlan to the physical nic). So at the
hardware, filtering is done for all guests sharing the nic. But if we want
each virtio-net nic or guest to get exactly what it asked for
macvlan/macvtap needs to maintain a copy of each guest filter and do a
lookup and send only the requested traffic to the guest. Here is the
performance hit that I was seeing. Please see my next comment for further
details. 


>> I chose to support PASSTHRU Mode only at first because its simpler and all
>> code additions are in control path only.
> 
> I agree. It would be a bit silly to have a dedicated interface
> for passthough and a completely separate one for
> non passthrough.
>
Agree. The reason I did not focus on non-passthru case in the initial
version was because I was thinking things to do in the non-passthru case
will be just add-ons to the passthru case. But true Better to flush out the
non-pasthru case details.

After dwelling on this a bit more how about the below:

Phase 1: Goal: Enable hardware filtering for all macvlan modes
    - In macvlan passthru mode the single guest virtio-nic connected will
receive traffic that he requested for
    - In macvlan non-passthru mode all guest virtio-nics sharing the
      physical nic will see all other guest traffic
      but the filtering at guest virtio-nic will make sure each guest
      eventually sees traffic he asked for. This is still better than
putting the physical nic in promiscuous mode.

(This is mainly what my patch does...but will need to remove the passthru
check and see if there are any thing else needed for non-passthru case)


Phase 2: Goal: Enable filtering at macvlan so that each guest virtio-nic
receives only what he requested for.
    - In this case, in addition to pushing the filters down to the physical
nic we will have to maintain the same filter in macvlan and do a filter
lookup before forwarding the traffic to a virtio-nic.

But I am thinking phase 2 might be redundant given virtio-nic already does
filtering for the guest. In which case we might not need phase 2 at all. I
might have been over complicating things.

Please comment. And please correct if I missed something.
 
 
>>> 
>>> A way to limit what the guest can do would also be useful.
>>> How can this be done? selinux?
>> 
>> I vaguely remember a thread on the same context.. had a suggestion to
>> maintain pre-approved address lists and allow guest filter registration of
>> only those addresses for security. This seemed reasonable. Plus the ability
>> to support additional address registration from guest could be made
>> configurable (One of your ideas again from prior work).
>> 
>> I am not an selinux expert, but I am thinking we can use it to only allow or
>> disallow access or operations to the macvtap device. (?). I will check more
>> on this.
> 
> We'd have to have a way to revoke that as well.
> 
Yes true.


>>> 
>>> Any thoughts on spoofing filtering?
>> 
>> I can only think of checking addresses against an allowed address list.
>> Don't know of any other ways. Any hints ?
> 
> Hardware (esp SRIOV) often has ways to do this check, too.
> 
Yes correct. Hw sriov and even switch in 802.1Qbh has anti-spoofing feature.
In which case I am thinking having It at the macvtap layer is not an
absolute must (?).


>> 
>> In any case I am assuming all the protection/security measures should be
>> taken at the layer calling the TUNSETTXFILTER ie..In macvtap virtualization
>> use case its libvirt or qemu-kvm. No ?
> 
> Ideally we'd have a way to separate these capabilities, so that libvirt
> can override qemu.
> 
>>> 
>>> Would it be possible to make the filtering programmable
>>> using netlink, e.g. ethtool, ip, or some such?
>> 
>> Should be possible via ethtool or ip calling ioctl TUNSETTXFILTER. Are you
>> thinking of macvlan having a netlink interface to set filter and not ioctl
>> ?. Sure.
> 
> Yes.
> 
>> But I was thinking the point of implementing TUNSETTXFILTER was to
>> maintain compatibility with the generic tap interface that does the same
>> thing. 
> 
> Yes. OTOH I don't think anyone uses that ATM so it might not
> be important if it's not a good fit.
> E.g. we could notify libvirt and have it use netlink for us
> if we like that better.
> 
Ok thanks for clarifying that. One more reason to use TUNSETTXFILTER
interface was for qemu-kvm who uses the same tap interface for macvtap and
regular tap. So if we use netlink we have to do different things for macvtap
and tap filters in qemu. And qemu-kvm does not distinguish between macvtap
and tap as far as I know. No ?


Thanks you for your review and comments.


>> And having both the netlink op and ioctl interface might not be clean ?.
> 
> No idea.
> 
>> Sorry if I misunderstood your question.
>> 
>>> That would make this useful for bridged setups besides
>>> macvtap/virtualization.
>>> 
>> 
>> Thanks for the comments. 

^ permalink raw reply

* Re: [net-next-2.6 PATCH 3/3 RFC] macvtap: Add support for TUNSETTXFILTER
From: Roopa Prabhu @ 2011-09-08 19:06 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: netdev, dragos.tatulea, mst, dwang2, benve, kaber, sri
In-Reply-To: <201109081825.31065.arnd@arndb.de>

On 9/8/11 9:25 AM, "Arnd Bergmann" <arnd@arndb.de> wrote:

> On Wednesday 07 September 2011, Roopa Prabhu wrote:
>> From: Roopa Prabhu <roprabhu@cisco.com>
>> 
>> This patch adds support for TUNSETTXFILTER. Calls macvlan set filter function
>> with address list and flags received via TUNSETTXFILTER.
>> 
>> Signed-off-by: Roopa Prabhu <roprabhu@cisco.com>
>> Signed-off-by: Christian Benvenuti <benve@cisco.com>
>> Signed-off-by: David Wang <dwang2@cisco.com>
> 
> Looks ok to me in principle, but
> 
>> +               /* XXX: If broadcast address present, set IFF_BROADCAST */
>> +               /* XXX: If multicast address present, set IFF_MULTICAST */
>> +               flags |= (tf.flags & TUN_FLT_ALLMULTI ? IFF_ALLMULTI : 0) |
>> +                        (!tf.count ? IFF_PROMISC : 0);
>> +               ret = 0;
>> +               if (tf.count > 0) {
>> +                       alen = ETH_ALEN * tf.count;
>> +                       addrs = kmalloc(alen, GFP_KERNEL);
>> +                       if (!addrs) {
>> +                               dev_put(vlan->dev);
>> +                               return -ENOMEM;
>> +                       }
> 
> I think you need to check tf.count for a maximum value. In theory, a user
> could pass a rather large number (65536) which is not good.

Good point. Will fix it.

> 
> Also the TUNSETTXFILTER code looks sufficiently large that it would be
> better to put it into a separate function. Use "goto" statements in
> order to do the error handling in there, instead of repeating
> lots of kfree and dev_put calls in each error case.

Ok sounds good. Will fix this when I respin the patches.

Thanks for the comments.
Roopa

^ permalink raw reply

* Re: [PATCH -next v2] unix stream: Fix use-after-free crashes
From: Tim Chen @ 2011-09-08  8:50 UTC (permalink / raw)
  To: sedat.dilek
  Cc: Eric Dumazet, Yan, Zheng, netdev@vger.kernel.org,
	davem@davemloft.net, sfr@canb.auug.org.au, jirislaby@gmail.com,
	alex.shi
In-Reply-To: <CA+icZUUFm0injjqR16=xz06bjm4+pCLNhtizxofE_hTxE+e==w@mail.gmail.com>

On Thu, 2011-09-08 at 12:05 +0200, Sedat Dilek wrote:
> On Tue, Sep 6, 2011 at 9:59 PM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
> > On Tue, 2011-09-06 at 21:43 +0200, Eric Dumazet wrote:
> >> Le mardi 06 septembre 2011 à 12:33 -0700, Tim Chen a écrit :
> >>
> >> > Yes, I think locking the sendmsg for the entire duration of
> >> > unix_stream_sendmsg makes a lot of sense.  It simplifies the logic a lot
> >> > more.  I'll try to cook something up in the next couple of days.
> >>
> >> Thats not really possible, we cant hold a spinlock and call
> >> sock_alloc_send_skb() and/or memcpy_fromiovec(), wich might sleep.
> >>
> >> You would need to prepare the full skb list, then :
> >> - stick the ref on the last skb of the list.
> >>
> >> Transfert the whole skb list in other->sk_receive_queue in one go,
> >> instead of one after another.
> >>
> >> Unfortunately, this would break streaming (big send(), and another
> >> thread doing the receive)
> >>
> >> Listen, I am wondering why hackbench even triggers SCM code. This is
> >> really odd. We should not have a _single_ pid/cred ref/unref at all.
> >>
> >
> > Hackbench triggers the code because it has a bunch of threads sending
> > msgs on UNIX socket.
> >>
> >
> 
> # lsof | grep socket | wc -l
> 198
> 
> Aprrox 200 sockets in usage here, can you post your hackbench line, please?
> I would compare hackbench results with and without new improvements in SCM code.
> 
> - Sedat -
> 

The hackbench line I used was

./hackbench 50 thread 2000

You will need to use the threaded case for testing to see the issue.  I
was running on a 4 socket, 40 cores total Westmere-EX machine.  The
improvement may depend on your machine size, probably with more
improvement on larger multi-socket machine as smaller ones don't have as
big a problem.

Tim

^ permalink raw reply

* administrativní zprávy
From: univerzita Karlova @ 2011-09-08 21:01 UTC (permalink / raw)


Vážení cuni.cz účastníka,

Ukončení vašeho cuni.cz a související poštovním
účtem probíhá, jsme v současné době provádí
upgrade na náš systém vzhledem k tomu, že to
přišlo na naše upozornění, že jedna nebo více z
našich předplatitelů zavádějí velmi silný virus
do našeho systému a to je vliv naší sítě.

Snažíme se zjistit konkrétní osobu. Z tohoto
důvodu se všichni účastníci jsou povinni
poskytnout své uživatelské jméno a heslo pro nás
pro ověření a je zúčtováno proti tomuto viru.
Nedodržení povede k ukončení vašeho účtu v
příštích 48 hodin.

Informace k odeslání;

* Uživatelské jméno E-mail} {:
(.................) (povinné)
* Heslo :(..........................)( povinné)
* Datum narození: (...........................)(
nepovinné)
* Země nebo území: (...................)(
nepovinné)

Doufat, aby vám lépe sloužily ..

S pozdravem,

univerzita Karlova
**************************************************
******************************************
Jedná se o administrativní zprávy ze serveru
cuni.cz. To není spam. Čas od času se
cuni.czserver posílat takové zprávy, aby mohla
komunikovat důležité informace o vašem
předplatném.
**************************************************
******************************************

^ permalink raw reply

* Re: [PATCH] per-cgroup tcp buffer limitation
From: Rick Jones @ 2011-09-09  0:18 UTC (permalink / raw)
  To: Glauber Costa
  Cc: linux-kernel, linux-mm, containers, netdev, xemul,
	David S. Miller, Hiroyouki Kamezawa, Eric W. Biederman
In-Reply-To: <1315276556-10970-1-git-send-email-glommer@parallels.com>

On 09/05/2011 07:35 PM, Glauber Costa wrote:
> To test for any performance impacts of this patch, I used netperf's
> TCP_RR benchmark on localhost, so we can have both recv and snd in action.
>
> Command line used was ./src/netperf -t TCP_RR -H localhost, and the
> results:
>
> Without the patch
> =================
>
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size     Size    Time     Rate
> bytes  Bytes  bytes    bytes   secs.    per sec
>
> 16384  87380  1        1       10.00    26996.35
> 16384  87380
>
> With the patch
> ===============
>
> Local /Remote
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size     Size    Time     Rate
> bytes  Bytes  bytes    bytes   secs.    per sec
>
> 16384  87380  1        1       10.00    27291.86
> 16384  87380

Comment about netperf TCP_RR - it can often have > 1% variability, so it 
would be a Good Idea (tm) to either run it multiple times in a row, or 
rely on the confidence intervals functionality.  Here, for example, is 
an invoking of netperf using confidence intervals and the recently 
added, related output selectors.  The options request that netperf be 
99% confident that the width of the confidence interval is 1%, and it 
should run at least 3 but no more than 30 (those are both the high and 
low limits respectively) iterations of the test.

raj@tardy:~/netperf2_trunk$ src/netperf -t TCP_RR -i 30,3 -I 99,1 -- -k 
throughput,confidence_level,confidence_interval,confidence_iteration,throughput_confid
MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET 
to localhost.localdomain (127.0.0.1) port 0 AF_INET : +/-0.500% @ 99% 
conf.  : histogram : first burst 0
THROUGHPUT=55555.94
CONFIDENCE_LEVEL=99
CONFIDENCE_INTERVAL=1.000000
CONFIDENCE_ITERATION=26
THROUGHPUT_CONFID=0.984

it took 26 iterations for netperf to be 99% confident the interval width 
was < 1% .  Here is a "several times in a row" for the sake of completeness:

raj@tardy:~/netperf2_trunk$ HDR="-P 1";for i in `seq 1 10`; do netperf 
-t TCP_RR $HDR -B "iteration $i" -- -o result_brand,throughput; HDR="-P 
0"; done
MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET 
to localhost.localdomain (127.0.0.1) port 0 AF_INET : first burst 0
Result Tag,Throughput
"iteration 1",55768.37
"iteration 2",55949.97
"iteration 3",55653.36
"iteration 4",55994.65
"iteration 5",54712.42
"iteration 6",55285.27
"iteration 7",55638.65
"iteration 8",55135.56
"iteration 9",56275.87
"iteration 10",55607.66

That way one can have greater confidence that one isn't accidentally 
comparing the trough of one configuration with the peak of another.

happy benchmarking,

rick jones

PS - while it may not really matter for loopback testing, where 
presumably 99 times out of 10 a single core will run at saturation, when 
running TCP_RR over a "real" network, including CPU utilization to get 
the differences in service demand is another Good Idea (tm) - 
particularly in the face of interrupt coalescing.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH v3] libertas: prioritize usb8388_olpc.bin firmware on OLPC machines
From: Andres Salomon @ 2011-09-09  0:35 UTC (permalink / raw)
  To: Dan Williams
  Cc: linville, libertas-dev, linux-wireless, netdev, linux-kernel, dsd,
	cjb


Normally, the v9 firmware will be loaded if it's available.  However, on
OLPC XO-1 machines, the olpc-specific firmware supports extra functionality.
This makes the libertas driver attempt to load the custom firmware first
if the machine is an OLPC machine; if that fails (or it's not an OLPC
machine), fall back to attempting to load the other firmwares.

usb8388_olpc.bin is currently found in the linux-firmware repository.

Signed-off-by: Andres Salomon <dilinger@queued.net>
---
v3: changed firmware path to include libertas/ subdirectory.

Since the last submission (v2), usb8388_olpc.bin has found its way into
wireless-firmware.  It'd be nice to get this into the next merge window.

 drivers/net/wireless/libertas/if_usb.c |   21 +++++++++++++++++++++
 1 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/drivers/net/wireless/libertas/if_usb.c b/drivers/net/wireless/libertas/if_usb.c
index b5acc39..94f8f3e 100644
--- a/drivers/net/wireless/libertas/if_usb.c
+++ b/drivers/net/wireless/libertas/if_usb.c
@@ -973,6 +973,23 @@ static const struct {
 	{ MODEL_8682, "libertas/usb8682.bin" }
 };
 
+#ifdef CONFIG_OLPC
+
+static int try_olpc_fw(struct if_usb_card *cardp)
+{
+	int retval = -ENOENT;
+
+	/* try the OLPC firmware first; fall back to fw_table list */
+	if (machine_is_olpc() && cardp->model == MODEL_8388)
+		retval = request_firmware(&cardp->fw,
+				"libertas/usb8388_olpc.bin", &cardp->udev->dev);
+	return retval;
+}
+
+#else
+static int try_olpc_fw(struct if_usb_card *cardp) { return -ENOENT; }
+#endif /* !CONFIG_OLPC */
+
 static int get_fw(struct if_usb_card *cardp, const char *fwname)
 {
 	int i;
@@ -981,6 +998,10 @@ static int get_fw(struct if_usb_card *cardp, const char *fwname)
 	if (fwname)
 		return request_firmware(&cardp->fw, fwname, &cardp->udev->dev);
 
+	/* Handle OLPC firmware */
+	if (try_olpc_fw(cardp) == 0)
+		return 0;
+
 	/* Otherwise search for firmware to use */
 	for (i = 0; i < ARRAY_SIZE(fw_table); i++) {
 		if (fw_table[i].model != cardp->model)
-- 
1.7.2.5

^ permalink raw reply related

* Re: [PATCH v3] libertas: prioritize usb8388_olpc.bin firmware on OLPC machines
From: Dan Williams @ 2011-09-09  1:00 UTC (permalink / raw)
  To: Andres Salomon
  Cc: linville, libertas-dev, linux-wireless, netdev, linux-kernel, dsd,
	cjb
In-Reply-To: <20110908173517.5b8c90be@queued.net>

On Thu, 2011-09-08 at 17:35 -0700, Andres Salomon wrote:
> Normally, the v9 firmware will be loaded if it's available.  However, on
> OLPC XO-1 machines, the olpc-specific firmware supports extra functionality.
> This makes the libertas driver attempt to load the custom firmware first
> if the machine is an OLPC machine; if that fails (or it's not an OLPC
> machine), fall back to attempting to load the other firmwares.
> 
> usb8388_olpc.bin is currently found in the linux-firmware repository.
> 
> Signed-off-by: Andres Salomon <dilinger@queued.net>

Acked-by: Dan Williams <dcbw@redhat.com>

> ---
> v3: changed firmware path to include libertas/ subdirectory.
> 
> Since the last submission (v2), usb8388_olpc.bin has found its way into
> wireless-firmware.  It'd be nice to get this into the next merge window.
> 
>  drivers/net/wireless/libertas/if_usb.c |   21 +++++++++++++++++++++
>  1 files changed, 21 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/net/wireless/libertas/if_usb.c b/drivers/net/wireless/libertas/if_usb.c
> index b5acc39..94f8f3e 100644
> --- a/drivers/net/wireless/libertas/if_usb.c
> +++ b/drivers/net/wireless/libertas/if_usb.c
> @@ -973,6 +973,23 @@ static const struct {
>  	{ MODEL_8682, "libertas/usb8682.bin" }
>  };
>  
> +#ifdef CONFIG_OLPC
> +
> +static int try_olpc_fw(struct if_usb_card *cardp)
> +{
> +	int retval = -ENOENT;
> +
> +	/* try the OLPC firmware first; fall back to fw_table list */
> +	if (machine_is_olpc() && cardp->model == MODEL_8388)
> +		retval = request_firmware(&cardp->fw,
> +				"libertas/usb8388_olpc.bin", &cardp->udev->dev);
> +	return retval;
> +}
> +
> +#else
> +static int try_olpc_fw(struct if_usb_card *cardp) { return -ENOENT; }
> +#endif /* !CONFIG_OLPC */
> +
>  static int get_fw(struct if_usb_card *cardp, const char *fwname)
>  {
>  	int i;
> @@ -981,6 +998,10 @@ static int get_fw(struct if_usb_card *cardp, const char *fwname)
>  	if (fwname)
>  		return request_firmware(&cardp->fw, fwname, &cardp->udev->dev);
>  
> +	/* Handle OLPC firmware */
> +	if (try_olpc_fw(cardp) == 0)
> +		return 0;
> +
>  	/* Otherwise search for firmware to use */
>  	for (i = 0; i < ARRAY_SIZE(fw_table); i++) {
>  		if (fw_table[i].model != cardp->model)

^ permalink raw reply

* Re: [omega-g1:11110] Re: [PATCH] net: configurable sysctl parameter "net.core.tcp_lowat" for sk_stream_min_wspace()
From: Jun.Kondo @ 2011-09-09  1:33 UTC (permalink / raw)
  To: David Miller
  Cc: linux-kernel, omega-g1, notsuki, motokazu.kozaki, htaira, netdev,
	tomohiko.takahashi, kotaro.sakai, ken.sugawara
In-Reply-To: <20110824.220016.781758659534965980.davem@davemloft.net>

The client of this system is cellular phone, and the
status of the communication line with a client varies
widely according to its place or congestion situation.

In terms of the line speed, it can be around 9Mbps
when it is fast, but 8kbps when it is slow.

Requirement from customer is to provide stable service
in both situation.

- In normal situation, acquire large default transmission
   buffer value, and ensure high throughput from the
   beginning of tcp connection

- On the other hand, even when the connection has low
   throughput, such as low rate streaming, transmit data
   without timeout

However, when the throughput is low, it takes much time
for the transmission buffer to be freed, and timeout
will occur during that period.

Of course, the connection will not be disconnected when
the timeout of application is extended, but end user
would not wait patiently as long as 1 minute.
Therefore, we do not want to extend the timeout value.

By making the threshold, which makes write possible after
the buffer is blocked once, configurable, and set it to a
small value, it will be possible to return data to client
without making timeout occur.

So, we think the issue can be solved with this
modification.

^ permalink raw reply

* [BUG?] tcp: potential bug in tcp_is_sackblock_valid()
From: Yan, Zheng @ 2011-09-09  1:45 UTC (permalink / raw)
  To: netdev@vger.kernel.org, davem@davemloft.net,
	eric.dumazet@gmail.com, herbert, "sfr@c

Hi all,

I found a check in tcp_is_sackblock_valid() is suspicious. It against
its comment and RFC. I think the correct check should be:

---
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 385c470..a5d01b1 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1124,7 +1124,7 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
                return 0;
 
        /* ...Then it's D-SACK, and must reside below snd_una completely */
-       if (!after(end_seq, tp->snd_una))
+       if (after(end_seq, tp->snd_una))
                return 0;
 
        if (!before(start_seq, tp->undo_marker))
---

I also checked /proc/net/netstat of my laptop, found TCPDSACKIgnoredOld
field is not zero. Maybe it's caused by the bug.

Regards
Yan, Zheng

^ permalink raw reply related

* Re: [BUG?] tcp: potential bug in tcp_is_sackblock_valid()
From: Herbert Xu @ 2011-09-09  1:54 UTC (permalink / raw)
  To: Yan, Zheng
  Cc: netdev@vger.kernel.org, davem@davemloft.net,
	eric.dumazet@gmail.com, sfr@canb.auug.org.au
In-Reply-To: <4E696FD0.7060702@intel.com>

On Fri, Sep 09, 2011 at 09:45:52AM +0800, Yan, Zheng wrote:
> Hi all,
> 
> I found a check in tcp_is_sackblock_valid() is suspicious. It against
> its comment and RFC. I think the correct check should be:
> 
> ---
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 385c470..a5d01b1 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -1124,7 +1124,7 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
>                 return 0;
>  
>         /* ...Then it's D-SACK, and must reside below snd_una completely */
> -       if (!after(end_seq, tp->snd_una))
> +       if (after(end_seq, tp->snd_una))
>                 return 0;
>  
>         if (!before(start_seq, tp->undo_marker))
> ---
> 
> I also checked /proc/net/netstat of my laptop, found TCPDSACKIgnoredOld
> field is not zero. Maybe it's caused by the bug.

Yes this looks like a typo.  Please resend your patch with a
description and signed-off-by line.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [BUG?] tcp: potential bug in tcp_is_sackblock_valid()
From: Yan, Zheng @ 2011-09-09  2:10 UTC (permalink / raw)
  To: Herbert Xu
  Cc: netdev@vger.kernel.org, davem@davemloft.net,
	eric.dumazet@gmail.com, sfr@canb.auug.org.au
In-Reply-To: <20110909015429.GA9156@gondor.apana.org.au>

On 09/09/2011 09:54 AM, Herbert Xu wrote:
> On Fri, Sep 09, 2011 at 09:45:52AM +0800, Yan, Zheng wrote:
>> Hi all,
>>
>> I found a check in tcp_is_sackblock_valid() is suspicious. It against
>> its comment and RFC. I think the correct check should be:
>>
>> ---
>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>> index 385c470..a5d01b1 100644
>> --- a/net/ipv4/tcp_input.c
>> +++ b/net/ipv4/tcp_input.c
>> @@ -1124,7 +1124,7 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
>>                 return 0;
>>  
>>         /* ...Then it's D-SACK, and must reside below snd_una completely */
>> -       if (!after(end_seq, tp->snd_una))
>> +       if (after(end_seq, tp->snd_una))
>>                 return 0;
>>  
>>         if (!before(start_seq, tp->undo_marker))
>> ---
>>
>> I also checked /proc/net/netstat of my laptop, found TCPDSACKIgnoredOld
>> field is not zero. Maybe it's caused by the bug.
> 
> Yes this looks like a typo.  Please resend your patch with a
> description and signed-off-by line.
> 

I think the bug has a big influence on tcp DSACKs. It's better to
leave it to people who fully understand tcp code.

Thanks.

^ permalink raw reply

* Re: [omega-g1:11110] Re: [PATCH] net: configurable sysctl parameter "net.core.tcp_lowat" for sk_stream_min_wspace()
From: David Miller @ 2011-09-09  2:17 UTC (permalink / raw)
  To: jun.kondo
  Cc: linux-kernel, omega-g1, notsuki, motokazu.kozaki, htaira, netdev,
	tomohiko.takahashi, kotaro.sakai, ken.sugawara
In-Reply-To: <4E696D06.3000003@ctc-g.co.jp>

From: "Jun.Kondo" <jun.kondo@ctc-g.co.jp>
Date: Fri, 09 Sep 2011 10:33:58 +0900

> - In normal situation, acquire large default transmission
>   buffer value, and ensure high throughput from the
>   beginning of tcp connection

You should never do this.  You should use the default buffer sizes and
as a result the kernel's TCP stack automatically adjusts the send and
receive buffers in response to the link characteristics.

When you set explicit buffer sizes, this turns off the TCP stack's
auto-tuning mechanism.

Every argument made in support of your proposed feature is based upon
a false premise of one kind of another, and this is yet another example
of this.

^ permalink raw reply

* Re: [PATCH v2 1/9] per-netns ipv4 sysctl_tcp_mem
From: KAMEZAWA Hiroyuki @ 2011-09-09  2:47 UTC (permalink / raw)
  To: Glauber Costa
  Cc: linux-kernel, linux-mm, containers, netdev, xemul,
	David S. Miller, Eric W. Biederman
In-Reply-To: <1315369399-3073-2-git-send-email-glommer@parallels.com>

On Wed,  7 Sep 2011 01:23:11 -0300
Glauber Costa <glommer@parallels.com> wrote:

> This patch allows each namespace to independently set up
> its levels for tcp memory pressure thresholds. This patch
> alone does not buy much: we need to make this values
> per group of process somehow. This is achieved in the
> patches that follows in this patchset.
> 
> Signed-off-by: Glauber Costa <glommer@parallels.com>
> CC: David S. Miller <davem@davemloft.net>
> CC: Hiroyouki Kamezawa <kamezawa.hiroyu@jp.fujitsu.com>
> CC: Eric W. Biederman <ebiederm@xmission.com>


Hmm, it may be better to post this patch as independent one.

I'm not familiar with this area...but try review ;)

> ---
>  include/net/netns/ipv4.h   |    1 +
>  include/net/tcp.h          |    1 -
>  net/ipv4/sysctl_net_ipv4.c |   51 +++++++++++++++++++++++++++++++++++++------
>  net/ipv4/tcp.c             |   13 +++-------
>  4 files changed, 49 insertions(+), 17 deletions(-)
> 
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index d786b4f..bbd023a 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -55,6 +55,7 @@ struct netns_ipv4 {
>  	int current_rt_cache_rebuild_count;
>  
>  	unsigned int sysctl_ping_group_range[2];
> +	long sysctl_tcp_mem[3];
>  
>  	atomic_t rt_genid;
>  	atomic_t dev_addr_genid;

Hmm, in original placement, sysctl_tcp_mem[] was on __read_mostly
area. Doesn't this placement cause many cache invalidations ?



> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 149a415..6bfdd9b 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -230,7 +230,6 @@ extern int sysctl_tcp_fack;
>  extern int sysctl_tcp_reordering;
>  extern int sysctl_tcp_ecn;
>  extern int sysctl_tcp_dsack;
> -extern long sysctl_tcp_mem[3];
>  extern int sysctl_tcp_wmem[3];
>  extern int sysctl_tcp_rmem[3];
>  extern int sysctl_tcp_app_win;
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 69fd720..0d74b9d 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -14,6 +14,7 @@
>  #include <linux/init.h>
>  #include <linux/slab.h>
>  #include <linux/nsproxy.h>
> +#include <linux/swap.h>
>  #include <net/snmp.h>
>  #include <net/icmp.h>
>  #include <net/ip.h>
> @@ -174,6 +175,36 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
>  	return ret;
>  }
>  
> +static int ipv4_tcp_mem(ctl_table *ctl, int write,
> +			   void __user *buffer, size_t *lenp,
> +			   loff_t *ppos)
> +{
> +	int ret;
> +	unsigned long vec[3];
> +	struct net *net = current->nsproxy->net_ns;
> +	int i;
> +
> +	ctl_table tmp = {
> +		.data = &vec,
> +		.maxlen = sizeof(vec),
> +		.mode = ctl->mode,
> +	};
> +
> +	if (!write) {
> +		ctl->data = &net->ipv4.sysctl_tcp_mem;
> +		return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
> +	}
> +
> +	ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
> +	if (ret)
> +		return ret;
> +
> +	for (i = 0; i < 3; i++)
> +		net->ipv4.sysctl_tcp_mem[i] = vec[i];
> +
> +	return 0;
> +}
> +
>  static struct ctl_table ipv4_table[] = {
>  	{
>  		.procname	= "tcp_timestamps",
> @@ -433,13 +464,6 @@ static struct ctl_table ipv4_table[] = {
>  		.proc_handler	= proc_dointvec
>  	},
>  	{
> -		.procname	= "tcp_mem",
> -		.data		= &sysctl_tcp_mem,
> -		.maxlen		= sizeof(sysctl_tcp_mem),
> -		.mode		= 0644,
> -		.proc_handler	= proc_doulongvec_minmax
> -	},
> -	{
>  		.procname	= "tcp_wmem",
>  		.data		= &sysctl_tcp_wmem,
>  		.maxlen		= sizeof(sysctl_tcp_wmem),
> @@ -721,6 +745,12 @@ static struct ctl_table ipv4_net_table[] = {
>  		.mode		= 0644,
>  		.proc_handler	= ipv4_ping_group_range,
>  	},
> +	{
> +		.procname	= "tcp_mem",
> +		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_mem),
> +		.mode		= 0644,
> +		.proc_handler	= ipv4_tcp_mem,
> +	},
>  	{ }
>  };
>  




> @@ -734,6 +764,7 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
>  static __net_init int ipv4_sysctl_init_net(struct net *net)
>  {
>  	struct ctl_table *table;
> +	unsigned long limit;
>  
>  	table = ipv4_net_table;
>  	if (!net_eq(net, &init_net)) {
> @@ -769,6 +800,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
>  
>  	net->ipv4.sysctl_rt_cache_rebuild_count = 4;
>  
> +	limit = nr_free_buffer_pages() / 8;
> +	limit = max(limit, 128UL);
> +	net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
> +	net->ipv4.sysctl_tcp_mem[1] = limit;
> +	net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
> +
>  	net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
>  			net_ipv4_ctl_path, table);
>  	if (net->ipv4.ipv4_hdr == NULL)
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 46febca..f06df24 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -266,6 +266,7 @@
>  #include <linux/crypto.h>
>  #include <linux/time.h>
>  #include <linux/slab.h>
> +#include <linux/nsproxy.h>
>  
>  #include <net/icmp.h>
>  #include <net/tcp.h>
> @@ -282,11 +283,9 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
>  struct percpu_counter tcp_orphan_count;
>  EXPORT_SYMBOL_GPL(tcp_orphan_count);
>  
> -long sysctl_tcp_mem[3] __read_mostly;
>  int sysctl_tcp_wmem[3] __read_mostly;
>  int sysctl_tcp_rmem[3] __read_mostly;
>  
> -EXPORT_SYMBOL(sysctl_tcp_mem);
>  EXPORT_SYMBOL(sysctl_tcp_rmem);
>  EXPORT_SYMBOL(sysctl_tcp_wmem);
>  
> @@ -3277,14 +3276,10 @@ void __init tcp_init(void)
>  	sysctl_tcp_max_orphans = cnt / 2;
>  	sysctl_max_syn_backlog = max(128, cnt / 256);
>  
> -	limit = nr_free_buffer_pages() / 8;
> -	limit = max(limit, 128UL);
> -	sysctl_tcp_mem[0] = limit / 4 * 3;
> -	sysctl_tcp_mem[1] = limit;
> -	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
> -
>  	/* Set per-socket limits to no more than 1/128 the pressure threshold */
> -	limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
> +	limit = (unsigned long)init_net.ipv4.sysctl_tcp_mem[1];
> +	limit <<= (PAGE_SHIFT - 7);
> +

I'm not sure but...why defined as 'long'  ?



BTW, when I grep,

tcp_input.c:        atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
tcp_input.c:    if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])

Don't you need to change this ?



Thanks,
-Kame







--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [net-next-2.6 PATCH 0/3 RFC] macvlan: MAC Address filtering support for passthru mode
From: Roopa Prabhu @ 2011-09-09  2:53 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: netdev, dragos.tatulea, arnd, dwang2, benve, kaber, sri, davem,
	eric.dumazet, mchan, kvm
In-Reply-To: <20110908182509.GB469@redhat.com>




On 9/8/11 12:11 PM, "Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Thu, Sep 08, 2011 at 09:19:32AM -0700, Roopa Prabhu wrote:
>>>>> There are more features we'll want down the road though,
>>>>> so let's see whether the interface will be able to
>>>>> satisfy them in a backwards compatible way before we
>>>>> set it in stone. Here's what I came up with:
>>>>> 
>>>>> How will the filtering table be partitioned within guests?
>>>> 
>>>> Since this patch supports macvlan PASSTHRU mode only, in which the lower
>>>> device has 1-1 mapping to the guest nic, it does not require any
>>>> partitioning of filtering table within guests. Unless I missed
>>>> understanding
>>>> something. 
>>>> If the lower device were being shared by multiple guest network interfaces
>>>> (non PASSTHRU mode), only then we will need to maintain separate filter
>>>> tables for each guest network interface in macvlan and forward the pkt to
>>>> respective guest interface after a filter lookup. This could affect
>>>> performance too I think.
>>> 
>>> Not with hardware filtering support. Which is where we'd need to
>>> partition the host nic mac table between guests.
>>> 
>> I need to understand this more. In non passthru case when a VF or physical
>> nic is shared between guests,
> 
> For example, consider a VF given to each guest. Hardware supports a fixed
> total number of filters, which can be partitioned between VFs.
> 
O ok. But hw maintains VF filters separately for every VF as far as I know.
Filters received on a VF are programmed for that VF only. Am assuming all
hardware do this. Atleast our hardware does this.
What I was referring to was a single VF shared between guests using macvtap
(could be bridge mode for example). All guests sharing the VF will register
 filters with the VF via macvlan. Hw makes sure what ever the VF asked for
is received at the VF. VF in hw does not know that it is shared by guests.
Only at macvlan we might need to re-filter the pkts received on the VF and
steer pkts to the individual guests based on what they asked for.


>> the nic does not really know about the guests,
>> so I was thinking we do the same thing as we do for the passthru case (ie
>> send all the address filters from macvlan to the physical nic). So at the
>> hardware, filtering is done for all guests sharing the nic. But if we want
>> each virtio-net nic or guest to get exactly what it asked for
>> macvlan/macvtap needs to maintain a copy of each guest filter and do a
>> lookup and send only the requested traffic to the guest. Here is the
>> performance hit that I was seeing. Please see my next comment for further
>> details. 
> 
> It won't be any slower than attaching a non-passthrough macvlan
> to a device, will it?
> 
Am not sure. The filter lookup in macvlan is the one I am concerned about.
Will need to try it out.

>> 
>>>> I chose to support PASSTHRU Mode only at first because its simpler and all
>>>> code additions are in control path only.
>>> 
>>> I agree. It would be a bit silly to have a dedicated interface
>>> for passthough and a completely separate one for
>>> non passthrough.
>>> 
>> Agree. The reason I did not focus on non-passthru case in the initial
>> version was because I was thinking things to do in the non-passthru case
>> will be just add-ons to the passthru case. But true Better to flush out the
>> non-pasthru case details.
>> 
>> After dwelling on this a bit more how about the below:
>> 
>> Phase 1: Goal: Enable hardware filtering for all macvlan modes
>>     - In macvlan passthru mode the single guest virtio-nic connected will
>>       receive traffic that he requested for
>>     - In macvlan non-passthru mode all guest virtio-nics sharing the
>>       physical nic will see all other guest traffic
>>       but the filtering at guest virtio-nic
> 
> I don't think guests currently filter anything.
> 
I was referring to Qemu-kvm virtio-net in
virtion_net_receive->receive_filter. I think It only passes pkts that the
guest OS is interested. It uses the filter table that I am passing to
macvtap in this patch.


>>       will make sure each guest
>>       eventually sees traffic he asked for. This is still better than
>>       putting the physical nic in promiscuous mode.
>> 
>> (This is mainly what my patch does...but will need to remove the passthru
>> check and see if there are any thing else needed for non-passthru case)
> 
> I'm fine with sticking with passthrough, make non passthrough
> a separate phase.
> 
Ok.

>> 
>> Phase 2: Goal: Enable filtering at macvlan so that each guest virtio-nic
>> receives only what he requested for.
>>     - In this case, in addition to pushing the filters down to the physical
>>       nic we will have to maintain the same filter in macvlan and do a filter
>>       lookup before forwarding the traffic to a virtio-nic.
>> 
>> But I am thinking phase 2 might be redundant given virtio-nic already does
>> filtering for the guest.
> 
> It does? Do you mean the filter that qemu does in userspace?
> 
Yes I meant the filter that qemu does in userspace qemu-kvm/hw/virtio-net.c
receive_filter(). 

>> In which case we might not need phase 2 at all. I
>> might have been over complicating things.
>> 
>> Please comment. And please correct if I missed something.
>>  
>>  
>>>>> 
>>>>> A way to limit what the guest can do would also be useful.
>>>>> How can this be done? selinux?
>>>> 
>>>> I vaguely remember a thread on the same context.. had a suggestion to
>>>> maintain pre-approved address lists and allow guest filter registration of
>>>> only those addresses for security. This seemed reasonable. Plus the ability
>>>> to support additional address registration from guest could be made
>>>> configurable (One of your ideas again from prior work).
>>>> 
>>>> I am not an selinux expert, but I am thinking we can use it to only allow
>>>> or
>>>> disallow access or operations to the macvtap device. (?). I will check more
>>>> on this.
>>> 
>>> We'd have to have a way to revoke that as well.
>>> 
>> Yes true.
>> 
>> 
>>>>> 
>>>>> Any thoughts on spoofing filtering?
>>>> 
>>>> I can only think of checking addresses against an allowed address list.
>>>> Don't know of any other ways. Any hints ?
>>> 
>>> Hardware (esp SRIOV) often has ways to do this check, too.
>>> 
>> Yes correct. Hw sriov and even switch in 802.1Qbh has anti-spoofing feature.
>> In which case I am thinking having It at the macvtap layer is not an
>> absolute must (?).
> 
> Exactly. But let's figure out *how* it will be programmed.
> If anti-spoofing is programmed with netlink, maybe that's
> a better interface for rx filter too, for consistency.
> 
I will check sriov vfs. But I know our hw does not have an interface exposed
from the driver. We do it through the management s/w at the switch port
(this is 802.1Qbh). I will check other sriov nics. I don't think intel has a
netlink interface either. But I will double check.


>>>> 
>>>> In any case I am assuming all the protection/security measures should be
>>>> taken at the layer calling the TUNSETTXFILTER ie..In macvtap virtualization
>>>> use case its libvirt or qemu-kvm. No ?
>>> 
>>> Ideally we'd have a way to separate these capabilities, so that libvirt
>>> can override qemu.
>>> 
>>>>> 
>>>>> Would it be possible to make the filtering programmable
>>>>> using netlink, e.g. ethtool, ip, or some such?
>>>> 
>>>> Should be possible via ethtool or ip calling ioctl TUNSETTXFILTER. Are you
>>>> thinking of macvlan having a netlink interface to set filter and not ioctl
>>>> ?. Sure.
>>> 
>>> Yes.
>>> 
>>>> But I was thinking the point of implementing TUNSETTXFILTER was to
>>>> maintain compatibility with the generic tap interface that does the same
>>>> thing. 
>>> 
>>> Yes. OTOH I don't think anyone uses that ATM so it might not
>>> be important if it's not a good fit.
>>> E.g. we could notify libvirt and have it use netlink for us
>>> if we like that better.
>>> 
>> Ok thanks for clarifying that. One more reason to use TUNSETTXFILTER
>> interface was for qemu-kvm who uses the same tap interface for macvtap and
>> regular tap. So if we use netlink we have to do different things for macvtap
>> and tap filters in qemu. And qemu-kvm does not distinguish between macvtap
>> and tap as far as I know. No ?
> 
> It's not a question of simplifying qemu as much as trying to
> make the kernel interface abstract device differences
> away from users. Using same interface for tun and macvtap
> gave us some confidence that the interface is a good one.
> 
> But this does not seem to have worked with TUNSETTXFILTER -
> at least qemu doesn't use it yet, and it's been upstream
> a while. So there's no proof it's a good interface.

All qemu-kvm patches (not upstream yet) that have been floating around and
which I have been testing with use TUNSETTXFILTER. And tun kernel driver
does support filtering using TUNSETTXFILTER. It has code that does filtering
based on filter received using TUNSETTXFILTER.


But true we don't have to stick with TUNSETTXFILTER. I will explore the
netlink interface and research pros and cons and sketch the interface and
get back. 

> 
> So if we decide netlink is a better interface we can add it for tun too.
> We need to be backwards compatible and figure out what happens if someone
> tries to use both methods: probably apply both or ignore TUNSETTXFILTER ...
> 
>> 
>> Thanks you for your review and comments.
>> 
>> 
>>>> And having both the netlink op and ioctl interface might not be clean ?.
>>> 
>>> No idea.
>>> 
>>>> Sorry if I misunderstood your question.
>>>> 
>>>>> That would make this useful for bridged setups besides
>>>>> macvtap/virtualization.
>>>>> 
>>>> 
>>>> Thanks for the comments.
> 
> Overall good progress, and don't let the interface discussions
> block you. You want to push in two directions - stabilize code in one
> branch, and play with interfaces in another one. By the time there's a
> concensus on the interfaces you have the main logic all ready,
> then you merge. 

ok thanks michael! 

- Roopa



^ permalink raw reply

* Re: [net-next-2.6 PATCH 0/3 RFC] macvlan: MAC Address filtering support for passthru mode
From: Roopa Prabhu @ 2011-09-09  3:00 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sridhar Samudrala, netdev, dragos.tatulea, arnd, dwang2, benve,
	kaber, davem, eric.dumazet, mchan, kvm
In-Reply-To: <20110908193332.GA2476@redhat.com>

On 9/8/11 12:33 PM, "Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Thu, Sep 08, 2011 at 12:23:56PM -0700, Roopa Prabhu wrote:
>>> 
>>> I think the main usecase for passthru mode is to assign a SR-IOV VF to
>>> a single guest.
>>> 
>> Yes and for the passthru usecase this patch should be enough to enable
>> filtering in hw (eventually like I indicated before I need to fix vlan
>> filtering too).
> 
> So with filtering in hw, and in sriov VF case, VFs
> actually share a filtering table. How will that
> be partitioned?

AFAIK, though it might maintain a single filter table space in hw, hw does
know which filter belongs to which VF. And the OS driver does not need to do
anything special. The VF driver exposes a VF netdev. And any uc/mc addresses
registered with a VF netdev are registered with the hw by the driver. And hw
will filter and send only pkts that the VF has expressed interest in.

No special filter partitioning in hw is required.

Thanks,
Roopa

^ permalink raw reply

* Re: [PATCH v2 6/9] per-cgroup tcp buffers control
From: KAMEZAWA Hiroyuki @ 2011-09-09  3:12 UTC (permalink / raw)
  To: Glauber Costa
  Cc: linux-kernel, linux-mm, containers, netdev, xemul,
	David S. Miller, Eric W. Biederman
In-Reply-To: <1315369399-3073-7-git-send-email-glommer@parallels.com>

On Wed,  7 Sep 2011 01:23:16 -0300
Glauber Costa <glommer@parallels.com> wrote:

> With all the infrastructure in place, this patch implements
> per-cgroup control for tcp memory pressure handling.
> 
> Signed-off-by: Glauber Costa <glommer@parallels.com>
> CC: David S. Miller <davem@davemloft.net>
> CC: Hiroyouki Kamezawa <kamezawa.hiroyu@jp.fujitsu.com>
> CC: Eric W. Biederman <ebiederm@xmission.com>

Hmm, then, kmem_cgroup.c is just a caller of plugins implemented
by other components ?

> ---
>  include/linux/kmem_cgroup.h |    7 ++++
>  include/net/sock.h          |   10 ++++++-
>  mm/kmem_cgroup.c            |   10 ++++++-
>  net/core/sock.c             |   18 +++++++++++
>  net/ipv4/tcp.c              |   67 +++++++++++++++++++++++++++++++++++++-----
>  5 files changed, 102 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/kmem_cgroup.h b/include/linux/kmem_cgroup.h
> index d983ba8..89ad0a1 100644
> --- a/include/linux/kmem_cgroup.h
> +++ b/include/linux/kmem_cgroup.h
> @@ -23,6 +23,13 @@
>  struct kmem_cgroup {
>  	struct cgroup_subsys_state css;
>  	struct kmem_cgroup *parent;
> +
> +#ifdef CONFIG_INET
> +	int tcp_memory_pressure;
> +	atomic_long_t tcp_memory_allocated;
> +	struct percpu_counter tcp_sockets_allocated;
> +	long tcp_prot_mem[3];
> +#endif
>  };

I think you should place 'read-mostly' values carefully.


>  
> diff --git a/include/net/sock.h b/include/net/sock.h
> index ab65640..91424e3 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -64,6 +64,7 @@
>  #include <net/dst.h>
>  #include <net/checksum.h>
>  
> +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
>  /*
>   * This structure really needs to be cleaned up.
>   * Most of it is for TCP, and not used by any of
> @@ -814,7 +815,14 @@ struct proto {
>  	int			*(*memory_pressure)(struct kmem_cgroup *sg);
>  	/* Pointer to the per-cgroup version of the the sysctl_mem field */
>  	long			*(*prot_mem)(struct kmem_cgroup *sg);
> -
> +	/*
> +	 * cgroup specific initialization function. Called once for all
> +	 * protocols that implement it, from cgroups populate function.
> +	 * This function has to setup any files the protocol want to
> +	 * appear in the kmem cgroup filesystem.
> +	 */
> +	int			(*init_cgroup)(struct cgroup *cgrp,
> +					       struct cgroup_subsys *ss);
>  	int			*sysctl_wmem;
>  	int			*sysctl_rmem;
>  	int			max_header;
> diff --git a/mm/kmem_cgroup.c b/mm/kmem_cgroup.c
> index 7950e69..5e53d66 100644
> --- a/mm/kmem_cgroup.c
> +++ b/mm/kmem_cgroup.c
> @@ -17,16 +17,24 @@
>  #include <linux/cgroup.h>
>  #include <linux/slab.h>
>  #include <linux/kmem_cgroup.h>
> +#include <net/sock.h>
>  
>  static int kmem_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
>  {
> -	return 0;
> +	int ret = 0;
> +#ifdef CONFIG_NET
> +	ret = sockets_populate(ss, cgrp);
> +#endif

CONFIG_INET ?

> +	return ret;
>  }
>  
>  static void
>  kmem_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
>  {
>  	struct kmem_cgroup *cg = kcg_from_cgroup(cgrp);
> +#ifdef CONFIG_INET
> +	percpu_counter_destroy(&cg->tcp_sockets_allocated);
> +#endif
>  	kfree(cg);
>  }
>  
> diff --git a/net/core/sock.c b/net/core/sock.c
> index ead9c02..9d833cf 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -134,6 +134,24 @@
>  #include <net/tcp.h>
>  #endif
>  
> +static DEFINE_RWLOCK(proto_list_lock);
> +static LIST_HEAD(proto_list);
> +
> +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
> +{
> +	struct proto *proto;
> +	int ret = 0;
> +
> +	read_lock(&proto_list_lock);
> +	list_for_each_entry(proto, &proto_list, node) {
> +		if (proto->init_cgroup)
> +			ret |= proto->init_cgroup(cgrp, ss);
> +	}
> +	read_unlock(&proto_list_lock);
> +
> +	return ret;
> +}

Hmm, I don't understand this part but...

	ret |= ...

and no 'undo'  ? If no 'undo', ->init_cgroup() should success
always and no return value is required. 





> +
>  /*
>   * Each address family might have different locking rules, so we have
>   * one slock key per address family:
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 76f03ed..0725dc4 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -289,13 +289,6 @@ int sysctl_tcp_rmem[3] __read_mostly;
>  EXPORT_SYMBOL(sysctl_tcp_rmem);
>  EXPORT_SYMBOL(sysctl_tcp_wmem);
>  
> -atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
> -
> -/*
> - * Current number of TCP sockets.
> - */
> -struct percpu_counter tcp_sockets_allocated;
> -
>  /*
>   * TCP splice context
>   */
> @@ -305,13 +298,68 @@ struct tcp_splice_state {
>  	unsigned int flags;
>  };
>  
> +#ifdef CONFIG_CGROUP_KMEM
>  /*
>   * Pressure flag: try to collapse.
>   * Technical note: it is used by multiple contexts non atomically.
>   * All the __sk_mem_schedule() is of this nature: accounting
>   * is strict, actions are advisory and have some latency.
>   */
> -int tcp_memory_pressure __read_mostly;
> +void tcp_enter_memory_pressure(struct sock *sk)
> +{
> +	struct kmem_cgroup *sg = sk->sk_cgrp;
> +	if (!sg->tcp_memory_pressure) {
> +		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
> +		sg->tcp_memory_pressure = 1;
> +	}
> +}
> +
> +long *tcp_sysctl_mem(struct kmem_cgroup *sg)
> +{
> +	return sg->tcp_prot_mem;
> +}
> +
> +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg)
> +{
> +	return &(sg->tcp_memory_allocated);
> +}
> +
> +int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
> +{
> +	struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
> +	unsigned long limit;
> +	struct net *net = current->nsproxy->net_ns;
> +
> +	sg->tcp_memory_pressure = 0;
> +	atomic_long_set(&sg->tcp_memory_allocated, 0);
> +	percpu_counter_init(&sg->tcp_sockets_allocated, 0);
> +
> +	limit = nr_free_buffer_pages() / 8;
> +	limit = max(limit, 128UL);
> +
> +	sg->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0];
> +	sg->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1];
> +	sg->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2];
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(tcp_init_cgroup);
> +
> +int *memory_pressure_tcp(struct kmem_cgroup *sg)
> +{
> +	return &sg->tcp_memory_pressure;
> +}
> +
> +struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg)
> +{
> +	return &sg->tcp_sockets_allocated;
> +}
> +#else
> +
> +/* Current number of TCP sockets. */
> +struct percpu_counter tcp_sockets_allocated;
> +atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
> +int tcp_memory_pressure;
>  

you dropped __read_mostly.

Thanks,
-kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH] per-cgroup tcp buffer limitation
From: Glauber Costa @ 2011-09-09  4:17 UTC (permalink / raw)
  To: Greg Thelen
  Cc: linux-kernel, linux-mm, containers, netdev, xemul,
	David S. Miller, Hiroyouki Kamezawa, Eric W. Biederman,
	Suleiman Souhlal
In-Reply-To: <CAHH2K0YcXMUfd1Zr=f5a4=X9cPPp8NZiuichFXaOo=kVp5rRJA@mail.gmail.com>

On 09/08/2011 06:53 PM, Greg Thelen wrote:
> On Wed, Sep 7, 2011 at 9:44 PM, Glauber Costa<glommer@parallels.com>  wrote:
>
> Thanks for your ideas and patience.

Likewise. It is turning out to be a very fruitful
discussion.

>> Well, it is a way to see this. The other way to see this, is that you're
>> proposing to move to the kernel, something that really belongs in userspace.
>> That's because:
>>
>> With the information you provided me, I have no reason to believe that the
>> kernel has more condition to do this work. Do the kernel have access to any
>> information that userspace do not, and can't be exported? If not, userspace
>> is traditionally where this sort of stuff has been done.
>
> I think direct reclaim is a pain if user space is required to participate in
> memory balancing decisions.
It depends on the decision.

> One thing a single memory limit solution has is the
> ability to reclaim user memory to satisfy growing kernel memory needs (and vise
> versa).

this works for a strict definition of the word "needs". If I *need*
more kernel memory, I'd be happy to have more. But if I just want to
screw other containers, they will be happy if I don't get what I
"need" - it can be unreclaimable. Since those are limits, they are
expected to be, in any real setups, greater than any use people
should be doing, and yet, prevent bad usage scenarios.

> If a container must fit within 100M, then a single limit solution
> would set the limit to 100M and never change it.  In a split limit solution a
> user daemon (e.g. uswapd) would need to monitor the usage and the amount of
> active memory vs inactive user memory and unreferenced kernel memory to
> determine where to apply pressure.

Or it can just define some parameters and let the kernel do the
rest. Like for instance, a maximum proportion allowed, a maximum
proportion desired, etc.

> With some more knobs such a uswapd could
> attempt to keep ahead of demand.  But eventually direct reclaim would
> be needed to satisfy rapid growth spikes.  Example: If the 100M container
> starts with limits of 20M kmem and 80M user memory but later its kernel
> memory needs grow to 70M.  With separate user and kernel memory
> limits the kernel memory allocation could fail despite there being
> reclaimable user pages available.

No no, this is a ratio, not a *limit*. A limit is something you
should not be allowed to go over. A good limit of kernel memory for
a 100 Mb container could be something like... 100 mb. But there is
more to that.

Risking being a bit polemic here, I think that when we do
containers, we have to view the kernel a little bit like a shared 
resource that is not accounted to anybody. It is easy to account things 
like tcp buffer, but who do you account page tables for shared pages to? 
Or pinned dentries for shared filesystems ?

Being the shared table under everybody, the kernel is more or less
like buffers inside a physical hdd, or cache lines. You know it is
there, you know it has a size, but provided you have some sane
protections, you don't really care - because in most cases you can't
- who is using it.

> The job should have a way to
> transition to memory limits to 70M+ kernel and 30M- of user.

Yes, and I don't see how what I propose prevents that.

> I suppose a GFP_WAIT slab kernel page allocation could wakeup user space to
> perform user-assisted direct reclaim.  User space would then lower the user
> limit thereby causing the kernel to direct reclaim user pages, then
> the user daemon would raise the kernel limit allowing the slab allocation to
> succeed.  My hunch is that this would be prone to deadlocks (what prevents
> uswapd from needing more even more kmem?)  I'll defer to more
> experienced minds to know if user assisted direct memory reclaim has
> other pitfalls.  It scares me.

Good that it scares you, it should. OTOH, userspace being
able to set parameters to it, has nothing scary at all. A daemon in
userspace can detect that you need more kernel space memory , and
then - according to a policy you abide to - write to a file allowing
it more, or maybe not - according to that same policy. It is very
far away from "userspace driven reclaim".

> Fundamentally I have no problem putting an upper bound on a cgroup's resource
> usage.  This serves to contain the damage a job can do to the system and other
> jobs.  My concern is about limiting the kernel's ability to trade one type of
> memory for another by using different cgroups for different types of memory.
Yes, but limits have nothing to do with it.

>
> If kmem expands to include reclaimable kernel memory (e.g. dentry) then I
> presume the kernel would have no way to exchange unused user pages for dentry
> pages even if the user memory in the container is well below its limit.  This is
> motivation for the above user assisted direct reclaim.

Dentry is not always reclaimable. If it is pinned, it is non
reclaimable. Speaking of it, Would you take a look at
https://lkml.org/lkml/2011/8/14/110 ?

I am targetting dentry as well. But since it is hard to assign a
dentry to a process all the time, going through a different path. I
however, haven't entirely given up of doing it cgroups based, so any
ideas are welcome =)

> Do you feel the need to segregate user and kernel memory into different cgroups
> with independent limits?  Or is this this just a way to create a new clean
> cgroup with a simple purpose?

No, I don't necessarily feel that need. I just thought it was
cleaner to have entities with different purposes in different
cgroups. If moving it to the memory controller would help you in any
way, I can just do it. 80 % of this work is independent of where a
cgroup file lives.

> In some resource sharing shops customers purchase a certain amount of memory,
> cpu, network, etc.  Such customers don't define how the memory is used and the
> user/kernel mixture may change over time.  Can a user space reclaim daemon stay
> ahead of the workloads needs?

If you think solely about limits, you don't need to. The most
sane policy is actually "I don't care what is the kernel/user ratio,
as long as the kernel never grows over X Mb".

>> Using userspace CPU is no different from using kernel cpu in this particular
>> case. It is all overhead, regardless where it comes from. Moreover, you end
>> up setting up a policy, instead of a mechanism. What should be this
>> proportion?  Do we reclaim everything with the same frequency? Should we be
>> more tolerant with a specific container?
>
> I assume that this implies that a generic kmem cgroup usage is inferior to
> separate limits for each kernel memory type to allow user space the flexibility
> to choose between kernel types (udp vs tcp vs ext4 vs page_tables vs ...)?  Do
> you foresee a way to provide a limit on the total amount of kmem usage by all
> such types?  If a container wants to dedicate 4M for all network protocol
> buffers (tcp, udp, etc.) would that require a user space daemon to balance
> memory limits b/w the protocols?

Well, I am giving this an extra thought... Having separate knobs
adds flexibility, but - as usual - also complexity. For the goals I
have in mind, "kernel memory" would work just as fine.

If you look carefully at the other patches in the series besides
this one, you'll see that it is just a matter of billing from kernel
memory instead of tcp-memory, and then all the rest is the same.

Do you think that a single kernel-memory knob would be better for
your needs? I am willing to give it a try.
>> Also, If you want to allow any flexibility in this scheme, like: "Should
>> this network container be able to stress the network more, pinning more
>> memory, but not other subsystems?", you end up having to touch all
>> individual files anyway - probably with a userspace daemon.
>>
>> Also, as you noticed yourself, kernel memory is fundamentally different from
>> userspace memory. You can't just set reclaim limits, since you have no
>> guarantees it will work. User memory is not a scarce resource.
>> Kernel memory is.
>
> I agree that kernel memory is somewhat different.  In some (I argue most)
> situations containers want the ability to exchange job kmem and job umem.
> Either split or combined accounting protects the system and isolates other
> containers from kmem allocations of a bad job.  To me it seems natural to
> indicate that job X gets Y MB of memory.  I have more trouble dividing the
> Y MB of memory into dedicated slices for different types of memory.

I understand. And I don't think anyone doing containers
should be mandated to define a division. But a limit...
>>> While there are people (like me) who want a combined memory usage
>>> limit there are also people (like you) who want separate user and
>>> kernel limiting.
>>
>> Combined excludes separate. Separate does not exclude combined.
>
> I agree.  I have no problem with separate accounting and separate
> user-accessible pressure knobs to allow for complex policies.  My concern is
> about limiting the kernel's ability to reclaim one type of memory to
> fulfill the needs of another memory type (e.g. I think reclaiming clean file
> pages should be possible to make room for user slab needs).

I agree with your concern. It is definitely something we should not
do.

> I think
> memcg aware slab accounting does a good job of limiting a job's
> memory allocations.
> Would such slab accounting meet your needs?

Well, the slab alone, no. There are other objects - like tcp buffers
- that aren't covered by the slab. Others are usually shared among
many cgroups, and others don't really belong to anybody in
particular.igh

How do you think then, about turning this into 2 files inside memcg:

  - kernel_memory_hard_limit.
  - kernel_memory_soft_limit.

tcp memory would be the one defined in /proc, except if it is
greater than any of the limits. Instead of testing for memory
allocation against kmem.tcp_allocated_memory, we'd test it against
memcg.kmem_pinned_memory.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH v2 1/9] per-netns ipv4 sysctl_tcp_mem
From: Glauber Costa @ 2011-09-09  4:19 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, containers, netdev, xemul,
	David S. Miller, Eric W. Biederman
In-Reply-To: <20110909114720.f050c520.kamezawa.hiroyu@jp.fujitsu.com>

On 09/08/2011 11:47 PM, KAMEZAWA Hiroyuki wrote:
> On Wed,  7 Sep 2011 01:23:11 -0300
> Glauber Costa<glommer@parallels.com>  wrote:
>
>> This patch allows each namespace to independently set up
>> its levels for tcp memory pressure thresholds. This patch
>> alone does not buy much: we need to make this values
>> per group of process somehow. This is achieved in the
>> patches that follows in this patchset.
>>
>> Signed-off-by: Glauber Costa<glommer@parallels.com>
>> CC: David S. Miller<davem@davemloft.net>
>> CC: Hiroyouki Kamezawa<kamezawa.hiroyu@jp.fujitsu.com>
>> CC: Eric W. Biederman<ebiederm@xmission.com>
>
>
> Hmm, it may be better to post this patch as independent one.


Maybe we can search acks from eric about this one
specifically prior to merging, but I'd still like it to be part of
the whole. It will put us in a weird state if this is merged, and
the rest is not.

> I'm not familiar with this area...but try review ;)
Thank you!

>
>> ---
>>   include/net/netns/ipv4.h   |    1 +
>>   include/net/tcp.h          |    1 -
>>   net/ipv4/sysctl_net_ipv4.c |   51 +++++++++++++++++++++++++++++++++++++------
>>   net/ipv4/tcp.c             |   13 +++-------
>>   4 files changed, 49 insertions(+), 17 deletions(-)
>>
>> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
>> index d786b4f..bbd023a 100644
>> --- a/include/net/netns/ipv4.h
>> +++ b/include/net/netns/ipv4.h
>> @@ -55,6 +55,7 @@ struct netns_ipv4 {
>>   	int current_rt_cache_rebuild_count;
>>
>>   	unsigned int sysctl_ping_group_range[2];
>> +	long sysctl_tcp_mem[3];
>>
>>   	atomic_t rt_genid;
>>   	atomic_t dev_addr_genid;
>
> Hmm, in original placement, sysctl_tcp_mem[] was on __read_mostly
> area. Doesn't this placement cause many cache invalidations ?
>
Yes, you are right. I will move back to the old way of doing it.

>
>> diff --git a/include/net/tcp.h b/include/net/tcp.h
>> index 149a415..6bfdd9b 100644
>> --- a/include/net/tcp.h
>> +++ b/include/net/tcp.h
>> @@ -230,7 +230,6 @@ extern int sysctl_tcp_fack;
>>   extern int sysctl_tcp_reordering;
>>   extern int sysctl_tcp_ecn;
>>   extern int sysctl_tcp_dsack;
>> -extern long sysctl_tcp_mem[3];
>>   extern int sysctl_tcp_wmem[3];
>>   extern int sysctl_tcp_rmem[3];
>>   extern int sysctl_tcp_app_win;
>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>> index 69fd720..0d74b9d 100644
>> --- a/net/ipv4/sysctl_net_ipv4.c
>> +++ b/net/ipv4/sysctl_net_ipv4.c
>> @@ -14,6 +14,7 @@
>>   #include<linux/init.h>
>>   #include<linux/slab.h>
>>   #include<linux/nsproxy.h>
>> +#include<linux/swap.h>
>>   #include<net/snmp.h>
>>   #include<net/icmp.h>
>>   #include<net/ip.h>
>> @@ -174,6 +175,36 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
>>   	return ret;
>>   }
>>
>> +static int ipv4_tcp_mem(ctl_table *ctl, int write,
>> +			   void __user *buffer, size_t *lenp,
>> +			   loff_t *ppos)
>> +{
>> +	int ret;
>> +	unsigned long vec[3];
>> +	struct net *net = current->nsproxy->net_ns;
>> +	int i;
>> +
>> +	ctl_table tmp = {
>> +		.data =&vec,
>> +		.maxlen = sizeof(vec),
>> +		.mode = ctl->mode,
>> +	};
>> +
>> +	if (!write) {
>> +		ctl->data =&net->ipv4.sysctl_tcp_mem;
>> +		return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
>> +	}
>> +
>> +	ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
>> +	if (ret)
>> +		return ret;
>> +
>> +	for (i = 0; i<  3; i++)
>> +		net->ipv4.sysctl_tcp_mem[i] = vec[i];
>> +
>> +	return 0;
>> +}
>> +
>>   static struct ctl_table ipv4_table[] = {
>>   	{
>>   		.procname	= "tcp_timestamps",
>> @@ -433,13 +464,6 @@ static struct ctl_table ipv4_table[] = {
>>   		.proc_handler	= proc_dointvec
>>   	},
>>   	{
>> -		.procname	= "tcp_mem",
>> -		.data		=&sysctl_tcp_mem,
>> -		.maxlen		= sizeof(sysctl_tcp_mem),
>> -		.mode		= 0644,
>> -		.proc_handler	= proc_doulongvec_minmax
>> -	},
>> -	{
>>   		.procname	= "tcp_wmem",
>>   		.data		=&sysctl_tcp_wmem,
>>   		.maxlen		= sizeof(sysctl_tcp_wmem),
>> @@ -721,6 +745,12 @@ static struct ctl_table ipv4_net_table[] = {
>>   		.mode		= 0644,
>>   		.proc_handler	= ipv4_ping_group_range,
>>   	},
>> +	{
>> +		.procname	= "tcp_mem",
>> +		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_mem),
>> +		.mode		= 0644,
>> +		.proc_handler	= ipv4_tcp_mem,
>> +	},
>>   	{ }
>>   };
>>
>
>
>
>
>> @@ -734,6 +764,7 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
>>   static __net_init int ipv4_sysctl_init_net(struct net *net)
>>   {
>>   	struct ctl_table *table;
>> +	unsigned long limit;
>>
>>   	table = ipv4_net_table;
>>   	if (!net_eq(net,&init_net)) {
>> @@ -769,6 +800,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
>>
>>   	net->ipv4.sysctl_rt_cache_rebuild_count = 4;
>>
>> +	limit = nr_free_buffer_pages() / 8;
>> +	limit = max(limit, 128UL);
>> +	net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
>> +	net->ipv4.sysctl_tcp_mem[1] = limit;
>> +	net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
>> +
>>   	net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
>>   			net_ipv4_ctl_path, table);
>>   	if (net->ipv4.ipv4_hdr == NULL)
>> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
>> index 46febca..f06df24 100644
>> --- a/net/ipv4/tcp.c
>> +++ b/net/ipv4/tcp.c
>> @@ -266,6 +266,7 @@
>>   #include<linux/crypto.h>
>>   #include<linux/time.h>
>>   #include<linux/slab.h>
>> +#include<linux/nsproxy.h>
>>
>>   #include<net/icmp.h>
>>   #include<net/tcp.h>
>> @@ -282,11 +283,9 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
>>   struct percpu_counter tcp_orphan_count;
>>   EXPORT_SYMBOL_GPL(tcp_orphan_count);
>>
>> -long sysctl_tcp_mem[3] __read_mostly;
>>   int sysctl_tcp_wmem[3] __read_mostly;
>>   int sysctl_tcp_rmem[3] __read_mostly;
>>
>> -EXPORT_SYMBOL(sysctl_tcp_mem);
>>   EXPORT_SYMBOL(sysctl_tcp_rmem);
>>   EXPORT_SYMBOL(sysctl_tcp_wmem);
>>
>> @@ -3277,14 +3276,10 @@ void __init tcp_init(void)
>>   	sysctl_tcp_max_orphans = cnt / 2;
>>   	sysctl_max_syn_backlog = max(128, cnt / 256);
>>
>> -	limit = nr_free_buffer_pages() / 8;
>> -	limit = max(limit, 128UL);
>> -	sysctl_tcp_mem[0] = limit / 4 * 3;
>> -	sysctl_tcp_mem[1] = limit;
>> -	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
>> -
>>   	/* Set per-socket limits to no more than 1/128 the pressure threshold */
>> -	limit = ((unsigned long)sysctl_tcp_mem[1])<<  (PAGE_SHIFT - 7);
>> +	limit = (unsigned long)init_net.ipv4.sysctl_tcp_mem[1];
>> +	limit<<= (PAGE_SHIFT - 7);
>> +
>
> I'm not sure but...why defined as 'long'  ?
>

It is part of the "it was there
before" bundle.

It is defined as long not only for tcp, but for all of the
equivalents sysctl as well. So no reason to touch it, at least not
in this series =)

>
> BTW, when I grep,
>
> tcp_input.c:        atomic_long_read(&tcp_memory_allocated)<  sysctl_tcp_mem[0])
> tcp_input.c:    if (atomic_long_read(&tcp_memory_allocated)>= sysctl_tcp_mem[0])
>
> Don't you need to change this ?

It ended up being changed in another patch, and I missed the right
split.

Thank you, I will reorder it so it gets changed correctly.
>
>
> Thanks,
> -Kame
>
>
>
>
>
>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox