Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [RFC 1/2] PCI-Express Non-Transparent Bridge Support
From: Greg KH @ 2012-07-16 18:30 UTC (permalink / raw)
  To: Jon Mason; +Cc: linux-kernel, netdev, linux-pci, Dave Jiang
In-Reply-To: <20120716175505.GF9598@jonmason-lab>

On Mon, Jul 16, 2012 at 10:55:06AM -0700, Jon Mason wrote:
> On Sun, Jul 15, 2012 at 05:19:21PM -0700, Greg KH wrote:
> > On Sun, Jul 15, 2012 at 04:55:48PM -0700, Jon Mason wrote:
> > > On Sat, Jul 14, 2012 at 10:10:15AM -0700, Greg KH wrote:
> > > > On Fri, Jul 13, 2012 at 02:44:59PM -0700, Jon Mason wrote:
> > > > > +static int max_num_cbs = 2;
> > > > > +module_param(max_num_cbs, uint, 0644);
> > > > > +MODULE_PARM_DESC(max_num_cbs, "Maximum number of NTB transport connections");
> > > > > +
> > > > > +static bool no_msix;
> > > > > +module_param(no_msix, bool, 0644);
> > > > > +MODULE_PARM_DESC(no_msix, "Do not allow MSI-X interrupts to be selected");
> > > > 
> > > > How would a user, or a distro, know to set these options?  Why are they
> > > > even options at all?
> > > 
> > > Good question.  There is actually a potential benefit to disabling
> > > MSI-X.  The NTB device on one of our platforms only has 3 MSI-X
> > > vectors.  In the current driver design, that would limit them to 3
> > > client/virtual devices.  However, there are 15bits in the ISR that can
> > > be used for the same purpose.  So, if you disable MSI-X, you can have
> > > 15 instead of 3.  
> > 
> > But again, how would a user, or a distro, know to set these?  Where is
> > the documentation describing it?  Why really have these options at all
> > and not just fix the platform issues (only 3 MSI-X vectors?  Really?)
> 
> I believe we'll want multiple clients (or have multiqueue Ethernet).
> I'm happy to add something to /Documentation to describe it and why it
> would be useful, or I can remove it and re-introduce it when I add
> multiqueue Ethernet.

I'd suggest waiting and adding it later if really needed (see previous
comment about not adding code/features before they are actually needed.)

thanks,

greg k-h

^ permalink raw reply

* Re: [RFC 1/2] PCI-Express Non-Transparent Bridge Support
From: chetan loke @ 2012-07-16 18:26 UTC (permalink / raw)
  To: Jon Mason; +Cc: linux-kernel, netdev, linux-pci, Dave Jiang
In-Reply-To: <1342215900-3358-1-git-send-email-jon.mason@intel.com>

Jon,

On Fri, Jul 13, 2012 at 5:44 PM, Jon Mason <jon.mason@intel.com> wrote:

..............

> +/**
> + * ntb_ring_sdb() - Set the doorbell on the secondary/external side
> + * @ndev: pointer to ntb_device instance
> + * @db: doorbell to ring
> + *
> + * This function allows triggering of a doorbell on the secondary/external
> + * side that will initiate an interrupt on the remote host
> + *
> + * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
> + */
> +int ntb_ring_sdb(struct ntb_device *ndev, unsigned int db)
> +{
> +       dev_dbg(&ndev->pdev->dev, "%s: ringing doorbell %d\n", __func__, db);
> +

> +       if (db >= ndev->max_cbs)
> +               return -EINVAL;

How about moving this max_cbs error check in the upper level
callers(example in ntb_process_tx)?
That way you won't have to defer handling some negative cases all the
way till the end.

So ntb_process_tx could now look like:

.....
error=0;
if (entry->len > transport_mtu) {
...
error=1;
}
else if (qp->qp_num >= qp->ndev->max_cbs) {
...
error=1;
}

if (unlikely(error)) {
      ntb_list_add_tail(&qp->txc_lock, &entry->entry, &qp->txc);
      if (qp->tx_handler)
           qp->tx_handler(qp);

      return 0;
}
.................

No further comments below

> +
> +static int ntb_process_tx(struct ntb_transport_qp *qp,
> +                         struct ntb_queue_entry *entry)
> +{
> +       struct ntb_payload_header *hdr;
> +       void *offset;
> +
> +       offset = qp->tx_offset;
> +       hdr = offset;
> +
> +       pr_debug("%lld - offset %p, tx %p, entry len %d flags %x buff %p\n",
> +                qp->tx_pkts, offset, qp->tx_offset, entry->len, entry->flags,
> +                entry->buf);
> +       if (hdr->flags) {
> +               ntb_list_add_head(&qp->txq_lock, &entry->entry, &qp->txq);
> +               qp->tx_ring_full++;
> +               return -EAGAIN;
> +       }
> +
> +       if (entry->len > transport_mtu) {
> +               pr_err("Trying to send pkt size of %d\n", entry->len);
> +               entry->flags = HW_ERROR_FLAG;
> +
> +               ntb_list_add_tail(&qp->txc_lock, &entry->entry, &qp->txc);
> +
> +               if (qp->tx_handler)
> +                       qp->tx_handler(qp);
> +
> +               return 0;
> +       }
> +
> +       ntb_tx_copy_task(qp, entry, offset);
> +
> +       qp->tx_offset =
> +           (qp->tx_offset +
> +            ((transport_mtu + sizeof(struct ntb_payload_header)) * 2) >=
> +            qp->tx_mw_end) ? qp->tx_mw_begin : qp->tx_offset + transport_mtu +
> +           sizeof(struct ntb_payload_header);
> +
> +       qp->tx_pkts++;
> +
> +       return 0;
> +}
> +


Chetan Loke

^ permalink raw reply

* Re: [RFC 1/2] PCI-Express Non-Transparent Bridge Support
From: Jon Mason @ 2012-07-16 17:55 UTC (permalink / raw)
  To: Greg KH; +Cc: linux-kernel, netdev, linux-pci, Dave Jiang
In-Reply-To: <20120716001921.GA19775@kroah.com>

On Sun, Jul 15, 2012 at 05:19:21PM -0700, Greg KH wrote:
> On Sun, Jul 15, 2012 at 04:55:48PM -0700, Jon Mason wrote:
> > On Sat, Jul 14, 2012 at 10:10:15AM -0700, Greg KH wrote:
> > > On Fri, Jul 13, 2012 at 02:44:59PM -0700, Jon Mason wrote:
> > > > +static int max_num_cbs = 2;
> > > > +module_param(max_num_cbs, uint, 0644);
> > > > +MODULE_PARM_DESC(max_num_cbs, "Maximum number of NTB transport connections");
> > > > +
> > > > +static bool no_msix;
> > > > +module_param(no_msix, bool, 0644);
> > > > +MODULE_PARM_DESC(no_msix, "Do not allow MSI-X interrupts to be selected");
> > > 
> > > How would a user, or a distro, know to set these options?  Why are they
> > > even options at all?
> > 
> > Good question.  There is actually a potential benefit to disabling
> > MSI-X.  The NTB device on one of our platforms only has 3 MSI-X
> > vectors.  In the current driver design, that would limit them to 3
> > client/virtual devices.  However, there are 15bits in the ISR that can
> > be used for the same purpose.  So, if you disable MSI-X, you can have
> > 15 instead of 3.  
> 
> But again, how would a user, or a distro, know to set these?  Where is
> the documentation describing it?  Why really have these options at all
> and not just fix the platform issues (only 3 MSI-X vectors?  Really?)

I believe we'll want multiple clients (or have multiqueue Ethernet).  I'm happy to add something to /Documentation to describe it and why it would be useful, or I can remove it and re-introduce it when I add multiqueue Ethernet.

3 MSI-X vectors (plus one for PCI-E link up/down) on Xeon NTB, and 33 for Atom NTB.  Yeah, really.

> 
> thanks,
> 
> greg k-h

^ permalink raw reply

* Re: [PATCH v6 4/7] net, ethernet, davinci_emac: add OF support
From: Sekhar Nori @ 2012-07-16 17:27 UTC (permalink / raw)
  To: David Miller
  Cc: Heiko Schocher, davinci-linux-open-source, linux-arm-kernel,
	devicetree-discuss, netdev, Grant Likely, Wolfgang Denk,
	Anatoly Sivov
In-Reply-To: <1341823456-32297-1-git-send-email-hs@denx.de>

Hi Dave,

On 7/9/2012 2:14 PM, Heiko Schocher wrote:
> add of support for the davinci_emac driver.
> 
> Signed-off-by: Heiko Schocher <hs@denx.de>
> Acked-by: Sekhar Nori <nsekhar@ti.com>
> Cc: davinci-linux-open-source@linux.davincidsp.com
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: devicetree-discuss@lists.ozlabs.org
> Cc: netdev@vger.kernel.org
> Cc: Grant Likely <grant.likely@secretlab.ca>
> Cc: Sekhar Nori <nsekhar@ti.com>
> Cc: Wolfgang Denk <wd@denx.de>
> Cc: Anatoly Sivov <mm05@mail.ru>
> Cc: David Miller <davem@davemloft.net>

Can you please consider this patch for v3.6? I tested it on DaVinci
AM18x EVM with and without CONFIG_OF using NFS root.

This patch can be independently queued and does not have any dependencies.

Thanks,
Sekhar

^ permalink raw reply

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
From: Rick Jones @ 2012-07-16 17:27 UTC (permalink / raw)
  To: Thadeu Lima de Souza Cascardo
  Cc: davem@davemloft.net, netdev@vger.kernel.org,
	yevgenyp@mellanox.co.il, ogerlitz@mellanox.com,
	amirv@mellanox.com, brking@linux.vnet.ibm.com,
	leitao@linux.vnet.ibm.com, klebers@linux.vnet.ibm.com
In-Reply-To: <1342458113-10384-1-git-send-email-cascardo@linux.vnet.ibm.com>

On 07/16/2012 10:01 AM, Thadeu Lima de Souza Cascardo wrote:
> In its receive path, mlx4_en driver maps each page chunk that it pushes
> to the hardware and unmaps it when pushing it up the stack. This limits
> throughput to about 3Gbps on a Power7 8-core machine.

That seems rather extraordinarily low - Power7 is supposed to be a 
rather high performance CPU.  The last time I noticed O(3Gbit/s) on 10G 
for bulk transfer was before the advent of LRO/GRO - that was in the x86 
space though.  Is mapping really that expensive with Power7?


> One solution is to map the entire allocated page at once. However, this
> requires that we keep track of every page fragment we give to a
> descriptor. We also need to work with the discipline that all fragments will
> be released (in the sense that it will not be reused by the driver
> anymore) in the order they are allocated to the driver.
>
> This requires that we don't reuse any fragments, every single one of
> them must be reallocated. We do that by releasing all the fragments that
> are processed and only after finished processing the descriptors, we
> start the refill.
>
> We also must somehow guarantee that we either refill all fragments in a
> descriptor or none at all, without resorting to giving up a page
> fragment that we would have already given. Otherwise, we would break the
> discipline of only releasing the fragments in the order they were
> allocated.
>
> This has passed page allocation fault injections (restricted to the
> driver by using required-start and required-end) and device hotplug
> while 16 TCP streams were able to deliver more than 9Gbps.

What is the effect on packet-per-second performance?  (eg aggregate, 
burst-mode netperf TCP_RR with TCP_NODELAY set or perhaps UDP_RR)

rick jones

^ permalink raw reply

* Re: [PATCH] net-next: make sock diag per-namespace
From: Jan Ceuleers @ 2012-07-16 17:26 UTC (permalink / raw)
  To: Andrew Vagin
  Cc: Eric Dumazet, Andrew Vagin, David S. Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, Pavel Emelianov,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <20120716123815.GA1178@avaginn.sw.ru>

On 07/16/2012 02:38 PM, Andrew Vagin wrote:
> You are right. Sorry for this stupid fault. I will send a new patch.

Before doing so:

Could you put the "net-next" inside the square brackets (being the tree you are aiming your patch at, which should not end up in the commit log), and mention the subsystem in its place? Perhaps simply "net"?

Just a suggestion.

Thanks, Jan

^ permalink raw reply

* Re: 3.4.4/amd64 full interrupt hangs under big nfs copies
From: Marc MERLIN @ 2012-07-16 17:17 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Larry.Finger, bhutchings, linux-wireless, netdev
In-Reply-To: <1342455717.2830.14.camel@edumazet-glaptop>

On Mon, Jul 16, 2012 at 06:21:57PM +0200, Eric Dumazet wrote:
> > No, it's atually when I'm 'uploading' from my laptop to my server.
> > One interesting thing is that my server is running lvm2 with snapshots,
> > which makes writes slower than my laptop can push data over the network, so
> > it's definitely causing buffers to fill up.
> > I just did a download test and got 4.5MB/s sustained without problems.
> 
> Hmm, nfs apparently is able to push lot of data, try to reduce
> rsize/wsize to sane values, like 32K instead of 512K ?
> 
> gargamel:/mnt/dshelf2/ /net/gargamel/mnt/dshelf2 nfs4
> rw,nosuid,nodev,relatime,vers=4.0,rsize=524288,wsize=524288,namlen=255,hard,proto=tcp,port=0,timeo=600,retrans=2,sec=sys,clientaddr=192.168.205.7,local_lock=none,addr=192.168.205.3 0 0

Nice catch. That seems like an excessive default from autofs5 5.0.4-3.2+b1

So, it helped. I still got hangs, but this time they were VFS hangs. I
couldn't do anything filesystem related durign the 'hangs', but the
interrupts weren't hung anymore, so I could move my mouse cursor.

Having NFS hang all of VFS and local disk is obviously still a problem, but
at this point it may not be a networking (or wireless) related problem.

I'll attach the relevant logs during that attempt. Does that help?

Thanks,
Marc

[76903.011101] SysRq : Show Blocked State
[76903.011110]   task                        PC stack   pid father
[76903.011306] mc              D ffff88021e2d3680     0  9383   9270 0x00000080
[76903.011314]  ffff880111094100 0000000000000082 000000000000000e ffff880213549140
[76903.011322]  0000000000013680 ffff8800140e3fd8 ffff8800140e3fd8 ffff880111094100
[76903.011328]  ffff88021e5c5258 0000000000000000 ffff880111094100 ffff8800140e3e40
[76903.011335] Call Trace:
[76903.011362]  [<ffffffffa06dcdf2>] ? nfs_find_actor+0x66/0x66 [nfs]
[76903.011376]  [<ffffffffa06dce4d>] ? nfs_wait_bit_killable+0x5b/0x6e [nfs]
[76903.011384]  [<ffffffff81360f55>] ? __wait_on_bit_lock+0x3c/0x85
[76903.011391]  [<ffffffff810bb793>] ? filemap_fdatawait_range+0x11b/0x139
[76903.011397]  [<ffffffff8136100d>] ? out_of_line_wait_on_bit_lock+0x6f/0x78
[76903.011410]  [<ffffffffa06dcdf2>] ? nfs_find_actor+0x66/0x66 [nfs]
[76903.011417]  [<ffffffff81052e69>] ? autoremove_wake_function+0x2a/0x2a
[76903.011435]  [<ffffffffa06e8ca2>] ? nfs_commit_inode+0x66/0x27a [nfs]
[76903.011448]  [<ffffffffa06db56e>] ? nfs_file_fsync+0x95/0xf3 [nfs]
[76903.011455]  [<ffffffff811015a9>] ? filp_close+0x3b/0x6a
[76903.011461]  [<ffffffff8110165e>] ? sys_close+0x86/0xc7
[76903.011467]  [<ffffffff8136723d>] ? system_call_fastpath+0x1a/0x1f
[76903.011482] kworker/0:0     D ffff88021e213680     0 13850      2 0x00000080
[76903.011489]  ffff8801fac7d850 0000000000000046 ffff8802117cb848 ffff880140773750
[76903.011495]  0000000000013680 ffff88004c4e7fd8 ffff88004c4e7fd8 ffff8801fac7d850
[76903.011502]  ffff88021e5df9a0 0000000000000000 ffff8801fac7d850 ffffffffa069be59
[76903.011508] Call Trace:
[76903.011524]  [<ffffffffa069be59>] ? rpc_make_runnable+0x6a/0x6a [sunrpc]
[76903.011535]  [<ffffffffa069beb2>] ? rpc_wait_bit_killable+0x59/0x6c [sunrpc]
[76903.011541]  [<ffffffff81361054>] ? __wait_on_bit+0x3e/0x71
[76903.011547]  [<ffffffff81362b73>] ? _raw_spin_unlock_irqrestore+0x30/0x3e
[76903.011553]  [<ffffffff813610f6>] ? out_of_line_wait_on_bit+0x6f/0x78
[76903.011565]  [<ffffffffa069be59>] ? rpc_make_runnable+0x6a/0x6a [sunrpc]
[76903.011570]  [<ffffffff81052e69>] ? autoremove_wake_function+0x2a/0x2a
[76903.011587]  [<ffffffffa06e7bdf>] ? nfs_initiate_commit+0xf4/0x105 [nfs]
[76903.011604]  [<ffffffffa06e8e30>] ? nfs_commit_inode+0x1f4/0x27a [nfs]
[76903.011617]  [<ffffffffa06db97c>] ? nfs_release_page+0x56/0x73 [nfs]
[76903.011626]  [<ffffffff810ca356>] ? shrink_page_list+0x556/0x739
[76903.011635]  [<ffffffff8105dd51>] ? get_parent_ip+0x9/0x1b
[76903.011640]  [<ffffffff8136583e>] ? sub_preempt_count+0x83/0x94
[76903.011646]  [<ffffffff810c91eb>] ? update_isolated_counts.isra.44+0x148/0x16e
[76903.011653]  [<ffffffff810ca9a3>] ? shrink_inactive_list+0x2b1/0x446
[76903.011661]  [<ffffffff810cb182>] ? shrink_mem_cgroup_zone+0x371/0x480
[76903.011668]  [<ffffffff810cb2f3>] ? shrink_zone+0x62/0x9b
[76903.011675]  [<ffffffff810cb73c>] ? do_try_to_free_pages+0x1e4/0x434
[76903.011682]  [<ffffffff810cbc11>] ? try_to_free_pages+0xb3/0xf9
[76903.011688]  [<ffffffff8105931b>] ? should_resched+0x5/0x23
[76903.011695]  [<ffffffff810c24a2>] ? __alloc_pages_nodemask+0x4ef/0x7df
[76903.011702]  [<ffffffff8105dd51>] ? get_parent_ip+0x9/0x1b
[76903.011711]  [<ffffffff810ecf10>] ? alloc_pages_current+0xc7/0xe4
[76903.011723]  [<ffffffffa04ca247>] ? iwlagn_rx_allocate+0x97/0x24d [iwlwifi]
[76903.011734]  [<ffffffffa04ca81e>] ? iwlagn_rx_replenish+0x3a/0x3a [iwlwifi]
[76903.011744]  [<ffffffffa04ca7fc>] ? iwlagn_rx_replenish+0x18/0x3a [iwlwifi]
[76903.011750]  [<ffffffff8104ea7d>] ? process_one_work+0x16d/0x298
[76903.011757]  [<ffffffff8104f4d9>] ? worker_thread+0xc2/0x145
[76903.011763]  [<ffffffff8104f417>] ? manage_workers.isra.23+0x15b/0x15b
[76903.011768]  [<ffffffff81052788>] ? kthread+0x7d/0x85
[76903.011774]  [<ffffffff813686a4>] ? kernel_thread_helper+0x4/0x10
[76903.011780]  [<ffffffff8105270b>] ? kthread_freezable_should_stop+0x37/0x37
[76903.011786]  [<ffffffff813686a0>] ? gs_change+0x13/0x13
[76903.011797] Sched Debug Version: v0.10, 3.4.4-amd64-preempt-noide-20120410 #1

and

[76843.153742] 
[76873.080978] SysRq : Show Blocked State
[76873.080987]   task                        PC stack   pid father
[76873.081200] mc              D ffff88021e293680     0  9383   9270 0x00000080
[76873.081208]  ffff880111094100 0000000000000082 0000000000000001 ffff8802135107d0
[76873.081216]  0000000000013680 ffff8800140e3fd8 ffff8800140e3fd8 ffff880111094100
[76873.081222]  ffff88010c9033d0 ffff88021e293680 ffff880111094100 ffffffff810bb429
[76873.081229] Call Trace:
[76873.081241]  [<ffffffff810bb429>] ? __lock_page+0x66/0x66
[76873.081249]  [<ffffffff81362059>] ? io_schedule+0x55/0x6b
[76873.081254]  [<ffffffff810bb42f>] ? sleep_on_page+0x6/0xa
[76873.081260]  [<ffffffff81361054>] ? __wait_on_bit+0x3e/0x71
[76873.081265]  [<ffffffff810bb577>] ? wait_on_page_bit+0x6e/0x73
[76873.081272]  [<ffffffff81052e69>] ? autoremove_wake_function+0x2a/0x2a
[76873.081278]  [<ffffffff810bb6ec>] ? filemap_fdatawait_range+0x74/0x139
[76873.081285]  [<ffffffff810bc2e8>] ? filemap_write_and_wait_range+0x3b/0x4d
[76873.081308]  [<ffffffffa06db536>] ? nfs_file_fsync+0x5d/0xf3 [nfs]
[76873.081317]  [<ffffffff811015a9>] ? filp_close+0x3b/0x6a
[76873.081323]  [<ffffffff8110165e>] ? sys_close+0x86/0xc7
[76873.081330]  [<ffffffff8136723d>] ? system_call_fastpath+0x1a/0x1f
[76873.081346] kworker/0:0     D ffff88021e213680     0 13850      2 0x00000080
[76873.081352]  ffff8801fac7d850 0000000000000046 ffff880186753ce8 ffff880126d7f040
[76873.081358]  0000000000013680 ffff88004c4e7fd8 ffff88004c4e7fd8 ffff8801fac7d850
[76873.081365]  ffff8801c5ae1d70 ffff88021e213680 ffff8801fac7d850 ffffffff810bb429
[76873.081371] Call Trace:
[76873.081376]  [<ffffffff810bb429>] ? __lock_page+0x66/0x66
[76873.081381]  [<ffffffff81362059>] ? io_schedule+0x55/0x6b
[76873.081386]  [<ffffffff810bb42f>] ? sleep_on_page+0x6/0xa
[76873.081391]  [<ffffffff81361054>] ? __wait_on_bit+0x3e/0x71
[76873.081396]  [<ffffffff810bb577>] ? wait_on_page_bit+0x6e/0x73
[76873.081402]  [<ffffffff81052e69>] ? autoremove_wake_function+0x2a/0x2a
[76873.081411]  [<ffffffff810c9f66>] ? shrink_page_list+0x166/0x739
[76873.081420]  [<ffffffff8105dd51>] ? get_parent_ip+0x9/0x1b
[76873.081425]  [<ffffffff8136583e>] ? sub_preempt_count+0x83/0x94
[76873.081431]  [<ffffffff810c91eb>] ? update_isolated_counts.isra.44+0x148/0x16e
[76873.081438]  [<ffffffff810ca9a3>] ? shrink_inactive_list+0x2b1/0x446
[76873.081446]  [<ffffffff810cb182>] ? shrink_mem_cgroup_zone+0x371/0x480
[76873.081454]  [<ffffffff810cb2f3>] ? shrink_zone+0x62/0x9b
[76873.081460]  [<ffffffff810cb73c>] ? do_try_to_free_pages+0x1e4/0x434
[76873.081467]  [<ffffffff810cbc11>] ? try_to_free_pages+0xb3/0xf9
[76873.081473]  [<ffffffff8105931b>] ? should_resched+0x5/0x23
[76873.081481]  [<ffffffff810c24a2>] ? __alloc_pages_nodemask+0x4ef/0x7df
[76873.081487]  [<ffffffff8105dd51>] ? get_parent_ip+0x9/0x1b
[76873.081497]  [<ffffffff810ecf10>] ? alloc_pages_current+0xc7/0xe4
[76873.081510]  [<ffffffffa04ca247>] ? iwlagn_rx_allocate+0x97/0x24d [iwlwifi]
[76873.081521]  [<ffffffffa04ca81e>] ? iwlagn_rx_replenish+0x3a/0x3a [iwlwifi]
[76873.081530]  [<ffffffffa04ca7fc>] ? iwlagn_rx_replenish+0x18/0x3a [iwlwifi]
[76873.081538]  [<ffffffff8104ea7d>] ? process_one_work+0x16d/0x298
[76873.081545]  [<ffffffff8104f4d9>] ? worker_thread+0xc2/0x145
[76873.081551]  [<ffffffff8104f417>] ? manage_workers.isra.23+0x15b/0x15b
[76873.081556]  [<ffffffff81052788>] ? kthread+0x7d/0x85
[76873.081562]  [<ffffffff813686a4>] ? kernel_thread_helper+0x4/0x10
[76873.081568]  [<ffffffff8105270b>] ? kthread_freezable_should_stop+0x37/0x37
[76873.081574]  [<ffffffff813686a0>] ? gs_change+0x13/0x13
[76873.081585] 192.168.205.3-m D ffff88021e293680     0 14532      2 0x00000080
[76873.081590]  ffff880206d600c0 0000000000000046 ffff880186733e60 ffff88004b4230c0
[76873.081597]  0000000000013680 ffff880022305fd8 ffff880022305fd8 ffff880206d600c0
[76873.081603]  ffff88021e5bb778 0000000000000000 ffff880206d600c0 ffffffffa069be59
[76873.081609] Call Trace:
[76873.081625]  [<ffffffffa069be59>] ? rpc_make_runnable+0x6a/0x6a [sunrpc]
[76873.081637]  [<ffffffffa069beb2>] ? rpc_wait_bit_killable+0x59/0x6c [sunrpc]
[76873.081642]  [<ffffffff81361054>] ? __wait_on_bit+0x3e/0x71
[76873.081648]  [<ffffffff81362b73>] ? _raw_spin_unlock_irqrestore+0x30/0x3e
[76873.081654]  [<ffffffff813610f6>] ? out_of_line_wait_on_bit+0x6f/0x78
[76873.081665]  [<ffffffffa069be59>] ? rpc_make_runnable+0x6a/0x6a [sunrpc]
[76873.081671]  [<ffffffff81052e69>] ? autoremove_wake_function+0x2a/0x2a
[76873.081690]  [<ffffffffa06efb13>] ? nfs4_run_open_task+0x101/0x12e [nfs]
[76873.081709]  [<ffffffffa06f12fb>] ? nfs4_open_recover_helper+0xbd/0x13f [nfs]
[76873.081724]  [<ffffffffa06f13e1>] ? nfs4_open_recover+0x64/0x113 [nfs]
[76873.081740]  [<ffffffffa06f36a2>] ? nfs4_open_expired+0x69/0xc4 [nfs]
[76873.081761]  [<ffffffffa06ff5b8>] ? nfs4_do_reclaim+0x109/0x4a0 [nfs]
[76873.081779]  [<ffffffffa06fe7cb>] ? nfs4_state_clear_reclaim_reboot.part.7+0xf6/0x10a [nfs]
[76873.081797]  [<ffffffffa06ffcb2>] ? nfs4_run_state_manager+0x363/0x52e [nfs]
[76873.081814]  [<ffffffffa06ff94f>] ? nfs4_do_reclaim+0x4a0/0x4a0 [nfs]
[76873.081819]  [<ffffffff81052788>] ? kthread+0x7d/0x85
[76873.081825]  [<ffffffff813686a4>] ? kernel_thread_helper+0x4/0x10
[76873.081830]  [<ffffffff8105270b>] ? kthread_freezable_should_stop+0x37/0x37
[76873.081836]  [<ffffffff813686a0>] ? gs_change+0x13/0x13
[76873.081842] Sched Debug Version: v0.10, 3.4.4-amd64-preempt-noide-20120410 #1
-- 
"A mouse is a device used to point at the xterm you want to type in" - A.S.R.
Microsoft is to operating systems ....
                                      .... what McDonalds is to gourmet cooking
Home page: http://marc.merlins.org/  

^ permalink raw reply

* [PATCH] mlx4_en: map entire pages to increase throughput
From: Thadeu Lima de Souza Cascardo @ 2012-07-16 17:01 UTC (permalink / raw)
  To: davem
  Cc: netdev, yevgenyp, ogerlitz, amirv, brking, leitao, klebers,
	Thadeu Lima de Souza Cascardo

In its receive path, mlx4_en driver maps each page chunk that it pushes
to the hardware and unmaps it when pushing it up the stack. This limits
throughput to about 3Gbps on a Power7 8-core machine.

One solution is to map the entire allocated page at once. However, this
requires that we keep track of every page fragment we give to a
descriptor. We also need to work with the discipline that all fragments will
be released (in the sense that it will not be reused by the driver
anymore) in the order they are allocated to the driver.

This requires that we don't reuse any fragments, every single one of
them must be reallocated. We do that by releasing all the fragments that
are processed and only after finished processing the descriptors, we
start the refill.

We also must somehow guarantee that we either refill all fragments in a
descriptor or none at all, without resorting to giving up a page
fragment that we would have already given. Otherwise, we would break the
discipline of only releasing the fragments in the order they were
allocated.

This has passed page allocation fault injections (restricted to the
driver by using required-start and required-end) and device hotplug
while 16 TCP streams were able to deliver more than 9Gbps.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@linux.vnet.ibm.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c   |  237 ++++++++++++++------------
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |    3 +-
 2 files changed, 131 insertions(+), 109 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index a04cbf7..37ac073 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -41,41 +41,75 @@
 
 #include "mlx4_en.h"
 
-
-static int mlx4_en_alloc_frag(struct mlx4_en_priv *priv,
-			      struct mlx4_en_rx_desc *rx_desc,
-			      struct page_frag *skb_frags,
-			      struct mlx4_en_rx_alloc *ring_alloc,
-			      int i)
+static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
+			       struct mlx4_en_rx_desc *rx_desc,
+			       struct mlx4_en_rx_alloc *frags,
+			       struct mlx4_en_rx_alloc *ring_alloc)
 {
-	struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
-	struct mlx4_en_rx_alloc *page_alloc = &ring_alloc[i];
+	struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
+	struct mlx4_en_frag_info *frag_info;
 	struct page *page;
 	dma_addr_t dma;
+	int i;
 
-	if (page_alloc->offset == frag_info->last_offset) {
-		/* Allocate new page */
-		page = alloc_pages(GFP_ATOMIC | __GFP_COMP, MLX4_EN_ALLOC_ORDER);
-		if (!page)
-			return -ENOMEM;
-
-		skb_frags[i].page = page_alloc->page;
-		skb_frags[i].offset = page_alloc->offset;
-		page_alloc->page = page;
-		page_alloc->offset = frag_info->frag_align;
-	} else {
-		page = page_alloc->page;
-		get_page(page);
+	for (i = 0; i < priv->num_frags; i++) {
+		frag_info = &priv->frag_info[i];
+		if (ring_alloc[i].offset == frag_info->last_offset) {
+			page = alloc_pages(GFP_ATOMIC | __GFP_COMP,
+					MLX4_EN_ALLOC_ORDER);
+			if (!page)
+				goto out;
+			dma = dma_map_page(priv->ddev, page, 0,
+				MLX4_EN_ALLOC_SIZE, PCI_DMA_FROMDEVICE);
+			if (dma_mapping_error(priv->ddev, dma)) {
+				put_page(page);
+				goto out;
+			}
+			page_alloc[i].page = page;
+			page_alloc[i].dma = dma;
+			page_alloc[i].offset = frag_info->frag_align;
+		} else {
+			page_alloc[i].page = ring_alloc[i].page;
+			get_page(ring_alloc[i].page);
+			page_alloc[i].dma = ring_alloc[i].dma;
+			page_alloc[i].offset = ring_alloc[i].offset +
+						frag_info->frag_stride;
+		}
+	}
 
-		skb_frags[i].page = page;
-		skb_frags[i].offset = page_alloc->offset;
-		page_alloc->offset += frag_info->frag_stride;
+	for (i = 0; i < priv->num_frags; i++) {
+		frags[i] = ring_alloc[i];
+		dma = ring_alloc[i].dma + ring_alloc[i].offset;
+		ring_alloc[i] = page_alloc[i];
+		rx_desc->data[i].addr = cpu_to_be64(dma);
 	}
-	dma = dma_map_single(priv->ddev, page_address(skb_frags[i].page) +
-			     skb_frags[i].offset, frag_info->frag_size,
-			     PCI_DMA_FROMDEVICE);
-	rx_desc->data[i].addr = cpu_to_be64(dma);
+
 	return 0;
+
+
+out:
+	while (i--) {
+		frag_info = &priv->frag_info[i];
+		if (ring_alloc[i].offset == frag_info->last_offset)
+			dma_unmap_page(priv->ddev, page_alloc[i].dma,
+				MLX4_EN_ALLOC_SIZE, PCI_DMA_FROMDEVICE);
+		put_page(page_alloc[i].page);
+	}
+	return -ENOMEM;
+}
+
+static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
+			      struct mlx4_en_rx_alloc *frags,
+			      int i)
+{
+	struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
+
+	if (frags[i].offset == frag_info->last_offset) {
+		dma_unmap_page(priv->ddev, frags[i].dma, MLX4_EN_ALLOC_SIZE,
+					 PCI_DMA_FROMDEVICE);
+	}
+	if (frags[i].page)
+		put_page(frags[i].page);
 }
 
 static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
@@ -91,6 +125,13 @@ static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
 		if (!page_alloc->page)
 			goto out;
 
+		page_alloc->dma = dma_map_page(priv->ddev, page_alloc->page, 0,
+					MLX4_EN_ALLOC_SIZE, PCI_DMA_FROMDEVICE);
+		if (dma_mapping_error(priv->ddev, page_alloc->dma)) {
+			put_page(page_alloc->page);
+			page_alloc->page = NULL;
+			goto out;
+		}
 		page_alloc->offset = priv->frag_info[i].frag_align;
 		en_dbg(DRV, priv, "Initialized allocator:%d with page:%p\n",
 		       i, page_alloc->page);
@@ -100,6 +141,8 @@ static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
 out:
 	while (i--) {
 		page_alloc = &ring->page_alloc[i];
+		dma_unmap_page(priv->ddev, page_alloc->dma,
+				MLX4_EN_ALLOC_SIZE, PCI_DMA_FROMDEVICE);
 		put_page(page_alloc->page);
 		page_alloc->page = NULL;
 	}
@@ -117,24 +160,22 @@ static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv,
 		en_dbg(DRV, priv, "Freeing allocator:%d count:%d\n",
 		       i, page_count(page_alloc->page));
 
+		dma_unmap_page(priv->ddev, page_alloc->dma,
+				MLX4_EN_ALLOC_SIZE, PCI_DMA_FROMDEVICE);
 		put_page(page_alloc->page);
 		page_alloc->page = NULL;
 	}
 }
 
-
 static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv,
 				 struct mlx4_en_rx_ring *ring, int index)
 {
 	struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index;
-	struct skb_frag_struct *skb_frags = ring->rx_info +
-					    (index << priv->log_rx_info);
 	int possible_frags;
 	int i;
 
 	/* Set size and memtype fields */
 	for (i = 0; i < priv->num_frags; i++) {
-		skb_frag_size_set(&skb_frags[i], priv->frag_info[i].frag_size);
 		rx_desc->data[i].byte_count =
 			cpu_to_be32(priv->frag_info[i].frag_size);
 		rx_desc->data[i].lkey = cpu_to_be32(priv->mdev->mr.key);
@@ -151,29 +192,14 @@ static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv,
 	}
 }
 
-
 static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
 				   struct mlx4_en_rx_ring *ring, int index)
 {
 	struct mlx4_en_rx_desc *rx_desc = ring->buf + (index * ring->stride);
-	struct page_frag *skb_frags = ring->rx_info +
-				      (index << priv->log_rx_info);
-	int i;
+	struct mlx4_en_rx_alloc *frags = ring->rx_info +
+					(index << priv->log_rx_info);
 
-	for (i = 0; i < priv->num_frags; i++)
-		if (mlx4_en_alloc_frag(priv, rx_desc, skb_frags, ring->page_alloc, i))
-			goto err;
-
-	return 0;
-
-err:
-	while (i--) {
-		dma_addr_t dma = be64_to_cpu(rx_desc->data[i].addr);
-		pci_unmap_single(priv->mdev->pdev, dma, skb_frags[i].size,
-				 PCI_DMA_FROMDEVICE);
-		put_page(skb_frags[i].page);
-	}
-	return -ENOMEM;
+	return mlx4_en_alloc_frags(priv, rx_desc, frags, ring->page_alloc);
 }
 
 static inline void mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring)
@@ -185,20 +211,13 @@ static void mlx4_en_free_rx_desc(struct mlx4_en_priv *priv,
 				 struct mlx4_en_rx_ring *ring,
 				 int index)
 {
-	struct page_frag *skb_frags;
-	struct mlx4_en_rx_desc *rx_desc = ring->buf + (index << ring->log_stride);
-	dma_addr_t dma;
+	struct mlx4_en_rx_alloc *frags;
 	int nr;
 
-	skb_frags = ring->rx_info + (index << priv->log_rx_info);
+	frags = ring->rx_info + (index << priv->log_rx_info);
 	for (nr = 0; nr < priv->num_frags; nr++) {
 		en_dbg(DRV, priv, "Freeing fragment:%d\n", nr);
-		dma = be64_to_cpu(rx_desc->data[nr].addr);
-
-		en_dbg(DRV, priv, "Unmapping buffer at dma:0x%llx\n", (u64) dma);
-		dma_unmap_single(priv->ddev, dma, skb_frags[nr].size,
-				 PCI_DMA_FROMDEVICE);
-		put_page(skb_frags[nr].page);
+		mlx4_en_free_frag(priv, frags, nr);
 	}
 }
 
@@ -268,10 +287,9 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
 			   struct mlx4_en_rx_ring *ring, u32 size, u16 stride)
 {
 	struct mlx4_en_dev *mdev = priv->mdev;
-	int err;
+	int err = -ENOMEM;
 	int tmp;
 
-
 	ring->prod = 0;
 	ring->cons = 0;
 	ring->size = size;
@@ -281,7 +299,7 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
 	ring->buf_size = ring->size * ring->stride + TXBB_SIZE;
 
 	tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS *
-					sizeof(struct skb_frag_struct));
+					sizeof(struct mlx4_en_rx_alloc));
 	ring->rx_info = vmalloc(tmp);
 	if (!ring->rx_info)
 		return -ENOMEM;
@@ -338,7 +356,7 @@ int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
 		memset(ring->buf, 0, ring->buf_size);
 		mlx4_en_update_rx_prod_db(ring);
 
-		/* Initailize all descriptors */
+		/* Initialize all descriptors */
 		for (i = 0; i < ring->size; i++)
 			mlx4_en_init_rx_desc(priv, ring, i);
 
@@ -401,12 +419,10 @@ void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
 }
 
 
-/* Unmap a completed descriptor and free unused pages */
 static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
 				    struct mlx4_en_rx_desc *rx_desc,
-				    struct page_frag *skb_frags,
+				    struct mlx4_en_rx_alloc *frags,
 				    struct sk_buff *skb,
-				    struct mlx4_en_rx_alloc *page_alloc,
 				    int length)
 {
 	struct skb_frag_struct *skb_frags_rx = skb_shinfo(skb)->frags;
@@ -414,26 +430,24 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
 	int nr;
 	dma_addr_t dma;
 
-	/* Collect used fragments while replacing them in the HW descirptors */
+	/* Collect used fragments while replacing them in the HW descriptors */
 	for (nr = 0; nr < priv->num_frags; nr++) {
 		frag_info = &priv->frag_info[nr];
 		if (length <= frag_info->frag_prefix_size)
 			break;
+		if (!frags[nr].page)
+			goto fail;
 
-		/* Save page reference in skb */
-		__skb_frag_set_page(&skb_frags_rx[nr], skb_frags[nr].page);
-		skb_frag_size_set(&skb_frags_rx[nr], skb_frags[nr].size);
-		skb_frags_rx[nr].page_offset = skb_frags[nr].offset;
-		skb->truesize += frag_info->frag_stride;
 		dma = be64_to_cpu(rx_desc->data[nr].addr);
+		dma_sync_single_for_cpu(priv->ddev, dma, frag_info->frag_size,
+					DMA_FROM_DEVICE);
 
-		/* Allocate a replacement page */
-		if (mlx4_en_alloc_frag(priv, rx_desc, skb_frags, page_alloc, nr))
-			goto fail;
-
-		/* Unmap buffer */
-		dma_unmap_single(priv->ddev, dma, skb_frag_size(&skb_frags_rx[nr]),
-				 PCI_DMA_FROMDEVICE);
+		/* Save page reference in skb */
+		get_page(frags[nr].page);
+		__skb_frag_set_page(&skb_frags_rx[nr], frags[nr].page);
+		skb_frag_size_set(&skb_frags_rx[nr], frag_info->frag_size);
+		skb_frags_rx[nr].page_offset = frags[nr].offset;
+		skb->truesize += frag_info->frag_stride;
 	}
 	/* Adjust size of last fragment to match actual length */
 	if (nr > 0)
@@ -442,8 +456,6 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
 	return nr;
 
 fail:
-	/* Drop all accumulated fragments (which have already been replaced in
-	 * the descriptor) of this packet; remaining fragments are reused... */
 	while (nr > 0) {
 		nr--;
 		__skb_frag_unref(&skb_frags_rx[nr]);
@@ -454,8 +466,7 @@ fail:
 
 static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv,
 				      struct mlx4_en_rx_desc *rx_desc,
-				      struct page_frag *skb_frags,
-				      struct mlx4_en_rx_alloc *page_alloc,
+				      struct mlx4_en_rx_alloc *frags,
 				      unsigned int length)
 {
 	struct sk_buff *skb;
@@ -473,23 +484,20 @@ static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv,
 
 	/* Get pointer to first fragment so we could copy the headers into the
 	 * (linear part of the) skb */
-	va = page_address(skb_frags[0].page) + skb_frags[0].offset;
+	va = page_address(frags[0].page) + frags[0].offset;
 
 	if (length <= SMALL_PACKET_SIZE) {
 		/* We are copying all relevant data to the skb - temporarily
-		 * synch buffers for the copy */
+		 * sync buffers for the copy */
 		dma = be64_to_cpu(rx_desc->data[0].addr);
 		dma_sync_single_for_cpu(priv->ddev, dma, length,
 					DMA_FROM_DEVICE);
 		skb_copy_to_linear_data(skb, va, length);
-		dma_sync_single_for_device(priv->ddev, dma, length,
-					   DMA_FROM_DEVICE);
 		skb->tail += length;
 	} else {
-
 		/* Move relevant fragments to skb */
-		used_frags = mlx4_en_complete_rx_desc(priv, rx_desc, skb_frags,
-						      skb, page_alloc, length);
+		used_frags = mlx4_en_complete_rx_desc(priv, rx_desc, frags,
+							skb, length);
 		if (unlikely(!used_frags)) {
 			kfree_skb(skb);
 			return NULL;
@@ -526,12 +534,25 @@ out_loopback:
 	dev_kfree_skb_any(skb);
 }
 
+static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv,
+				     struct mlx4_en_rx_ring *ring)
+{
+	int index = ring->prod & ring->size_mask;
+
+	while ((u32) (ring->prod - ring->cons) < ring->actual_size) {
+		if (mlx4_en_prepare_rx_desc(priv, ring, index))
+			break;
+		ring->prod++;
+		index = ring->prod & ring->size_mask;
+	}
+}
+
 int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_cqe *cqe;
 	struct mlx4_en_rx_ring *ring = &priv->rx_ring[cq->ring];
-	struct page_frag *skb_frags;
+	struct mlx4_en_rx_alloc *frags;
 	struct mlx4_en_rx_desc *rx_desc;
 	struct sk_buff *skb;
 	int index;
@@ -540,6 +561,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 	int polled = 0;
 	int ip_summed;
 	struct ethhdr *ethh;
+	dma_addr_t dma;
 	u64 s_mac;
 
 	if (!priv->port_up)
@@ -555,7 +577,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
 		    cq->mcq.cons_index & cq->size)) {
 
-		skb_frags = ring->rx_info + (index << priv->log_rx_info);
+		frags = ring->rx_info + (index << priv->log_rx_info);
 		rx_desc = ring->buf + (index << ring->log_stride);
 
 		/*
@@ -579,8 +601,11 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 
 		/* Get pointer to first fragment since we haven't skb yet and
 		 * cast it to ethhdr struct */
-		ethh = (struct ethhdr *)(page_address(skb_frags[0].page) +
-					 skb_frags[0].offset);
+		dma = be64_to_cpu(rx_desc->data[0].addr);
+		dma_sync_single_for_cpu(priv->ddev, dma, sizeof(*ethh),
+					DMA_FROM_DEVICE);
+		ethh = (struct ethhdr *)(page_address(frags[0].page) +
+					 frags[0].offset);
 		s_mac = mlx4_en_mac_to_u64(ethh->h_source);
 
 		/* If source MAC is equal to our own MAC and not performing
@@ -612,10 +637,9 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 					if (!gro_skb)
 						goto next;
 
-					nr = mlx4_en_complete_rx_desc(
-						priv, rx_desc,
-						skb_frags, gro_skb,
-						ring->page_alloc, length);
+					nr = mlx4_en_complete_rx_desc(priv,
+						rx_desc, frags, gro_skb,
+						length);
 					if (!nr)
 						goto next;
 
@@ -651,8 +675,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 			ring->csum_none++;
 		}
 
-		skb = mlx4_en_rx_skb(priv, rx_desc, skb_frags,
-				     ring->page_alloc, length);
+		skb = mlx4_en_rx_skb(priv, rx_desc, frags, length);
 		if (!skb) {
 			priv->stats.rx_dropped++;
 			goto next;
@@ -678,6 +701,9 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 		netif_receive_skb(skb);
 
 next:
+		for (nr = 0; nr < priv->num_frags; nr++)
+			mlx4_en_free_frag(priv, frags, nr);
+
 		++cq->mcq.cons_index;
 		index = (cq->mcq.cons_index) & ring->size_mask;
 		cqe = &cq->buf[index];
@@ -693,7 +719,7 @@ out:
 	mlx4_cq_set_ci(&cq->mcq);
 	wmb(); /* ensure HW sees CQ consumer before we post new buffers */
 	ring->cons = cq->mcq.cons_index;
-	ring->prod += polled; /* Polled descriptors were realocated in place */
+	mlx4_en_refill_rx_buffers(priv, ring);
 	mlx4_en_update_rx_prod_db(ring);
 	return polled;
 }
@@ -782,7 +808,7 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
 
 	priv->num_frags = i;
 	priv->rx_skb_size = eff_mtu;
-	priv->log_rx_info = ROUNDUP_LOG2(i * sizeof(struct skb_frag_struct));
+	priv->log_rx_info = ROUNDUP_LOG2(i * sizeof(struct mlx4_en_rx_alloc));
 
 	en_dbg(DRV, priv, "Rx buffer scatter-list (effective-mtu:%d "
 		  "num_frags:%d):\n", eff_mtu, priv->num_frags);
@@ -984,8 +1010,3 @@ void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv)
 	}
 	mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num);
 }
-
-
-
-
-
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index a126321..f2fc90d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -107,7 +107,7 @@ enum {
 #define MLX4_EN_MAX_TX_SIZE	8192
 #define MLX4_EN_MAX_RX_SIZE	8192
 
-/* Minimum ring size for our page-allocation sceme to work */
+/* Minimum ring size for our page-allocation scheme to work */
 #define MLX4_EN_MIN_RX_SIZE	(MLX4_EN_ALLOC_SIZE / SMP_CACHE_BYTES)
 #define MLX4_EN_MIN_TX_SIZE	(4096 / TXBB_SIZE)
 
@@ -228,6 +228,7 @@ struct mlx4_en_tx_desc {
 
 struct mlx4_en_rx_alloc {
 	struct page *page;
+	dma_addr_t dma;
 	u16 offset;
 };
 
-- 
1.7.4.4

^ permalink raw reply related

* Re: [RFC 1/2] PCI-Express Non-Transparent Bridge Support
From: chetan loke @ 2012-07-16 16:49 UTC (permalink / raw)
  To: Jon Mason; +Cc: linux-kernel, netdev, linux-pci, Dave Jiang
In-Reply-To: <1342215900-3358-1-git-send-email-jon.mason@intel.com>

Hi Jon,

On Fri, Jul 13, 2012 at 5:44 PM, Jon Mason <jon.mason@intel.com> wrote:

Just a few minor comments/questions:

....

> +struct ntb_transport_qp {
> +       struct ntb_device *ndev;
> +
> +       bool client_ready;
> +       bool qp_link;
> +       u8 qp_num;      /* Only 64 QP's are allowed.  0-63 */
> +
> +       void (*tx_handler) (struct ntb_transport_qp *qp);
> +       struct tasklet_struct tx_work;

Is it ok to rename the following vars for convenience sake?

> +       struct list_head txq;
tx_pend_q - (pending_queue) or tx_out_q - (outstanding_queue) - or
pick any new string you like - other than a mono-syllable definition

> +       struct list_head txc;
tx_compl_q - completion queue

> +       struct list_head txe;
tx_avail_e - available entry queue


> +       spinlock_t txq_lock;
> +       spinlock_t txc_lock;
> +       spinlock_t txe_lock;

then match the variants accordingly.

> +       struct list_head rxq;
> +       struct list_head rxc;
> +       struct list_head rxe;
> +       spinlock_t rxq_lock;
> +       spinlock_t rxc_lock;
> +       spinlock_t rxe_lock;

similarly the rx-counterpart


..................

> +static void ntb_tx_copy_task(struct ntb_transport_qp *qp,
> +                            struct ntb_queue_entry *entry,
> +                            void *offset)
> +{
> +       struct ntb_payload_header *hdr = offset;
> +       int rc;
> +
> +       offset += sizeof(struct ntb_payload_header);
> +       memcpy_toio(offset, entry->buf, entry->len);
> +
> +       hdr->len = entry->len;
> +       hdr->ver = qp->tx_pkts;
> +
> +       /* Ensure that the data is fully copied out before setting the flag */
> +       wmb();
> +       hdr->flags = entry->flags | DESC_DONE_FLAG;
> +
> +       rc = ntb_ring_sdb(qp->ndev, qp->qp_num);
> +       if (rc)
> +               pr_err("%s: error ringing db %d\n", __func__, qp->qp_num);
> +
> +       if (entry->len > 0) {

how do you interpret this len variable and decide if it's a good/bad completion?

> +               qp->tx_bytes += entry->len;
> +
> +               /* Add fully transmitted data to completion queue */
> +               ntb_list_add_tail(&qp->txc_lock, &entry->entry, &qp->txc);
> +
> +               if (qp->tx_handler)
> +                       qp->tx_handler(qp);
> +       } else
> +               ntb_list_add_tail(&qp->txe_lock, &entry->entry, &qp->txe);

I could be wrong but how is the original skb handled if the code path
goes in the else clause?
Also, in the else clause, how about a ntb_list_add_head rather than a _tail.

> +
> +static int ntb_process_tx(struct ntb_transport_qp *qp,
> +                         struct ntb_queue_entry *entry)
> +{
> +       struct ntb_payload_header *hdr;
> +       void *offset;
> +
> +       offset = qp->tx_offset;
> +       hdr = offset;
> +
> +       pr_debug("%lld - offset %p, tx %p, entry len %d flags %x buff %p\n",
> +                qp->tx_pkts, offset, qp->tx_offset, entry->len, entry->flags,
> +                entry->buf);
> +       if (hdr->flags) {
> +               ntb_list_add_head(&qp->txq_lock, &entry->entry, &qp->txq);
> +               qp->tx_ring_full++;
> +               return -EAGAIN;
> +       }
> +
> +       if (entry->len > transport_mtu) {
> +               pr_err("Trying to send pkt size of %d\n", entry->len);
> +               entry->flags = HW_ERROR_FLAG;
> +
> +               ntb_list_add_tail(&qp->txc_lock, &entry->entry, &qp->txc);
> +
> +               if (qp->tx_handler)
> +                       qp->tx_handler(qp);
> +
> +               return 0;
> +       }
> +
> +       ntb_tx_copy_task(qp, entry, offset);

what happens when ntb_sdb_ring returns an error? would you still want
to increment tx_pkts below?

> +
> +       qp->tx_offset =
> +           (qp->tx_offset +
> +            ((transport_mtu + sizeof(struct ntb_payload_header)) * 2) >=
> +            qp->tx_mw_end) ? qp->tx_mw_begin : qp->tx_offset + transport_mtu +
> +           sizeof(struct ntb_payload_header);
> +
> +       qp->tx_pkts++;
> +
> +       return 0;
> +}
> +

........................


> +void *ntb_transport_tx_dequeue(struct ntb_transport_qp *qp, unsigned int *len)
> +{
> +       struct ntb_queue_entry *entry;
> +       void *buf;
> +
> +       if (!qp)
> +               return NULL;
> +
> +       entry = ntb_list_rm_head(&qp->txc_lock, &qp->txc);
> +       if (!entry)
> +               return NULL;
> +
> +       buf = entry->callback_data;
> +       if (entry->flags != HW_ERROR_FLAG)
> +               *len = entry->len;
> +       else
> +               *len = -EIO;
> +
> +       ntb_list_add_tail(&qp->txe_lock, &entry->entry, &qp->txe);

how about a ntb_list_add_head?


Chetan Loke

^ permalink raw reply

* Re: [PATCH] ipv6: fix incorrect route 'expires' value passed to userspace.
From: Stephen Hemminger @ 2012-07-16 16:41 UTC (permalink / raw)
  To: Li Wei; +Cc: David S. Miller, netdev
In-Reply-To: <5003CC41.9080204@cn.fujitsu.com>

On Mon, 16 Jul 2012 16:09:37 +0800
Li Wei <lw@cn.fujitsu.com> wrote:

> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index becb048..a7fec9d 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -2516,7 +2516,7 @@ static int rt6_fill_node(struct net *net,
>  		goto nla_put_failure;
>  	if (!(rt->rt6i_flags & RTF_EXPIRES))
>  		expires = 0;
> -	else if (rt->dst.expires - jiffies < INT_MAX)
> +	else if ((int)(rt->dst.expires - jiffies) < INT_MAX)
>  		expires = rt->dst.expires - jiffies;
>  	else
>  		expires = INT_MAX;

Why not use time_is_after_jiffies() macro?

^ permalink raw reply

* Re: 3.4.4/amd64 full interrupt hangs under big nfs copies
From: Eric Dumazet @ 2012-07-16 16:21 UTC (permalink / raw)
  To: Marc MERLIN
  Cc: David Miller, Larry.Finger, bhutchings, linux-wireless, netdev
In-Reply-To: <20120716151826.GA10586@merlins.org>

On Mon, 2012-07-16 at 08:18 -0700, Marc MERLIN wrote:
> On Mon, Jul 16, 2012 at 08:18:49AM +0200, Eric Dumazet wrote:
> > > My understanding is that user space calling drivers that shut off all
> > > interrupts for extended periods of time (as least I think so since my mouse
> > > cursor would not move), is still a kernel bug.
> > > 
> > > For what it's worth, copying 1GB of data in lots of small files does not
> > > cause problems, it seems that it's big files that cause a problem since they
> > > likely fill a buffer somewhere while interrupts are disabled.
> > > 
> > > Do you have an idea of how I can find out where my mc process is stuck in
> > > the kernel?
> > > Should I reproduce with specific sysrq output?
> > 
> > Just to clarify, you get this freeze when transferring a big file from a
> > remote NFS server to your PC, (aka a download), not the reverse way ?
>  
> No, it's atually when I'm 'uploading' from my laptop to my server.
> One interesting thing is that my server is running lvm2 with snapshots,
> which makes writes slower than my laptop can push data over the network, so
> it's definitely causing buffers to fill up.
> I just did a download test and got 4.5MB/s sustained without problems.

Hmm, nfs apparently is able to push lot of data, try to reduce
rsize/wsize to sane values, like 32K instead of 512K ?

gargamel:/mnt/dshelf2/ /net/gargamel/mnt/dshelf2 nfs4
rw,nosuid,nodev,relatime,vers=4.0,rsize=524288,wsize=524288,namlen=255,hard,proto=tcp,port=0,timeo=600,retrans=2,sec=sys,clientaddr=192.168.205.7,local_lock=none,addr=192.168.205.3 0 0

You could trace svc_sock_setbufsize() and check how large is set
sk_sndbuf

(iwlwifi is unable to use sendpage anyway, since SG is not enabled)

^ permalink raw reply

* Re: 82571EB: Detected Hardware Unit Hang
From: Henrique de Moraes Holschuh @ 2012-07-16 16:08 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: netdev@vger.kernel.org, e1000-devel@lists.sf.net, Joe Jin,
	linux-kernel@vger.kernel.org
In-Reply-To: <1342453645.2523.17.camel@bwh-desktop.uk.solarflarecom.com>

On Mon, 16 Jul 2012, Ben Hutchings wrote:
> On Sun, 2012-07-15 at 10:35 -0300, Henrique de Moraes Holschuh wrote:
> > On Sun, 15 Jul 2012, Dave, Tushar N wrote:
> > > Somehow setting max payload to 256 from BIOS does not set this value for all devices. I believe this is a BIOS bug.
> > 
> > And preferably, Linux should complain about it.  Since we know it is going
> > to cause problems, and since we know it does happen, we should be raising a
> > ruckus about it in the kernel log (and probably fixing it to min(path) while
> > at it)...
> > 
> > Is this something that should be raised as a feature request with the
> > PCI/PCIe subsystem?
> 
> The feature is there, but we ended up with:
> 
> commit 5f39e6705faade2e89d119958a8c51b9b6e2c53c
> Author: Jon Mason <mason@myri.com>
> Date:   Mon Oct 3 09:50:20 2011 -0500
> 
>     PCI: Disable MPS configuration by default
> 
> But you are welcome to share use of the fixup_mpss_256() quirk.

Meh.  I'd be happy with a warning if MPSS decreases when walking up to
the tree root... i.e. a warning if any child has a MPSS larger than the
parent.

-- 
  "One disk to rule them all, One disk to find them. One disk to bring
  them all and in the darkness grind them. In the Land of Redmond
  where the shadows lie." -- The Silicon Valley Tarot
  Henrique Holschuh

------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and 
threat landscape has changed and how IT managers can respond. Discussions 
will include endpoint security, mobile security and the latest in malware 
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* Re: 82571EB: Detected Hardware Unit Hang
From: Ben Hutchings @ 2012-07-16 15:47 UTC (permalink / raw)
  To: Henrique de Moraes Holschuh
  Cc: Dave, Tushar N, Joe Jin, e1000-devel@lists.sf.net,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <20120715133510.GA5484@khazad-dum.debian.net>

On Sun, 2012-07-15 at 10:35 -0300, Henrique de Moraes Holschuh wrote:
> On Sun, 15 Jul 2012, Dave, Tushar N wrote:
> > Somehow setting max payload to 256 from BIOS does not set this value for all devices. I believe this is a BIOS bug.
> 
> And preferably, Linux should complain about it.  Since we know it is going
> to cause problems, and since we know it does happen, we should be raising a
> ruckus about it in the kernel log (and probably fixing it to min(path) while
> at it)...
> 
> Is this something that should be raised as a feature request with the
> PCI/PCIe subsystem?

The feature is there, but we ended up with:

commit 5f39e6705faade2e89d119958a8c51b9b6e2c53c
Author: Jon Mason <mason@myri.com>
Date:   Mon Oct 3 09:50:20 2011 -0500

    PCI: Disable MPS configuration by default

But you are welcome to share use of the fixup_mpss_256() quirk.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: resurrecting tcphealth
From: Christoph Paasch @ 2012-07-16 15:24 UTC (permalink / raw)
  To: Piotr Sawuk; +Cc: netdev, linux-kernel
In-Reply-To: <436a23ea8f8df4e8e3c71c369f1e3649.squirrel@webmail.univie.ac.at>

On Monday 16 July 2012 17:12:26 Piotr Sawuk wrote:
> +               seq_printf(seq, "%d: %-21pI4:%u %-21pI4:%u "
> +                               "%8u %8lu %8lu %8lu %8lu%n",
> +                               st->num,
> +                               &inet->inet_rcv_saddr,
> +                               ntohs(inet->inet_sport),
> +                               &inet->inet_daddr,
> +                               ntohs(inet->inet_dport),
> +                               jiffies_to_msecs(tp->srtt),

This still doesn't gives you the correct RTT.
srtt is in jiffies * 8.

You should do jiffies_to_msecs(tp->srtt) >> 3.

The RTT is already exposed by tcp_info anyway... (see tcp_get_info() - where 
you also see the bitshift)


Christoph

-- 
IP Networking Lab --- http://inl.info.ucl.ac.be
MultiPath TCP in the Linux Kernel --- http://mptcp.info.ucl.ac.be
Université Catholique de Louvain
--

^ permalink raw reply

* Re: 3.4.4/amd64 full interrupt hangs under big nfs copies
From: Marc MERLIN @ 2012-07-16 15:18 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Larry.Finger, bhutchings, linux-wireless, netdev
In-Reply-To: <1342419529.3265.12217.camel@edumazet-glaptop>

On Mon, Jul 16, 2012 at 08:18:49AM +0200, Eric Dumazet wrote:
> > My understanding is that user space calling drivers that shut off all
> > interrupts for extended periods of time (as least I think so since my mouse
> > cursor would not move), is still a kernel bug.
> > 
> > For what it's worth, copying 1GB of data in lots of small files does not
> > cause problems, it seems that it's big files that cause a problem since they
> > likely fill a buffer somewhere while interrupts are disabled.
> > 
> > Do you have an idea of how I can find out where my mc process is stuck in
> > the kernel?
> > Should I reproduce with specific sysrq output?
> 
> Just to clarify, you get this freeze when transferring a big file from a
> remote NFS server to your PC, (aka a download), not the reverse way ?
 
No, it's atually when I'm 'uploading' from my laptop to my server.
One interesting thing is that my server is running lvm2 with snapshots,
which makes writes slower than my laptop can push data over the network, so
it's definitely causing buffers to fill up.
I just did a download test and got 4.5MB/s sustained without problems.

> If so, you might hit OOM condition because iwlwifi uses big/fat RX
> buffers, I never understood why...
> 
> (amsdu_size_8K = 1)
> 
> Storing an MTU=1500 frams in 8KB of memory sounds really bad.
> 
> diff --git a/drivers/net/wireless/iwlwifi/iwl-drv.c b/drivers/net/wireless/iwlwifi/iwl-drv.c
> index cc41cfa..434b924 100644
> --- a/drivers/net/wireless/iwlwifi/iwl-drv.c
> +++ b/drivers/net/wireless/iwlwifi/iwl-drv.c
> @@ -1006,7 +1006,7 @@ void iwl_drv_stop(struct iwl_drv *drv)
>  
>  /* shared module parameters */
>  struct iwl_mod_params iwlwifi_mod_params = {
> -	.amsdu_size_8K = 1,
> +	.amsdu_size_8K = 0,

So, do you recomend I try this if my problem is with TX and not RX?

More generally, is there a tip you have for me to get WCHAN in ps on a
process that is causing this problem, or will sysrq-W/sysrq-D show this, or
do I need something like sysrq-T ?

I just reproduced this while having a loop that did 
while :; do date; echo w > /proc/sysrq-trigger; sleep 30; done

I got a nice 30mn (!) hang, including the lines below:
[68151.449220] kworker/0:0: page allocation failure: order:1, mode:0x4020
[68151.449225] Pid: 9004, comm: kworker/0:0 Tainted: G         C O 3.4.4-amd64-preempt-noide-20120410 #1
[68151.449227] Call Trace:
[68151.449228]  <IRQ>  [<ffffffff810bf8d8>] ? warn_alloc_failed+0x11f/0x132
[68151.449239]  [<ffffffff810453ff>] ? __mod_timer+0x13a/0x14c
[68151.449243]  [<ffffffff810c26e2>] ? __alloc_pages_nodemask+0x72f/0x7df
[68151.449256]  [<ffffffff810ecf10>] ? alloc_pages_current+0xc7/0xe4
[68151.449266]  [<ffffffffa04ca247>] ? iwlagn_rx_allocate+0x97/0x24d [iwlwifi]
[68151.449273]  [<ffffffffa04cb27b>] ? iwl_irq_tasklet+0x6e4/0x838 [iwlwifi]
[68151.449279]  [<ffffffff8103f6e9>] ? tasklet_action+0x79/0xc8
[68151.449283]  [<ffffffff8103f205>] ? __do_softirq+0xc0/0x188
[68151.449288]  [<ffffffff8136879c>] ? call_softirq+0x1c/0x30
[68151.449293]  [<ffffffff8100fcf9>] ? do_softirq+0x3c/0x7b
[68151.449297]  [<ffffffff8103f4f3>] ? irq_exit+0x3d/0xa5
[68151.449301]  [<ffffffff8100fa1e>] ? do_IRQ+0x81/0x97
[68151.449306]  [<ffffffff81362e2a>] ? common_interrupt+0x6a/0x6a
[68151.449308]  <EOI>  [<ffffffff811b7bca>] ? idr_get_next+0x7d/0x92
[68151.449318]  [<ffffffff81088d74>] ? css_get_next+0x59/0x97
[68151.449322]  [<ffffffff810fc115>] ? mem_cgroup_iter+0x109/0x1ab
[68151.449328]  [<ffffffff810cb31a>] ? shrink_zone+0x89/0x9b
[68151.449333]  [<ffffffff810cb73c>] ? do_try_to_free_pages+0x1e4/0x434
[68151.449338]  [<ffffffff810cbc11>] ? try_to_free_pages+0xb3/0xf9
[68151.449343]  [<ffffffff810c24a2>] ? __alloc_pages_nodemask+0x4ef/0x7df
[68151.449349]  [<ffffffff810ecf10>] ? alloc_pages_current+0xc7/0xe4
[68151.449356]  [<ffffffffa04ca247>] ? iwlagn_rx_allocate+0x97/0x24d [iwlwifi]
[68151.449363]  [<ffffffffa04ca81e>] ? iwlagn_rx_replenish+0x3a/0x3a [iwlwifi]
[68151.449369]  [<ffffffffa04ca7fc>] ? iwlagn_rx_replenish+0x18/0x3a [iwlwifi]
[68151.449373]  [<ffffffff8104ea7d>] ? process_one_work+0x16d/0x298
[68151.449379]  [<ffffffff8104f4d9>] ? worker_thread+0xc2/0x145
[68151.449383]  [<ffffffff8104f417>] ? manage_workers.isra.23+0x15b/0x15b
[68151.449386]  [<ffffffff81052788>] ? kthread+0x7d/0x85
[68151.449390]  [<ffffffff813686a4>] ? kernel_thread_helper+0x4/0x10
[68151.449395]  [<ffffffff8105270b>] ? kthread_freezable_should_stop+0x37/0x37
[68151.449399]  [<ffffffff813686a0>] ? gs_change+0x13/0x13
(...)
[68151.475337] iwlwifi 0:03:00.0: Failed to alloc_pages with GFP_ATOMIC.Only 5 free buffers remaining.


Full log below around the 30mn hang:
[68145.180343] SysRq : Show Blocked State
[68145.180348]   task                        PC stack   pid father
[68145.180469] mc              D ffff88021e213680     0  6526  21734 0x00000080
[68145.180473]  ffff8802134ff750 0000000000000086 0000000000000008 ffff88020d76a7d0
[68145.180477]  0000000000013680 ffff88001a98dfd8 ffff88001a98dfd8 ffff8802134ff750
[68145.180480]  ffff8801a299ede0 ffff88021e213680 ffff8802134ff750 ffffffff810bb429
[68145.180483] Call Trace:
[68145.180490]  [<ffffffff810bb429>] ? __lock_page+0x66/0x66
[68145.180494]  [<ffffffff81362059>] ? io_schedule+0x55/0x6b
[68145.180496]  [<ffffffff810bb42f>] ? sleep_on_page+0x6/0xa
[68145.180499]  [<ffffffff81361054>] ? __wait_on_bit+0x3e/0x71
[68145.180502]  [<ffffffff810bb577>] ? wait_on_page_bit+0x6e/0x73
[68145.180505]  [<ffffffff81052e69>] ? autoremove_wake_function+0x2a/0x2a
[68145.180540]  [<ffffffffa0199d18>] ? read_extent_buffer_pages+0x1fb/0x24b [btrfs]
[68145.180552]  [<ffffffffa0175112>] ? lock_page+0x11/0x11 [btrfs]
[68145.180564]  [<ffffffffa0176e69>] ? btree_read_extent_buffer_pages.constprop.110+0x5c/0xf3 [btrfs]
[68145.180577]  [<ffffffffa0177412>] ? read_tree_block+0x25/0x2d [btrfs]
[68145.180586]  [<ffffffffa0163087>] ? read_block_for_search.isra.32+0x2af/0x2e7 [btrfs]
[68145.180597]  [<ffffffffa01649c2>] ? btrfs_search_slot+0x48d/0x659 [btrfs]
[68145.180608]  [<ffffffffa0172d7d>] ? btrfs_lookup_csum+0x66/0x106 [btrfs]
[68145.180611]  [<ffffffff8136583e>] ? sub_preempt_count+0x83/0x94
[68145.180622]  [<ffffffffa0172fad>] ? __btrfs_lookup_bio_sums+0x190/0x30a [btrfs]
[68145.180635]  [<ffffffffa017d021>] ? btrfs_submit_bio_hook+0xa9/0x12a [btrfs]
[68145.180649]  [<ffffffffa0194af3>] ? submit_one_bio+0x82/0xb9 [btrfs]
[68145.180662]  [<ffffffffa01977c7>] ? submit_extent_page.isra.26+0x10a/0x1b2 [btrfs]
[68145.180675]  [<ffffffffa0196d91>] ? repair_io_failure+0x18e/0x18e [btrfs]
[68145.180687]  [<ffffffffa0197cf5>] ? __extent_read_full_page+0x424/0x4be [btrfs]
[68145.180699]  [<ffffffffa0196d91>] ? repair_io_failure+0x18e/0x18e [btrfs]
[68145.180712]  [<ffffffffa017eca4>] ? btrfs_writepage+0x4b/0x4b [btrfs]
[68145.180716]  [<ffffffff810c56b3>] ? __lru_cache_add+0x7b/0x8d
[68145.180728]  [<ffffffffa01988e8>] ? extent_readpages+0xaf/0xf3 [btrfs]
[68145.180741]  [<ffffffffa017eca4>] ? btrfs_writepage+0x4b/0x4b [btrfs]
[68145.180744]  [<ffffffff810c4640>] ? __do_page_cache_readahead+0x139/0x1de
[68145.180747]  [<ffffffff810c493d>] ? ra_submit+0x19/0x1d
[68145.180749]  [<ffffffff810bca23>] ? generic_file_aio_read+0x2b0/0x5d3
[68145.180753]  [<ffffffff81110627>] ? set_fd_set+0x23/0x26
[68145.180756]  [<ffffffff8110252a>] ? do_sync_read+0xab/0xe3
[68145.180759]  [<ffffffff81362b73>] ? _raw_spin_unlock_irqrestore+0x30/0x3e
[68145.180762]  [<ffffffff810595b4>] ? __wake_up+0x35/0x46
[68145.180764]  [<ffffffff81102c22>] ? vfs_read+0x9f/0xe6
[68145.180767]  [<ffffffff81103c2d>] ? fget_light+0x33/0x8d
[68145.180769]  [<ffffffff81102cae>] ? sys_read+0x45/0x6b
[68145.180772]  [<ffffffff8136723d>] ? system_call_fastpath+0x1a/0x1f
[68145.180783] Sched Debug Version: v0.10, 3.4.4-amd64-preempt-noide-20120410 #1
[68145.180785] ktime                                   : 68319835.436522
[68145.180787] sched_clk                               : 68145180.782078
[68145.180788] cpu_clk                                 : 68145180.782141
[68145.180790] jiffies                                 : 4311972254
[68145.180791] sched_clock_stable                      : 1
[68145.180792] 
[68145.180793] sysctl_sched
[68145.180795]   .sysctl_sched_latency                    : 18.000000
[68145.180796]   .sysctl_sched_min_granularity            : 2.250000
[68145.180798]   .sysctl_sched_wakeup_granularity         : 3.000000
[68145.180799]   .sysctl_sched_child_runs_first           : 0
[68145.180801]   .sysctl_sched_features                   : 24119
[68145.180803]   .sysctl_sched_tunable_scaling            : 1 (logaritmic)
[68145.180805] 
[68145.180805] cpu#0, 2893.340 MHz
[68145.180806]   .nr_running                    : 1
[68145.180808]   .load                          : 1024
[68145.180809]   .nr_switches                   : 220345791
[68145.180810]   .nr_load_updates               : 21415127
[68145.180812]   .nr_uninterruptible            : -4403
[68145.180813]   .next_balance                  : 4311.972255
[68145.180815]   .curr->pid                     : 8408
[68145.180816]   .clock                         : 68145180.058636
[68145.180818]   .cpu_load[0]                   : 0
[68145.180819]   .cpu_load[1]                   : 664
[68145.180820]   .cpu_load[2]                   : 639
[68145.180821]   .cpu_load[3]                   : 486
[68145.180823]   .cpu_load[4]                   : 397
[68145.180825] 
[68145.180825] cfs_rq[0]:/autogroup-105
[68145.180827]   .exec_clock                    : 0.000000
[68145.180828]   .MIN_vruntime                  : 0.000001
[68145.180830]   .min_vruntime                  : 632596.935417
[68145.180832]   .max_vruntime                  : 0.000001
[68145.180833]   .spread                        : 0.000000
[68145.180835]   .spread0                       : -16307488.857263
[68145.180836]   .nr_spread_over                : 0
[68145.180837]   .nr_running                    : 0
[68145.180838]   .load                          : 0
[68145.180840]   .load_avg                      : 2629.999737
[68145.180841]   .load_period                   : 5.116983
[68145.180843]   .load_contrib                  : 513
[68145.180844]   .load_tg                       : 515
[68145.180846]   .se->exec_start                : 68145177.126165
[68145.180847]   .se->vruntime                  : 16940076.867799
[68145.180849]   .se->sum_exec_runtime          : 525985.491293
[68145.180850]   .se->load.weight               : 2
[68145.180852] 
[68145.180852] cfs_rq[0]:/autogroup-728
[68145.180854]   .exec_clock                    : 0.000000
[68145.180855]   .MIN_vruntime                  : 0.000001
[68145.180857]   .min_vruntime                  : 1465.002075
[68145.180858]   .max_vruntime                  : 0.000001
[68145.180859]   .spread                        : 0.000000
[68145.180861]   .spread0                       : -16938620.790605
[68145.180862]   .nr_spread_over                : 0
[68145.180864]   .nr_running                    : 0
[68145.180865]   .load                          : 0
[68145.180866]   .load_avg                      : 718.614822
[68145.180868]   .load_period                   : 5.560256
[68145.180869]   .load_contrib                  : 121
[68145.180870]   .load_tg                       : 643
[68145.180872]   .se->exec_start                : 68145179.581565
[68145.180873]   .se->vruntime                  : 16940077.021084
[68145.180875]   .se->sum_exec_runtime          : 238.261122
[68145.180876]   .se->load.weight               : 2
[68145.180878] 
[68145.180878] cfs_rq[0]:/autogroup-77
[68145.180880]   .exec_clock                    : 0.000000
[68145.180881]   .MIN_vruntime                  : 0.000001
[68145.180883]   .min_vruntime                  : 7008.558906
[68145.180884]   .max_vruntime                  : 0.000001
[68145.180886]   .spread                        : 0.000000
[68145.180887]   .spread0                       : -16933077.233774
[68145.180888]   .nr_spread_over                : 0
[68145.180890]   .nr_running                    : 0
[68145.180891]   .load                          : 0
[68145.180892]   .load_avg                      : 639.999936
[68145.180894]   .load_period                   : 6.666654
[68145.180895]   .load_contrib                  : 96
[68145.180896]   .load_tg                       : 202
[68145.180898]   .se->exec_start                : 68145155.292812
[68145.180899]   .se->vruntime                  : 16940072.397295
[68145.180901]   .se->sum_exec_runtime          : 6875.737275
[68145.180902]   .se->load.weight               : 2
[68145.180904] 
[68145.180904] cfs_rq[0]:/autogroup-74
[68145.180906]   .exec_clock                    : 0.000000
[68145.180907]   .MIN_vruntime                  : 0.000001
[68145.180908]   .min_vruntime                  : 9096.628700
[68145.180910]   .max_vruntime                  : 0.000001
[68145.180911]   .spread                        : 0.000000
[68145.180913]   .spread0                       : -16930989.163980
[68145.180914]   .nr_spread_over                : 0
[68145.180915]   .nr_running                    : 0
[68145.180917]   .load                          : 0
[68145.180918]   .load_avg                      : 643.893952
[68145.180919]   .load_period                   : 8.513359
[68145.180921]   .load_contrib                  : 75
[68145.180922]   .load_tg                       : 75
[68145.180924]   .se->exec_start                : 68145155.254096
[68145.180926]   .se->vruntime                  : 16940072.391504
[68145.180927]   .se->sum_exec_runtime          : 8917.069991
[68145.180928]   .se->load.weight               : 2
[68145.180930] 
[68145.180930] cfs_rq[0]:/autogroup-85
[68145.180932]   .exec_clock                    : 0.000000
[68145.180933]   .MIN_vruntime                  : 0.000001
[68145.180935]   .min_vruntime                  : 461037.358394
[68145.180936]   .max_vruntime                  : 0.000001
[68145.180938]   .spread                        : 0.000000
[68145.180939]   .spread0                       : -16479048.434286
[68145.180940]   .nr_spread_over                : 0
[68145.180942]   .nr_running                    : 0
[68145.180943]   .load                          : 0
[68145.180944]   .load_avg                      : 159.999984
[68145.180946]   .load_period                   : 8.582743
[68145.180947]   .load_contrib                  : 18
[68145.180948]   .load_tg                       : 20
[68145.180950]   .se->exec_start                : 68145144.752282
[68145.180951]   .se->vruntime                  : 16940070.960978
[68145.180953]   .se->sum_exec_runtime          : 523059.429556
[68145.180954]   .se->load.weight               : 2
[68145.180956] 
[68145.180956] cfs_rq[0]:/autogroup-99
[68145.180958]   .exec_clock                    : 0.000000
[68145.180959]   .MIN_vruntime                  : 0.000001
[68145.180961]   .min_vruntime                  : 919853.728128
[68145.180962]   .max_vruntime                  : 0.000001
[68145.180963]   .spread                        : 0.000000
[68145.180965]   .spread0                       : -16020232.064552
[68145.180966]   .nr_spread_over                : 0
[68145.180968]   .nr_running                    : 0
[68145.180969]   .load                          : 0
[68145.180970]   .load_avg                      : 95.570772
[68145.180972]   .load_period                   : 8.631443
[68145.180973]   .load_contrib                  : 11
[68145.180974]   .load_tg                       : 25
[68145.180976]   .se->exec_start                : 68145166.483609
[68145.180977]   .se->vruntime                  : 16940075.698106
[68145.180979]   .se->sum_exec_runtime          : 649839.065400
[68145.180980]   .se->load.weight               : 2
[68145.180982] 
[68145.180982] cfs_rq[0]:/autogroup-53
[68145.180984]   .exec_clock                    : 0.000000
[68145.180985]   .MIN_vruntime                  : 0.000001
[68145.180987]   .min_vruntime                  : 8270588.687589
[68145.180988]   .max_vruntime                  : 0.000001
[68145.180989]   .spread                        : 0.000000
[68145.180991]   .spread0                       : -8669497.105091
[68145.180992]   .nr_spread_over                : 0
[68145.180993]   .nr_running                    : 0
[68145.180995]   .load                          : 0
[68145.180996]   .load_avg                      : 51.067853
[68145.180998]   .load_period                   : 6.626155
[68145.180999]   .load_contrib                  : 2
[68145.181000]   .load_tg                       : 1479
[68145.181002]   .se->exec_start                : 68145177.453635
[68145.181003]   .se->vruntime                  : 16940076.966610
[68145.181005]   .se->sum_exec_runtime          : 7731181.203365
[68145.181006]   .se->load.weight               : 2
[68145.181008] 
[68145.181008] cfs_rq[0]:/autogroup-216
[68145.181009]   .exec_clock                    : 0.000000
[68145.181011]   .MIN_vruntime                  : 0.000001
[68145.181012]   .min_vruntime                  : 106577.197911
[68145.181014]   .max_vruntime                  : 0.000001
[68145.181015]   .spread                        : 0.000000
[68145.181017]   .spread0                       : -16833508.594769
[68145.181018]   .nr_spread_over                : 0
[68145.181019]   .nr_running                    : 0
[68145.181020]   .load                          : 0
[68145.181022]   .load_avg                      : 22.437006
[68145.181023]   .load_period                   : 6.581971
[68145.181025]   .load_contrib                  : 3
[68145.181026]   .load_tg                       : 3
[68145.181028]   .se->exec_start                : 68145151.713745
[68145.181029]   .se->vruntime                  : 16940072.220775
[68145.181031]   .se->sum_exec_runtime          : 55613.402140
[68145.181032]   .se->load.weight               : 2
[68145.181034] 
[68145.181034] cfs_rq[0]:/autogroup-499
[68145.181036]   .exec_clock                    : 0.000000
[68145.181037]   .MIN_vruntime                  : 0.000001
[68145.181039]   .min_vruntime                  : 12143.287893
[68145.181040]   .max_vruntime                  : 0.000001
[68145.181042]   .spread                        : 0.000000
[68145.181043]   .spread0                       : -16927942.504787
[68145.181045]   .nr_spread_over                : 0
[68145.181046]   .nr_running                    : 0
[68145.181047]   .load                          : 0
[68145.181048]   .load_avg                      : 2288.736922
[68145.181050]   .load_period                   : 7.063528
[68145.181051]   .load_contrib                  : 324
[68145.181053]   .load_tg                       : 324
[68145.181054]   .se->exec_start                : 68145176.734792
[68145.181056]   .se->vruntime                  : 16940085.859991
[68145.181057]   .se->sum_exec_runtime          : 5891.241766
[68145.181058]   .se->load.weight               : 2
[68145.181060] 
[68145.181060] cfs_rq[0]:/autogroup-345
[68145.181062]   .exec_clock                    : 0.000000
[68145.181063]   .MIN_vruntime                  : 0.000001
[68145.181065]   .min_vruntime                  : 337573.928223
[68145.181066]   .max_vruntime                  : 0.000001
[68145.181067]   .spread                        : 0.000000
[68145.181069]   .spread0                       : -16602511.864457
[68145.181070]   .nr_spread_over                : 0
[68145.181072]   .nr_running                    : 0
[68145.181073]   .load                          : 0
[68145.181074]   .load_avg                      : 694.208403
[68145.181076]   .load_period                   : 6.188356
[68145.181077]   .load_contrib                  : 112
[68145.181078]   .load_tg                       : 511
[68145.181080]   .se->exec_start                : 68145174.673610
[68145.181081]   .se->vruntime                  : 16940076.816044
[68145.181083]   .se->sum_exec_runtime          : 453037.061998
[68145.181084]   .se->load.weight               : 2
[68145.181086] 
[68145.181086] cfs_rq[0]:/
[68145.181088]   .exec_clock                    : 0.000000
[68145.181089]   .MIN_vruntime                  : 0.000001
[68145.181090]   .min_vruntime                  : 16940085.792680
[68145.181092]   .max_vruntime                  : 0.000001
[68145.181093]   .spread                        : 0.000000
[68145.181095]   .spread0                       : 0.000000
[68145.181096]   .nr_spread_over                : 0
[68145.181097]   .nr_running                    : 1
[68145.181098]   .load                          : 1024
[68145.181100]   .load_avg                      : 0.000000
[68145.181101]   .load_period                   : 0.000000
[68145.181102]   .load_contrib                  : 0
[68145.181104]   .load_tg                       : 0
[68145.181105] 
[68145.181105] rt_rq[0]:
[68145.181107]   .rt_nr_running                 : 0
[68145.181108]   .rt_throttled                  : 0
[68145.181109]   .rt_time                       : 0.005109
[68145.181111]   .rt_runtime                    : 950.000000
[68145.181112] 
[68145.181113] runnable tasks:
[68145.181113]             task   PID         tree-key  switches  prio     exec-runtime         sum-exec        sum-sleep
[68145.181114] ----------------------------------------------------------------------------------------------------------
[68145.181136] R    kworker/0:1  8408  16940085.792680     29268   120               0               0               0.000000               0.000000               0.000000 /
[68145.181144] 
[68145.181144] cpu#1, 2893.340 MHz
[68145.181146]   .nr_running                    : 1
[68145.181147]   .load                          : 832
[68145.181148]   .nr_switches                   : 215554245
[68145.181150]   .nr_load_updates               : 7918869
[68145.181151]   .nr_uninterruptible            : 2380
[68145.181152]   .next_balance                  : 4311.972282
[68145.181154]   .curr->pid                     : 8880
[68145.181155]   .clock                         : 68145180.268053
[68145.181157]   .cpu_load[0]                   : 879
[68145.181158]   .cpu_load[1]                   : 706
[68145.181159]   .cpu_load[2]                   : 505
[68145.181160]   .cpu_load[3]                   : 322
[68145.181162]   .cpu_load[4]                   : 232
[68145.181163] 
[68145.181164] cfs_rq[1]:/autogroup-53
[68145.181165]   .exec_clock                    : 0.000000
[68145.181166]   .MIN_vruntime                  : 0.000001
[68145.181168]   .min_vruntime                  : 10319271.413405
[68145.181169]   .max_vruntime                  : 0.000001
[68145.181171]   .spread                        : 0.000000
[68145.181172]   .spread0                       : -6620814.379275
[68145.181174]   .nr_spread_over                : 0
[68145.181175]   .nr_running                    : 0
[68145.181176]   .load                          : 0
[68145.181177]   .load_avg                      : 2668.957401
[68145.181179]   .load_period                   : 6.733473
[68145.181180]   .load_contrib                  : 430
[68145.181181]   .load_tg                       : 1479
[68145.181183]   .se->exec_start                : 68145180.194582
[68145.181184]   .se->vruntime                  : 19124295.713641
[68145.181186]   .se->sum_exec_runtime          : 9934376.153325
[68145.181187]   .se->load.weight               : 2
[68145.181189] 
[68145.181189] cfs_rq[1]:/autogroup-728
[68145.181191]   .exec_clock                    : 0.000000
[68145.181192]   .MIN_vruntime                  : 0.000001
[68145.181193]   .min_vruntime                  : 805.626907
[68145.181195]   .max_vruntime                  : 0.000001
[68145.181196]   .spread                        : 0.000000
[68145.181198]   .spread0                       : -16939280.165773
[68145.181199]   .nr_spread_over                : 0
[68145.181200]   .nr_running                    : 1
[68145.181202]   .load                          : 526
[68145.181203]   .load_avg                      : 4387.132719
[68145.181204]   .load_period                   : 8.401223
[68145.181206]   .load_contrib                  : 522
[68145.181207]   .load_tg                       : 643
[68145.181209]   .se->exec_start                : 68145180.268053
[68145.181210]   .se->vruntime                  : 19124304.586462
[68145.181212]   .se->sum_exec_runtime          : 174.761132
[68145.181213]   .se->load.weight               : 832
[68145.181214] 
[68145.181215] cfs_rq[1]:/autogroup-77
[68145.181216]   .exec_clock                    : 0.000000
[68145.181217]   .MIN_vruntime                  : 0.000001
[68145.181219]   .min_vruntime                  : 12354.484454
[68145.181220]   .max_vruntime                  : 0.000001
[68145.181222]   .spread                        : 0.000000
[68145.181223]   .spread0                       : -16927731.308226
[68145.181225]   .nr_spread_over                : 0
[68145.181226]   .nr_running                    : 0
[68145.181227]   .load                          : 0
[68145.181228]   .load_avg                      : 639.999936
[68145.181230]   .load_period                   : 5.988008
[68145.181231]   .load_contrib                  : 106
[68145.181232]   .load_tg                       : 202
[68145.181234]   .se->exec_start                : 68145155.131954
[68145.181235]   .se->vruntime                  : 19124277.088025
[68145.181237]   .se->sum_exec_runtime          : 12258.830758
[68145.181238]   .se->load.weight               : 2
[68145.181240] 
[68145.181240] cfs_rq[1]:/autogroup-13
[68145.181242]   .exec_clock                    : 0.000000
[68145.181243]   .MIN_vruntime                  : 0.000001
[68145.181245]   .min_vruntime                  : 7268.331092
[68145.181246]   .max_vruntime                  : 0.000001
[68145.181247]   .spread                        : 0.000000
[68145.181249]   .spread0                       : -16932817.461588
[68145.181250]   .nr_spread_over                : 0
[68145.181251]   .nr_running                    : 0
[68145.181253]   .load                          : 0
[68145.181254]   .load_avg                      : 324.714656
[68145.181255]   .load_period                   : 5.489619
[68145.181257]   .load_contrib                  : 59
[68145.181258]   .load_tg                       : 59
[68145.181259]   .se->exec_start                : 68145150.974036
[68145.181261]   .se->vruntime                  : 19124277.210880
[68145.181262]   .se->sum_exec_runtime          : 3628.001811
[68145.181264]   .se->load.weight               : 2
[68145.181265] 
[68145.181266] cfs_rq[1]:/autogroup-52
[68145.181267]   .exec_clock                    : 0.000000
[68145.181268]   .MIN_vruntime                  : 0.000001
[68145.181270]   .min_vruntime                  : 244250.987333
[68145.181271]   .max_vruntime                  : 0.000001
[68145.181273]   .spread                        : 0.000000
[68145.181274]   .spread0                       : -16695834.805347
[68145.181275]   .nr_spread_over                : 0
[68145.181277]   .nr_running                    : 0
[68145.181278]   .load                          : 0
[68145.181279]   .load_avg                      : 339.904928
[68145.181281]   .load_period                   : 5.831847
[68145.181282]   .load_contrib                  : 58
[68145.181283]   .load_tg                       : 58
[68145.181285]   .se->exec_start                : 68145155.473960
[68145.181287]   .se->vruntime                  : 19124277.987459
[68145.181288]   .se->sum_exec_runtime          : 244497.514450
[68145.181289]   .se->load.weight               : 2
[68145.181291] 
[68145.181292] cfs_rq[1]:/autogroup-751
[68145.181293]   .exec_clock                    : 0.000000
[68145.181294]   .MIN_vruntime                  : 0.000001
[68145.181296]   .min_vruntime                  : 52.906532
[68145.181297]   .max_vruntime                  : 0.000001
[68145.181299]   .spread                        : 0.000000
[68145.181300]   .spread0                       : -16940032.886148
[68145.181301]   .nr_spread_over                : 0
[68145.181303]   .nr_running                    : 0
[68145.181304]   .load                          : 0
[68145.181305]   .load_avg                      : 329.284768
[68145.181307]   .load_period                   : 5.489338
[68145.181308]   .load_contrib                  : 59
[68145.181309]   .load_tg                       : 59
[68145.181311]   .se->exec_start                : 68145151.059469
[68145.181312]   .se->vruntime                  : 19124277.756132
[68145.181314]   .se->sum_exec_runtime          : 44.955108
[68145.181315]   .se->load.weight               : 2
[68145.181316] 
[68145.181317] cfs_rq[1]:/autogroup-105
[68145.181318]   .exec_clock                    : 0.000000
[68145.181320]   .MIN_vruntime                  : 0.000001
[68145.181321]   .min_vruntime                  : 586685.915590
[68145.181323]   .max_vruntime                  : 0.000001
[68145.181324]   .spread                        : 0.000000
[68145.181326]   .spread0                       : -16353399.877090
[68145.181327]   .nr_spread_over                : 0
[68145.181328]   .nr_running                    : 0
[68145.181329]   .load                          : 0
[68145.181331]   .load_avg                      : 13.583718
[68145.181332]   .load_period                   : 5.192153
[68145.181333]   .load_contrib                  : 2
[68145.181335]   .load_tg                       : 515
[68145.181336]   .se->exec_start                : 68145167.025810
[68145.181338]   .se->vruntime                  : 19124277.682220
[68145.181339]   .se->sum_exec_runtime          : 451469.872335
[68145.181341]   .se->load.weight               : 2
[68145.181342] 
[68145.181342] cfs_rq[1]:/autogroup-85
[68145.181344]   .exec_clock                    : 0.000000
[68145.181345]   .MIN_vruntime                  : 0.000001
[68145.181347]   .min_vruntime                  : 457615.974873
[68145.181348]   .max_vruntime                  : 0.000001
[68145.181350]   .spread                        : 0.000000
[68145.181351]   .spread0                       : -16482469.817807
[68145.181353]   .nr_spread_over                : 0
[68145.181354]   .nr_running                    : 0
[68145.181355]   .load                          : 0
[68145.181357]   .load_avg                      : 22.432540
[68145.181358]   .load_period                   : 9.882693
[68145.181360]   .load_contrib                  : 2
[68145.181361]   .load_tg                       : 14
[68145.181362]   .se->exec_start                : 68145166.823349
[68145.181364]   .se->vruntime                  : 19124277.441135
[68145.181366]   .se->sum_exec_runtime          : 496686.043300
[68145.181367]   .se->load.weight               : 2
[68145.181368] 
[68145.181369] cfs_rq[1]:/autogroup-99
[68145.181370]   .exec_clock                    : 0.000000
[68145.181372]   .MIN_vruntime                  : 0.000001
[68145.181373]   .min_vruntime                  : 886895.365957
[68145.181374]   .max_vruntime                  : 0.000001
[68145.181376]   .spread                        : 0.000000
[68145.181377]   .spread0                       : -16053191.714824
[68145.181379]   .nr_spread_over                : 0
[68145.181380]   .nr_running                    : 0
[68145.181381]   .load                          : 0
[68145.181382]   .load_avg                      : 19.102799
[68145.181384]   .load_period                   : 6.184140
[68145.181385]   .load_contrib                  : 3
[68145.181386]   .load_tg                       : 20
[68145.181388]   .se->exec_start                : 68145174.705985
[68145.181389]   .se->vruntime                  : 19124286.557447
[68145.181391]   .se->sum_exec_runtime          : 563791.520677
[68145.181392]   .se->load.weight               : 2
[68145.181394] 
[68145.181394] cfs_rq[1]:/autogroup-345
[68145.181395]   .exec_clock                    : 0.000000
[68145.181397]   .MIN_vruntime                  : 0.000001
[68145.181398]   .min_vruntime                  : 390277.323389
[68145.181400]   .max_vruntime                  : 0.000001
[68145.181401]   .spread                        : 0.000000
[68145.181402]   .spread0                       : -16549809.757392
[68145.181404]   .nr_spread_over                : 0
[68145.181405]   .nr_running                    : 0
[68145.181406]   .load                          : 0
[68145.181408]   .load_avg                      : 3.038718
[68145.181409]   .load_period                   : 9.886871
[68145.181410]   .load_contrib                  : 0
[68145.181412]   .load_tg                       : 431
[68145.181413]   .se->exec_start                : 68145154.709399
[68145.181415]   .se->vruntime                  : 19124277.152141
[68145.181416]   .se->sum_exec_runtime          : 488635.600464
[68145.181418]   .se->load.weight               : 2
[68145.181419] 
[68145.181419] cfs_rq[1]:/
[68145.181420]   .exec_clock                    : 0.000000
[68145.181422]   .MIN_vruntime                  : 0.000001
[68145.181423]   .min_vruntime                  : 19124304.595285
[68145.181425]   .max_vruntime                  : 0.000001
[68145.181426]   .spread                        : 0.000000
[68145.181428]   .spread0                       : 2184217.514504
[68145.181429]   .nr_spread_over                : 0
[68145.181430]   .nr_running                    : 1
[68145.181432]   .load                          : 832
[68145.181433]   .load_avg                      : 0.000000
[68145.181434]   .load_period                   : 0.000000
[68145.181436]   .load_contrib                  : 0
[68145.181437]   .load_tg                       : 0
[68145.181438] 
[68145.181438] rt_rq[1]:
[68145.181439]   .rt_nr_running                 : 0
[68145.181441]   .rt_throttled                  : 0
[68145.181442]   .rt_time                       : 0.002810
[68145.181443]   .rt_runtime                    : 950.000000
[68145.181445] 
[68145.181446] runnable tasks:
[68145.181446]             task   PID         tree-key  switches  prio     exec-runtime         sum-exec        sum-sleep
[68145.181447] ----------------------------------------------------------------------------------------------------------
[68145.181467] R           bash  8880       804.468551      1051   123               0               0               0.000000               0.000000               0.000000 /autogroup-728
[68145.181474] 
[68145.181474] cpu#2, 2893.340 MHz
[68145.181475]   .nr_running                    : 1
[68145.181477]   .load                          : 1006
[68145.181478]   .nr_switches                   : 281539588
[68145.181479]   .nr_load_updates               : 5389825
[68145.181481]   .nr_uninterruptible            : 4408
[68145.181482]   .next_balance                  : 4311.972269
[68145.181484]   .curr->pid                     : 8272
[68145.181485]   .clock                         : 68145181.347401
[68145.181486]   .cpu_load[0]                   : 1006
[68145.181488]   .cpu_load[1]                   : 1003
[68145.181489]   .cpu_load[2]                   : 979
[68145.181490]   .cpu_load[3]                   : 940
[68145.181491]   .cpu_load[4]                   : 926
[68145.181493] 
[68145.181493] cfs_rq[2]:/autogroup-53
[68145.181495]   .exec_clock                    : 0.000000
[68145.181496]   .MIN_vruntime                  : 0.000001
[68145.181498]   .min_vruntime                  : 14748239.445043
[68145.181499]   .max_vruntime                  : 0.000001
[68145.181500]   .spread                        : 0.000000
[68145.181502]   .spread0                       : -2191847.635738
[68145.181503]   .nr_spread_over                : 0
[68145.181505]   .nr_running                    : 1
[68145.181506]   .load                          : 1024
[68145.181507]   .load_avg                      : 8875.504327
[68145.181509]   .load_period                   : 8.171060
[68145.181510]   .load_contrib                  : 1043
[68145.181511]   .load_tg                       : 1480
[68145.181513]   .se->exec_start                : 68145181.347401
[68145.181514]   .se->vruntime                  : 23834838.341563
[68145.181516]   .se->sum_exec_runtime          : 14308302.530944
[68145.181517]   .se->load.weight               : 1006
[68145.181518] 
[68145.181519] cfs_rq[2]:/
[68145.181520]   .exec_clock                    : 0.000000
[68145.181522]   .MIN_vruntime                  : 0.000001
[68145.181523]   .min_vruntime                  : 23834838.341563
[68145.181525]   .max_vruntime                  : 0.000001
[68145.181526]   .spread                        : 0.000000
[68145.181527]   .spread0                       : 6894751.260782
[68145.181529]   .nr_spread_over                : 0
[68145.181530]   .nr_running                    : 1
[68145.181531]   .load                          : 1006
[68145.181533]   .load_avg                      : 0.000000
[68145.181534]   .load_period                   : 0.000000
[68145.181535]   .load_contrib                  : 0
[68145.181536]   .load_tg                       : 0
[68145.181538] 
[68145.181538] rt_rq[2]:
[68145.181539]   .rt_nr_running                 : 0
[68145.181540]   .rt_throttled                  : 0
[68145.181542]   .rt_time                       : 0.002557
[68145.181543]   .rt_runtime                    : 950.000000
[68145.181545] 
[68145.181545] runnable tasks:
[68145.181546]             task   PID         tree-key  switches  prio     exec-runtime         sum-exec        sum-sleep
[68145.181547] ----------------------------------------------------------------------------------------------------------
[68145.181556] Rplugin-containe  8272  14748239.445043  94987642   120               0               0               0.000000               0.000000               0.000000 /autogroup-53
[68145.181571] 
[68145.181571] cpu#3, 2893.340 MHz
[68145.181573]   .nr_running                    : 0
[68145.181574]   .load                          : 0
[68145.181575]   .nr_switches                   : 284183655
[68145.181576]   .nr_load_updates               : 5361929
[68145.181578]   .nr_uninterruptible            : -2384
[68145.181579]   .next_balance                  : 4311.972267
[68145.181581]   .curr->pid                     : 0
[68145.181582]   .clock                         : 68145181.363470
[68145.181583]   .cpu_load[0]                   : 923
[68145.181584]   .cpu_load[1]                   : 466
[68145.181586]   .cpu_load[2]                   : 287
[68145.181587]   .cpu_load[3]                   : 227
[68145.181588]   .cpu_load[4]                   : 173
[68145.181590] 
[68145.181590] cfs_rq[3]:/autogroup-345
[68145.181591]   .exec_clock                    : 0.000000
[68145.181593]   .MIN_vruntime                  : 0.000001
[68145.181594]   .min_vruntime                  : 271574.282861
[68145.181596]   .max_vruntime                  : 0.000001
[68145.181597]   .spread                        : 0.000000
[68145.181598]   .spread0                       : -16668512.797920
[68145.181600]   .nr_spread_over                : 0
[68145.181601]   .nr_running                    : 0
[68145.181602]   .load                          : 0
[68145.181604]   .load_avg                      : 2929.355520
[68145.181605]   .load_period                   : 8.069625
[68145.181606]   .load_contrib                  : 363
[68145.181608]   .load_tg                       : 431
[68145.181609]   .se->exec_start                : 68145181.362481
[68145.181611]   .se->vruntime                  : 24275013.527067
[68145.181612]   .se->sum_exec_runtime          : 337338.760030
[68145.181614]   .se->load.weight               : 2
[68145.181615] 
[68145.181616] cfs_rq[3]:/autogroup-99
[68145.181617]   .exec_clock                    : 0.000000
[68145.181618]   .MIN_vruntime                  : 0.000001
[68145.181620]   .min_vruntime                  : 709630.528938
[68145.181621]   .max_vruntime                  : 0.000001
[68145.181623]   .spread                        : 0.000000
[68145.181624]   .spread0                       : -16230456.551843
[68145.181625]   .nr_spread_over                : 0
[68145.181627]   .nr_running                    : 0
[68145.181628]   .load                          : 0
[68145.181629]   .load_avg                      : 80.933248
[68145.181631]   .load_period                   : 7.593330
[68145.181632]   .load_contrib                  : 10
[68145.181633]   .load_tg                       : 20
[68145.181635]   .se->exec_start                : 68145172.520017
[68145.181636]   .se->vruntime                  : 24275013.536371
[68145.181638]   .se->sum_exec_runtime          : 455298.062310
[68145.181639]   .se->load.weight               : 2
[68145.181640] 
[68145.181641] cfs_rq[3]:/autogroup-53
[68145.181642]   .exec_clock                    : 0.000000
[68145.181643]   .MIN_vruntime                  : 0.000001
[68145.181645]   .min_vruntime                  : 15285191.917887
[68145.181646]   .max_vruntime                  : 0.000001
[68145.181648]   .spread                        : 0.000000
[68145.181649]   .spread0                       : -1654895.162894
[68145.181651]   .nr_spread_over                : 0
[68145.181652]   .nr_running                    : 0
[68145.181653]   .load                          : 0
[68145.181654]   .load_avg                      : 27.589206
[68145.181656]   .load_period                   : 7.598913
[68145.181657]   .load_contrib                  : 3
[68145.181658]   .load_tg                       : 1480
[68145.181660]   .se->exec_start                : 68145171.243075
[68145.181661]   .se->vruntime                  : 24275021.987757
[68145.181663]   .se->sum_exec_runtime          : 15157215.916175
[68145.181664]   .se->load.weight               : 2
[68145.181666] 
[68145.181666] cfs_rq[3]:/
[68145.181667]   .exec_clock                    : 0.000000
[68145.181669]   .MIN_vruntime                  : 0.000001
[68145.181670]   .min_vruntime                  : 24275021.987757
[68145.181672]   .max_vruntime                  : 0.000001
[68145.181673]   .spread                        : 0.000000
[68145.181674]   .spread0                       : 7334934.906976
[68145.181675]   .nr_spread_over                : 0
[68145.181676]   .nr_running                    : 0
[68145.181677]   .load                          : 0
[68145.181678]   .load_avg                      : 0.000000
[68145.181680]   .load_period                   : 0.000000
[68145.181681]   .load_contrib                  : 0
[68145.181683]   .load_tg                       : 0
[68145.181684] 
[68145.181685] rt_rq[3]:
[68145.181686]   .rt_nr_running                 : 0
[68145.181687]   .rt_throttled                  : 0
[68145.181689]   .rt_time                       : 0.004236
[68145.181691]   .rt_runtime                    : 950.000000
[68145.181693] 
[68145.181693] runnable tasks:
[68145.181694]             task   PID         tree-key  switches  prio     exec-runtime         sum-exec        sum-sleep
[68145.181695] ----------------------------------------------------------------------------------------------------------
[68145.181712] R  btrfs-endio-3 13855  24275012.987757    163715   120               0               0               0.000000               0.000000               0.000000 /
[68145.181719] 
[68151.449220] kworker/0:0: page allocation failure: order:1, mode:0x4020
[68151.449225] Pid: 9004, comm: kworker/0:0 Tainted: G         C O 3.4.4-amd64-preempt-noide-20120410 #1
[68151.449227] Call Trace:
[68151.449228]  <IRQ>  [<ffffffff810bf8d8>] ? warn_alloc_failed+0x11f/0x132
[68151.449239]  [<ffffffff810453ff>] ? __mod_timer+0x13a/0x14c
[68151.449243]  [<ffffffff810c26e2>] ? __alloc_pages_nodemask+0x72f/0x7df
[68151.449256]  [<ffffffff810ecf10>] ? alloc_pages_current+0xc7/0xe4
[68151.449266]  [<ffffffffa04ca247>] ? iwlagn_rx_allocate+0x97/0x24d [iwlwifi]
[68151.449273]  [<ffffffffa04cb27b>] ? iwl_irq_tasklet+0x6e4/0x838 [iwlwifi]
[68151.449279]  [<ffffffff8103f6e9>] ? tasklet_action+0x79/0xc8
[68151.449283]  [<ffffffff8103f205>] ? __do_softirq+0xc0/0x188
[68151.449288]  [<ffffffff8136879c>] ? call_softirq+0x1c/0x30
[68151.449293]  [<ffffffff8100fcf9>] ? do_softirq+0x3c/0x7b
[68151.449297]  [<ffffffff8103f4f3>] ? irq_exit+0x3d/0xa5
[68151.449301]  [<ffffffff8100fa1e>] ? do_IRQ+0x81/0x97
[68151.449306]  [<ffffffff81362e2a>] ? common_interrupt+0x6a/0x6a
[68151.449308]  <EOI>  [<ffffffff811b7bca>] ? idr_get_next+0x7d/0x92
[68151.449318]  [<ffffffff81088d74>] ? css_get_next+0x59/0x97
[68151.449322]  [<ffffffff810fc115>] ? mem_cgroup_iter+0x109/0x1ab
[68151.449328]  [<ffffffff810cb31a>] ? shrink_zone+0x89/0x9b
[68151.449333]  [<ffffffff810cb73c>] ? do_try_to_free_pages+0x1e4/0x434
[68151.449338]  [<ffffffff810cbc11>] ? try_to_free_pages+0xb3/0xf9
[68151.449343]  [<ffffffff810c24a2>] ? __alloc_pages_nodemask+0x4ef/0x7df
[68151.449349]  [<ffffffff810ecf10>] ? alloc_pages_current+0xc7/0xe4
[68151.449356]  [<ffffffffa04ca247>] ? iwlagn_rx_allocate+0x97/0x24d [iwlwifi]
[68151.449363]  [<ffffffffa04ca81e>] ? iwlagn_rx_replenish+0x3a/0x3a [iwlwifi]
[68151.449369]  [<ffffffffa04ca7fc>] ? iwlagn_rx_replenish+0x18/0x3a [iwlwifi]
[68151.449373]  [<ffffffff8104ea7d>] ? process_one_work+0x16d/0x298
[68151.449379]  [<ffffffff8104f4d9>] ? worker_thread+0xc2/0x145
[68151.449383]  [<ffffffff8104f417>] ? manage_workers.isra.23+0x15b/0x15b
[68151.449386]  [<ffffffff81052788>] ? kthread+0x7d/0x85
[68151.449390]  [<ffffffff813686a4>] ? kernel_thread_helper+0x4/0x10
[68151.449395]  [<ffffffff8105270b>] ? kthread_freezable_should_stop+0x37/0x37
[68151.449399]  [<ffffffff813686a0>] ? gs_change+0x13/0x13
[68151.449401] Mem-Info:
[68151.449403] Node 0 DMA per-cpu:
[68151.449406] CPU    0: hi:    0, btch:   1 usd:   0
[68151.449409] CPU    1: hi:    0, btch:   1 usd:   0
[68151.449411] CPU    2: hi:    0, btch:   1 usd:   0
[68151.449413] CPU    3: hi:    0, btch:   1 usd:   0
[68151.449415] Node 0 DMA32 per-cpu:
[68151.449418] CPU    0: hi:  186, btch:  31 usd:  40
[68151.449421] CPU    1: hi:  186, btch:  31 usd:   0
[68151.449423] CPU    2: hi:  186, btch:  31 usd:   0
[68151.449425] CPU    3: hi:  186, btch:  31 usd:   0
[68151.449427] Node 0 Normal per-cpu:
[68151.449430] CPU    0: hi:  186, btch:  31 usd:  34
[68151.449432] CPU    1: hi:  186, btch:  31 usd:  28
[68151.449434] CPU    2: hi:  186, btch:  31 usd:   0
[68151.449436] CPU    3: hi:  186, btch:  31 usd:   0
[68151.449441] active_anon:722200 inactive_anon:259222 isolated_anon:0
[68151.449442]  active_file:278984 inactive_file:316208 isolated_file:0
[68151.449443]  unevictable:1 dirty:180847 writeback:0 unstable:0
[68151.449444]  free:40350 slab_reclaimable:31197 slab_unreclaimable:22223
[68151.449446]  mapped:306850 shmem:33690 pagetables:19223 bounce:0
[68151.449449] Node 0 DMA free:15904kB min:132kB low:164kB high:196kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15680kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes
[68151.449461] lowmem_reserve[]: 0 3257 7777 7777
[68151.449468] Node 0 DMA32 free:81536kB min:28252kB low:35312kB high:42376kB active_anon:1292064kB inactive_anon:514608kB active_file:593820kB inactive_file:686892kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:3335900kB mlocked:0kB dirty:332952kB writeback:0kB mapped:185252kB shmem:48512kB slab_reclaimable:40220kB slab_unreclaimable:22240kB kernel_stack:1600kB pagetables:15604kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
[68151.449480] lowmem_reserve[]: 0 0 4519 4519
[68151.449486] Node 0 Normal free:63960kB min:39196kB low:48992kB high:58792kB active_anon:1596736kB inactive_anon:522280kB active_file:522116kB inactive_file:577940kB unevictable:4kB isolated(anon):0kB isolated(file):0kB present:4627820kB mlocked:4kB dirty:390436kB writeback:0kB mapped:1042148kB shmem:86248kB slab_reclaimable:84568kB slab_unreclaimable:66652kB kernel_stack:4920kB pagetables:61288kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:34 all_unreclaimable? no
[68151.449500] lowmem_reserve[]: 0 0 0 0
[68151.449506] Node 0 DMA: 0*4kB 0*8kB 0*16kB 1*32kB 2*64kB 1*128kB 1*256kB 0*512kB 1*1024kB 1*2048kB 3*4096kB = 15904kB
[68151.449521] Node 0 DMA32: 19072*4kB 12*8kB 0*16kB 1*32kB 1*64kB 0*128kB 0*256kB 0*512kB 1*1024kB 0*2048kB 1*4096kB = 81600kB
[68151.449537] Node 0 Normal: 14790*4kB 86*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 1*4096kB = 63944kB
[68151.449552] 664276 total pagecache pages
[68151.449555] 35263 pages in swap cache
[68151.449557] Swap cache stats: add 373242, delete 337979, find 73039/82302
[68151.449560] Free swap  = 3029760kB
[68151.449562] Total swap = 4106248kB
[68151.475328] 2057712 pages RAM
[68151.475330] 340431 pages reserved
[68151.475331] 589330 pages shared
[68151.475332] 1252827 pages non-shared
[68151.475337] iwlwifi 0000:03:00.0: Failed to alloc_pages with GFP_ATOMIC.Only 5 free buffers remaining.
[69042.086472] btrfs: unlinked 9 orphans
[69042.307617] btrfs: unlinked 78 orphans
[69042.434375] btrfs: unlinked 1 orphans
[69042.598993] btrfs: unlinked 27 orphans
[69647.593815] nfs: server gargamel not responding, still trying
[69647.593830] nfs: server gargamel not responding, still trying
[69647.593835] nfs: server gargamel not responding, still trying
[69647.593840] nfs: server gargamel not responding, still trying
[69647.593845] nfs: server gargamel not responding, still trying
(...)
[69658.194846] nfs: server gargamel not responding, still trying
[69658.194848] nfs: server gargamel not responding, still trying
[69658.194849] nfs: server gargamel not responding, still trying
[69671.113619] nfs: server gargamel OK
[69671.456875] nfs: server gargamel OK
[69673.105176] SysRq : Show Blocked State
[69673.105182]   task                        PC stack   pid father
[69673.105325] automount       D ffff88020e94e810     0 10363      1 0x00000080
[69673.105330]  ffff88020e94e810 0000000000000086 ffff88021142f0c0 ffff880002250140
[69673.105334]  0000000000013680 ffff8800099e3fd8 ffff8800099e3fd8 ffff88020e94e810
[69673.105338]  ffff88021142f0c0 7fffffffffffffff 7fffffffffffffff ffff8800099e3480
[69673.105342] Call Trace:
[69673.105350]  [<ffffffff81360e2b>] ? schedule_timeout+0x2c/0xdb
[69673.105355]  [<ffffffff8104e736>] ? __queue_work+0x22e/0x256
[69673.105360]  [<ffffffff8105dd51>] ? get_parent_ip+0x9/0x1b
[69673.105363]  [<ffffffff8136583e>] ? sub_preempt_count+0x83/0x94
[69673.105366]  [<ffffffff81361967>] ? wait_for_common+0x9a/0x112
[69673.105369]  [<ffffffff8105f46a>] ? try_to_wake_up+0x1b4/0x1b4
[69673.105372]  [<ffffffff8104c785>] ? call_usermodehelper_exec+0x9b/0xdd
[69673.105376]  [<ffffffff811679f6>] ? call_sbin_request_key+0x265/0x2d1
[69673.105385]  [<ffffffff81167d75>] ? request_key_and_link+0x313/0x39a
[69673.105389]  [<ffffffff810f3e0c>] ? ____cache_alloc+0x19/0x205
[69673.105393]  [<ffffffff81167eae>] ? request_key+0x37/0x69
[69673.105396]  [<ffffffff810570aa>] ? override_creds+0x1f/0x32
[69673.105411]  [<ffffffffa070196f>] ? nfs_idmap_request_key+0xe6/0x18b [nfs]
[69673.105422]  [<ffffffffa0701b25>] ? nfs_idmap_lookup_id+0x53/0xca [nfs]
[69673.105432]  [<ffffffffa06f8389>] ? decode_getfattr_attrs+0x8de/0xcf4 [nfs]
[69673.105436]  [<ffffffff81061521>] ? set_next_entity+0x32/0x52
[69673.105445]  [<ffffffffa06fdcb0>] ? nfs4_xdr_dec_lookup+0x71/0x71 [nfs]
[69673.105454]  [<ffffffffa06fd665>] ? decode_getfattr_generic.constprop.83+0x7f/0xcd [nfs]
[69673.105464]  [<ffffffffa06fdcb0>] ? nfs4_xdr_dec_lookup+0x71/0x71 [nfs]
[69673.105475]  [<ffffffffa06fdcf9>] ? nfs4_xdr_dec_getattr+0x49/0x50 [nfs]
[69673.105484]  [<ffffffffa069be59>] ? rpc_make_runnable+0x6a/0x6a [sunrpc]
[69673.105492]  [<ffffffffa069d7f3>] ? rpcauth_unwrap_resp+0x9b/0xa8 [sunrpc]
[69673.105498]  [<ffffffffa069be59>] ? rpc_make_runnable+0x6a/0x6a [sunrpc]
[69673.105504]  [<ffffffffa0693f70>] ? call_decode+0x673/0x708 [sunrpc]
[69673.105509]  [<ffffffffa06938fd>] ? call_bc_transmit+0x109/0x109 [sunrpc]
[69673.105514]  [<ffffffffa06938fd>] ? call_bc_transmit+0x109/0x109 [sunrpc]
[69673.105520]  [<ffffffffa069c6a1>] ? __rpc_execute+0xbe/0x320 [sunrpc]
[69673.105524]  [<ffffffff81052c0e>] ? wake_up_bit+0xd/0x1e
[69673.105530]  [<ffffffffa06954e7>] ? rpc_run_task+0x77/0x7e [sunrpc]
[69673.105535]  [<ffffffffa06955dd>] ? rpc_call_sync+0x42/0x62 [sunrpc]
[69673.105545]  [<ffffffffa06ef1d4>] ? _nfs4_proc_getattr+0x8b/0x94 [nfs]
[69673.105555]  [<ffffffffa06f2629>] ? nfs4_proc_getattr+0x2b/0x48 [nfs]
[69673.105562]  [<ffffffffa06de0f5>] ? __nfs_revalidate_inode+0xab/0x194 [nfs]
[69673.105569]  [<ffffffffa06d94c3>] ? nfs_check_verifier+0x42/0x60 [nfs]
[69673.105575]  [<ffffffffa06d9607>] ? nfs_lookup_revalidate+0xf0/0x353 [nfs]
[69673.105582]  [<ffffffffa06db091>] ? nfs_open_revalidate+0x1bc/0x1cb [nfs]
[69673.105585]  [<ffffffff8136583e>] ? sub_preempt_count+0x83/0x94
[69673.105588]  [<ffffffff81362b37>] ? _raw_spin_unlock+0x25/0x31
[69673.105592]  [<ffffffff81113e7f>] ? __d_lookup+0xd1/0xe3
[69673.105594]  [<ffffffff8110bf3f>] ? inode_permission+0x65/0xda
[69673.105597]  [<ffffffff8110ba37>] ? walk_component+0x202/0x3c8
[69673.105601]  [<ffffffff811be7eb>] ? put_dec+0x2e/0x33
[69673.105604]  [<ffffffff8110c81a>] ? path_lookupat+0x7c/0x29b
[69673.105608]  [<ffffffff810fcc74>] ? __mem_cgroup_uncharge_common+0x100/0x1fd
[69673.105612]  [<ffffffff810c0392>] ? get_pageblock_flags_group+0x3a/0x6e
[69673.105616]  [<ffffffffa07bfb60>] ? find_autofs_mount+0x9f/0x9f [autofs4]
[69673.105619]  [<ffffffff8110ca55>] ? do_path_lookup+0x1c/0x87
[69673.105622]  [<ffffffffa07bfb60>] ? find_autofs_mount+0x9f/0x9f [autofs4]
[69673.105625]  [<ffffffff8110cc79>] ? kern_path+0x1d/0x3a
[69673.105629]  [<ffffffffa07c0020>] ? _autofs_dev_ioctl+0x82/0x32e [autofs4]
[69673.105632]  [<ffffffff810f49aa>] ? __kmalloc_track_caller+0xf6/0x108
[69673.105636]  [<ffffffffa07bfbb5>] ? autofs_dev_ioctl_ismountpoint+0x55/0x137 [autofs4]
[69673.105640]  [<ffffffffa07c0239>] ? _autofs_dev_ioctl+0x29b/0x32e [autofs4]
[69673.105644]  [<ffffffffa07c02d7>] ? autofs_dev_ioctl+0xb/0xf [autofs4]
[69673.105647]  [<ffffffff81110091>] ? do_vfs_ioctl+0x403/0x444
[69673.105650]  [<ffffffff810dcecf>] ? do_munmap+0x2da/0x2f3
[69673.105653]  [<ffffffff8111011d>] ? sys_ioctl+0x4b/0x72
[69673.105657]  [<ffffffff8136723d>] ? system_call_fastpath+0x1a/0x1f
[69673.105666] mc              D ffff8802134ff750     0  6526  21734 0x00000080
[69673.105670]  ffff8802134ff750 0000000000000086 0000000000000008 ffff880175d69890
[69673.105673]  0000000000013680 ffff88001a98dfd8 ffff88001a98dfd8 ffff8802134ff750
[69673.105677]  ffff8801a299ede0 ffff88021e213680 ffff8802134ff750 ffffffff810bb429
[69673.105681] Call Trace:
[69673.105684]  [<ffffffff810bb429>] ? __lock_page+0x66/0x66
[69673.105687]  [<ffffffff81362059>] ? io_schedule+0x55/0x6b
[69673.105690]  [<ffffffff810bb42f>] ? sleep_on_page+0x6/0xa
[69673.105692]  [<ffffffff81361054>] ? __wait_on_bit+0x3e/0x71
[69673.105696]  [<ffffffff810bb577>] ? wait_on_page_bit+0x6e/0x73
[69673.105699]  [<ffffffff81052e69>] ? autoremove_wake_function+0x2a/0x2a
[69673.105731]  [<ffffffffa0199d18>] ? read_extent_buffer_pages+0x1fb/0x24b [btrfs]
[69673.105745]  [<ffffffffa0175112>] ? lock_page+0x11/0x11 [btrfs]
[69673.105758]  [<ffffffffa0176e69>] ? btree_read_extent_buffer_pages.constprop.110+0x5c/0xf3 [btrfs]
[69673.105771]  [<ffffffffa0177412>] ? read_tree_block+0x25/0x2d [btrfs]
[69673.105782]  [<ffffffffa0163087>] ? read_block_for_search.isra.32+0x2af/0x2e7 [btrfs]
[69673.105793]  [<ffffffffa01649c2>] ? btrfs_search_slot+0x48d/0x659 [btrfs]
[69673.105805]  [<ffffffffa0172d7d>] ? btrfs_lookup_csum+0x66/0x106 [btrfs]
[69673.105808]  [<ffffffff8136583e>] ? sub_preempt_count+0x83/0x94
[69673.105820]  [<ffffffffa0172fad>] ? __btrfs_lookup_bio_sums+0x190/0x30a [btrfs]
[69673.105834]  [<ffffffffa017d021>] ? btrfs_submit_bio_hook+0xa9/0x12a [btrfs]
[69673.105849]  [<ffffffffa0194af3>] ? submit_one_bio+0x82/0xb9 [btrfs]
[69673.105863]  [<ffffffffa01977c7>] ? submit_extent_page.isra.26+0x10a/0x1b2 [btrfs]
[69673.105876]  [<ffffffffa0196d91>] ? repair_io_failure+0x18e/0x18e [btrfs]
[69673.105889]  [<ffffffffa0197cf5>] ? __extent_read_full_page+0x424/0x4be [btrfs]
[69673.105902]  [<ffffffffa0196d91>] ? repair_io_failure+0x18e/0x18e [btrfs]
[69673.105916]  [<ffffffffa017eca4>] ? btrfs_writepage+0x4b/0x4b [btrfs]
[69673.105920]  [<ffffffff810c56b3>] ? __lru_cache_add+0x7b/0x8d
[69673.105934]  [<ffffffffa01988e8>] ? extent_readpages+0xaf/0xf3 [btrfs]
[69673.105947]  [<ffffffffa017eca4>] ? btrfs_writepage+0x4b/0x4b [btrfs]
[69673.105950]  [<ffffffff810c4640>] ? __do_page_cache_readahead+0x139/0x1de
[69673.105953]  [<ffffffff810c493d>] ? ra_submit+0x19/0x1d
[69673.105956]  [<ffffffff810bca23>] ? generic_file_aio_read+0x2b0/0x5d3
[69673.105960]  [<ffffffff81110627>] ? set_fd_set+0x23/0x26
[69673.105963]  [<ffffffff8110252a>] ? do_sync_read+0xab/0xe3
[69673.105967]  [<ffffffff81362b73>] ? _raw_spin_unlock_irqrestore+0x30/0x3e
[69673.105971]  [<ffffffff810595b4>] ? __wake_up+0x35/0x46
[69673.105973]  [<ffffffff81102c22>] ? vfs_read+0x9f/0xe6
[69673.105977]  [<ffffffff81103c2d>] ? fget_light+0x33/0x8d
[69673.105979]  [<ffffffff81102cae>] ? sys_read+0x45/0x6b
[69673.105983]  [<ffffffff8136723d>] ? system_call_fastpath+0x1a/0x1f
[69673.105986] mplayer         D ffff8802017ef040     0  8109    538 0x00000080
[69673.105990]  ffff8802017ef040 0000000000000086 ffff88012ff54700 ffff880121d77080
[69673.105993]  0000000000013680 ffff88013fc0dfd8 ffff88013fc0dfd8 ffff8802017ef040
[69673.105996]  0000000000000000 ffff88013fc0d890 ffff88021e5b5208 ffffffff8116761a
[69673.106000] Call Trace:
[69673.106003]  [<ffffffff8116761a>] ? wait_for_key_construction+0x59/0x59
[69673.106005]  [<ffffffff81167620>] ? key_wait_bit+0x6/0xa
[69673.106008]  [<ffffffff81361054>] ? __wait_on_bit+0x3e/0x71
[69673.106011]  [<ffffffff813610f6>] ? out_of_line_wait_on_bit+0x6f/0x78
[69673.106014]  [<ffffffff8116761a>] ? wait_for_key_construction+0x59/0x59
[69673.106017]  [<ffffffff81052e69>] ? autoremove_wake_function+0x2a/0x2a
[69673.106020]  [<ffffffff811675fb>] ? wait_for_key_construction+0x3a/0x59
[69673.106022]  [<ffffffff81167ec3>] ? request_key+0x4c/0x69
[69673.106025]  [<ffffffff810570aa>] ? override_creds+0x1f/0x32
[69673.106036]  [<ffffffffa070196f>] ? nfs_idmap_request_key+0xe6/0x18b [nfs]
[69673.106046]  [<ffffffffa0701b25>] ? nfs_idmap_lookup_id+0x53/0xca [nfs]
[69673.106056]  [<ffffffffa06f8389>] ? decode_getfattr_attrs+0x8de/0xcf4 [nfs]
[69673.106059]  [<ffffffff81061521>] ? set_next_entity+0x32/0x52
[69673.106069]  [<ffffffffa06fdcb0>] ? nfs4_xdr_dec_lookup+0x71/0x71 [nfs]
[69673.106078]  [<ffffffffa06fd665>] ? decode_getfattr_generic.constprop.83+0x7f/0xcd [nfs]
[69673.106087]  [<ffffffffa06fdcb0>] ? nfs4_xdr_dec_lookup+0x71/0x71 [nfs]
[69673.106096]  [<ffffffffa06fdcf9>] ? nfs4_xdr_dec_getattr+0x49/0x50 [nfs]
[69673.106104]  [<ffffffffa069be59>] ? rpc_make_runnable+0x6a/0x6a [sunrpc]
[69673.106111]  [<ffffffffa069d7f3>] ? rpcauth_unwrap_resp+0x9b/0xa8 [sunrpc]
[69673.106117]  [<ffffffffa069be59>] ? rpc_make_runnable+0x6a/0x6a [sunrpc]
[69673.106122]  [<ffffffffa0693f70>] ? call_decode+0x673/0x708 [sunrpc]
[69673.106128]  [<ffffffffa06938fd>] ? call_bc_transmit+0x109/0x109 [sunrpc]
[69673.106133]  [<ffffffffa06938fd>] ? call_bc_transmit+0x109/0x109 [sunrpc]
[69673.106139]  [<ffffffffa069c6a1>] ? __rpc_execute+0xbe/0x320 [sunrpc]
[69673.106141]  [<ffffffff81052c0e>] ? wake_up_bit+0xd/0x1e
[69673.106147]  [<ffffffffa06954e7>] ? rpc_run_task+0x77/0x7e [sunrpc]
[69673.106152]  [<ffffffffa06955dd>] ? rpc_call_sync+0x42/0x62 [sunrpc]
[69673.106162]  [<ffffffffa06ef1d4>] ? _nfs4_proc_getattr+0x8b/0x94 [nfs]
[69673.106172]  [<ffffffffa06f2629>] ? nfs4_proc_getattr+0x2b/0x48 [nfs]
[69673.106179]  [<ffffffffa06de0f5>] ? __nfs_revalidate_inode+0xab/0x194 [nfs]
[69673.106186]  [<ffffffffa06de21a>] ? nfs_revalidate_mapping+0x3c/0xf5 [nfs]
[69673.106193]  [<ffffffffa06db88d>] ? nfs_file_read+0x8d/0xce [nfs]
[69673.106196]  [<ffffffff8110252a>] ? do_sync_read+0xab/0xe3
[69673.106199]  [<ffffffff810f3bb4>] ? kfree+0x65/0x76
[69673.106204]  [<ffffffffa035f8d6>] ? snd_timer_user_read+0x10a/0x1b4 [snd_timer]
[69673.106207]  [<ffffffff81102c22>] ? vfs_read+0x9f/0xe6
[69673.106209]  [<ffffffff81103c2d>] ? fget_light+0x33/0x8d
[69673.106212]  [<ffffffff81102cae>] ? sys_read+0x45/0x6b
[69673.106215]  [<ffffffff8136723d>] ? system_call_fastpath+0x1a/0x1f

Thanks,
Marc
-- 
"A mouse is a device used to point at the xterm you want to type in" - A.S.R.
Microsoft is to operating systems ....
                                      .... what McDonalds is to gourmet cooking
Home page: http://marc.merlins.org/  

^ permalink raw reply

* Re: resurrecting tcphealth
From: Piotr Sawuk @ 2012-07-16 15:12 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel
In-Reply-To: <1342445557.2523.8.camel@bwh-desktop.uk.solarflarecom.com>

On Mo, 16.07.2012, 15:32, Ben Hutchings wrote:
> On Sat, 2012-07-14 at 09:56 +0200, Piotr Sawuk wrote:
>> On Sa, 14.07.2012, 03:31, valdis.kletnieks@vt.edu wrote:
>> > On Fri, 13 Jul 2012 16:55:44 -0700, Stephen Hemminger said:
>> >
>> >> >+			/* Course retransmit inefficiency- this packet has been received
>> >> twice. */
>> >> >+			tp->dup_pkts_recv++;
>> >> I don't understand that comment, could you use a better sentence
>> please?
>> >
>> > I think what was intended was:
>> >
>> > /* Curse you, retransmit inefficiency! This packet has been received at
>> least twice */
>> >
>>
>> LOL, no. I think "course retransmit" is short for "course-grained timeout
>> caused retransmit" but I can't be sure since I'm not the author of these
>> lines. I'll replace that comment with the non-shorthand version though.
>> however, I think the real comment here should be:
> [...]
>
> The word you are looking for is 'coarse' not 'course' (they are
> generally pronounced the same, to confuse you).

that was my first thought too.
but then I noticed the word "course" in the kernel's comments.
judging by context it describes the events of a round-trip.
so I guess Course-Grained means RTT-grained.
especially since this misspelling was consistent in the author's paper.

anyway, new patch, made some mistakes in my previous version.
also I added the jiffies_to_msecs noone dared to mention
comments and suggestions as always welcome:

diff -rub A/include/linux/tcp.h B/include/linux/tcp.h
--- A/include/linux/tcp.h	2012-07-08 02:23:56.000000000 +0200
+++ B/include/linux/tcp.h	2012-07-16 16:42:08.000000000 +0200
@@ -492,6 +492,17 @@
 	 * contains related tcp_cookie_transactions fields.
 	 */
 	struct tcp_cookie_values  *cookie_values;
+
+#ifdef CONFIG_TCPHEALTH
+	/*
+	 * TCP health monitoring counters.
+	 */
+	__u32	dup_acks_sent;
+	__u32	dup_pkts_recv;
+	__u32	acks_sent;
+	__u32	pkts_recv;
+	__u32	last_ack_sent;	/* Sequence number of the last ack we sent. */
+#endif
 };

 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
diff -rub A/net/ipv4/Kconfig B/net/ipv4/Kconfig
--- A/net/ipv4/Kconfig	2012-07-08 02:23:56.000000000 +0200
+++ B/net/ipv4/Kconfig	2012-07-16 11:56:15.000000000 +0200
@@ -619,6 +619,28 @@
 	default "reno" if DEFAULT_RENO
 	default "cubic"

+config TCPHEALTH
+	bool "TCP client-side health-statistics (/proc/net/tcphealth)"
+	default n
+	---help---
+	TCP Health Monitoring (established connections only):
+	 -Duplicate ACKs indicate there could be lost or reordered packets
+	  on the connection.
+	 -Duplicate Packets Received signal a slow and badly inefficient
+	  connection.
+	 -RttEst estimates how long future packets will take on a round trip
+	  over the connection.
+
+	Additionally you get total amount of sent ACKs and received Packets.
+	All these values are displayed seperately for each connection.
+	If you are running a dedicated server you wont need this.
+	Duplicate ACKs refers only to those sent upon receiving a Packet.
+	A server most likely doesn't receive much Packets to count.
+	Hence for a server these statistics wont be meaningful.
+	especially since they are split into individual connections.
+
+	If you plan to investigate why some download is slow, say Y.
+
 config TCP_MD5SIG
 	bool "TCP: MD5 Signature Option support (RFC2385) (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
diff -rub A/net/ipv4/tcp_input.c B/net/ipv4/tcp_input.c
--- A/net/ipv4/tcp_input.c	2012-07-08 02:23:56.000000000 +0200
+++ B/net/ipv4/tcp_input.c	2012-07-16 16:45:17.000000000 +0200
@@ -4492,6 +4492,11 @@
 		}

 		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+#ifdef CONFIG_TCPHEALTH
+			/* Course-Grained Timeout caused retransmit inefficiency-
+			 * this packet has been received twice. */
+			tp->dup_pkts_recv++;
+#endif
 			SOCK_DEBUG(sk, "ofo packet was already received\n");
 			__skb_unlink(skb, &tp->out_of_order_queue);
 			__kfree_skb(skb);
@@ -4824,6 +4829,12 @@
 		return;
 	}

+#ifdef CONFIG_TCPHEALTH
+	/* A packet is a "duplicate" if it contains bytes we have already
received. */
+	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+		tp->dup_pkts_recv++;
+#endif
+
 	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
 		/* A retransmit, 2nd most common case.  Force an immediate ack. */
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
@@ -5535,6 +5546,12 @@

 	tp->rx_opt.saw_tstamp = 0;

+#ifdef CONFIG_TCPHEALTH
+	/*
+	 *	total per-connection packet arrivals.
+	 */
+	tp->pkts_recv++;
+#endif
 	/*	pred_flags is 0xS?10 << 16 + snd_wnd
 	 *	if header_prediction is to be made
 	 *	'S' will always be tp->tcp_header_len >> 2
diff -rub A/net/ipv4/tcp_ipv4.c B/net/ipv4/tcp_ipv4.c
--- A/net/ipv4/tcp_ipv4.c	2012-07-08 02:23:56.000000000 +0200
+++ B/net/ipv4/tcp_ipv4.c	2012-07-16 16:29:05.000000000 +0200
@@ -2500,6 +2500,57 @@
 	return 0;
 }

+#ifdef CONFIG_TCPHEALTH
+/*
+ *	Output /proc/net/tcphealth
+ */
+#define LINESZ 128
+
+int tcp_health_seq_show(struct seq_file *seq, void *v)
+{
+	int len;
+	struct tcp_iter_state *st;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq,
+		"id   Local Address        Remote Address       RttEst(ms) AcksSent "
+		"DupAcksSent PktsRecv DupPktsRecv\n");
+		goto out;
+	}
+
+	/* Loop through established TCP connections */
+	st = seq->private;
+
+
+	if (st->state == TCP_SEQ_STATE_ESTABLISHED)
+	{
+		const struct tcp_sock *tp = tcp_sk(v);
+		const struct inet_sock *inet = inet_sk(v);
+
+		seq_printf(seq, "%d: %-21pI4:%u %-21pI4:%u "
+				"%8u %8lu %8lu %8lu %8lu%n",
+				st->num,
+				&inet->inet_rcv_saddr,
+				ntohs(inet->inet_sport),
+				&inet->inet_daddr,
+				ntohs(inet->inet_dport),
+				jiffies_to_msecs(tp->srtt),
+				tp->acks_sent,
+				tp->dup_acks_sent,
+				tp->pkts_recv,
+				tp->dup_pkts_recv,
+
+				&len
+			);
+
+		seq_printf(seq, "%*s\n", LINESZ - 1 - len, "");
+	}
+
+out:
+	return 0;
+}
+#endif /* CONFIG_TCPHEALTH */
+
 static const struct file_operations tcp_afinfo_seq_fops = {
 	.owner   = THIS_MODULE,
 	.open    = tcp_seq_open,
@@ -2508,6 +2559,17 @@
 	.release = seq_release_net
 };

+#ifdef CONFIG_TCPHEALTH
+static struct tcp_seq_afinfo tcphealth_seq_afinfo = {
+	.name		= "tcphealth",
+	.family		= AF_INET,
+	.seq_fops	= &tcp_afinfo_seq_fops,
+	.seq_ops	= {
+		.show		= tcp_health_seq_show,
+	},
+};
+#endif
+
 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
 	.name		= "tcp",
 	.family		= AF_INET,
@@ -2519,12 +2581,20 @@

 static int __net_init tcp4_proc_init_net(struct net *net)
 {
-	return tcp_proc_register(net, &tcp4_seq_afinfo);
+	int ret = tcp_proc_register(net, &tcp4_seq_afinfo);
+#ifdef CONFIG_TCPHEALTH
+	if(ret == 0)
+		ret = tcp_proc_register(net, &tcphealth_seq_afinfo);
+#endif
+	return ret;
 }

 static void __net_exit tcp4_proc_exit_net(struct net *net)
 {
 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
+#ifdef CONFIG_TCPHEALTH
+	tcp_proc_unregister(net, &tcphealth_seq_afinfo);
+#endif
 }

 static struct pernet_operations tcp4_net_ops = {
diff -rub A/net/ipv4/tcp_output.c B/net/ipv4/tcp_output.c
--- A/net/ipv4/tcp_output.c	2012-07-08 02:23:56.000000000 +0200
+++ B/net/ipv4/tcp_output.c	2012-07-16 09:44:02.000000000 +0200
@@ -2772,8 +2772,19 @@
 	skb_reserve(buff, MAX_TCP_HEADER);
 	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);

+#ifdef CONFIG_TCPHEALTH
+	/* If the rcv_nxt has not advanced since sending our last ACK, this is a
duplicate. */
+	if (tcp_sk(sk)->rcv_nxt == tcp_sk(sk)->last_ack_sent)
+		tcp_sk(sk)->dup_acks_sent++;
+	/* Record the total number of acks sent on this connection. */
+	tcp_sk(sk)->acks_sent++;
+#endif
+
 	/* Send it off, this clears delayed acks for us. */
 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
+#ifdef CONFIG_TCPHEALTH
+	tcp_sk(sk)->last_ack_sent = tcp_sk(sk)->rcv_nxt;
+#endif
 	tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
 }

^ permalink raw reply

* RE: Is TCP vulneribility patch (as in RFC 5961) done in linux?
From: Eric Dumazet @ 2012-07-16 14:47 UTC (permalink / raw)
  To: Kiran (Kiran Kumar) Kella; +Cc: netdev@vger.kernel.org
In-Reply-To: <68700EDA775E5E47B5EBA9FF8AC0F15C078898@SJEXCHMB09.corp.ad.broadcom.com>

On Mon, 2012-07-16 at 13:50 +0000, Kiran (Kiran Kumar) Kella wrote:
> Eric,
> 
>   Thanks a lot for the patch.
> I shall try it out and let you know if I see any issues.

Please note followup patches are needed to address RFC 5961 Sections 4 &
5

(4 Blind Reset Attack Using the SYN Bit)
(5 Blind Data Injection Attack)

^ permalink raw reply

* Re: timer expiry check at icmp.c in ipv6
From: Eric Dumazet @ 2012-07-16 14:33 UTC (permalink / raw)
  To: BALAKUMARAN KANNAN; +Cc: netdev@vger.kernel.org
In-Reply-To: <4A71D24947E78D43BC584A7CD4391A41017DBBAF@SIXPRD0410MB359.apcprd04.prod.outlook.com>

On Mon, 2012-07-16 at 13:44 +0000, BALAKUMARAN KANNAN wrote:
> Dear all,
> 
> In kernel-3.0.26 code net/ipv6/icmp.c, while sending ICMP reply where
> it checks for the timer expiry. It should check the value given by a
> router advertisement. I think the expiry value is stored in
> rt->rt6_expires in ndisc.c (line no: 1284). Then while sending an ICMP
> reply, it should check with the expiry timer right? Where that check
> is happening? Please somebody explain me.
> 
> Thank you.

Its probably done in net/ipv6/ip6_fib.c

fib6_gc_timer_cb() -> fib6_run_gc()

Every 30 seconds.

(you can change /proc/sys/net/ipv6/route/gc_interval )

^ permalink raw reply

* [PATCH] net-next: make sock diag per-namespace (v2)
From: Andrew Vagin @ 2012-07-16 14:28 UTC (permalink / raw)
  To: David S. Miller
  Cc: Andrew Vagin, David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, Pavel Emelyanov, Eric Dumazet,
	linux-kernel, netdev

Before this patch sock_diag works for init_net only and dumps
information about sockets from all namespaces.

This patch expands sock_diag for all name-spaces.
It creates a netlink kernel socket for each netns and filters
data during dumping.

v2: filter accoding with netns in all places
    remove an unused variable.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: James Morris <jmorris@namei.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Pavel Emelyanov <xemul@parallels.com>
CC: Eric Dumazet <eric.dumazet@gmail.com>
Cc: linux-kernel@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Andrew Vagin <avagin@openvz.org>
---
 include/linux/sock_diag.h   |    1 -
 include/net/net_namespace.h |    1 +
 net/core/sock_diag.c        |   27 ++++++++++++++++++++-------
 net/ipv4/inet_diag.c        |   21 ++++++++++++++++-----
 net/ipv4/udp_diag.c         |   10 +++++++---
 net/unix/diag.c             |    9 +++++++--
 6 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index 6793fac..e3e395a 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -44,6 +44,5 @@ void sock_diag_save_cookie(void *sk, __u32 *cookie);
 
 int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attr);
 
-extern struct sock *sock_diag_nlsk;
 #endif /* KERNEL */
 #endif
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index ac9195e..ae1cd6c 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -101,6 +101,7 @@ struct net {
 	struct netns_xfrm	xfrm;
 #endif
 	struct netns_ipvs	*ipvs;
+	struct sock		*diag_nlsk;
 };
 
 
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 07a29eb..9d8755e 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -166,23 +166,36 @@ static void sock_diag_rcv(struct sk_buff *skb)
 	mutex_unlock(&sock_diag_mutex);
 }
 
-struct sock *sock_diag_nlsk;
-EXPORT_SYMBOL_GPL(sock_diag_nlsk);
-
-static int __init sock_diag_init(void)
+static int __net_init diag_net_init(struct net *net)
 {
 	struct netlink_kernel_cfg cfg = {
 		.input	= sock_diag_rcv,
 	};
 
-	sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG,
+	net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG,
 					       THIS_MODULE, &cfg);
-	return sock_diag_nlsk == NULL ? -ENOMEM : 0;
+	return net->diag_nlsk == NULL ? -ENOMEM : 0;
+}
+
+static void __net_exit diag_net_exit(struct net *net)
+{
+	netlink_kernel_release(net->diag_nlsk);
+	net->diag_nlsk = NULL;
+}
+
+static struct pernet_operations diag_net_ops = {
+	.init = diag_net_init,
+	.exit = diag_net_exit,
+};
+
+static int __init sock_diag_init(void)
+{
+	return register_pernet_subsys(&diag_net_ops);
 }
 
 static void __exit sock_diag_exit(void)
 {
-	netlink_kernel_release(sock_diag_nlsk);
+	unregister_pernet_subsys(&diag_net_ops);
 }
 
 module_init(sock_diag_init);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 38064a2..570e61f 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -272,16 +272,17 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
 	int err;
 	struct sock *sk;
 	struct sk_buff *rep;
+	struct net *net = sock_net(in_skb->sk);
 
 	err = -EINVAL;
 	if (req->sdiag_family == AF_INET) {
-		sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0],
+		sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
 				 req->id.idiag_dport, req->id.idiag_src[0],
 				 req->id.idiag_sport, req->id.idiag_if);
 	}
 #if IS_ENABLED(CONFIG_IPV6)
 	else if (req->sdiag_family == AF_INET6) {
-		sk = inet6_lookup(&init_net, hashinfo,
+		sk = inet6_lookup(net, hashinfo,
 				  (struct in6_addr *)req->id.idiag_dst,
 				  req->id.idiag_dport,
 				  (struct in6_addr *)req->id.idiag_src,
@@ -317,7 +318,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
 		nlmsg_free(rep);
 		goto out;
 	}
-	err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid,
+	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid,
 			      MSG_DONTWAIT);
 	if (err > 0)
 		err = 0;
@@ -724,6 +725,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 {
 	int i, num;
 	int s_i, s_num;
+	struct net *net = sock_net(skb->sk);
 
 	s_i = cb->args[1];
 	s_num = num = cb->args[2];
@@ -743,6 +745,9 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 			sk_nulls_for_each(sk, node, &ilb->head) {
 				struct inet_sock *inet = inet_sk(sk);
 
+				if (!net_eq(sock_net(sk), net))
+					continue;
+
 				if (num < s_num) {
 					num++;
 					continue;
@@ -813,6 +818,8 @@ skip_listen_ht:
 		sk_nulls_for_each(sk, node, &head->chain) {
 			struct inet_sock *inet = inet_sk(sk);
 
+			if (!net_eq(sock_net(sk), net))
+				continue;
 			if (num < s_num)
 				goto next_normal;
 			if (!(r->idiag_states & (1 << sk->sk_state)))
@@ -839,6 +846,8 @@ next_normal:
 
 			inet_twsk_for_each(tw, node,
 				    &head->twchain) {
+				if (!net_eq(twsk_net(tw), net))
+					continue;
 
 				if (num < s_num)
 					goto next_dying;
@@ -943,6 +952,7 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
 static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	int hdrlen = sizeof(struct inet_diag_req);
+	struct net *net = sock_net(skb->sk);
 
 	if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX ||
 	    nlmsg_len(nlh) < hdrlen)
@@ -963,7 +973,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
 			struct netlink_dump_control c = {
 				.dump = inet_diag_dump_compat,
 			};
-			return netlink_dump_start(sock_diag_nlsk, skb, nlh, &c);
+			return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
 		}
 	}
 
@@ -973,6 +983,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
 static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 {
 	int hdrlen = sizeof(struct inet_diag_req_v2);
+	struct net *net = sock_net(skb->sk);
 
 	if (nlmsg_len(h) < hdrlen)
 		return -EINVAL;
@@ -991,7 +1002,7 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 			struct netlink_dump_control c = {
 				.dump = inet_diag_dump,
 			};
-			return netlink_dump_start(sock_diag_nlsk, skb, h, &c);
+			return netlink_dump_start(net->diag_nlsk, skb, h, &c);
 		}
 	}
 
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index a7f86a3..16d0960 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -34,15 +34,16 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
 	int err = -EINVAL;
 	struct sock *sk;
 	struct sk_buff *rep;
+	struct net *net = sock_net(in_skb->sk);
 
 	if (req->sdiag_family == AF_INET)
-		sk = __udp4_lib_lookup(&init_net,
+		sk = __udp4_lib_lookup(net,
 				req->id.idiag_src[0], req->id.idiag_sport,
 				req->id.idiag_dst[0], req->id.idiag_dport,
 				req->id.idiag_if, tbl);
 #if IS_ENABLED(CONFIG_IPV6)
 	else if (req->sdiag_family == AF_INET6)
-		sk = __udp6_lib_lookup(&init_net,
+		sk = __udp6_lib_lookup(net,
 				(struct in6_addr *)req->id.idiag_src,
 				req->id.idiag_sport,
 				(struct in6_addr *)req->id.idiag_dst,
@@ -75,7 +76,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
 		kfree_skb(rep);
 		goto out;
 	}
-	err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid,
+	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid,
 			      MSG_DONTWAIT);
 	if (err > 0)
 		err = 0;
@@ -90,6 +91,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin
 		struct inet_diag_req_v2 *r, struct nlattr *bc)
 {
 	int num, s_num, slot, s_slot;
+	struct net *net = sock_net(skb->sk);
 
 	s_slot = cb->args[0];
 	num = s_num = cb->args[1];
@@ -106,6 +108,8 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin
 		sk_nulls_for_each(sk, node, &hslot->head) {
 			struct inet_sock *inet = inet_sk(sk);
 
+			if (!net_eq(sock_net(sk), net))
+				continue;
 			if (num < s_num)
 				goto next;
 			if (!(r->idiag_states & (1 << sk->sk_state)))
diff --git a/net/unix/diag.c b/net/unix/diag.c
index a74864e..750b134 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -177,6 +177,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct unix_diag_req *req;
 	int num, s_num, slot, s_slot;
+	struct net *net = sock_net(skb->sk);
 
 	req = nlmsg_data(cb->nlh);
 
@@ -192,6 +193,8 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 
 		num = 0;
 		sk_for_each(sk, node, &unix_socket_table[slot]) {
+			if (!net_eq(sock_net(sk), net))
+				continue;
 			if (num < s_num)
 				goto next;
 			if (!(req->udiag_states & (1 << sk->sk_state)))
@@ -243,6 +246,7 @@ static int unix_diag_get_exact(struct sk_buff *in_skb,
 	struct sock *sk;
 	struct sk_buff *rep;
 	unsigned int extra_len;
+	struct net *net = sock_net(in_skb->sk);
 
 	if (req->udiag_ino == 0)
 		goto out_nosk;
@@ -273,7 +277,7 @@ again:
 
 		goto again;
 	}
-	err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid,
+	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid,
 			      MSG_DONTWAIT);
 	if (err > 0)
 		err = 0;
@@ -287,6 +291,7 @@ out_nosk:
 static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 {
 	int hdrlen = sizeof(struct unix_diag_req);
+	struct net *net = sock_net(skb->sk);
 
 	if (nlmsg_len(h) < hdrlen)
 		return -EINVAL;
@@ -295,7 +300,7 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 		struct netlink_dump_control c = {
 			.dump = unix_diag_dump,
 		};
-		return netlink_dump_start(sock_diag_nlsk, skb, h, &c);
+		return netlink_dump_start(net->diag_nlsk, skb, h, &c);
 	} else
 		return unix_diag_get_exact(skb, h, nlmsg_data(h));
 }
-- 
1.7.1

^ permalink raw reply related

* RE: [PATCH net-next] be2net: dont pull too much data in skb linear part
From: Padmanabh.Ratnakar @ 2012-07-16 14:02 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, Sathya.Perla
In-Reply-To: <1342185581.3265.8355.camel@edumazet-glaptop>



> -----Original Message-----
> From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
> Sent: Friday, July 13, 2012 6:50 PM
> To: Ratnakar, Padmanabh
> Cc: netdev
> Subject: [PATCH net-next] be2net: dont pull too much data in skb linear part
> 
> From: Eric Dumazet <edumazet@google.com>
> 
> skb_fill_rx_data() pulls 64 byte of data in skb->data
> 
> Its too much for TCP (with no options) on IPv4, as total size of headers is 14 +
> 40 = 54
> 
> This means tcp stack and splice() are suboptimal, since tcp payload is in part in
> tcp->data, and in part in skb frag.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  drivers/net/ethernet/emulex/benet/be_main.c |    6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/ethernet/emulex/benet/be_main.c
> b/drivers/net/ethernet/emulex/benet/be_main.c
> index 7e989d0..f18375c 100644
> --- a/drivers/net/ethernet/emulex/benet/be_main.c
> +++ b/drivers/net/ethernet/emulex/benet/be_main.c
> @@ -1228,16 +1228,16 @@ static void skb_fill_rx_data(struct be_rx_obj
> *rxo, struct sk_buff *skb,
>  	/* Copy data in the first descriptor of this completion */
>  	curr_frag_len = min(rxcp->pkt_size, rx_frag_size);
> 
> -	/* Copy the header portion into skb_data */
> -	hdr_len = min(BE_HDR_LEN, curr_frag_len);
> -	memcpy(skb->data, start, hdr_len);
>  	skb->len = curr_frag_len;
>  	if (curr_frag_len <= BE_HDR_LEN) { /* tiny packet */
> +		memcpy(skb->data, start, curr_frag_len);
>  		/* Complete packet has now been moved to data */
>  		put_page(page_info->page);
>  		skb->data_len = 0;
>  		skb->tail += curr_frag_len;
>  	} else {
> +		hdr_len = ETH_HLEN;
> +		memcpy(skb->data, start, hdr_len);
>  		skb_shinfo(skb)->nr_frags = 1;
>  		skb_frag_set_page(skb, 0, page_info->page);
>  		skb_shinfo(skb)->frags[0].page_offset =
> 
Change looks good to me. I tested this and it is working.
Thanks
Acked-by: Padmanabh Ratnakar <padmanabh.ratnakar@emulex.com>



^ permalink raw reply

* Re: [PATCH] net-next: make sock diag per-namespace
From: Ben Hutchings @ 2012-07-16 14:00 UTC (permalink / raw)
  To: Andrew Vagin
  Cc: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy, Pavel Emelyanov, linux-kernel,
	netdev
In-Reply-To: <1342440849-1757320-1-git-send-email-avagin@openvz.org>

On Mon, 2012-07-16 at 16:14 +0400, Andrew Vagin wrote:
> Before this patch sock_diag works for init_net only and dumps
> information about sockets from all namespaces.
> 
> This patch expands sock_diag for all name-spaces.
> It creates a netlink kernel socket for each netns and filters
> data during dumping.
[...]
> --- a/net/core/sock_diag.c
> +++ b/net/core/sock_diag.c
> @@ -166,23 +166,39 @@ static void sock_diag_rcv(struct sk_buff *skb)
>  	mutex_unlock(&sock_diag_mutex);
>  }
>  
> -struct sock *sock_diag_nlsk;
> -EXPORT_SYMBOL_GPL(sock_diag_nlsk);
> +struct sock *diag_nlsk;
> +EXPORT_SYMBOL_GPL(diag_nlsk);
[...]

This new variable seems to be unused.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* for all benefits
From: Martins @ 2012-07-16 13:58 UTC (permalink / raw)


[-- Attachment #1: Type: text/plain, Size: 31 bytes --]

for all benefits, note attached

[-- Attachment #2: for all benefits, note attached.JPG --]
[-- Type: image/jpeg, Size: 132701 bytes --]

^ permalink raw reply

* Re: [PATCH] natsemi: make cable length magic configurable
From: Mark Brown @ 2012-07-16 13:57 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: Jean Delvare, netdev, Tim Hockin, Olaf Kirch
In-Reply-To: <1342444134.2523.4.camel@bwh-desktop.uk.solarflarecom.com>

On Mon, Jul 16, 2012 at 02:08:54PM +0100, Ben Hutchings wrote:
> On Mon, 2012-07-16 at 14:26 +0200, Jean Delvare wrote:

> > Furthermore I don't quite get why we can't just go with the module
> > parameter. As I understand it, this is a crappy driver for crappy, rare
> > hardware. The driver already has a module parameter to work around a
> > hardware bug (dspcfg_workaround), I don't quite see why adding a second
> > one would be a problem. At least it is consistent.

> David can be quite insistent about finding an alternative to module
> parameters.

The hardware isn't that rare or bad - it's pretty widely deployed in
100Mbit systems (including lots of embedded ones) and performs well for
the systems it's targetting.  You'd not use it for a modern server but
if what you need is 100M it's a fairly good part.

The dspcfg_workaround change was added for one specific board, I'd be
really surprised if it were useful for anything other than the board it
was originally developed for, I'd not be surprised to learn that it's a
hardware issue in that board.  It disables a documented erratum
workaround to give a performance improvement, it was done for one board
that for some reason triggered the erratum a lot but with less severe
consequences than normal.

^ permalink raw reply

* RE: Is TCP vulneribility patch (as in RFC 5961) done in linux?
From: Kiran (Kiran Kumar) Kella @ 2012-07-16 13:50 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev@vger.kernel.org
In-Reply-To: <1342446411.23494.76.camel@edumazet-glaptop>

Eric,

  Thanks a lot for the patch.
I shall try it out and let you know if I see any issues.

Regards,
Kiran

-----Original Message-----
From: Eric Dumazet [mailto:eric.dumazet@gmail.com] 
Sent: Monday, July 16, 2012 7:17 PM
To: Kiran (Kiran Kumar) Kella
Cc: netdev@vger.kernel.org
Subject: RE: Is TCP vulneribility patch (as in RFC 5961) done in linux?

On Mon, 2012-07-16 at 10:33 +0200, Eric Dumazet wrote:
> On Mon, 2012-07-16 at 07:06 +0000, Kiran (Kiran Kumar) Kella wrote:
> > Looking into the file tcp_input.c in the latest stable linux release 3.4.4 source, I understand the fix for this recommendation is not implemented in Linux.
> > Any reason why it was not addressed?
> 
> Nobody cared ?
> 
> Are you planning to send a patch ?
> 


Here is an RFC patch implementing RFC 5961 3.2

[PATCH net-next] tcp: implement RFC 5961 3.2

Implement the RFC 5691 mitigation against Blind
Reset attack using RST bit.

Add a new sysctl, tcp_challengeack_limit, to limit
number of challenge ACK sent per second.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Kiran Kumar Kella <kkiran@broadcom.com>
---
 Documentation/networking/ip-sysctl.txt |    5 +++
 include/linux/snmp.h                   |    1 
 include/net/tcp.h                      |    1 
 net/ipv4/proc.c                        |    1 
 net/ipv4/sysctl_net_ipv4.c             |    7 +++++
 net/ipv4/tcp_input.c                   |   31 ++++++++++++++++++++++-
 6 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index e20c17a..f785fd1 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -565,6 +565,11 @@ tcp_limit_output_bytes - INTEGER
 	reduce the size of individual GSO packet (64KB being the max)
 	Default: 131072
 
+tcp_challengeack_limit - INTEGER
+	Limits number of Challenge ACK sent per second, as recommended
+	in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks)
+	Default: 100
+
 UDP variables:
 
 udp_mem - vector of 3 INTEGERs: min, pressure, max
diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index 2e68f5b..594638e 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -234,6 +234,7 @@ enum
 	LINUX_MIB_TCPREQQFULLDROP,		/* TCPReqQFullDrop */
 	LINUX_MIB_TCPRETRANSFAIL,		/* TCPRetransFail */
 	LINUX_MIB_TCPRCVCOALESCE,			/* TCPRcvCoalesce */
+	LINUX_MIB_TCPCHALLENGEACK,		/* TCPChallengeACK */
 	__LINUX_MIB_MAX
 };
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 439984b..fc14419 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -254,6 +254,7 @@ extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
 extern int sysctl_tcp_limit_output_bytes;
+extern int sysctl_tcp_challengeack_limit;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 8af0d44..d589468 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -258,6 +258,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),
 	SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL),
 	SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE),
+	SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 70730f7..12df8e8 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -605,6 +605,13 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_challengeack_limit",
+		.data		= &sysctl_tcp_challengeack_limit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 #ifdef CONFIG_NET_DMA
 	{
 		.procname	= "tcp_dma_copybreak",
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 055ac49..8e7edff 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -88,6 +88,9 @@ int sysctl_tcp_app_win __read_mostly = 31;
 int sysctl_tcp_adv_win_scale __read_mostly = 1;
 EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
 
+/* rfc5961 challenge ack rate limiting */
+int sysctl_tcp_challengeack_limit = 100;
+
 int sysctl_tcp_stdurg __read_mostly;
 int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
@@ -5244,6 +5247,23 @@ out:
 }
 #endif /* CONFIG_NET_DMA */
 
+static void tcp_send_challenge_ack(struct sock *sk)
+{
+	/* unprotected vars, we dont care of overwrites */
+	static u32 challenge_time;
+	static unsigned int challenge_count;
+	u32 now = tcp_time_stamp / HZ;
+
+	if (now != challenge_time) {
+		challenge_time = now;
+		challenge_count = 0;
+	}
+	if (++challenge_count <= sysctl_tcp_challengeack_limit) {
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
+		tcp_send_ack(sk);
+	}
+}
+
 /* Does PAWS and seqno based validation of an incoming segment, flags will
  * play significant role here.
  */
@@ -5280,7 +5300,16 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 
 	/* Step 2: check RST bit */
 	if (th->rst) {
-		tcp_reset(sk);
+		/* RFC 5961 3.2 : 
+		 * If sequence number exactly matches RCV.NXT, then
+		 *     RESET the connection
+		 * else
+		 *     Send a challenge ACK
+		 */
+		if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
+			tcp_reset(sk);
+		else
+			tcp_send_challenge_ack(sk);
 		goto discard;
 	}
 




^ permalink raw reply related

* RE: Is TCP vulneribility patch (as in RFC 5961) done in linux?
From: Kiran (Kiran Kumar) Kella @ 2012-07-16 13:50 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev@vger.kernel.org
In-Reply-To: <1342446411.23494.76.camel@edumazet-glaptop>

Thanks a lot Eric for the patch.
I shall try it out.

Regards,
Kiran

-----Original Message-----
From: Eric Dumazet [mailto:eric.dumazet@gmail.com] 
Sent: Monday, July 16, 2012 7:17 PM
To: Kiran (Kiran Kumar) Kella
Cc: netdev@vger.kernel.org
Subject: RE: Is TCP vulneribility patch (as in RFC 5961) done in linux?

On Mon, 2012-07-16 at 10:33 +0200, Eric Dumazet wrote:
> On Mon, 2012-07-16 at 07:06 +0000, Kiran (Kiran Kumar) Kella wrote:
> > Looking into the file tcp_input.c in the latest stable linux release 3.4.4 source, I understand the fix for this recommendation is not implemented in Linux.
> > Any reason why it was not addressed?
> 
> Nobody cared ?
> 
> Are you planning to send a patch ?
> 


Here is an RFC patch implementing RFC 5961 3.2

[PATCH net-next] tcp: implement RFC 5961 3.2

Implement the RFC 5691 mitigation against Blind
Reset attack using RST bit.

Add a new sysctl, tcp_challengeack_limit, to limit
number of challenge ACK sent per second.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Kiran Kumar Kella <kkiran@broadcom.com>
---
 Documentation/networking/ip-sysctl.txt |    5 +++
 include/linux/snmp.h                   |    1 
 include/net/tcp.h                      |    1 
 net/ipv4/proc.c                        |    1 
 net/ipv4/sysctl_net_ipv4.c             |    7 +++++
 net/ipv4/tcp_input.c                   |   31 ++++++++++++++++++++++-
 6 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index e20c17a..f785fd1 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -565,6 +565,11 @@ tcp_limit_output_bytes - INTEGER
 	reduce the size of individual GSO packet (64KB being the max)
 	Default: 131072
 
+tcp_challengeack_limit - INTEGER
+	Limits number of Challenge ACK sent per second, as recommended
+	in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks)
+	Default: 100
+
 UDP variables:
 
 udp_mem - vector of 3 INTEGERs: min, pressure, max
diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index 2e68f5b..594638e 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -234,6 +234,7 @@ enum
 	LINUX_MIB_TCPREQQFULLDROP,		/* TCPReqQFullDrop */
 	LINUX_MIB_TCPRETRANSFAIL,		/* TCPRetransFail */
 	LINUX_MIB_TCPRCVCOALESCE,			/* TCPRcvCoalesce */
+	LINUX_MIB_TCPCHALLENGEACK,		/* TCPChallengeACK */
 	__LINUX_MIB_MAX
 };
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 439984b..fc14419 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -254,6 +254,7 @@ extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
 extern int sysctl_tcp_limit_output_bytes;
+extern int sysctl_tcp_challengeack_limit;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 8af0d44..d589468 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -258,6 +258,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),
 	SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL),
 	SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE),
+	SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 70730f7..12df8e8 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -605,6 +605,13 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "tcp_challengeack_limit",
+		.data		= &sysctl_tcp_challengeack_limit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 #ifdef CONFIG_NET_DMA
 	{
 		.procname	= "tcp_dma_copybreak",
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 055ac49..8e7edff 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -88,6 +88,9 @@ int sysctl_tcp_app_win __read_mostly = 31;
 int sysctl_tcp_adv_win_scale __read_mostly = 1;
 EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
 
+/* rfc5961 challenge ack rate limiting */
+int sysctl_tcp_challengeack_limit = 100;
+
 int sysctl_tcp_stdurg __read_mostly;
 int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
@@ -5244,6 +5247,23 @@ out:
 }
 #endif /* CONFIG_NET_DMA */
 
+static void tcp_send_challenge_ack(struct sock *sk)
+{
+	/* unprotected vars, we dont care of overwrites */
+	static u32 challenge_time;
+	static unsigned int challenge_count;
+	u32 now = tcp_time_stamp / HZ;
+
+	if (now != challenge_time) {
+		challenge_time = now;
+		challenge_count = 0;
+	}
+	if (++challenge_count <= sysctl_tcp_challengeack_limit) {
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
+		tcp_send_ack(sk);
+	}
+}
+
 /* Does PAWS and seqno based validation of an incoming segment, flags will
  * play significant role here.
  */
@@ -5280,7 +5300,16 @@ static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 
 	/* Step 2: check RST bit */
 	if (th->rst) {
-		tcp_reset(sk);
+		/* RFC 5961 3.2 : 
+		 * If sequence number exactly matches RCV.NXT, then
+		 *     RESET the connection
+		 * else
+		 *     Send a challenge ACK
+		 */
+		if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
+			tcp_reset(sk);
+		else
+			tcp_send_challenge_ack(sk);
 		goto discard;
 	}
 




^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox