* Re: [RFC] [ver3 PATCH 3/6] virtio_net: virtio_net driver changes
From: Sasha Levin @ 2011-11-18 6:24 UTC (permalink / raw)
To: Ben Hutchings
Cc: Krishna Kumar, rusty, mst, netdev, kvm, davem, virtualization
In-Reply-To: <1321578488.2749.66.camel@bwh-desktop>
On Fri, 2011-11-18 at 01:08 +0000, Ben Hutchings wrote:
> On Fri, 2011-11-11 at 18:34 +0530, Krishna Kumar wrote:
> > Changes for multiqueue virtio_net driver.
> [...]
> > @@ -677,25 +730,35 @@ static struct rtnl_link_stats64 *virtnet
> > {
> > struct virtnet_info *vi = netdev_priv(dev);
> > int cpu;
> > - unsigned int start;
> >
> > for_each_possible_cpu(cpu) {
> > - struct virtnet_stats __percpu *stats
> > - = per_cpu_ptr(vi->stats, cpu);
> > - u64 tpackets, tbytes, rpackets, rbytes;
> > -
> > - do {
> > - start = u64_stats_fetch_begin(&stats->syncp);
> > - tpackets = stats->tx_packets;
> > - tbytes = stats->tx_bytes;
> > - rpackets = stats->rx_packets;
> > - rbytes = stats->rx_bytes;
> > - } while (u64_stats_fetch_retry(&stats->syncp, start));
> > -
> > - tot->rx_packets += rpackets;
> > - tot->tx_packets += tpackets;
> > - tot->rx_bytes += rbytes;
> > - tot->tx_bytes += tbytes;
> > + int qpair;
> > +
> > + for (qpair = 0; qpair < vi->num_queue_pairs; qpair++) {
> > + struct virtnet_send_stats __percpu *tx_stat;
> > + struct virtnet_recv_stats __percpu *rx_stat;
>
> While you're at it, you can drop the per-CPU stats and make them only
> per-queue. There is unlikely to be any benefit in maintaining them
> per-CPU while receive and transmit processing is serialised per-queue.
It allows you to update stats without a lock.
Whats the benefit of having them per queue?
--
Sasha.
^ permalink raw reply
* [PATCH 1/1] PHY configuration for compatible issue
From: AriesLee @ 2011-11-18 14:54 UTC (permalink / raw)
To: Aries Lee, Guo-Fu Tseng, netdev; +Cc: AriesLee
To perform PHY calibration and set a different EA value by chip ID,
Whenever the NIC chip power on, ie booting or resuming, we need to
force HW to calibrate PHY parameter again, and also set a proper EA
value which gather from experiment.
Those procedures help to reduce compatible issues(NIC is unable to link
up in some special case) in giga speed.
Signed-off-by: AriesLee <AriesLee@jmicron.com>
---
drivers/net/ethernet/jme.c | 119 ++++++++++++++++++++++++++++++++++++++++++-
drivers/net/ethernet/jme.h | 19 +++++++
2 files changed, 135 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c
index df3ab83..46c68a4 100644
--- a/drivers/net/ethernet/jme.c
+++ b/drivers/net/ethernet/jme.c
@@ -1744,6 +1744,118 @@ jme_phy_off(struct jme_adapter *jme)
jme_new_phy_off(jme);
}
+static void
+jme_phy_specreg_read(struct jme_adapter *jme, u32 specreg, u32 *phy_data)
+{
+ u32 phy_addr;
+
+ phy_addr = JM_PHY_SPEC_REG_READ | specreg;
+ jme_mdio_write(jme->dev, jme->mii_if.phy_id, JM_PHY_SPEC_ADDR_REG,
+ phy_addr);
+ *phy_data = jme_mdio_read(jme->dev, jme->mii_if.phy_id,
+ JM_PHY_SPEC_DATA_REG);
+}
+
+static void
+jme_phy_specreg_write(struct jme_adapter *jme, u32 ext_reg, u32 phy_data)
+{
+ u32 phy_addr;
+
+ phy_addr = JM_PHY_SPEC_REG_WRITE | ext_reg;
+ jme_mdio_write(jme->dev, jme->mii_if.phy_id, JM_PHY_SPEC_DATA_REG,
+ phy_data);
+ jme_mdio_write(jme->dev, jme->mii_if.phy_id, JM_PHY_SPEC_ADDR_REG,
+ phy_addr);
+}
+
+static int
+jme_phy_calibration(struct jme_adapter *jme)
+{
+ u32 ctrl1000, bmcr, phy_addr, phy_data;
+
+ /* Turn PHY off */
+ bmcr = jme_mdio_read(jme->dev, jme->mii_if.phy_id, MII_BMCR);
+ bmcr |= BMCR_PDOWN;
+ jme_mdio_write(jme->dev, jme->mii_if.phy_id, MII_BMCR, bmcr);
+ /* Turn PHY on */
+ bmcr = jme_mdio_read(jme->dev, jme->mii_if.phy_id, MII_BMCR);
+ bmcr &= ~BMCR_PDOWN;
+ jme_mdio_write(jme->dev, jme->mii_if.phy_id, MII_BMCR, bmcr);
+ /* Enabel PHY test mode 1 */
+ ctrl1000 = jme_mdio_read(jme->dev, jme->mii_if.phy_id, MII_CTRL1000);
+ ctrl1000 &= ~PHY_GAD_TEST_MODE_MSK;
+ ctrl1000 |= PHY_GAD_TEST_MODE_1;
+ jme_mdio_write(jme->dev, jme->mii_if.phy_id, MII_CTRL1000, ctrl1000);
+
+ jme_phy_specreg_read(jme, JM_PHY_EXT_COMM_2_REG, &phy_data);
+ phy_data &= ~JM_PHY_EXT_COMM_2_CALI_MODE_0;
+ phy_data |= JM_PHY_EXT_COMM_2_CALI_LATCH |
+ JM_PHY_EXT_COMM_2_CALI_ENABLE;
+ jme_phy_specreg_write(jme, JM_PHY_EXT_COMM_2_REG, phy_data);
+ msleep(20);
+ jme_phy_specreg_read(jme, JM_PHY_EXT_COMM_2_REG, &phy_data);
+ phy_data &= ~(JM_PHY_EXT_COMM_2_CALI_ENABLE |
+ JM_PHY_EXT_COMM_2_CALI_MODE_0 |
+ JM_PHY_EXT_COMM_2_CALI_LATCH);
+ jme_phy_specreg_write(jme, JM_PHY_EXT_COMM_2_REG, phy_data);
+
+ /* Disable PHY test mode */
+ ctrl1000 = jme_mdio_read(jme->dev, jme->mii_if.phy_id, MII_CTRL1000);
+ ctrl1000 &= ~PHY_GAD_TEST_MODE_MSK;
+ jme_mdio_write(jme->dev, jme->mii_if.phy_id, MII_CTRL1000, ctrl1000);
+ return 0;
+}
+
+static int
+jme_phy_setEA(struct jme_adapter *jme)
+{
+ u32 phy_addr, phy_comm0 = 0, phy_comm1 = 0;
+ u8 nic_ctrl;
+
+ pci_read_config_byte(jme->pdev, PCI_PRIV_SHARE_NICCTRL, &nic_ctrl);
+ if ((nic_ctrl & 0x3) == JME_FLAG_PHYEA_ENABLE)
+ return 0;
+
+ switch (jme->pdev->device) {
+ case PCI_DEVICE_ID_JMICRON_JMC250:
+ if (((jme->chip_main_rev == 5) &&
+ ((jme->chip_sub_rev == 0) || (jme->chip_sub_rev == 1) ||
+ (jme->chip_sub_rev == 3))) ||
+ (jme->chip_main_rev >= 6)) {
+ phy_comm0 = 0x008A;
+ phy_comm1 = 0x4109;
+ }
+ if ((jme->chip_main_rev == 3) &&
+ ((jme->chip_sub_rev == 1) || (jme->chip_sub_rev == 2)))
+ phy_comm0 = 0xE088;
+ break;
+ case PCI_DEVICE_ID_JMICRON_JMC260:
+ if (((jme->chip_main_rev == 5) &&
+ ((jme->chip_sub_rev == 0) || (jme->chip_sub_rev == 1) ||
+ (jme->chip_sub_rev == 3))) ||
+ (jme->chip_main_rev >= 6)) {
+ phy_comm0 = 0x008A;
+ phy_comm1 = 0x4109;
+ }
+ if ((jme->chip_main_rev == 3) &&
+ ((jme->chip_sub_rev == 1) || (jme->chip_sub_rev == 2)))
+ phy_comm0 = 0xE088;
+ if ((jme->chip_main_rev == 2) && (jme->chip_sub_rev == 0))
+ phy_comm0 = 0x608A;
+ if ((jme->chip_main_rev == 2) && (jme->chip_sub_rev == 2))
+ phy_comm0 = 0x408A;
+ break;
+ default:
+ return -ENODEV;
+ }
+ if (phy_comm0)
+ jme_phy_specreg_write(jme, JM_PHY_EXT_COMM_0_REG, phy_comm0);
+ if (phy_comm1)
+ jme_phy_specreg_write(jme, JM_PHY_EXT_COMM_1_REG, phy_comm1);
+
+ return 0;
+}
+
static int
jme_open(struct net_device *netdev)
{
@@ -1769,7 +1881,8 @@ jme_open(struct net_device *netdev)
jme_set_settings(netdev, &jme->old_ecmd);
else
jme_reset_phy_processor(jme);
-
+ jme_phy_calibration(jme);
+ jme_phy_setEA(jme);
jme_reset_link(jme);
return 0;
@@ -3184,7 +3297,8 @@ jme_resume(struct device *dev)
jme_set_settings(netdev, &jme->old_ecmd);
else
jme_reset_phy_processor(jme);
-
+ jme_phy_calibration(jme);
+ jme_phy_setEA(jme);
jme_start_irq(jme);
netif_device_attach(netdev);
@@ -3239,4 +3353,3 @@ MODULE_DESCRIPTION("JMicron JMC2x0 PCI Express Ethernet driver");
MODULE_LICENSE("GPL");
MODULE_VERSION(DRV_VERSION);
MODULE_DEVICE_TABLE(pci, jme_pci_tbl);
-
diff --git a/drivers/net/ethernet/jme.h b/drivers/net/ethernet/jme.h
index 02ea27c..4304072 100644
--- a/drivers/net/ethernet/jme.h
+++ b/drivers/net/ethernet/jme.h
@@ -760,6 +760,25 @@ enum jme_rxmcs_bits {
RXMCS_CHECKSUM,
};
+/* Extern PHY common register 2 */
+
+#define PHY_GAD_TEST_MODE_1 0x00002000
+#define PHY_GAD_TEST_MODE_MSK 0x0000E000
+#define JM_PHY_SPEC_REG_READ 0x00004000
+#define JM_PHY_SPEC_REG_WRITE 0x00008000
+#define PHY_CALIBRATION_DELAY 20
+#define JM_PHY_SPEC_ADDR_REG 0x1E
+#define JM_PHY_SPEC_DATA_REG 0x1F
+
+#define JM_PHY_EXT_COMM_0_REG 0x30
+#define JM_PHY_EXT_COMM_1_REG 0x31
+#define JM_PHY_EXT_COMM_2_REG 0x32
+#define JM_PHY_EXT_COMM_2_CALI_ENABLE 0x01
+#define JM_PHY_EXT_COMM_2_CALI_MODE_0 0x02
+#define JM_PHY_EXT_COMM_2_CALI_LATCH 0x10
+#define PCI_PRIV_SHARE_NICCTRL 0xF5
+#define JME_FLAG_PHYEA_ENABLE 0x2
+
/*
* Wakeup Frame setup interface registers
*/
--
1.7.4.4
^ permalink raw reply related
* Re: [PATCH net-next] bnx2: switch to build_skb() infrastructure
From: David Miller @ 2011-11-18 7:09 UTC (permalink / raw)
To: mchan; +Cc: eric.dumazet, netdev, eilong
In-Reply-To: <1321591644.8833.1.camel@HP1>
From: "Michael Chan" <mchan@broadcom.com>
Date: Thu, 17 Nov 2011 20:47:24 -0800
>
> On Tue, 2011-11-15 at 09:30 -0800, Eric Dumazet wrote:
>> This is very similar to bnx2x conversion, but bnx2 only requires 16bytes
>> alignement at start of the received frame to store its l2_fhdr, so goal
>> was not to reduce skb truesize (in fact it should not change after this
>> patch)
>>
>> Using build_skb() reduces cache line misses in the driver, since we
>> use cache hot skb instead of cold ones. Number of in-flight sk_buff
>> structures is lower, they are more likely recycled in SLUB caches
>> while still hot.
>>
>> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
>> CC: Michael Chan <mchan@broadcom.com>
>> CC: Eilon Greenstein <eilong@broadcom.com>
>
> Looks good. Thanks.
>
> Reviewed-by: Michael Chan <mchan@broadcom.com>
Applied, thanks everyone.
^ permalink raw reply
* Re: [PATCH FIX net-next v1] net-forcedeth: fix possible stats inaccuracies on 32b hosts
From: David Miller @ 2011-11-18 7:09 UTC (permalink / raw)
To: eric.dumazet
Cc: david.decotigny, netdev, linux-kernel, ian.campbell, rick.jones2
In-Reply-To: <1321560495.2444.1.camel@edumazet-laptop>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 17 Nov 2011 21:08:15 +0100
> Le jeudi 17 novembre 2011 à 11:38 -0800, David Decotigny a écrit :
>> The software stats are updated from BH, this change ensures that 32b
>> UP hosts use appropriate protection.
>>
>> Tested:
>> - HW/SW stats consistent with pktgen (in particular tx=rx)
>> - HW/SW stats consistent when tx/rx offloads disabled
>> - no problem with+without lockdep (SMP 16-way)
>>
>>
>>
>> Signed-off-by: David Decotigny <david.decotigny@google.com>
>
> Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Applied, thanks.
^ permalink raw reply
* RE: [PATCH 1/1] PHY configuration for compatible issue
From: Aries Lee @ 2011-11-18 7:13 UTC (permalink / raw)
To: 'Guo-Fu Tseng', netdev; +Cc: 'AriesLee'
In-Reply-To: <20111117070537.M4900@cooldavid.org>
Hi Guo-Fu and All
Because jme_phy_on() and jme_phy_off() just turn on/off the PHY, the
value of extern register is still the power on default value, not the most
robust value which we collect in the LAB.
It still need to config those registers to the proper value when the chip
lost the power and power on again.
B.R
Aries
-----Original Message-----
From: Guo-Fu Tseng [mailto:cooldavid@cooldavid.org]
Sent: Thursday, November 17, 2011 3:15 PM
To: AriesLee; netdev@vger.kernel.org
Cc: AriesLee
Subject: Re: [PATCH 1/1] PHY configuration for compatible issue
On Thu, 17 Nov 2011 22:05:42 +0800, AriesLee wrote
> From: Aries Lee <AriesLee@jmicron.com>
>
> To perform PHY calibration and set a different EA value by chip ID,
> Whenever the NIC chip power on, ie booting or resuming, we need to
> force HW to calibrate PHY parameter again, and also set a proper EA
> value which gathered from experiment.
>
> That process resolve the compatible issues(NIC is unable to link
> up in some special case) in giga speed.
Thank you Aries.
Here is some suggestions after a quick review:
It would be better if you implement the read/write function
for extended-phy-register, instead of using JM_PHY_SPEC_ADDR_REG
and JM_PHY_SPEC_DATA_REG directly all the time.
There are jme_phy_on() and jme_phy_off() function in place.
Should you simply using it?
^ permalink raw reply
* lock-warning seen on giving rds-info command
From: Kumar Sanghvi @ 2011-11-18 8:07 UTC (permalink / raw)
To: netdev; +Cc: davem, venkat.x.venkatsubra, Kumar Sanghvi
Hi netdev team,
I am trying to investigate one issue with RDS.
On giving command rds-info, we get below trace in dmesg:
------------[ cut here ]------------
WARNING: at /root/chelsio/git_repos/linux-2.6/kernel/softirq.c:159 local_bh_enable_ip+0x7a/0xa0()
Hardware name: X7DB8
Modules linked in: rds_rdma rds rdma_cm iw_cm ib_addr autofs4 sunrpc
p4_clockmod freq_table speedstep_lib ib_ipoib ib_cm ib_sa ipv6 ib_uverbs
ib_umad ib_mad iw_nes libcrc32c ib_core dm_mirror dm_region_hash dm_log
uinput ppdev sg parport_pc parport pcspkr serio_raw i2c_i801 iTCO_wdt
iTCO_vendor_support ioatdma dca i5k_amb i5000_edac edac_core e1000e shpchp
ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic ata_piix
floppy radeon ttm drm_kms_helper drm hwmon i2c_algo_bit i2c_core dm_mod
[last unloaded: mdio]
Pid: 1937, comm: rds-info Not tainted 3.1.0 #1
Call Trace:
[<ffffffff8106446f>] warn_slowpath_common+0x7f/0xc0
[<ffffffff810644ca>] warn_slowpath_null+0x1a/0x20
[<ffffffff8106af6a>] local_bh_enable_ip+0x7a/0xa0
[<ffffffff814d3126>] _raw_read_unlock_bh+0x16/0x20
[<ffffffff813f83b4>] sock_i_ino+0x44/0x60
[<ffffffffa046ab02>] rds_sock_info+0xd2/0x140 [rds]
[<ffffffffa046c7a7>] rds_info_getsockopt+0x197/0x200 [rds]
[<ffffffffa046a1b4>] rds_getsockopt+0x84/0xe0 [rds]
[<ffffffff813f3453>] sys_getsockopt+0x73/0xe0
[<ffffffff814dafd7>] tracesys+0xd9/0xde
---[ end trace 4e7838d2e1e53231 ]---
I tested this on 3.1 kernel. However, this problem seems to be present on most
recent kernels. I tried to investigate, and the RDS problem seems to stem(I might be wrong)
after this upstream commit:
----
commit f064af1e500a2bf4607706f0f458163bdb2a6ea5
Author: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed Sep 22 12:43:39 2010 +0000
net: fix a lockdep splat
----
In the above commit, sock_i_ino is modified to use read_lock/unlock_bh instead
of plain read_lock/unlock.
Now, everywhere in Linux network stack, it seems that callers of sock_i_ino
are not disabling interrupts and are also not holding any lock while calling
sock_i_ino function. However, RDS seems to be only one who is calling sock_i_ino
with spin_lock_irqsave i.e. with interrupts disabled.
As a result, when sock_i_ino finally calls read_unlock_bh - which in turn will lead to
local_bh_enable_ip - which in turn throws the WARN-ON-ONCE since interrupts are
disabled at this stage.
Below patch is a quick fix which seems to make this lock-warning disappear.
With this patch, the WARN-ON-ONCE is no longer seen on giving rds-info command.
However, I am not much aware of RDS stack, and I don't know the motive of using
spin-lock-irqsave in rds code for the sock-lock. I am also not aware if the below
patch could introduce regressions in RDS code. So, it would be helpful if someone
who knows RDS can review the patch and comment on what should be the proper fix.
Thanks,
Kumar.
----------------------
[PATCH] net/rds: Use spin_lock for rds_sock_lock
On giving rds-info command, we get below lock-warning in dmesg:
....
Call Trace:
[<ffffffff8106446f>] warn_slowpath_common+0x7f/0xc0
[<ffffffff810644ca>] warn_slowpath_null+0x1a/0x20
[<ffffffff8106af6a>] local_bh_enable_ip+0x7a/0xa0
[<ffffffff814d3126>] _raw_read_unlock_bh+0x16/0x20
[<ffffffff813f83b4>] sock_i_ino+0x44/0x60
[<ffffffffa046ab02>] rds_sock_info+0xd2/0x140 [rds]
[<ffffffffa046c7a7>] rds_info_getsockopt+0x197/0x200 [rds]
[<ffffffffa046a1b4>] rds_getsockopt+0x84/0xe0 [rds]
[<ffffffff813f3453>] sys_getsockopt+0x73/0xe0
[<ffffffff814dafd7>] tracesys+0xd9/0xde
....
Fix this by using spin_lock for rds_sock_lock, instead of using
spin_lock_irqsave.
Signed-off-by: Kumar Sanghvi <kumaras@chelsio.com>
---
net/rds/af_rds.c | 20 ++++++++------------
1 files changed, 8 insertions(+), 12 deletions(-)
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index bb6ad81..5080ae2 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -68,7 +68,6 @@ static int rds_release(struct socket *sock)
{
struct sock *sk = sock->sk;
struct rds_sock *rs;
- unsigned long flags;
if (!sk)
goto out;
@@ -94,10 +93,10 @@ static int rds_release(struct socket *sock)
rds_rdma_drop_keys(rs);
rds_notify_queue_get(rs, NULL);
- spin_lock_irqsave(&rds_sock_lock, flags);
+ spin_lock(&rds_sock_lock);
list_del_init(&rs->rs_item);
rds_sock_count--;
- spin_unlock_irqrestore(&rds_sock_lock, flags);
+ spin_unlock(&rds_sock_lock);
rds_trans_put(rs->rs_transport);
@@ -409,7 +408,6 @@ static const struct proto_ops rds_proto_ops = {
static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
{
- unsigned long flags;
struct rds_sock *rs;
sock_init_data(sock, sk);
@@ -426,10 +424,10 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
spin_lock_init(&rs->rs_rdma_lock);
rs->rs_rdma_keys = RB_ROOT;
- spin_lock_irqsave(&rds_sock_lock, flags);
+ spin_lock(&rds_sock_lock);
list_add_tail(&rs->rs_item, &rds_sock_list);
rds_sock_count++;
- spin_unlock_irqrestore(&rds_sock_lock, flags);
+ spin_unlock(&rds_sock_lock);
return 0;
}
@@ -471,12 +469,11 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len,
{
struct rds_sock *rs;
struct rds_incoming *inc;
- unsigned long flags;
unsigned int total = 0;
len /= sizeof(struct rds_info_message);
- spin_lock_irqsave(&rds_sock_lock, flags);
+ spin_lock(&rds_sock_lock);
list_for_each_entry(rs, &rds_sock_list, rs_item) {
read_lock(&rs->rs_recv_lock);
@@ -492,7 +489,7 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len,
read_unlock(&rs->rs_recv_lock);
}
- spin_unlock_irqrestore(&rds_sock_lock, flags);
+ spin_unlock(&rds_sock_lock);
lens->nr = total;
lens->each = sizeof(struct rds_info_message);
@@ -504,11 +501,10 @@ static void rds_sock_info(struct socket *sock, unsigned int len,
{
struct rds_info_socket sinfo;
struct rds_sock *rs;
- unsigned long flags;
len /= sizeof(struct rds_info_socket);
- spin_lock_irqsave(&rds_sock_lock, flags);
+ spin_lock(&rds_sock_lock);
if (len < rds_sock_count)
goto out;
@@ -529,7 +525,7 @@ out:
lens->nr = rds_sock_count;
lens->each = sizeof(struct rds_info_socket);
- spin_unlock_irqrestore(&rds_sock_lock, flags);
+ spin_unlock(&rds_sock_lock);
}
static void rds_exit(void)
--
1.7.6.2
^ permalink raw reply related
* Re: WARNING: at mm/slub.c:3357, kernel BUG at mm/slub.c:3413
From: Markus Trippelsdorf @ 2011-11-18 7:55 UTC (permalink / raw)
To: linux-kernel
Cc: linux-mm, Christoph Lameter, Pekka Enberg, Matt Mackall, Alex Shi,
netdev, Eric Dumazet
In-Reply-To: <20111118072519.GA1615@x4.trippels.de>
On 2011.11.18 at 08:25 +0100, Markus Trippelsdorf wrote:
> This happend during boot this morning:
>
> Nov 18 07:57:12 x4 kernel: XFS (sdb2): Ending clean mount
> Nov 18 07:57:12 x4 kernel: VFS: Mounted root (xfs filesystem) readonly on device 8:18.
> Nov 18 07:57:12 x4 kernel: devtmpfs: mounted
> Nov 18 07:57:12 x4 kernel: Freeing unused kernel memory: 436k freed
> Nov 18 07:57:12 x4 kernel: Write protecting the kernel read-only data: 8192k
> Nov 18 07:57:12 x4 kernel: Freeing unused kernel memory: 1220k freed
> Nov 18 07:57:12 x4 kernel: Freeing unused kernel memory: 132k freed
> Nov 18 07:57:12 x4 kernel: XFS (sda): Mounting Filesystem
> Nov 18 07:57:12 x4 kernel: XFS (sda): Ending clean mount
> Nov 18 07:57:12 x4 kernel: ATL1E 0000:02:00.0: irq 40 for MSI/MSI-X
> Nov 18 07:57:12 x4 kernel: ATL1E 0000:02:00.0: eth0: NIC Link is Up <100 Mbps Full Duplex>
> Nov 18 07:57:12 x4 kernel: ATL1E 0000:02:00.0: eth0: NIC Link is Up <100 Mbps Full Duplex>
> Nov 18 07:57:13 x4 kernel: Adding 2097148k swap on /var/tmp/swap/swapfile. Priority:-1 extents:2 across:2634672k
> Nov 18 07:57:16 x4 kernel: ------------[ cut here ]------------
> Nov 18 07:57:16 x4 kernel: WARNING: at mm/slub.c:3357 ksize+0xa5/0xb0()
> Nov 18 07:57:16 x4 kernel: Hardware name: System Product Name
> Nov 18 07:57:16 x4 kernel: Pid: 1539, comm: wmii Not tainted 3.2.0-rc2-00057-ga9098b3-dirty #60
> Nov 18 07:57:16 x4 kernel: Call Trace: Nov 18 07:57:16 x4 kernel: [<ffffffff8106cb75>] warn_slowpath_common+0x75/0xb0
> Nov 18 07:57:16 x4 kernel: [<ffffffff8106cc75>] warn_slowpath_null+0x15/0x20
> Nov 18 07:57:16 x4 kernel: [<ffffffff81103985>] ksize+0xa5/0xb0
> Nov 18 07:57:16 x4 kernel: [<ffffffff8141e3ee>] __alloc_skb+0x7e/0x210
> Nov 18 07:57:16 x4 kernel: [<ffffffff8141b75e>] sock_alloc_send_pskb+0x1be/0x300
> Nov 18 07:57:16 x4 kernel: [<ffffffff8141a5d2>] ? sock_wfree+0x52/0x60 Nov 18 07:57:16 x4 kernel: [<ffffffff8141b8b0>] sock_alloc_send_skb+0x10/0x20 Nov 18 07:57:16 x4 kernel: [<ffffffff814a3c61>] unix_stream_sendmsg+0x261/0x420
> Nov 18 07:57:16 x4 kernel: [<ffffffff814176ff>] sock_aio_write+0xdf/0x100 Nov 18 07:57:16 x4 kernel: [<ffffffff81417620>] ? sock_aio_read+0x110/0x110
> Nov 18 07:57:16 x4 kernel: [<ffffffff8110b6ca>] do_sync_readv_writev+0xca/0x110
> Nov 18 07:57:16 x4 kernel: [<ffffffff8110b7fb>] ? rw_copy_check_uvector+0x6b/0x130
> Nov 18 07:57:16 x4 kernel: [<ffffffff8110a8a6>] ? do_sync_read+0xd6/0x110 Nov 18 07:57:16 x4 kernel: [<ffffffff8110b996>] do_readv_writev+0xd6/0x1e0
> Nov 18 07:57:16 x4 kernel: [<ffffffff8110bb20>] vfs_writev+0x30/0x60 Nov 18 07:57:16 x4 kernel: [<ffffffff8110bc25>] sys_writev+0x45/0x90
> Nov 18 07:57:16 x4 kernel: [<ffffffff8111d8d1>] ? sys_poll+0x71/0x110
> Nov 18 07:57:16 x4 kernel: [<ffffffff814ca5fb>] system_call_fastpath+0x16/0x1b
> Nov 18 07:57:16 x4 kernel: ---[ end trace 320a3cfbcb373e9a ]---
> Nov 18 07:57:16 x4 kernel: ------------[ cut here ]------------
> Nov 18 07:57:16 x4 kernel: kernel BUG at mm/slub.c:3413!
> Nov 18 07:57:16 x4 kernel: invalid opcode: 0000 [#1] PREEMPT SMP
> Nov 18 07:57:16 x4 kernel: CPU 1
> Nov 18 07:57:16 x4 kernel: Pid: 1500, comm: X Tainted: G W 3.2.0-rc2-00057-ga9098b3-dirty #60 System manufacturer System Product Name/M4A78T-E
> Nov 18 07:57:16 x4 kernel: RIP: 0010:[<ffffffff81103ac1>] [<ffffffff81103ac1>] kfree+0x131/0x140
> Nov 18 07:57:16 x4 kernel: RSP: 0018:ffff88020fbc1b88 EFLAGS: 00010246
> Nov 18 07:57:16 x4 kernel: RAX: 4000000000000000 RBX: ffff880200000000 RCX: 0000000000000304
> Nov 18 07:57:16 x4 kernel: RDX: ffffffff7fffffff RSI: 0000000000000282 RDI: ffff880200000000
> Nov 18 07:57:16 x4 kernel: RBP: ffff88020fbc1ba8 R08: 0000000000000304 R09: ffffea0008000000
> Nov 18 07:57:16 x4 kernel: R10: 00000000005d6ba0 R11: 0000000000003246 R12: ffff880213570700
> Nov 18 07:57:16 x4 kernel: R13: ffffffff8141e8c0 R14: ffff880213570700 R15: ffff880216bba3a0
> Nov 18 07:57:16 x4 kernel: FS: 00007fdc1da79880(0000) GS:ffff88021fc80000(0000) knlGS:0000000000000000
> Nov 18 07:57:16 x4 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> Nov 18 07:57:16 x4 kernel: CR2: 0000000001b39a98 CR3: 000000020fbfc000 CR4: 00000000000006e0
> Nov 18 07:57:16 x4 kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> Nov 18 07:57:16 x4 kernel: DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> Nov 18 07:57:16 x4 kernel: Process X (pid: 1500, threadinfo ffff88020fbc0000, task ffff880216bba3a0)
> Nov 18 07:57:16 x4 kernel: Stack:
> Nov 18 07:57:16 x4 kernel: 0000000000000000 ffff880213570700 ffff880213570700 0000000000000000
> Nov 18 07:57:16 x4 kernel: ffff88020fbc1bc8 ffffffff8141e8c0 ffff880213570700 ffff880213570700
> Nov 18 07:57:16 x4 kernel: ffff88020fbc1be8 ffffffff8141e8e9 ffff880200000000 ffff880213570700
> Nov 18 07:57:16 x4 kernel: Call Trace:
> Nov 18 07:57:16 x4 kernel: [<ffffffff8141e8c0>] skb_release_data+0xf0/0x100
> Nov 18 07:57:16 x4 kernel: [<ffffffff8141e8e9>] skb_release_all+0x19/0x20
> Nov 18 07:57:16 x4 kernel: [<ffffffff8141e901>] __kfree_skb+0x11/0xa0
> Nov 18 07:57:16 x4 kernel: [<ffffffff8141e9b6>] consume_skb+0x26/0xa0
> Nov 18 07:57:16 x4 kernel: [<ffffffff814a5614>] unix_stream_recvmsg+0x2c4/0x790
> Nov 18 07:57:16 x4 kernel: [<ffffffff8111c1c0>] ? __pollwait+0xf0/0xf0
> Nov 18 07:57:16 x4 kernel: [<ffffffff814175f4>] sock_aio_read+0xe4/0x110
> Nov 18 07:57:16 x4 kernel: [<ffffffff8110a8a6>] do_sync_read+0xd6/0x110
> Nov 18 07:57:16 x4 kernel: [<ffffffff8108fa64>] ? enqueue_hrtimer+0x24/0xc0
> Nov 18 07:57:16 x4 kernel: [<ffffffff81090303>] ? hrtimer_start+0x13/0x20
> Nov 18 07:57:16 x4 kernel: [<ffffffff810714ac>] ? do_setitimer+0x1bc/0x240
> Nov 18 07:57:16 x4 kernel: [<ffffffff8110b095>] vfs_read+0x135/0x160
> Nov 18 07:57:16 x4 kernel: [<ffffffff8110b375>] sys_read+0x45/0x90
> Nov 18 07:57:16 x4 kernel: [<ffffffff814ca5fb>] system_call_fastpath+0x16/0x1b
> Nov 18 07:57:16 x4 kernel: Code: e9 3d ff ff ff 48 89 da 4c 89 ce e8 51 fe 3b 00 e9 77 ff ff ff 49 f7 01 00 c0 00 00 74 0d 4c 89 cf e8 64 24 fd ff e9 61 ff ff ff <0f> 0b 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 53 48
> Nov 18 07:57:16 x4 kernel: RIP [<ffffffff81103ac1>] kfree+0x131/0x140
> Nov 18 07:57:16 x4 kernel: RSP <ffff88020fbc1b88>
> Nov 18 07:57:16 x4 kernel: ---[ end trace 320a3cfbcb373e9b ]---
> Nov 18 08:01:30 x4 kernel: SysRq : Emergency Sync
> Nov 18 08:01:30 x4 kernel: Emergency Sync complete
>
> The dirty flag comes from a bunch of unrelated xfs patches from Christoph, that
> I'm testing right now.
>
> Please also see my previous post: http://thread.gmane.org/gmane.linux.kernel/1215023
> It looks like something is scribbling over memory.
>
> This machine uses ECC, so bit-flips should be impossible.
CC'ing netdev@vger.kernel.org and Eric Dumazet.
--
Markus
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply
* Re: [PATCH v7] Phonet: set the pipe handle using setsockopt
From: Rémi Denis-Courmont @ 2011-11-18 7:58 UTC (permalink / raw)
To: Hemant-vilas RAMDASI, netdev@vger.kernel.org
In-Reply-To: <AA50405D3B026C488CB3DB0AADE695401F56663F03@EXDCVYMBSTM005.EQ1STM.local>
Le Vendredi 18 Novembre 2011 06:27:44 ext Dinesh Kumar SHARMA a écrit :
> Hi,
>
> Can this be applied now?
I don't know. break after return looks a bit odd to me, but I am not sure what
the kernel style officially is.
--
Rémi Denis-Courmont
http://www.remlab.net/
^ permalink raw reply
* Re: Route cache problem.
From: Rogier Wolff @ 2011-11-18 8:17 UTC (permalink / raw)
To: Eric Dumazet; +Cc: linux-kernel, netdev
In-Reply-To: <1320333410.2344.13.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC>
On Thu, Nov 03, 2011 at 04:16:50PM +0100, Eric Dumazet wrote:
> Le jeudi 03 novembre 2011 à 15:37 +0100, Rogier Wolff a écrit :
> > Hi,
> >
> > My workstation has an incorrect route cache entry:
> >
>
> What kernel version ?
Linux version 3.0.0-12-generic (from Ubunutu oneiric.)
> > assurancetourix:~> route -nC | head -2 ; route -nC | grep 234.34
> > Kernel IP routing cache
> > Source Destination Gateway Flags Metric Ref Use Iface
> > 192.168.235.8 192.168.234.34 192.168.235.251 0 0 3 eth0
> > 192.168.235.8 192.168.234.34 192.168.235.251 0 0 4 eth0
> > 192.168.235.8 192.168.234.34 192.168.235.251 0 0 2 eth0
> >
> > (I don't know why there are three).
Today there are four.
> 192.168.20.108 10.37.168.112 192.168.20.254 0 1 2 eth3
That indeed got me a full complement of route cache entries.
> Better use "ip -s route list cache" to diagnose problems (more
> information)
After doing the tos ping you suggested All TOS levels have a route
cache entry.
192.168.234.34 from 192.168.235.8 tos 0x1c via 192.168.235.251 dev eth0
cache <redirected> age 77sec ipid 0xaa09 rtt 47ms rttvar 15ms ssthresh 7 cwnd 9
192.168.234.34 tos 0x1c via 192.168.235.251 dev eth0 src 192.168.235.8
cache <redirected> used 3 age 72sec ipid 0xaa09 rtt 47ms rttvar 15ms ssthresh 7 cwnd 9
192.168.234.34 from 192.168.235.8 tos 0x1c via 192.168.235.251 dev eth0
cache <redirected> age 72sec ipid 0xaa09 rtt 47ms rttvar 15ms ssthresh 7 cwnd 9
> > Any suggestions? Any at all?
Last time, as well as this time, it is triggered by a network error
that leads to the 192.168.235.4 router not being able to reach
192.168.234.34 or any other host on the 192.168.234.0/24 network.
During that time the VPN to 192.168.234.0/24 is down, so 192.168.235.4
doesn't have a route to 192.168.234.0/24 and it is logical that
with that route gone, it sends packets for 192.168.234.0/24 to the default
router 192.168.235.251. As it sees itself forwarding packets that come
in on eth0 back to eth0, it will send a redirect. However that redirect
should somehow expire, and not survive things like dropping the route
to 192.168.234.0/24, dropping the default route, shutting down the
interface or some time passing (that network problem was solved 20
hours ago)......
Roger.
--
** R.E.Wolff@BitWizard.nl ** http://www.BitWizard.nl/ ** +31-15-2600998 **
** Delftechpark 26 2628 XH Delft, The Netherlands. KVK: 27239233 **
*-- BitWizard writes Linux device drivers for any device you may have! --*
The plan was simple, like my brother-in-law Phil. But unlike
Phil, this plan just might work.
^ permalink raw reply
* Re: lock-warning seen on giving rds-info command
From: Eric Dumazet @ 2011-11-18 8:26 UTC (permalink / raw)
To: Kumar Sanghvi; +Cc: netdev, davem, venkat.x.venkatsubra
In-Reply-To: <1321603654-17179-1-git-send-email-kumaras@chelsio.com>
Le vendredi 18 novembre 2011 à 13:37 +0530, Kumar Sanghvi a écrit :
> Hi netdev team,
>
> I am trying to investigate one issue with RDS.
> On giving command rds-info, we get below trace in dmesg:
>
> ------------[ cut here ]------------
> WARNING: at /root/chelsio/git_repos/linux-2.6/kernel/softirq.c:159 local_bh_enable_ip+0x7a/0xa0()
> Hardware name: X7DB8
> Modules linked in: rds_rdma rds rdma_cm iw_cm ib_addr autofs4 sunrpc
> p4_clockmod freq_table speedstep_lib ib_ipoib ib_cm ib_sa ipv6 ib_uverbs
> ib_umad ib_mad iw_nes libcrc32c ib_core dm_mirror dm_region_hash dm_log
> uinput ppdev sg parport_pc parport pcspkr serio_raw i2c_i801 iTCO_wdt
> iTCO_vendor_support ioatdma dca i5k_amb i5000_edac edac_core e1000e shpchp
> ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic ata_piix
> floppy radeon ttm drm_kms_helper drm hwmon i2c_algo_bit i2c_core dm_mod
> [last unloaded: mdio]
> Pid: 1937, comm: rds-info Not tainted 3.1.0 #1
> Call Trace:
> [<ffffffff8106446f>] warn_slowpath_common+0x7f/0xc0
> [<ffffffff810644ca>] warn_slowpath_null+0x1a/0x20
> [<ffffffff8106af6a>] local_bh_enable_ip+0x7a/0xa0
> [<ffffffff814d3126>] _raw_read_unlock_bh+0x16/0x20
> [<ffffffff813f83b4>] sock_i_ino+0x44/0x60
> [<ffffffffa046ab02>] rds_sock_info+0xd2/0x140 [rds]
> [<ffffffffa046c7a7>] rds_info_getsockopt+0x197/0x200 [rds]
> [<ffffffffa046a1b4>] rds_getsockopt+0x84/0xe0 [rds]
> [<ffffffff813f3453>] sys_getsockopt+0x73/0xe0
> [<ffffffff814dafd7>] tracesys+0xd9/0xde
> ---[ end trace 4e7838d2e1e53231 ]---
>
> I tested this on 3.1 kernel. However, this problem seems to be present on most
> recent kernels. I tried to investigate, and the RDS problem seems to stem(I might be wrong)
> after this upstream commit:
>
> ----
> commit f064af1e500a2bf4607706f0f458163bdb2a6ea5
> Author: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Wed Sep 22 12:43:39 2010 +0000
>
> net: fix a lockdep splat
> ----
>
> In the above commit, sock_i_ino is modified to use read_lock/unlock_bh instead
> of plain read_lock/unlock.
>
> Now, everywhere in Linux network stack, it seems that callers of sock_i_ino
> are not disabling interrupts and are also not holding any lock while calling
> sock_i_ino function. However, RDS seems to be only one who is calling sock_i_ino
> with spin_lock_irqsave i.e. with interrupts disabled.
> As a result, when sock_i_ino finally calls read_unlock_bh - which in turn will lead to
> local_bh_enable_ip - which in turn throws the WARN-ON-ONCE since interrupts are
> disabled at this stage.
>
You mean that following sequence triggers a warning ?
spin_lock_irqsave(&rds_sock_lock, flags);
...
read_lock_bh(&sk->sk_callback_lock);
read_unlock_bh(&sk->sk_callback_lock); // HERE
...
I dont know why. How softirqs can trigger if we block hard irqs ?
In any case your patch is not the right solution, please read
vi +112 net/rds/af_rds.c
/*
* Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
* _bh() isn't OK here, we're called from interrupt handlers. It's probably OK
* to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
* this seems more conservative.
* NB - normally, one would use sk_callback_lock for this, but we can
* get here from interrupts, whereas the network code grabs sk_callback_lock
* with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
*/
void rds_wake_sk_sleep(struct rds_sock *rs)
{
unsigned long flags;
read_lock_irqsave(&rs->rs_recv_lock, flags);
__rds_wake_sk_sleep(rds_rs_to_sk(rs));
read_unlock_irqrestore(&rs->rs_recv_lock, flags);
}
^ permalink raw reply
* Re: lock-warning seen on giving rds-info command
From: Kumar Sanghvi @ 2011-11-18 8:54 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev, davem, venkat.x.venkatsubra
In-Reply-To: <1321604802.2444.40.camel@edumazet-laptop>
Hi Eric,
On Fri, Nov 18, 2011 at 09:26:42 +0100, Eric Dumazet wrote:
> Le vendredi 18 novembre 2011 à 13:37 +0530, Kumar Sanghvi a écrit :
> > Hi netdev team,
> >
> > I am trying to investigate one issue with RDS.
> > On giving command rds-info, we get below trace in dmesg:
> >
> > ------------[ cut here ]------------
> > WARNING: at /root/chelsio/git_repos/linux-2.6/kernel/softirq.c:159 local_bh_enable_ip+0x7a/0xa0()
> > Hardware name: X7DB8
> > Modules linked in: rds_rdma rds rdma_cm iw_cm ib_addr autofs4 sunrpc
> > p4_clockmod freq_table speedstep_lib ib_ipoib ib_cm ib_sa ipv6 ib_uverbs
> > ib_umad ib_mad iw_nes libcrc32c ib_core dm_mirror dm_region_hash dm_log
> > uinput ppdev sg parport_pc parport pcspkr serio_raw i2c_i801 iTCO_wdt
> > iTCO_vendor_support ioatdma dca i5k_amb i5000_edac edac_core e1000e shpchp
> > ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic ata_piix
> > floppy radeon ttm drm_kms_helper drm hwmon i2c_algo_bit i2c_core dm_mod
> > [last unloaded: mdio]
> > Pid: 1937, comm: rds-info Not tainted 3.1.0 #1
> > Call Trace:
> > [<ffffffff8106446f>] warn_slowpath_common+0x7f/0xc0
> > [<ffffffff810644ca>] warn_slowpath_null+0x1a/0x20
> > [<ffffffff8106af6a>] local_bh_enable_ip+0x7a/0xa0
> > [<ffffffff814d3126>] _raw_read_unlock_bh+0x16/0x20
> > [<ffffffff813f83b4>] sock_i_ino+0x44/0x60
> > [<ffffffffa046ab02>] rds_sock_info+0xd2/0x140 [rds]
> > [<ffffffffa046c7a7>] rds_info_getsockopt+0x197/0x200 [rds]
> > [<ffffffffa046a1b4>] rds_getsockopt+0x84/0xe0 [rds]
> > [<ffffffff813f3453>] sys_getsockopt+0x73/0xe0
> > [<ffffffff814dafd7>] tracesys+0xd9/0xde
> > ---[ end trace 4e7838d2e1e53231 ]---
> >
> > I tested this on 3.1 kernel. However, this problem seems to be present on most
> > recent kernels. I tried to investigate, and the RDS problem seems to stem(I might be wrong)
> > after this upstream commit:
> >
> > ----
> > commit f064af1e500a2bf4607706f0f458163bdb2a6ea5
> > Author: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed Sep 22 12:43:39 2010 +0000
> >
> > net: fix a lockdep splat
> > ----
> >
> > In the above commit, sock_i_ino is modified to use read_lock/unlock_bh instead
> > of plain read_lock/unlock.
> >
> > Now, everywhere in Linux network stack, it seems that callers of sock_i_ino
> > are not disabling interrupts and are also not holding any lock while calling
> > sock_i_ino function. However, RDS seems to be only one who is calling sock_i_ino
> > with spin_lock_irqsave i.e. with interrupts disabled.
> > As a result, when sock_i_ino finally calls read_unlock_bh - which in turn will lead to
> > local_bh_enable_ip - which in turn throws the WARN-ON-ONCE since interrupts are
> > disabled at this stage.
> >
>
> You mean that following sequence triggers a warning ?
It seems to be. I am not completely sure however.
>
> spin_lock_irqsave(&rds_sock_lock, flags);
> ...
> read_lock_bh(&sk->sk_callback_lock);
> read_unlock_bh(&sk->sk_callback_lock); // HERE
> ...
>
>
> I dont know why. How softirqs can trigger if we block hard irqs ?
>
> In any case your patch is not the right solution, please read
>
> vi +112 net/rds/af_rds.c
Yes, I am aware of this. However, I thought I would better report this to
netdev with a possible, although incorrect, solution.
I am wondering how to correctly resolve this.
>
> /*
> * Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
> * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK
> * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
> * this seems more conservative.
> * NB - normally, one would use sk_callback_lock for this, but we can
> * get here from interrupts, whereas the network code grabs sk_callback_lock
> * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
> */
> void rds_wake_sk_sleep(struct rds_sock *rs)
> {
> unsigned long flags;
>
> read_lock_irqsave(&rs->rs_recv_lock, flags);
> __rds_wake_sk_sleep(rds_rs_to_sk(rs));
> read_unlock_irqrestore(&rs->rs_recv_lock, flags);
> }
>
>
>
^ permalink raw reply
* Re: WARNING: at mm/slub.c:3357, kernel BUG at mm/slub.c:3413
From: Alex,Shi @ 2011-11-18 8:43 UTC (permalink / raw)
To: Markus Trippelsdorf
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
Christoph Lameter, Pekka Enberg, Matt Mackall,
netdev@vger.kernel.org, Eric Dumazet
In-Reply-To: <20111118075521.GB1615@x4.trippels.de>
> >
> > The dirty flag comes from a bunch of unrelated xfs patches from Christoph, that
> > I'm testing right now.
Where is the xfs patchset? I am wondering if it is due to slub code. It
is also possible xfs set a incorrect page flags.
> >
> > Please also see my previous post: http://thread.gmane.org/gmane.linux.kernel/1215023
> > It looks like something is scribbling over memory.
> >
> > This machine uses ECC, so bit-flips should be impossible.
>
> CC'ing netdev@vger.kernel.org and Eric Dumazet.
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply
* Re: WARNING: at mm/slub.c:3357, kernel BUG at mm/slub.c:3413
From: Markus Trippelsdorf @ 2011-11-18 8:54 UTC (permalink / raw)
To: Alex,Shi
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
Christoph Lameter, Pekka Enberg, Matt Mackall,
netdev@vger.kernel.org, Eric Dumazet
In-Reply-To: <1321605837.30341.551.camel@debian>
On 2011.11.18 at 16:43 +0800, Alex,Shi wrote:
> > >
> > > The dirty flag comes from a bunch of unrelated xfs patches from Christoph, that
> > > I'm testing right now.
>
> Where is the xfs patchset? I am wondering if it is due to slub code. It
> is also possible xfs set a incorrect page flags.
http://thread.gmane.org/gmane.comp.file-systems.xfs.general/41810
--
Markus
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply
* Re: WARNING: at mm/slub.c:3357, kernel BUG at mm/slub.c:3413
From: Markus Trippelsdorf @ 2011-11-18 8:57 UTC (permalink / raw)
To: Alex,Shi
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
Christoph Lameter, Pekka Enberg, Matt Mackall,
netdev@vger.kernel.org, Eric Dumazet
In-Reply-To: <20111118085436.GC1615@x4.trippels.de>
On 2011.11.18 at 09:54 +0100, Markus Trippelsdorf wrote:
> On 2011.11.18 at 16:43 +0800, Alex,Shi wrote:
> > > >
> > > > The dirty flag comes from a bunch of unrelated xfs patches from Christoph, that
> > > > I'm testing right now.
> >
> > Where is the xfs patchset? I am wondering if it is due to slub code. It
> > is also possible xfs set a incorrect page flags.
>
> http://thread.gmane.org/gmane.comp.file-systems.xfs.general/41810
This is the diff:
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 574d4ee..b436581 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -26,6 +26,7 @@
#include "xfs_bmap_btree.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_inode_item.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_rw.h"
@@ -99,24 +100,6 @@ xfs_destroy_ioend(
}
/*
- * If the end of the current ioend is beyond the current EOF,
- * return the new EOF value, otherwise zero.
- */
-STATIC xfs_fsize_t
-xfs_ioend_new_eof(
- xfs_ioend_t *ioend)
-{
- xfs_inode_t *ip = XFS_I(ioend->io_inode);
- xfs_fsize_t isize;
- xfs_fsize_t bsize;
-
- bsize = ioend->io_offset + ioend->io_size;
- isize = MAX(ip->i_size, ip->i_new_size);
- isize = MIN(isize, bsize);
- return isize > ip->i_d.di_size ? isize : 0;
-}
-
-/*
* Fast and loose check if this write could update the on-disk inode size.
*/
static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
@@ -131,30 +114,40 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
* will be the intended file size until i_size is updated. If this write does
* not extend all the way to the valid file size then restrict this update to
* the end of the write.
- *
- * This function does not block as blocking on the inode lock in IO completion
- * can lead to IO completion order dependency deadlocks.. If it can't get the
- * inode ilock it will return EAGAIN. Callers must handle this.
*/
STATIC int
xfs_setfilesize(
- xfs_ioend_t *ioend)
+ struct xfs_ioend *ioend)
{
- xfs_inode_t *ip = XFS_I(ioend->io_inode);
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
xfs_fsize_t isize;
+ int error = 0;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ if (!isize)
+ return 0;
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
- return EAGAIN;
+ trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
- isize = xfs_ioend_new_eof(ioend);
- if (isize) {
- trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
- ip->i_d.di_size = isize;
- xfs_mark_inode_dirty(ip);
+ tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+ error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return error;
}
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return 0;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ ip->i_d.di_size = isize;
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ return xfs_trans_commit(tp, 0);
}
/*
@@ -168,10 +161,12 @@ xfs_finish_ioend(
struct xfs_ioend *ioend)
{
if (atomic_dec_and_test(&ioend->io_remaining)) {
+ struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
+
if (ioend->io_type == IO_UNWRITTEN)
- queue_work(xfsconvertd_workqueue, &ioend->io_work);
+ queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
else if (xfs_ioend_is_append(ioend))
- queue_work(xfsdatad_workqueue, &ioend->io_work);
+ queue_work(mp->m_data_workqueue, &ioend->io_work);
else
xfs_destroy_ioend(ioend);
}
@@ -206,29 +201,14 @@ xfs_end_io(
ioend->io_error = -error;
goto done;
}
+ } else if (xfs_ioend_is_append(ioend)) {
+ error = xfs_setfilesize(ioend);
+ if (error)
+ ioend->io_error = error;
}
- /*
- * We might have to update the on-disk file size after extending
- * writes.
- */
- error = xfs_setfilesize(ioend);
- ASSERT(!error || error == EAGAIN);
-
done:
- /*
- * If we didn't complete processing of the ioend, requeue it to the
- * tail of the workqueue for another attempt later. Otherwise destroy
- * it.
- */
- if (error == EAGAIN) {
- atomic_inc(&ioend->io_remaining);
- xfs_finish_ioend(ioend);
- /* ensure we don't spin on blocked ioends */
- delay(1);
- } else {
- xfs_destroy_ioend(ioend);
- }
+ xfs_destroy_ioend(ioend);
}
/*
@@ -385,13 +365,6 @@ xfs_submit_ioend_bio(
bio->bi_private = ioend;
bio->bi_end_io = xfs_end_bio;
- /*
- * If the I/O is beyond EOF we mark the inode dirty immediately
- * but don't update the inode size until I/O completion.
- */
- if (xfs_ioend_new_eof(ioend))
- xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
-
submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
}
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 116dd5c..06e4caf 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -18,8 +18,6 @@
#ifndef __XFS_AOPS_H__
#define __XFS_AOPS_H__
-extern struct workqueue_struct *xfsdatad_workqueue;
-extern struct workqueue_struct *xfsconvertd_workqueue;
extern mempool_t *xfs_ioend_pool;
/*
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index d4906e7..8bb1052 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -110,6 +110,7 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
/*
* Query whether the requested number of additional bytes of extended
* attribute space will be able to fit inline.
+ *
* Returns zero if not, else the di_forkoff fork offset to be used in the
* literal area for attribute data once the new bytes have been added.
*
@@ -136,11 +137,26 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
return (offset >= minforkoff) ? minforkoff : 0;
}
- if (!(mp->m_flags & XFS_MOUNT_ATTR2)) {
- if (bytes <= XFS_IFORK_ASIZE(dp))
- return dp->i_d.di_forkoff;
+ /*
+ * If the requested numbers of bytes is smaller or equal to the
+ * current attribute fork size we can always proceed.
+ *
+ * Note that if_bytes in the data fork might actually be larger than
+ * the current data fork size is due to delalloc extents. In that
+ * case either the extent count will go down when they are converted
+ * to ral extents, or the delalloc conversion will take care of the
+ * literal area rebalancing.
+ */
+ if (bytes <= XFS_IFORK_ASIZE(dp))
+ return dp->i_d.di_forkoff;
+
+ /*
+ * For attr2 we can try to move the forkoff if there is space in the
+ * literal area, but for the old format we are done if there is no
+ * space in the fixes attribute fork.
+ */
+ if (!(mp->m_flags & XFS_MOUNT_ATTR2))
return 0;
- }
dsize = dp->i_df.if_bytes;
@@ -157,10 +173,9 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
xfs_default_attroffset(dp))
dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
break;
-
case XFS_DINODE_FMT_BTREE:
/*
- * If have data btree then keep forkoff if we have one,
+ * If have a data btree then keep forkoff if we have one,
* otherwise we are adding a new attr, so then we set
* minforkoff to where the btree root can finish so we have
* plenty of room for attrs
@@ -168,10 +183,9 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
if (dp->i_d.di_forkoff) {
if (offset < dp->i_d.di_forkoff)
return 0;
- else
- return dp->i_d.di_forkoff;
- } else
- dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot);
+ return dp->i_d.di_forkoff;
+ }
+ dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot);
break;
}
@@ -186,10 +200,10 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
maxforkoff = maxforkoff >> 3; /* rounded down */
- if (offset >= minforkoff && offset < maxforkoff)
- return offset;
if (offset >= maxforkoff)
return maxforkoff;
+ if (offset >= minforkoff)
+ return offset;
return 0;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index cf0ac05..fcde6c3 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -45,8 +45,6 @@ static kmem_zone_t *xfs_buf_zone;
STATIC int xfsbufd(void *);
static struct workqueue_struct *xfslogd_workqueue;
-struct workqueue_struct *xfsdatad_workqueue;
-struct workqueue_struct *xfsconvertd_workqueue;
#ifdef XFS_BUF_LOCK_TRACKING
# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)
@@ -1797,21 +1795,8 @@ xfs_buf_init(void)
if (!xfslogd_workqueue)
goto out_free_buf_zone;
- xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
- if (!xfsdatad_workqueue)
- goto out_destroy_xfslogd_workqueue;
-
- xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
- WQ_MEM_RECLAIM, 1);
- if (!xfsconvertd_workqueue)
- goto out_destroy_xfsdatad_workqueue;
-
return 0;
- out_destroy_xfsdatad_workqueue:
- destroy_workqueue(xfsdatad_workqueue);
- out_destroy_xfslogd_workqueue:
- destroy_workqueue(xfslogd_workqueue);
out_free_buf_zone:
kmem_zone_destroy(xfs_buf_zone);
out:
@@ -1821,8 +1806,6 @@ xfs_buf_init(void)
void
xfs_buf_terminate(void)
{
- destroy_workqueue(xfsconvertd_workqueue);
- destroy_workqueue(xfsdatad_workqueue);
destroy_workqueue(xfslogd_workqueue);
kmem_zone_destroy(xfs_buf_zone);
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 753ed9b..2560248 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -436,6 +436,36 @@ xfs_aio_write_isize_update(
}
}
+STATIC int
+xfs_aio_write_isize_reset(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error = 0;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+ error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ if (ip->i_d.di_size <= ip->i_size) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_cancel(tp, 0);
+ return 0;
+ }
+
+ ip->i_d.di_size = ip->i_size;
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ return xfs_trans_commit(tp, 0);
+}
+
/*
* If this was a direct or synchronous I/O that failed (such as ENOSPC) then
* part of the I/O may have been written to disk before the error occurred. In
@@ -447,14 +477,18 @@ xfs_aio_write_newsize_update(
struct xfs_inode *ip,
xfs_fsize_t new_size)
{
+ bool reset = false;
if (new_size == ip->i_new_size) {
xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
if (new_size == ip->i_new_size)
ip->i_new_size = 0;
if (ip->i_d.di_size > ip->i_size)
- ip->i_d.di_size = ip->i_size;
+ reset = true;
xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
}
+
+ if (reset)
+ xfs_aio_write_isize_reset(ip);
}
/*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 760140d..8ed926b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -278,6 +278,20 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
}
/*
+ * If this I/O goes past the on-disk inode size update it unless it would
+ * be past the current in-core or write in-progress inode size.
+ */
+static inline xfs_fsize_t
+xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
+{
+ xfs_fsize_t i_size = max(ip->i_size, ip->i_new_size);
+
+ if (new_size > i_size)
+ new_size = i_size;
+ return new_size > ip->i_d.di_size ? new_size : 0;
+}
+
+/*
* i_flags helper functions
*/
static inline void
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 9afa282..9c86fa1 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -31,6 +31,7 @@
#include "xfs_ialloc_btree.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_inode_item.h"
#include "xfs_btree.h"
#include "xfs_bmap.h"
#include "xfs_rtalloc.h"
@@ -645,6 +646,7 @@ xfs_iomap_write_unwritten(
xfs_trans_t *tp;
xfs_bmbt_irec_t imap;
xfs_bmap_free_t free_list;
+ xfs_fsize_t i_size;
uint resblks;
int committed;
int error;
@@ -705,7 +707,22 @@ xfs_iomap_write_unwritten(
if (error)
goto error_on_bmapi_transaction;
- error = xfs_bmap_finish(&(tp), &(free_list), &committed);
+ /*
+ * Log the updated inode size as we go. We have to be careful
+ * to only log it up to the actual write offset if it is
+ * halfway into a block.
+ */
+ i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
+ if (i_size > offset + count)
+ i_size = offset + count;
+
+ i_size = xfs_new_eof(ip, i_size);
+ if (i_size) {
+ ip->i_d.di_size = i_size;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ }
+
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
if (error)
goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index bb24dac..4267fd8 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -211,6 +211,12 @@ typedef struct xfs_mount {
struct shrinker m_inode_shrink; /* inode reclaim shrinker */
int64_t m_low_space[XFS_LOWSP_MAX];
/* low free space thresholds */
+
+ struct workqueue_struct *m_data_workqueue;
+ struct workqueue_struct *m_unwritten_workqueue;
+#define XFS_WQ_NAME_LEN 512
+ char m_data_workqueue_name[XFS_WQ_NAME_LEN];
+ char m_unwritten_workqueue_name[XFS_WQ_NAME_LEN];
} xfs_mount_t;
/*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 3eca58f..8c7a6e4 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -769,6 +769,42 @@ xfs_setup_devices(
return 0;
}
+STATIC int
+xfs_init_mount_workqueues(
+ struct xfs_mount *mp)
+{
+ snprintf(mp->m_data_workqueue_name, XFS_WQ_NAME_LEN,
+ "xfs-data/%s", mp->m_fsname);
+ mp->m_data_workqueue =
+ alloc_workqueue(mp->m_data_workqueue_name, WQ_MEM_RECLAIM, 1);
+ if (!mp->m_data_workqueue)
+ goto out;
+
+ snprintf(mp->m_unwritten_workqueue_name, XFS_WQ_NAME_LEN,
+ "xfs-conv/%s", mp->m_fsname);
+ mp->m_unwritten_workqueue =
+ alloc_workqueue(mp->m_unwritten_workqueue_name,
+ WQ_MEM_RECLAIM, 1);
+ if (!mp->m_unwritten_workqueue)
+ goto out_destroy_data_iodone_queue;
+
+ return 0;
+
+out_destroy_data_iodone_queue:
+ destroy_workqueue(mp->m_data_workqueue);
+out:
+ return -ENOMEM;
+#undef XFS_WQ_NAME_LEN
+}
+
+STATIC void
+xfs_destroy_mount_workqueues(
+ struct xfs_mount *mp)
+{
+ destroy_workqueue(mp->m_data_workqueue);
+ destroy_workqueue(mp->m_unwritten_workqueue);
+}
+
/* Catch misguided souls that try to use this interface on XFS */
STATIC struct inode *
xfs_fs_alloc_inode(
@@ -1020,6 +1056,7 @@ xfs_fs_put_super(
xfs_unmountfs(mp);
xfs_freesb(mp);
xfs_icsb_destroy_counters(mp);
+ xfs_destroy_mount_workqueues(mp);
xfs_close_devices(mp);
xfs_free_fsname(mp);
kfree(mp);
@@ -1353,10 +1390,14 @@ xfs_fs_fill_super(
if (error)
goto out_free_fsname;
- error = xfs_icsb_init_counters(mp);
+ error = xfs_init_mount_workqueues(mp);
if (error)
goto out_close_devices;
+ error = xfs_icsb_init_counters(mp);
+ if (error)
+ goto out_destroy_workqueues;
+
error = xfs_readsb(mp, flags);
if (error)
goto out_destroy_counters;
@@ -1419,6 +1460,8 @@ xfs_fs_fill_super(
xfs_freesb(mp);
out_destroy_counters:
xfs_icsb_destroy_counters(mp);
+out_destroy_workqueues:
+ xfs_destroy_mount_workqueues(mp);
out_close_devices:
xfs_close_devices(mp);
out_free_fsname:
--
Markus
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related
* [PATCH] vhost-net: Acquire device lock when releasing device
From: Sasha Levin @ 2011-11-18 9:19 UTC (permalink / raw)
To: linux-kernel; +Cc: netdev, virtualization, Sasha Levin, kvm, Michael S. Tsirkin
Device lock should be held when releasing a device, and specifically
when calling vhost_dev_cleanup(). Otherwise, RCU complains about it:
[ 2025.642835] ===============================
[ 2025.643838] [ INFO: suspicious RCU usage. ]
[ 2025.645182] -------------------------------
[ 2025.645927] drivers/vhost/vhost.c:475 suspicious rcu_dereference_protected() usage!
[ 2025.647329]
[ 2025.647330] other info that might help us debug this:
[ 2025.647331]
[ 2025.649042]
[ 2025.649043] rcu_scheduler_active = 1, debug_locks = 1
[ 2025.650235] no locks held by trinity/21042.
[ 2025.650971]
[ 2025.650972] stack backtrace:
[ 2025.651789] Pid: 21042, comm: trinity Not tainted 3.2.0-rc2-sasha-00057-ga9098b3 #5
[ 2025.653342] Call Trace:
[ 2025.653792] [<ffffffff810b4a6a>] lockdep_rcu_suspicious+0xaf/0xb9
[ 2025.654916] [<ffffffff818d4c2c>] vhost_dev_cleanup+0x342/0x3ac
[ 2025.655985] [<ffffffff818d4f18>] vhost_net_release+0x48/0x7f
[ 2025.657247] [<ffffffff811416e3>] fput+0x11e/0x1dc
[ 2025.658091] [<ffffffff8113f1ec>] filp_close+0x6e/0x79
[ 2025.658964] [<ffffffff81089ed7>] put_files_struct+0xcc/0x196
[ 2025.659971] [<ffffffff8108a034>] exit_files+0x46/0x4f
[ 2025.660865] [<ffffffff8108a716>] do_exit+0x264/0x75c
[ 2025.662201] [<ffffffff8113f490>] ? fsnotify_modify+0x60/0x68
[ 2025.663260] [<ffffffff81bbdbca>] ? sysret_check+0x2e/0x69
[ 2025.664269] [<ffffffff8108acc1>] do_group_exit+0x83/0xb1
[ 2025.665448] [<ffffffff8108ad01>] sys_exit_group+0x12/0x16
[ 2025.666396] [<ffffffff81bbdb92>] system_call_fastpath+0x16/0x1b
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Cc: netdev@vger.kernel.org
Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
drivers/vhost/net.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 882a51f..c9be601 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -586,6 +586,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
struct socket *tx_sock;
struct socket *rx_sock;
+ mutex_lock(&n->dev.mutex);
vhost_net_stop(n, &tx_sock, &rx_sock);
vhost_net_flush(n);
vhost_dev_cleanup(&n->dev);
@@ -596,6 +597,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
/* We do an extra flush before freeing memory,
* since jobs can re-queue themselves. */
vhost_net_flush(n);
+ mutex_unlock(&n->dev.mutex);
kfree(n);
return 0;
}
--
1.7.8.rc1
^ permalink raw reply related
* Re: [PATCH] net, bridge: print log message after state changed
From: Holger Brunck @ 2011-11-18 9:20 UTC (permalink / raw)
To: David Lamparter; +Cc: David Miller, netdev, shemminger, bridge
In-Reply-To: <20111117192353.GD2051622@jupiter.n2.diac24.net>
On 11/17/2011 08:23 PM, David Lamparter wrote:
> On Mon, Nov 14, 2011 at 02:07:49AM -0500, David Miller wrote:
>>>> The message is therefore appropriately timed as far as I'm concerned.
>>>>
>>>
>>> It's not.
>>>
>>> Please test: create a bridge with STP disabled, add an interface to the
>>> bridge and set that interface down. You get the message "... entering
>>> forwarding state". That's wrong because it's "about to enter" disabled
>>> state some lines later.
>>>
>>> All other (4) calls to br_log_state are located after the state change,
>>> see for example br_stp_enable_port() just some lines above the patch.
>>
>> I would never expect an "entering" message to print out after the event
>> happens, and in fact I'd _always_ want to see it beforehand so that if
>> the state change caused a crash or similar it'd be that much easier
>> to pinpoint the proper location.
>
> There seems to be a misunderstanding here. The patch effectively does:
> - br_log_state(p);
> p->state = BR_STATE_DISABLED;
> + br_log_state(p);
>
> and the issue it is trying to fix is not the timing but rather the code
> printing the wrong (old, now-left) state.
>
Yes exactly this is the problem. We got confusing state messages in the current
implementation.
> I do agree that the log message should be printed before anything
> happens, so, Holger, can you brew a patch that does that?
>
This can't be changed easily. Because in some situations we don't know which
state we will enter at the entry point of the function.
For example br_make_forwarding does something like
if ..
p->state = STATE_A
else if ...
p->state = STATE_B
....
So my approach would be to change the log message from "port entering state" to
"port entered state" and print it out after the state changed.
Regards
Holger
^ permalink raw reply
* Re: [PATCH net-next] r8169: Add 64bit statistics
From: Francois Romieu @ 2011-11-18 9:18 UTC (permalink / raw)
To: Junchang Wang; +Cc: nic_swsd, eric.dumazet, netdev
In-Reply-To: <CABoNC816GL4vqbHSR8BYrOFFuL_ZSqxS1xqmG02uwpKkwKNJDQ@mail.gmail.com>
Junchang Wang <junchangwang@gmail.com> :
[...]
> I'm not sure which hardware stats registers are accurate.
See rtl8169_gstrings.
> Besides, it seem r8169.c are also designed for other chipsets (8129?).
Afaik the basic statistics are consistent through the 816x and
810{2, 3} family.
> I would like other people who are quite familiar with those chipsets
> to do this job.
>
> Is that OK ? Thanks.
Fair enough.
Eric's "ethtool -S = hardware, ip -s = software" remark makes sense
anyway. So there is no reason to be retentive.
--
Ueimor
^ permalink raw reply
* [PATCH] net: Use kzalloc rather than kmalloc followed by memset with 0
From: Thomas Meyer @ 2011-11-17 22:43 UTC (permalink / raw)
To: netdev, linux-kernel
This considers some simple cases that are common and easy to validate
Note in particular that there are no ...s in the rule, so all of the
matched code has to be contiguous
The semantic patch that makes this change is available
in scripts/coccinelle/api/alloc/kzalloc-simple.cocci.
Signed-off-by: Thomas Meyer <thomas@m3y3r.de>
---
diff -u -p a/drivers/net/ethernet/micrel/ksz884x.c b/drivers/net/ethernet/micrel/ksz884x.c
--- a/drivers/net/ethernet/micrel/ksz884x.c 2011-11-07 19:37:59.173455664 +0100
+++ b/drivers/net/ethernet/micrel/ksz884x.c 2011-11-08 09:32:00.426428212 +0100
@@ -4382,12 +4382,10 @@ static void ksz_update_timer(struct ksz_
*/
static int ksz_alloc_soft_desc(struct ksz_desc_info *desc_info, int transmit)
{
- desc_info->ring = kmalloc(sizeof(struct ksz_desc) * desc_info->alloc,
- GFP_KERNEL);
+ desc_info->ring = kzalloc(sizeof(struct ksz_desc) * desc_info->alloc,
+ GFP_KERNEL);
if (!desc_info->ring)
return 1;
- memset((void *) desc_info->ring, 0,
- sizeof(struct ksz_desc) * desc_info->alloc);
hw_init_desc(desc_info, transmit);
return 0;
}
^ permalink raw reply
* Re: [RFC] The Linux kernel IPv6 stack don't follow the RFC 4942 recommendation
From: Bjørn Mork @ 2011-11-18 10:15 UTC (permalink / raw)
To: Eric Dumazet; +Cc: François-Xavier Le Bail, netdev@vger.kernel.org
In-Reply-To: <1320484852.16908.0.camel@edumazet-laptop>
Eric Dumazet <eric.dumazet@gmail.com> writes:
> Le samedi 05 novembre 2011 à 01:39 -0700, François-Xavier Le Bail a
> écrit :
>
>> Agreed, but remain the case of ICMPv6 echo request/reply, which I think is in kernel.
>
> Yes, please describe your setup and how to reproduce the problem.
I agree with François-Xavier that the current behaviour is confusing. An
anycast address should really not be treated different from any other
global unicast address by the kernel.
1) The router anycast address does not show up in the list of local
addresses:
router:~$ ip -6 addr show dev br0.666
15: br0.666@br0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500
inet6 2001:db8:9:29a::1/64 scope global
valid_lft forever preferred_lft forever
inet6 fe80::215:17ff:fe1e:5e35/64 scope link
valid_lft forever preferred_lft forever
2) pinging the anycast address will produce a reply from another
address:
frtest1:~# ping6 -n -c1 2001:db8:9:29a::
PING 2001:db8:9:29a::(2001:db8:9:29a::) 56 data bytes
64 bytes from 2001:db8:9:29a::1: icmp_seq=1 ttl=64 time=0.275 ms
--- 2001:db8:9:29a:: ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.275/0.275/0.275/0.000 ms
frtest1:~# ping6 -n -c1 2001:db8:9:29a::1
PING 2001:db8:9:29a::1(2001:db8:9:29a::1) 56 data bytes
64 bytes from 2001:db8:9:29a::1: icmp_seq=1 ttl=64 time=0.229 ms
--- 2001:db8:9:29a::1 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.229/0.229/0.229/0.000 ms
3) the previous issue will become even more obviously buggy if we add a
second address to the router, in the same subnet while pinging the
anycast address. Here I'm doing
router:/tmp# ip addr add 2001:db8:9:29a::5/64 dev br0.666
on the router while I'm pinging the anycast address from a another host:
frtest1:~# ping6 -n 2001:db8:9:29a::
PING 2001:db8:9:29a::(2001:db8:9:29a::) 56 data bytes
64 bytes from 2001:db8:9:29a::1: icmp_seq=1 ttl=64 time=330 ms
64 bytes from 2001:db8:9:29a::1: icmp_seq=2 ttl=64 time=0.268 ms
64 bytes from 2001:db8:9:29a::1: icmp_seq=3 ttl=64 time=0.234 ms
64 bytes from 2001:db8:9:29a::5: icmp_seq=4 ttl=64 time=0.232 ms
64 bytes from 2001:db8:9:29a::5: icmp_seq=5 ttl=64 time=0.298 ms
64 bytes from 2001:db8:9:29a::5: icmp_seq=6 ttl=64 time=0.279 ms
Now, THAT doesn't look good, does it?
4) just to add to issue 2. After adding the second address on the
router, we have 3 global unicast addresses in the same subnet. You would
then expect the kernel to either use the destionation address of the
incoming requests as source, or always select the same address as
source.
It does neither. This is inconsistent, unless you treat the router
anycast address as something special. Which you should not:
frtest1:~# ping-n -c1 2001:db8:9:29a::
PING 2001:db8:9:29a::(2001:db8:9:29a::) 56 data bytes
64 bytes from 2001:db8:9:29a::5: icmp_seq=1 ttl=64 time=337 ms
--- 2001:db8:9:29a:: ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 337.251/337.251/337.251/0.000 ms
frtest1:~# ping6 -n -c1 2001:db8:9:29a::1
PING 2001:db8:9:29a::1(2001:db8:9:29a::1) 56 data bytes
64 bytes from 2001:db8:9:29a::1: icmp_seq=1 ttl=64 time=0.163 ms
--- 2001:db8:9:29a::1 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.163/0.163/0.163/0.000 ms
frtest1:~# ping6 -n -c1 2001:db8:9:29a::5
PING 2001:db8:9:29a::5(2001:db8:9:29a::5) 56 data bytes
64 bytes from 2001:db8:9:29a::5: icmp_seq=1 ttl=64 time=3.87 ms
--- 2001:db8:9:29a::5 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 3.873/3.873/3.873/0.000 ms
Bjørn
^ permalink raw reply
* Re: [PATCH] Don't allow sharing of tx skbs on xen-netfront
From: Ian Campbell @ 2011-11-18 10:30 UTC (permalink / raw)
To: Neil Horman
Cc: netdev@vger.kernel.org, David S. Miller, Jeremy Fitzhardinge,
Konrad Rzeszutek Wilk, xen-devel@lists.xensource.com
In-Reply-To: <20111117204553.GD26847@hmsreliant.think-freely.org>
On Thu, 2011-11-17 at 20:45 +0000, Neil Horman wrote:
> On Thu, Nov 17, 2011 at 08:17:01PM +0000, Ian Campbell wrote:
> > On Thu, 2011-11-17 at 19:25 +0000, Neil Horman wrote:
> > > On Thu, Nov 17, 2011 at 03:20:38PM +0000, Ian Campbell wrote:
> > > > On Mon, 2011-11-14 at 14:22 -0500, Neil Horman wrote:
> > > > > It was pointed out to me recently that the xen-netfront driver can't safely
> > > > > support shared skbs on transmit, since, while it doesn't maintain skb state
> > > > > directly, it does pass a pointer to the skb to the hypervisor via a list, and
> > > > > the hypervisor may expect the contents of the skb to remain stable. Clearing
> > > > > the IFF_TX_SKB_SHARING flag after the call to alloc_etherdev to make it safe.
> > > >
> > > > What are the actual constraints here? The skb is used as a handle to the
> > > > skb->data and shinfo (frags) and to complete at the end. It's actually
> > > > those which are passed to the hypervisor (effectively the same as
> > > > passing those addresses to the h/w for DMA).
> > > >
> > > > Which parts of the skb are expected/allowed to not remain stable?
> > > >
> > > > (Appologies if the above seems naive, I seem to have missed the
> > > > introduction of shared tx skbs and IFF_TX_SKB_SHARING)
> > > >
> > > Its ok, this is the most accurate description from the previous threads on the
> > > subject:
> > > 2
> > >
> > > The basic problem boils down the notion that some drivers, when they receive an
> > > skb in their xmit paths, presume to have sole ownership of the skb, and as a
> > > result may do things like add the skb to a list, or otherwise store stateful
> > > data in the skb. If the skb is shared, thats unsafe to do, as the stack still
> > > holds a reference to the skb, and make make changes without serializing them
> > > against the driver. So we have to flag those drivers which preform these kinds
> > > of actions. xen-netfront doesn't strictly speaking modify any state directly ni
> > > an skb, but it does place a pointer to the skb in a data structure here:
> > >
> > > np->tx_skbs[id].skb = skb;
> > >
> > > Which then gets handed off to the hypervisior. Since the hypervisor now has
> > > access to that skb pointer, and we can't be sure (from the guest perspective),
> > > what it does with that information, it would be better to be safe by disallowing
> > > shared skbs in this path.
> >
> > The skb pointer itself doesn't get given to the backend/hypervisor. The
> > page which skb->data refers to is granted to the backend domain, as are
> > the pages in the frags.
> >
> > I think we only need to be sure that the frontend doesn't rely on
> > anything in the skb itself, right? Does skb->data or shinfo count from
> > that perspective?
> shinfo is definately a problem, as other devices may make modifications to it.
> skb->data is probably safer, but is also potentially suspect (for instance if
> another device appends an additional header to the data for instance)
A device is allowed to rely on these things being stable while in its
start_xmit, right? (otherwise I don't see how any device can ever
cope...).
netfront only uses shinfo and ->data during start_xmit in order to
create the necessary grant reference (which can be thought of as a DMA
address passed to the virtual hardware). The only use of the stashed skb
pointer outside of this are to dev_kfree_skb on tx completion (from
either tx_buf_gc (normal completion) or release_tx_buf ("hardware"
reset).
Ian.
^ permalink raw reply
* [PATCH v8] Phonet: set the pipe handle using setsockopt
From: Hemant Vilas RAMDASI @ 2011-11-18 11:22 UTC (permalink / raw)
To: netdev-owner, davem
Cc: netdev, remi.denis-courmont, Dinesh Kumar Sharma, Hemant Ramdasi
From: Dinesh Kumar Sharma <dinesh.sharma@stericsson.com>
This provides flexibility to set the pipe handle
using setsockopt. The pipe can be enabled (if disabled) later
using ioctl.
Signed-off-by: Hemant Ramdasi <hemant.ramdasi@stericsson.com>
Signed-off-by: Dinesh Kumar Sharma <dinesh.sharma@stericsson.com>
---
include/linux/phonet.h | 2 +
net/phonet/pep.c | 106 +++++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 97 insertions(+), 11 deletions(-)
diff --git a/include/linux/phonet.h b/include/linux/phonet.h
index 6fb1384..1847ef9 100644
--- a/include/linux/phonet.h
+++ b/include/linux/phonet.h
@@ -37,6 +37,7 @@
#define PNPIPE_ENCAP 1
#define PNPIPE_IFINDEX 2
#define PNPIPE_HANDLE 3
+#define PNPIPE_INITSTATE 4
#define PNADDR_ANY 0
#define PNADDR_BROADCAST 0xFC
@@ -48,6 +49,7 @@
/* ioctls */
#define SIOCPNGETOBJECT (SIOCPROTOPRIVATE + 0)
+#define SIOCPNENABLEPIPE (SIOCPROTOPRIVATE + 13)
#define SIOCPNADDRESOURCE (SIOCPROTOPRIVATE + 14)
#define SIOCPNDELRESOURCE (SIOCPROTOPRIVATE + 15)
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index f17fd84..1ad2892 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -533,6 +533,29 @@ static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb)
return pipe_handler_send_created_ind(sk);
}
+static int pep_enableresp_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct pnpipehdr *hdr = pnp_hdr(skb);
+
+ if (hdr->error_code != PN_PIPE_NO_ERROR)
+ return -ECONNREFUSED;
+
+ return pep_indicate(sk, PNS_PIPE_ENABLED_IND, 0 /* sub-blocks */,
+ NULL, 0, GFP_ATOMIC);
+
+}
+
+static void pipe_start_flow_control(struct sock *sk)
+{
+ struct pep_sock *pn = pep_sk(sk);
+
+ if (!pn_flow_safe(pn->tx_fc)) {
+ atomic_set(&pn->tx_credits, 1);
+ sk->sk_write_space(sk);
+ }
+ pipe_grant_credits(sk, GFP_ATOMIC);
+}
+
/* Queue an skb to an actively connected sock.
* Socket lock must be held. */
static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb)
@@ -578,13 +601,25 @@ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb)
sk->sk_state = TCP_CLOSE_WAIT;
break;
}
+ if (pn->init_enable == PN_PIPE_DISABLE)
+ sk->sk_state = TCP_SYN_RECV;
+ else {
+ sk->sk_state = TCP_ESTABLISHED;
+ pipe_start_flow_control(sk);
+ }
+ break;
- sk->sk_state = TCP_ESTABLISHED;
- if (!pn_flow_safe(pn->tx_fc)) {
- atomic_set(&pn->tx_credits, 1);
- sk->sk_write_space(sk);
+ case PNS_PEP_ENABLE_RESP:
+ if (sk->sk_state != TCP_SYN_SENT)
+ break;
+
+ if (pep_enableresp_rcv(sk, skb)) {
+ sk->sk_state = TCP_CLOSE_WAIT;
+ break;
}
- pipe_grant_credits(sk, GFP_ATOMIC);
+
+ sk->sk_state = TCP_ESTABLISHED;
+ pipe_start_flow_control(sk);
break;
case PNS_PEP_DISCONNECT_RESP:
@@ -863,14 +898,32 @@ static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len)
int err;
u8 data[4] = { 0 /* sub-blocks */, PAD, PAD, PAD };
- pn->pipe_handle = 1; /* anything but INVALID_HANDLE */
+ if (pn->pipe_handle == PN_PIPE_INVALID_HANDLE)
+ pn->pipe_handle = 1; /* anything but INVALID_HANDLE */
+
err = pipe_handler_request(sk, PNS_PEP_CONNECT_REQ,
- PN_PIPE_ENABLE, data, 4);
+ pn->init_enable, data, 4);
if (err) {
pn->pipe_handle = PN_PIPE_INVALID_HANDLE;
return err;
}
+
sk->sk_state = TCP_SYN_SENT;
+
+ return 0;
+}
+
+static int pep_sock_enable(struct sock *sk, struct sockaddr *addr, int len)
+{
+ int err;
+
+ err = pipe_handler_request(sk, PNS_PEP_ENABLE_REQ, PAD,
+ NULL, 0);
+ if (err)
+ return err;
+
+ sk->sk_state = TCP_SYN_SENT;
+
return 0;
}
@@ -878,11 +931,14 @@ static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
struct pep_sock *pn = pep_sk(sk);
int answ;
+ int ret = -ENOIOCTLCMD;
switch (cmd) {
case SIOCINQ:
- if (sk->sk_state == TCP_LISTEN)
- return -EINVAL;
+ if (sk->sk_state == TCP_LISTEN) {
+ ret = -EINVAL;
+ break;
+ }
lock_sock(sk);
if (sock_flag(sk, SOCK_URGINLINE) &&
@@ -893,10 +949,22 @@ static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg)
else
answ = 0;
release_sock(sk);
- return put_user(answ, (int __user *)arg);
+ ret = put_user(answ, (int __user *)arg);
+ break;
+
+ case SIOCPNENABLEPIPE:
+ lock_sock(sk);
+ if (sk->sk_state == TCP_SYN_SENT)
+ ret = -EBUSY;
+ else if (sk->sk_state == TCP_ESTABLISHED)
+ ret = -EISCONN;
+ else
+ ret = pep_sock_enable(sk, NULL, 0);
+ release_sock(sk);
+ break;
}
- return -ENOIOCTLCMD;
+ return ret;
}
static int pep_init(struct sock *sk)
@@ -959,6 +1027,18 @@ static int pep_setsockopt(struct sock *sk, int level, int optname,
}
goto out_norel;
+ case PNPIPE_HANDLE:
+ if ((sk->sk_state == TCP_CLOSE) &&
+ (val >= 0) && (val < PN_PIPE_INVALID_HANDLE))
+ pn->pipe_handle = val;
+ else
+ err = -EINVAL;
+ break;
+
+ case PNPIPE_INITSTATE:
+ pn->init_enable = !!val;
+ break;
+
default:
err = -ENOPROTOOPT;
}
@@ -994,6 +1074,10 @@ static int pep_getsockopt(struct sock *sk, int level, int optname,
return -EINVAL;
break;
+ case PNPIPE_INITSTATE:
+ val = pn->init_enable;
+ break;
+
default:
return -ENOPROTOOPT;
}
--
1.7.4.3
^ permalink raw reply related
* Re: [PATCH v8] Phonet: set the pipe handle using setsockopt
From: Rémi Denis-Courmont @ 2011-11-18 11:41 UTC (permalink / raw)
To: netdev
In-Reply-To: <1321615325-2025-1-git-send-email-hemant.ramdasi@stericsson.com>
Le Vendredi 18 Novembre 2011 16:52:05 ext Hemant Vilas RAMDASI a écrit :
> From: Dinesh Kumar Sharma <dinesh.sharma@stericsson.com>
>
> This provides flexibility to set the pipe handle
> using setsockopt. The pipe can be enabled (if disabled) later
> using ioctl.
>
> Signed-off-by: Hemant Ramdasi <hemant.ramdasi@stericsson.com>
> Signed-off-by: Dinesh Kumar Sharma <dinesh.sharma@stericsson.com>
Acked-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
--
Rémi Denis-Courmont
http://www.remlab.net/
^ permalink raw reply
* Re: [PATCH] Don't allow sharing of tx skbs on xen-netfront
From: Neil Horman @ 2011-11-18 11:48 UTC (permalink / raw)
To: Ian Campbell
Cc: netdev@vger.kernel.org, David S. Miller, Jeremy Fitzhardinge,
Konrad Rzeszutek Wilk, xen-devel@lists.xensource.com
In-Reply-To: <1321612213.3664.293.camel@zakaz.uk.xensource.com>
On Fri, Nov 18, 2011 at 10:30:13AM +0000, Ian Campbell wrote:
> On Thu, 2011-11-17 at 20:45 +0000, Neil Horman wrote:
> > On Thu, Nov 17, 2011 at 08:17:01PM +0000, Ian Campbell wrote:
> > > On Thu, 2011-11-17 at 19:25 +0000, Neil Horman wrote:
> > > > On Thu, Nov 17, 2011 at 03:20:38PM +0000, Ian Campbell wrote:
> > > > > On Mon, 2011-11-14 at 14:22 -0500, Neil Horman wrote:
> > > > > > It was pointed out to me recently that the xen-netfront driver can't safely
> > > > > > support shared skbs on transmit, since, while it doesn't maintain skb state
> > > > > > directly, it does pass a pointer to the skb to the hypervisor via a list, and
> > > > > > the hypervisor may expect the contents of the skb to remain stable. Clearing
> > > > > > the IFF_TX_SKB_SHARING flag after the call to alloc_etherdev to make it safe.
> > > > >
> > > > > What are the actual constraints here? The skb is used as a handle to the
> > > > > skb->data and shinfo (frags) and to complete at the end. It's actually
> > > > > those which are passed to the hypervisor (effectively the same as
> > > > > passing those addresses to the h/w for DMA).
> > > > >
> > > > > Which parts of the skb are expected/allowed to not remain stable?
> > > > >
> > > > > (Appologies if the above seems naive, I seem to have missed the
> > > > > introduction of shared tx skbs and IFF_TX_SKB_SHARING)
> > > > >
> > > > Its ok, this is the most accurate description from the previous threads on the
> > > > subject:
> > > > 2
> > > >
> > > > The basic problem boils down the notion that some drivers, when they receive an
> > > > skb in their xmit paths, presume to have sole ownership of the skb, and as a
> > > > result may do things like add the skb to a list, or otherwise store stateful
> > > > data in the skb. If the skb is shared, thats unsafe to do, as the stack still
> > > > holds a reference to the skb, and make make changes without serializing them
> > > > against the driver. So we have to flag those drivers which preform these kinds
> > > > of actions. xen-netfront doesn't strictly speaking modify any state directly ni
> > > > an skb, but it does place a pointer to the skb in a data structure here:
> > > >
> > > > np->tx_skbs[id].skb = skb;
> > > >
> > > > Which then gets handed off to the hypervisior. Since the hypervisor now has
> > > > access to that skb pointer, and we can't be sure (from the guest perspective),
> > > > what it does with that information, it would be better to be safe by disallowing
> > > > shared skbs in this path.
> > >
> > > The skb pointer itself doesn't get given to the backend/hypervisor. The
> > > page which skb->data refers to is granted to the backend domain, as are
> > > the pages in the frags.
> > >
> > > I think we only need to be sure that the frontend doesn't rely on
> > > anything in the skb itself, right? Does skb->data or shinfo count from
> > > that perspective?
> > shinfo is definately a problem, as other devices may make modifications to it.
> > skb->data is probably safer, but is also potentially suspect (for instance if
> > another device appends an additional header to the data for instance)
>
> A device is allowed to rely on these things being stable while in its
> start_xmit, right? (otherwise I don't see how any device can ever
> cope...).
>
While the start_xmit routine is executing, yes. Its only after the driver
returns, that it can have no expectation of an skb's data to remain stable.
> netfront only uses shinfo and ->data during start_xmit in order to
> create the necessary grant reference (which can be thought of as a DMA
> address passed to the virtual hardware). The only use of the stashed skb
> pointer outside of this are to dev_kfree_skb on tx completion (from
> either tx_buf_gc (normal completion) or release_tx_buf ("hardware"
> reset).
>
Ok, if you're certain you can guarantee that the hypervisior makes no inspection
of the skb after the return from the driver, then you're safe
Neil
> Ian.
>
>
^ permalink raw reply
* Re: [PATCH 1/2] net: add network priority cgroup infrastructure (v2)
From: Neil Horman @ 2011-11-18 11:50 UTC (permalink / raw)
To: John Fastabend; +Cc: netdev@vger.kernel.org, Love, Robert W, David S. Miller
In-Reply-To: <4EC5CE48.8020603@intel.com>
On Thu, Nov 17, 2011 at 07:17:28PM -0800, John Fastabend wrote:
> On 11/17/2011 1:47 PM, Neil Horman wrote:
> > This patch adds in the infrastructure code to create the network priority
> > cgroup. The cgroup, in addition to the standard processes file creates two
> > control files:
> >
> > 1) prioidx - This is a read-only file that exports the index of this cgroup.
> > This is a value that is both arbitrary and unique to a cgroup in this subsystem,
> > and is used to index the per-device priority map
> >
> > 2) priomap - This is a writeable file. On read it reports a table of 2-tuples
> > <name:priority> where name is the name of a network interface and priority is
> > indicates the priority assigned to frames egresessing on the named interface and
> > originating from a pid in this cgroup
> >
> > This cgroup allows for skb priority to be set prior to a root qdisc getting
> > selected. This is benenficial for DCB enabled systems, in that it allows for any
> > application to use dcb configured priorities so without application modification
> >
> > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> > CC: Robert Love <robert.w.love@intel.com>
> > CC: "David S. Miller" <davem@davemloft.net>
> > ---
>
> one more nit... can we convert the rcu_dereference() into rtnl_dereference()
> where it is relevant?
>
Yeah, I'm sorry, I thought you had typo'ed that, I didn't realize we actualy had
a rtnl specific variant of the rcu_dereference macro. I'll take care of it
today.
Neil
^ permalink raw reply
* Re: [PATCH] Don't allow sharing of tx skbs on xen-netfront
From: Ian Campbell @ 2011-11-18 11:53 UTC (permalink / raw)
To: Neil Horman
Cc: netdev@vger.kernel.org, David S. Miller, Jeremy Fitzhardinge,
Konrad Rzeszutek Wilk, xen-devel@lists.xensource.com
In-Reply-To: <20111118114839.GA7907@hmsreliant.think-freely.org>
On Fri, 2011-11-18 at 11:48 +0000, Neil Horman wrote:
> On Fri, Nov 18, 2011 at 10:30:13AM +0000, Ian Campbell wrote:
> > On Thu, 2011-11-17 at 20:45 +0000, Neil Horman wrote:
> > > On Thu, Nov 17, 2011 at 08:17:01PM +0000, Ian Campbell wrote:
> > > > On Thu, 2011-11-17 at 19:25 +0000, Neil Horman wrote:
> > > > > On Thu, Nov 17, 2011 at 03:20:38PM +0000, Ian Campbell wrote:
> > > > > > On Mon, 2011-11-14 at 14:22 -0500, Neil Horman wrote:
> > > > > > > It was pointed out to me recently that the xen-netfront driver can't safely
> > > > > > > support shared skbs on transmit, since, while it doesn't maintain skb state
> > > > > > > directly, it does pass a pointer to the skb to the hypervisor via a list, and
> > > > > > > the hypervisor may expect the contents of the skb to remain stable. Clearing
> > > > > > > the IFF_TX_SKB_SHARING flag after the call to alloc_etherdev to make it safe.
> > > > > >
> > > > > > What are the actual constraints here? The skb is used as a handle to the
> > > > > > skb->data and shinfo (frags) and to complete at the end. It's actually
> > > > > > those which are passed to the hypervisor (effectively the same as
> > > > > > passing those addresses to the h/w for DMA).
> > > > > >
> > > > > > Which parts of the skb are expected/allowed to not remain stable?
> > > > > >
> > > > > > (Appologies if the above seems naive, I seem to have missed the
> > > > > > introduction of shared tx skbs and IFF_TX_SKB_SHARING)
> > > > > >
> > > > > Its ok, this is the most accurate description from the previous threads on the
> > > > > subject:
> > > > > 2
> > > > >
> > > > > The basic problem boils down the notion that some drivers, when they receive an
> > > > > skb in their xmit paths, presume to have sole ownership of the skb, and as a
> > > > > result may do things like add the skb to a list, or otherwise store stateful
> > > > > data in the skb. If the skb is shared, thats unsafe to do, as the stack still
> > > > > holds a reference to the skb, and make make changes without serializing them
> > > > > against the driver. So we have to flag those drivers which preform these kinds
> > > > > of actions. xen-netfront doesn't strictly speaking modify any state directly ni
> > > > > an skb, but it does place a pointer to the skb in a data structure here:
> > > > >
> > > > > np->tx_skbs[id].skb = skb;
> > > > >
> > > > > Which then gets handed off to the hypervisior. Since the hypervisor now has
> > > > > access to that skb pointer, and we can't be sure (from the guest perspective),
> > > > > what it does with that information, it would be better to be safe by disallowing
> > > > > shared skbs in this path.
> > > >
> > > > The skb pointer itself doesn't get given to the backend/hypervisor. The
> > > > page which skb->data refers to is granted to the backend domain, as are
> > > > the pages in the frags.
> > > >
> > > > I think we only need to be sure that the frontend doesn't rely on
> > > > anything in the skb itself, right? Does skb->data or shinfo count from
> > > > that perspective?
> > > shinfo is definately a problem, as other devices may make modifications to it.
> > > skb->data is probably safer, but is also potentially suspect (for instance if
> > > another device appends an additional header to the data for instance)
> >
> > A device is allowed to rely on these things being stable while in its
> > start_xmit, right? (otherwise I don't see how any device can ever
> > cope...).
> >
> While the start_xmit routine is executing, yes. Its only after the driver
> returns, that it can have no expectation of an skb's data to remain stable.
>
> > netfront only uses shinfo and ->data during start_xmit in order to
> > create the necessary grant reference (which can be thought of as a DMA
> > address passed to the virtual hardware). The only use of the stashed skb
> > pointer outside of this are to dev_kfree_skb on tx completion (from
> > either tx_buf_gc (normal completion) or release_tx_buf ("hardware"
> > reset).
> >
> Ok, if you're certain you can guarantee that the hypervisior makes no inspection
> of the skb after the return from the driver, then you're safe
I believe this is the case, all that is exposed to the backend is the
pfn, offset and length of the skb->data and frags at the time start_xmit
was called.
> Neil
>
> > Ian.
> >
> >
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox