* [PATCH net-next 3/6] net: mvpp2: 1000baseX support
From: Antoine Tenart @ 2017-12-27 22:14 UTC (permalink / raw)
To: davem, kishon, andrew, jason, sebastian.hesselbarth,
gregory.clement, linux
Cc: Antoine Tenart, mw, stefanc, ymarkman, thomas.petazzoni,
miquel.raynal, nadavh, netdev, linux-arm-kernel, linux-kernel
In-Reply-To: <20171227221446.18459-1-antoine.tenart@free-electrons.com>
This patch adds the 1000Base-X PHY mode support in the Marvell PPv2
driver. 1000Base-X is quite close the SGMII and uses nearly the same
code path.
Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
---
drivers/net/ethernet/marvell/mvpp2.c | 45 ++++++++++++++++++++++++++++--------
1 file changed, 35 insertions(+), 10 deletions(-)
diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index a19760736b71..094db9dd633f 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -4501,6 +4501,7 @@ static int mvpp22_gop_init(struct mvpp2_port *port)
mvpp22_gop_init_rgmii(port);
break;
case PHY_INTERFACE_MODE_SGMII:
+ case PHY_INTERFACE_MODE_1000BASEX:
mvpp22_gop_init_sgmii(port);
break;
case PHY_INTERFACE_MODE_10GKR:
@@ -4538,7 +4539,8 @@ static void mvpp22_gop_unmask_irq(struct mvpp2_port *port)
u32 val;
if (phy_interface_mode_is_rgmii(port->phy_interface) ||
- port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
+ port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
+ port->phy_interface == PHY_INTERFACE_MODE_1000BASEX) {
/* Enable the GMAC link status irq for this port */
val = readl(port->base + MVPP22_GMAC_INT_SUM_MASK);
val |= MVPP22_GMAC_INT_SUM_MASK_LINK_STAT;
@@ -4568,7 +4570,8 @@ static void mvpp22_gop_mask_irq(struct mvpp2_port *port)
}
if (phy_interface_mode_is_rgmii(port->phy_interface) ||
- port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
+ port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
+ port->phy_interface == PHY_INTERFACE_MODE_1000BASEX) {
val = readl(port->base + MVPP22_GMAC_INT_SUM_MASK);
val &= ~MVPP22_GMAC_INT_SUM_MASK_LINK_STAT;
writel(val, port->base + MVPP22_GMAC_INT_SUM_MASK);
@@ -4580,7 +4583,8 @@ static void mvpp22_gop_setup_irq(struct mvpp2_port *port)
u32 val;
if (phy_interface_mode_is_rgmii(port->phy_interface) ||
- port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
+ port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
+ port->phy_interface == PHY_INTERFACE_MODE_1000BASEX) {
val = readl(port->base + MVPP22_GMAC_INT_MASK);
val |= MVPP22_GMAC_INT_MASK_LINK_STAT;
writel(val, port->base + MVPP22_GMAC_INT_MASK);
@@ -4605,6 +4609,7 @@ static int mvpp22_comphy_init(struct mvpp2_port *port)
switch (port->phy_interface) {
case PHY_INTERFACE_MODE_SGMII:
+ case PHY_INTERFACE_MODE_1000BASEX:
mode = PHY_MODE_SGMII;
break;
case PHY_INTERFACE_MODE_10GKR:
@@ -4625,7 +4630,8 @@ static void mvpp2_port_mii_gmac_configure_mode(struct mvpp2_port *port)
{
u32 val;
- if (port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
+ if (port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
+ port->phy_interface == PHY_INTERFACE_MODE_1000BASEX) {
val = readl(port->base + MVPP22_GMAC_CTRL_4_REG);
val |= MVPP22_CTRL4_SYNC_BYPASS_DIS | MVPP22_CTRL4_DP_CLK_SEL |
MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
@@ -4640,9 +4646,11 @@ static void mvpp2_port_mii_gmac_configure_mode(struct mvpp2_port *port)
writel(val, port->base + MVPP22_GMAC_CTRL_4_REG);
}
- /* The port is connected to a copper PHY */
val = readl(port->base + MVPP2_GMAC_CTRL_0_REG);
- val &= ~MVPP2_GMAC_PORT_TYPE_MASK;
+ if (port->phy_interface == PHY_INTERFACE_MODE_1000BASEX)
+ val |= MVPP2_GMAC_PORT_TYPE_MASK;
+ else
+ val &= ~MVPP2_GMAC_PORT_TYPE_MASK;
writel(val, port->base + MVPP2_GMAC_CTRL_0_REG);
val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
@@ -4651,6 +4659,19 @@ static void mvpp2_port_mii_gmac_configure_mode(struct mvpp2_port *port)
MVPP2_GMAC_AN_DUPLEX_EN;
if (port->phy_interface == PHY_INTERFACE_MODE_SGMII)
val |= MVPP2_GMAC_IN_BAND_AUTONEG;
+
+ if (port->phy_interface == PHY_INTERFACE_MODE_1000BASEX)
+ /* 1000BaseX port cannot negotiate speed nor can it
+ * negotiate duplex: they are always operating with a
+ * fixed speed of 1000Mbps in full duplex, so force
+ * 1000 speed and full duplex here.
+ */
+ val |= MVPP2_GMAC_CONFIG_GMII_SPEED |
+ MVPP2_GMAC_CONFIG_FULL_DUPLEX;
+ else
+ val |= MVPP2_GMAC_AN_SPEED_EN |
+ MVPP2_GMAC_AN_DUPLEX_EN;
+
writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
}
@@ -4671,7 +4692,8 @@ static void mvpp2_port_mii_gmac_configure(struct mvpp2_port *port)
/* Configure the PCS and in-band AN */
val = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
- if (port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
+ if (port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
+ port->phy_interface == PHY_INTERFACE_MODE_1000BASEX) {
val |= MVPP2_GMAC_INBAND_AN_MASK | MVPP2_GMAC_PCS_ENABLE_MASK;
} else if (phy_interface_mode_is_rgmii(port->phy_interface)) {
val &= ~MVPP2_GMAC_PCS_ENABLE_MASK;
@@ -4733,7 +4755,8 @@ static void mvpp2_port_mii_set(struct mvpp2_port *port)
mvpp22_port_mii_set(port);
if (phy_interface_mode_is_rgmii(port->phy_interface) ||
- port->phy_interface == PHY_INTERFACE_MODE_SGMII)
+ port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
+ port->phy_interface == PHY_INTERFACE_MODE_1000BASEX) {
mvpp2_port_mii_gmac_configure(port);
else if (port->phy_interface == PHY_INTERFACE_MODE_10GKR)
mvpp2_port_mii_xlg_configure(port);
@@ -4810,7 +4833,8 @@ static void mvpp2_port_loopback_set(struct mvpp2_port *port)
else
val &= ~MVPP2_GMAC_GMII_LB_EN_MASK;
- if (port->phy_interface == PHY_INTERFACE_MODE_SGMII)
+ if (port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
+ port->phy_interface == PHY_INTERFACE_MODE_1000BASEX)
val |= MVPP2_GMAC_PCS_LB_EN_MASK;
else
val &= ~MVPP2_GMAC_PCS_LB_EN_MASK;
@@ -6023,7 +6047,8 @@ static irqreturn_t mvpp2_link_status_isr(int irq, void *dev_id)
link = true;
}
} else if (phy_interface_mode_is_rgmii(port->phy_interface) ||
- port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
+ port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
+ port->phy_interface == PHY_INTERFACE_MODE_1000BASEX) {
val = readl(port->base + MVPP22_GMAC_INT_STAT);
if (val & MVPP22_GMAC_INT_STAT_LINK) {
event = true;
--
2.14.3
^ permalink raw reply related
* [PATCH net-next 2/6] phy: cp110-comphy: 2.5G SGMII mode
From: Antoine Tenart @ 2017-12-27 22:14 UTC (permalink / raw)
To: davem, kishon, andrew, jason, sebastian.hesselbarth,
gregory.clement, linux
Cc: Antoine Tenart, mw, stefanc, ymarkman, thomas.petazzoni,
miquel.raynal, nadavh, netdev, linux-arm-kernel, linux-kernel
In-Reply-To: <20171227221446.18459-1-antoine.tenart@free-electrons.com>
This patch allow the CP100 comphy to configure some lanes in the
2.5G SGMII mode. This mode is quite close to SGMII and uses nearly the
same code path.
Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
---
drivers/phy/marvell/phy-mvebu-cp110-comphy.c | 17 ++++++++++++++---
1 file changed, 14 insertions(+), 3 deletions(-)
diff --git a/drivers/phy/marvell/phy-mvebu-cp110-comphy.c b/drivers/phy/marvell/phy-mvebu-cp110-comphy.c
index a0d522154cdf..946a6ed7b66f 100644
--- a/drivers/phy/marvell/phy-mvebu-cp110-comphy.c
+++ b/drivers/phy/marvell/phy-mvebu-cp110-comphy.c
@@ -135,19 +135,25 @@ struct mvebu_comhy_conf {
static const struct mvebu_comhy_conf mvebu_comphy_cp110_modes[] = {
/* lane 0 */
MVEBU_COMPHY_CONF(0, 1, PHY_MODE_SGMII, 0x1),
+ MVEBU_COMPHY_CONF(0, 1, PHY_MODE_SGMII_2_5G, 0x1),
/* lane 1 */
MVEBU_COMPHY_CONF(1, 2, PHY_MODE_SGMII, 0x1),
+ MVEBU_COMPHY_CONF(1, 2, PHY_MODE_SGMII_2_5G, 0x1),
/* lane 2 */
MVEBU_COMPHY_CONF(2, 0, PHY_MODE_SGMII, 0x1),
+ MVEBU_COMPHY_CONF(2, 0, PHY_MODE_SGMII_2_5G, 0x1),
MVEBU_COMPHY_CONF(2, 0, PHY_MODE_10GKR, 0x1),
/* lane 3 */
MVEBU_COMPHY_CONF(3, 1, PHY_MODE_SGMII, 0x2),
+ MVEBU_COMPHY_CONF(3, 1, PHY_MODE_SGMII_2_5G, 0x2),
/* lane 4 */
MVEBU_COMPHY_CONF(4, 0, PHY_MODE_SGMII, 0x2),
+ MVEBU_COMPHY_CONF(4, 0, PHY_MODE_SGMII_2_5G, 0x2),
MVEBU_COMPHY_CONF(4, 0, PHY_MODE_10GKR, 0x2),
MVEBU_COMPHY_CONF(4, 1, PHY_MODE_SGMII, 0x1),
/* lane 5 */
MVEBU_COMPHY_CONF(5, 2, PHY_MODE_SGMII, 0x1),
+ MVEBU_COMPHY_CONF(5, 2, PHY_MODE_SGMII_2_5G, 0x1),
};
struct mvebu_comphy_priv {
@@ -206,6 +212,10 @@ static void mvebu_comphy_ethernet_init_reset(struct mvebu_comphy_lane *lane,
if (mode == PHY_MODE_10GKR)
val |= MVEBU_COMPHY_SERDES_CFG0_GEN_RX(0xe) |
MVEBU_COMPHY_SERDES_CFG0_GEN_TX(0xe);
+ else if (mode == PHY_MODE_SGMII_2_5G)
+ val |= MVEBU_COMPHY_SERDES_CFG0_GEN_RX(0x8) |
+ MVEBU_COMPHY_SERDES_CFG0_GEN_TX(0x8) |
+ MVEBU_COMPHY_SERDES_CFG0_HALF_BUS;
else if (mode == PHY_MODE_SGMII)
val |= MVEBU_COMPHY_SERDES_CFG0_GEN_RX(0x6) |
MVEBU_COMPHY_SERDES_CFG0_GEN_TX(0x6) |
@@ -296,13 +306,13 @@ static int mvebu_comphy_init_plls(struct mvebu_comphy_lane *lane,
return 0;
}
-static int mvebu_comphy_set_mode_sgmii(struct phy *phy)
+static int mvebu_comphy_set_mode_sgmii(struct phy *phy, enum phy_mode mode)
{
struct mvebu_comphy_lane *lane = phy_get_drvdata(phy);
struct mvebu_comphy_priv *priv = lane->priv;
u32 val;
- mvebu_comphy_ethernet_init_reset(lane, PHY_MODE_SGMII);
+ mvebu_comphy_ethernet_init_reset(lane, mode);
val = readl(priv->base + MVEBU_COMPHY_RX_CTRL1(lane->id));
val &= ~MVEBU_COMPHY_RX_CTRL1_CLK8T_EN;
@@ -487,7 +497,8 @@ static int mvebu_comphy_power_on(struct phy *phy)
switch (lane->mode) {
case PHY_MODE_SGMII:
- ret = mvebu_comphy_set_mode_sgmii(phy);
+ case PHY_MODE_SGMII_2_5G:
+ ret = mvebu_comphy_set_mode_sgmii(phy, lane->mode);
break;
case PHY_MODE_10GKR:
ret = mvebu_comphy_set_mode_10gkr(phy);
--
2.14.3
^ permalink raw reply related
* [PATCH net-next 1/6] phy: add 2.5G SGMII mode to the phy_mode enum
From: Antoine Tenart @ 2017-12-27 22:14 UTC (permalink / raw)
To: davem, kishon, andrew, jason, sebastian.hesselbarth,
gregory.clement, linux
Cc: Antoine Tenart, mw, stefanc, ymarkman, thomas.petazzoni,
miquel.raynal, nadavh, netdev, linux-arm-kernel, linux-kernel
In-Reply-To: <20171227221446.18459-1-antoine.tenart@free-electrons.com>
This patch adds one more generic PHY mode to the phy_mode enum, to allow
configuring generic PHYs to the 2.5G SGMII mode by using the set_mode
callback.
Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
---
include/linux/phy/phy.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h
index 4f8423a948d5..70459a28f3a1 100644
--- a/include/linux/phy/phy.h
+++ b/include/linux/phy/phy.h
@@ -28,6 +28,7 @@ enum phy_mode {
PHY_MODE_USB_DEVICE,
PHY_MODE_USB_OTG,
PHY_MODE_SGMII,
+ PHY_MODE_SGMII_2_5G,
PHY_MODE_10GKR,
PHY_MODE_UFS_HS_A,
PHY_MODE_UFS_HS_B,
--
2.14.3
^ permalink raw reply related
* [PATCH net-next 0/6] net: mvpp2: 1000BaseX and 2000BaseX support
From: Antoine Tenart @ 2017-12-27 22:14 UTC (permalink / raw)
To: davem, kishon, andrew, jason, sebastian.hesselbarth,
gregory.clement, linux
Cc: Antoine Tenart, mw, stefanc, ymarkman, thomas.petazzoni,
miquel.raynal, nadavh, netdev, linux-arm-kernel, linux-kernel
Hi all,
This series adds 1000BaseX and 2500BaseX support to the Marvell PPv2
driver. In order to use it, the 2.5 SGMII mode is added in the Marvell
common PHY driver (cp110-comphy).
Thanks to theses patches the fourth network interface can be used on the
mcbin, and two patches are attached: one to describe the interface in
the mcbin device tree, and another one adding Ethernet aliases now that
the four interfaces are described.
This was tested on a mcbin.
Patches 1 and 2 should go through the PHY tree, patches 3 and 4 through
the net-next tree and patches 5 and 6 through the mvebu one.
Please note the two mvpp2 patches do not conflict with the ACPI series
Marcin sent a few days ago, and the two series can be processed in
parallel. (Marcin is aware of me sending this series).
Thanks!
Antoine
Antoine Tenart (5):
phy: add 2.5G SGMII mode to the phy_mode enum
phy: cp110-comphy: 2.5G SGMII mode
net: mvpp2: 1000baseX support
net: mvpp2: 2500baseX support
arm64: dts: marvell: mcbin: enable the fourth network interface
Yan Markman (1):
arm64: dts: marvell: add Ethernet aliases
arch/arm64/boot/dts/marvell/armada-7040-db.dts | 6 ++
arch/arm64/boot/dts/marvell/armada-8040-db.dts | 7 +++
arch/arm64/boot/dts/marvell/armada-8040-mcbin.dts | 15 +++++
drivers/net/ethernet/marvell/mvpp2.c | 67 +++++++++++++++++++----
drivers/phy/marvell/phy-mvebu-cp110-comphy.c | 17 +++++-
include/linux/phy/phy.h | 1 +
6 files changed, 100 insertions(+), 13 deletions(-)
--
2.14.3
^ permalink raw reply
* Re: [patch iproute2 v3 3/4] tc: Add -bs option to batch mode
From: Marcelo Ricardo Leitner @ 2017-12-27 22:06 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Ahern, Chris Mi, netdev, gerlitz.or
In-Reply-To: <20171227134024.0706d33f@xeon-e3>
On Wed, Dec 27, 2017 at 01:40:24PM -0800, Stephen Hemminger wrote:
> On Wed, 27 Dec 2017 18:39:29 -0200
> Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> wrote:
>
> > > > + send = false;
> > > > + else
> > > > + send = true;
> > > > +
> > > > + ret = do_cmd(largc, largv, batch_size, msg_iov_index++, send);
> > >
> > > What happens if tc commands are interlaced in the file -- qdisc add,
> > > class add, filter add, then a delete, show, exec, etc.? Right now each
> > > command is handled one at a time so an add followed by a delete will
> > > work. Your proposed batching loop won't work for this case as some
> > > commands are executed when that line is reached and others are batched
> > > for later send. Not all of the tc commands need to be batched in a
> > > single message so perhaps those commands cause the queue to be flushed
> > > (ie, message sent), then that command is executed and you start the
> > > batching over.
> > >
> > > Further, I really think the batching can be done without the global
> > > variables and without the command handlers knowing it is batching or
> > > part of an iov. e.g., in the case of batching try having the commands
> > > malloc the request buffer and return the pointer back to this loop in
> > > which case this loop calls rtnl_talk_msg and frees the buffers.
> >
> > Sounds like the batching is being done at the wrong level. If it was
> > done by rtnl_talk(), it should be easier.
> > We can keep rtnl_talk() for previous users and make rtnl_talk_msg() do
> > the batching, mostly independent of which kind of msg it it.
> >
> > As you need to inform it that it was the last entry, that may be
> > detected with feof(stdin). Just add a 'bool flush' parameter to it.
> > rtnl_talk_msg(...., flush=feof(stdin));
> >
> > Next step then would be to add a memory manager layer to it, so
> > libnetlink wouldn't need to copy the messages but recycle pointers:
> > rtnl_get_msgbuf(): returns a buffer that one can use to fill in the
> > msg and use with rtnl_talk_msg()
> > and the free is done by libnetlink itself when the message is
> > finally sent, so no need to keep track of what one needs to free or
> > can reuse.
>
>
> What about using sendmmsg instead?
> That woudl allow sending multiple messages in one syscall.
Could be. Although the batching effect would be very different.
sendmmsg calls cond_resched() between messages, for instance.
^ permalink raw reply
* Re: [PATCH v2 net-next] net/trace: fix printk format in inet_sock_set_state
From: David Miller @ 2017-12-27 22:04 UTC (permalink / raw)
To: laoar.shao; +Cc: netdev, sergei.shtylyov
In-Reply-To: <1514128239-2331-1-git-send-email-laoar.shao@gmail.com>
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 24 Dec 2017 23:10:39 +0800
> There's a space character missed in the printk messages.
>
> Put the message into one line could simplify searching for
> the messages in the kernel source.
>
> Fixes: 563e0bb0dc74("net: tracepoint: replace tcp_set_state tracepoint with
> inet_sock_set_state tracepoint")
Please do not break up long "Fixes: " tag lines, for the same reason
you shouldn't break up long kernel log message and trace message
strings.
I fixed it this time.
> Cc: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Applied.
^ permalink raw reply
* Re: [PATCH net-next 07/10] net: qualcomm: rmnet: Add support for RX checksum offload
From: kbuild test robot @ 2017-12-27 22:02 UTC (permalink / raw)
To: Subash Abhinov Kasiviswanathan
Cc: kbuild-all, davem, netdev, Subash Abhinov Kasiviswanathan
In-Reply-To: <1514341685-11262-8-git-send-email-subashab@codeaurora.org>
Hi Subash,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on net-next/master]
url: https://github.com/0day-ci/linux/commits/Subash-Abhinov-Kasiviswanathan/net-qualcomm-rmnet-Enable-csum-offloads/20171228-041216
reproduce:
# apt-get install sparse
make ARCH=x86_64 allmodconfig
make C=1 CF=-D__CHECK_ENDIAN__
sparse warnings: (new ones prefixed by >>)
vim +30 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
26
27 static u16 *rmnet_map_get_csum_field(unsigned char protocol,
28 const void *txporthdr)
29 {
> 30 u16 *check = 0;
31
32 switch (protocol) {
33 case IPPROTO_TCP:
> 34 check = &(((struct tcphdr *)txporthdr)->check);
35 break;
36
37 case IPPROTO_UDP:
> 38 check = &(((struct udphdr *)txporthdr)->check);
39 break;
40
41 default:
42 check = 0;
43 break;
44 }
45
46 return check;
47 }
48
49 static int
50 rmnet_map_ipv4_dl_csum_trailer(struct sk_buff *skb,
51 struct rmnet_map_dl_csum_trailer *csum_trailer)
52 {
53 u16 ip_pseudo_payload_csum, pseudo_csum, ip_hdr_csum, *csum_field;
54 u16 csum_value, ip_payload_csum, csum_value_final;
55 struct iphdr *ip4h;
56 void *txporthdr;
57
58 ip4h = (struct iphdr *)(skb->data);
59 if ((ntohs(ip4h->frag_off) & IP_MF) ||
60 ((ntohs(ip4h->frag_off) & IP_OFFSET) > 0))
61 return -EOPNOTSUPP;
62
63 txporthdr = skb->data + ip4h->ihl * 4;
64
65 csum_field = rmnet_map_get_csum_field(ip4h->protocol, txporthdr);
66
67 if (!csum_field)
68 return -EPROTONOSUPPORT;
69
70 /* RFC 768 - Skip IPv4 UDP packets where sender checksum field is 0 */
71 if (*csum_field == 0 && ip4h->protocol == IPPROTO_UDP)
72 return 0;
73
> 74 csum_value = ~ntohs(csum_trailer->csum_value);
> 75 ip_hdr_csum = ~ip_fast_csum(ip4h, (int)ip4h->ihl);
> 76 ip_payload_csum = csum16_sub(csum_value, ip_hdr_csum);
77
> 78 pseudo_csum = ~ntohs(csum_tcpudp_magic(ip4h->saddr, ip4h->daddr,
79 (u16)(ntohs(ip4h->tot_len) - ip4h->ihl * 4),
80 (u16)ip4h->protocol, 0));
> 81 ip_pseudo_payload_csum = csum16_add(ip_payload_csum, pseudo_csum);
82
> 83 csum_value_final = ~csum16_sub(ip_pseudo_payload_csum,
> 84 ntohs(*csum_field));
85
86 if (unlikely(csum_value_final == 0)) {
87 switch (ip4h->protocol) {
88 case IPPROTO_UDP:
89 /* RFC 768 - DL4 1's complement rule for UDP csum 0 */
90 csum_value_final = ~csum_value_final;
91 break;
92
93 case IPPROTO_TCP:
94 /* DL4 Non-RFC compliant TCP checksum found */
95 if (*csum_field == 0xFFFF)
96 csum_value_final = ~csum_value_final;
97 break;
98 }
99 }
100
101 if (csum_value_final == ntohs(*csum_field))
102 return 0;
103 else
104 return -EINVAL;
105 }
106
107 #if IS_ENABLED(CONFIG_IPV6)
108 static int
109 rmnet_map_ipv6_dl_csum_trailer(struct sk_buff *skb,
110 struct rmnet_map_dl_csum_trailer *csum_trailer)
111 {
112 u16 ip_pseudo_payload_csum, pseudo_csum, ip6_hdr_csum, *csum_field;
113 u16 csum_value, ip6_payload_csum, csum_value_final;
114 struct ipv6hdr *ip6h;
115 void *txporthdr;
116 u32 length;
117
118 ip6h = (struct ipv6hdr *)(skb->data);
119
120 txporthdr = skb->data + sizeof(struct ipv6hdr);
121 csum_field = rmnet_map_get_csum_field(ip6h->nexthdr, txporthdr);
122
123 if (!csum_field)
124 return -EPROTONOSUPPORT;
125
126 csum_value = ~ntohs(csum_trailer->csum_value);
> 127 ip6_hdr_csum = ~ntohs(ip_compute_csum(ip6h,
128 (int)(txporthdr - (void *)(skb->data))));
> 129 ip6_payload_csum = csum16_sub(csum_value, ip6_hdr_csum);
130
131 length = (ip6h->nexthdr == IPPROTO_UDP) ?
132 ntohs(((struct udphdr *)txporthdr)->len) :
133 ntohs(ip6h->payload_len);
134 pseudo_csum = ~ntohs(csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
135 length, ip6h->nexthdr, 0));
136 ip_pseudo_payload_csum = csum16_add(ip6_payload_csum, pseudo_csum);
137
138 csum_value_final = ~csum16_sub(ip_pseudo_payload_csum,
139 ntohs(*csum_field));
140
141 if (unlikely(csum_value_final == 0)) {
142 switch (ip6h->nexthdr) {
143 case IPPROTO_UDP:
144 /* RFC 2460 section 8.1
145 * DL6 One's complement rule for UDP checksum 0
146 */
147 csum_value_final = ~csum_value_final;
148 break;
149
150 case IPPROTO_TCP:
151 /* DL6 Non-RFC compliant TCP checksum found */
152 if (*csum_field == 0xFFFF)
153 csum_value_final = ~csum_value_final;
154 break;
155 }
156 }
157
158 if (csum_value_final == ntohs(*csum_field))
159 return 0;
160 else
161 return -EINVAL;
162 }
163 #endif
164
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
^ permalink raw reply
* Re: [pull request][for-next V2 00/11] Mellanox, mlx5 E-Switch updates 2017-12-19
From: David Miller @ 2017-12-27 22:03 UTC (permalink / raw)
To: saeedm; +Cc: dledford, netdev, linux-rdma, leonro
In-Reply-To: <20171227.170122.256080933244367774.davem@davemloft.net>
From: David Miller <davem@davemloft.net>
Date: Wed, 27 Dec 2017 17:01:22 -0500 (EST)
> Pulled, thank you.
Actually, I had to revert. Please fix this and resubmit:
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c: In function ‘esw_offloads_load_reps’:
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c:774:2: warning: this ‘for’ clause does not guard... [-Wmisleading-indentation]
for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++)
^~~
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c:776:3: note: ...this statement, but the latter is misleadingly indented as if it is guarded by the ‘for’
if (err)
^~
^ permalink raw reply
* Re: [pull request][for-next V2 00/11] Mellanox, mlx5 E-Switch updates 2017-12-19
From: David Miller @ 2017-12-27 22:01 UTC (permalink / raw)
To: saeedm-VPRAkNaXOzVWk0Htik3J/w
Cc: dledford-H+wXaHxf7aLQT0dZR+AlfA, netdev-u79uwXL29TY76Z2rM5mHXA,
linux-rdma-u79uwXL29TY76Z2rM5mHXA, leonro-VPRAkNaXOzVWk0Htik3J/w
In-Reply-To: <20171224134547.600-1-saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
From: Saeed Mahameed <saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Date: Sun, 24 Dec 2017 15:45:36 +0200
> Hi Dave and Doug,
>
> ==============
> This series includes updates for mlx5 E-Switch infrastructures,
> to be merged into net-next and rdma-next trees.
>
> Mark's patches provide E-Switch refactoring that generalize the mlx5
> E-Switch vf representors interfaces and data structures. The serious is
> mainly focused on moving ethernet (netdev) specific representors logic out
> of E-Switch (eswitch.c) into mlx5e representor module (en_rep.c), which
> provides better separation and allows future support for other types of vf
> representors (e.g. RDMA).
>
> Gal's patches at the end of this serious, provide a simple syntax fix and
> two other patches that handles vport ingress/egress ACL steering name
> spaces to be aligned with the Firmware/Hardware specs.
> ===============
>
> V1->V2:
> - Addressed coding style comments in patches #1 and #7
> - The series is still based on rc4, as now I see net-next is also @rc4.
>
> Please pull and let me know if there's any problem.
Pulled, thank you.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH iproute] qdisc: Print offload indication
From: Stephen Hemminger @ 2017-12-27 21:55 UTC (permalink / raw)
To: Yuval Mintz; +Cc: netdev, mlxsw
In-Reply-To: <1514281725-47488-1-git-send-email-yuvalm@mellanox.com>
On Tue, 26 Dec 2017 11:48:45 +0200
Yuval Mintz <yuvalm@mellanox.com> wrote:
> Use the newly added TCA_HW_OFFLOAD indication from kernel
> to print a consistent 'offloaded' message to user when listing qdiscs.
>
> Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Applied to master (since TCA_HW_OFFLOAD is already present).
^ permalink raw reply
* Re: [PATCH v2 net-next 0/2] kcm: Fix two locking issues
From: David Miller @ 2017-12-27 21:56 UTC (permalink / raw)
To: tom; +Cc: netdev, dvyukov, rohit
In-Reply-To: <20171223171716.16130-1-tom@quantonium.net>
From: Tom Herbert <tom@quantonium.net>
Date: Sat, 23 Dec 2017 09:17:14 -0800
> One issue is lockdep warnings when sock_owned_by_user returns true
> in strparser. Fix is to add and call sock_owned_by_user_nocheck since
> the check for owned by user is not an error condition in this case.
>
> The other issue is a potential deadlock between TX and RX paths
>
> KCM socket lock and the psock socket lock are acquired in both
> the RX and TX path, however they take the locks in opposite order
> which can lead to deadlock. The fix is to add try_sock_lock to see
> if psock socket lock can get acquired in the TX path with KCM lock
> held. If not, then KCM socket is released and the psock socket lock
> and KCM socket lock are acquired in the same order as the RX path.
>
> Tested:
>
> Ran KCM traffic without incident.
>
> v2: Remove patches to address potential deadlock. I couldn't convince
> myself this is an issue after looking at the code some more.
If this fixes real locking bugs you should target them at 'net' not
'net-next'.
I also see you telling some people hitting kcm locking problems to
test "the patch" but you give them no idea what patch you are even
talking about.
Is it this series? Nobody knows.
Please poing them at exactly what patch you want them to test, get
their testing results, and add appropriate Tested-by: tags as you
respin this for 'net'.
Thanks.
^ permalink raw reply
* Re: [PATCH iproute2 0/3] ip/tunnel: Fix noencap- and document "external" parameter handling.
From: Stephen Hemminger @ 2017-12-27 21:47 UTC (permalink / raw)
To: Serhey Popovych; +Cc: netdev
In-Reply-To: <1514374096-1473-1-git-send-email-serhe.popovych@gmail.com>
On Wed, 27 Dec 2017 13:28:13 +0200
Serhey Popovych <serhe.popovych@gmail.com> wrote:
> In this series I present next set of improvements/fixes:
>
> 1) Fix noencap- option handling: we need to clear bit, instead
> of seting all, but one we expect to clear.
>
> 2) Document "external" parameter both in ip-link(8) and help
> output for link_gre.c. Add "noexternal" option variant to
> bring inline with GENEVE and VXLAN.
>
> 3) Trivial: clear flowlabel/tclass from flowinfo in case of
> inherit to stop sending garbage to the kernel. It has no
> functional change, but follows similar behaviour in link_ip6tnl.c
>
> See individual patch description message for details.
>
> Thanks,
> Serhii
>
> Serhey Popovych (3):
> gre,ip6tnl/tunnel: Fix noencap- support
> gre6/tunnel: Do not submit garbage in flowinfo
> ip/tunnel: Document "external" parameter
>
> ip/link_gre.c | 7 +++++--
> ip/link_gre6.c | 4 ++--
> ip/link_ip6tnl.c | 6 ++++--
> ip/link_iptnl.c | 4 +++-
> man/man8/ip-link.8.in | 6 ++++++
> 5 files changed, 20 insertions(+), 7 deletions(-)
>
These are really disjoint. I applied the noencap and the flowinfo patch.
Agree with William that having noexternal which does nothing useful is of little value now.
^ permalink raw reply
* Re: [PATCH net-next 0/4] zerocopy refinements
From: David Miller @ 2017-12-27 21:45 UTC (permalink / raw)
To: willemdebruijn.kernel; +Cc: netdev, willemb
In-Reply-To: <20171223000020.55509-1-willemdebruijn.kernel@gmail.com>
From: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
Date: Fri, 22 Dec 2017 19:00:16 -0500
> From: Willem de Bruijn <willemb@google.com>
>
> 1/4 is a small optimization follow-up to the earlier fix to skb_segment:
> check skb state once per skb, instead of once per frag.
> 2/4 makes behavior more consistent between standard and zerocopy send:
> set the PSH bit when hitting MAX_SKB_FRAGS. This helps GRO.
> 3/4 resolves a surprising inconsistency in notification:
> because small packets were not stored in frags, they would not set
> the copied error code over loopback. This change also optimizes
> the path by removing copying and making tso_fragment cheaper.
> 4/4 follows-up to 3/4 by no longer allocated now unused memory.
> this was actually already in RFC patches, but dropped as I pared
> down the patch set during revisions.
Looks good, series applied, thanks.
^ permalink raw reply
* Re: [PATCH net-next v2 1/3] virtio_net: propagate linkspeed/duplex settings from the hypervisor
From: David Miller @ 2017-12-27 21:43 UTC (permalink / raw)
To: jbaron; +Cc: netdev, virtualization, qemu-devel, mst, jasowang
In-Reply-To: <44da522ecee60792ec918234ee4d61a84e4574f0.1513974243.git.jbaron@akamai.com>
From: Jason Baron <jbaron@akamai.com>
Date: Fri, 22 Dec 2017 16:54:01 -0500
> The ability to set speed and duplex for virtio_net in useful in various
> scenarios as described here:
>
> 16032be virtio_net: add ethtool support for set and get of settings
>
> However, it would be nice to be able to set this from the hypervisor,
> such that virtio_net doesn't require custom guest ethtool commands.
>
> Introduce a new feature flag, VIRTIO_NET_F_SPEED_DUPLEX, which allows
> the hypervisor to export a linkspeed and duplex setting. The user can
> subsequently overwrite it later if desired via: 'ethtool -s'.
>
> Signed-off-by: Jason Baron <jbaron@akamai.com>
> Cc: "Michael S. Tsirkin" <mst@redhat.com>
> Cc: Jason Wang <jasowang@redhat.com>
Looks mostly fine to me but need some virtio_net reviewers on this one.
> @@ -57,6 +57,8 @@
> * Steering */
> #define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */
>
> +#define VIRTIO_NET_F_SPEED_DUPLEX 63 /* Host set linkspeed and duplex */
> +
Why use a value so far away from the largest existing one?
Just curious.
^ permalink raw reply
* Re: [patch iproute2 v3 3/4] tc: Add -bs option to batch mode
From: Stephen Hemminger @ 2017-12-27 21:40 UTC (permalink / raw)
To: Marcelo Ricardo Leitner; +Cc: David Ahern, Chris Mi, netdev, gerlitz.or
In-Reply-To: <20171227203929.GB22042@localhost.localdomain>
On Wed, 27 Dec 2017 18:39:29 -0200
Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> wrote:
> > > + send = false;
> > > + else
> > > + send = true;
> > > +
> > > + ret = do_cmd(largc, largv, batch_size, msg_iov_index++, send);
> >
> > What happens if tc commands are interlaced in the file -- qdisc add,
> > class add, filter add, then a delete, show, exec, etc.? Right now each
> > command is handled one at a time so an add followed by a delete will
> > work. Your proposed batching loop won't work for this case as some
> > commands are executed when that line is reached and others are batched
> > for later send. Not all of the tc commands need to be batched in a
> > single message so perhaps those commands cause the queue to be flushed
> > (ie, message sent), then that command is executed and you start the
> > batching over.
> >
> > Further, I really think the batching can be done without the global
> > variables and without the command handlers knowing it is batching or
> > part of an iov. e.g., in the case of batching try having the commands
> > malloc the request buffer and return the pointer back to this loop in
> > which case this loop calls rtnl_talk_msg and frees the buffers.
>
> Sounds like the batching is being done at the wrong level. If it was
> done by rtnl_talk(), it should be easier.
> We can keep rtnl_talk() for previous users and make rtnl_talk_msg() do
> the batching, mostly independent of which kind of msg it it.
>
> As you need to inform it that it was the last entry, that may be
> detected with feof(stdin). Just add a 'bool flush' parameter to it.
> rtnl_talk_msg(...., flush=feof(stdin));
>
> Next step then would be to add a memory manager layer to it, so
> libnetlink wouldn't need to copy the messages but recycle pointers:
> rtnl_get_msgbuf(): returns a buffer that one can use to fill in the
> msg and use with rtnl_talk_msg()
> and the free is done by libnetlink itself when the message is
> finally sent, so no need to keep track of what one needs to free or
> can reuse.
What about using sendmmsg instead?
That woudl allow sending multiple messages in one syscall.
^ permalink raw reply
* Re: lost connection to test machine (3)
From: Florian Westphal @ 2017-12-27 21:36 UTC (permalink / raw)
To: Dmitry Vyukov
Cc: syzbot, LKML, syzkaller-bugs, Pablo Neira Ayuso, Jozsef Kadlecsik,
Florian Westphal, David Miller, netfilter-devel, coreteam, netdev
In-Reply-To: <CACT4Y+ZtCkx8WUjY95n-if60MWed2UpjM7H-_TQjfVq2PDXKhQ@mail.gmail.com>
Dmitry Vyukov <dvyukov@google.com> wrote:
> On Wed, Dec 27, 2017 at 7:18 PM, syzbot
> <syzbot+4396883fa8c4f64e0175@syzkaller.appspotmail.com> wrote:
> > Hello,
> >
> > syzkaller hit the following crash on
> > beacbc68ac3e23821a681adb30b45dc55b17488d
> > git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/master
> > compiler: gcc (GCC) 7.1.1 20170620
> > .config is attached
> > Raw console output is attached.
> > C reproducer is attached
> > syzkaller reproducer is attached. See https://goo.gl/kgGztJ
> > for information about syzkaller reproducers
> >
> >
> > IMPORTANT: if you fix the bug, please add the following tag to the commit:
> > Reported-by: <syzbot+4396883fa8c4f64e0175@syzkaller.appspotmail.com>
> > It will help syzbot understand when the bug is fixed. See footer for
> > details.
> > If you forward the report, please keep this part and the footer.
>
> +netfilter maintainers
>
> Here is cleaned reproducer:
>
> // autogenerated by syzkaller (http://github.com/google/syzkaller)
> #include <sys/types.h>
> #include <sys/socket.h>
> #include <netinet/in.h>
> #include <netinet/tcp.h>
> #include <linux/if.h>
> #include <linux/netfilter_ipv4/ip_tables.h>
>
> int main()
> {
> int fd;
>
> fd = socket(AF_INET, SOCK_STREAM, IPPROTO_IP);
> struct ipt_replace opt = {};
> opt.num_counters = 1;
> opt.size = -1;
> setsockopt(fd, SOL_IP, 0x40, &opt, 0x4);
> return 0;
> }
>
>
> What happens there is that here:
>
> struct xt_table_info *xt_alloc_table_info(unsigned int size)
> {
> ...
> if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
> return NULL;
>
> size = -1 and SMP_ALIGN(size) = 0, so this still tries to allocate
> 4GB+delta bytes.
>
> I don't understand why this uses SMP_ALIGN since we add 2 pages on
> top, it seems that we could just drop SMP_ALIGN and local SMP_ALIGN
> definition altogether.
Looking at history.git this seems to be a left over from back when
iptables allocated size * num_cpus() (and used an SMP_ALIGN based offset
for each cpu).
So yes, I think we can just toss/drop this.
^ permalink raw reply
* [PATCH] Staging: ipx: fixed several no space before tabs coding style issues
From: Jianshen Liu @ 2017-12-27 21:25 UTC (permalink / raw)
To: gregkh; +Cc: devel, netdev, linux-kernel, Jianshen Liu
Fixed several coding style warnings of "please, no space before tabs".
Signed-off-by: Jianshen Liu <ljishen@gmail.com>
---
drivers/staging/ipx/af_ipx.c | 56 ++++++++++++++++++++---------------------
drivers/staging/ipx/ipx_proc.c | 2 +-
drivers/staging/ipx/ipx_route.c | 6 ++---
3 files changed, 32 insertions(+), 32 deletions(-)
diff --git a/drivers/staging/ipx/af_ipx.c b/drivers/staging/ipx/af_ipx.c
index d21a9d1..d8be06c 100644
--- a/drivers/staging/ipx/af_ipx.c
+++ b/drivers/staging/ipx/af_ipx.c
@@ -2,7 +2,7 @@
* Implements an IPX socket layer.
*
* This code is derived from work by
- * Ross Biro : Writing the original IP stack
+ * Ross Biro : Writing the original IP stack
* Fred Van Kempen : Tidying up the TCP/IP
*
* Many thanks go to Keith Baker, Institute For Industrial Information
@@ -20,7 +20,7 @@
* provide warranty for any of this software. This material is provided
* "AS-IS" and at no charge.
*
- * Portions Copyright (c) 1995 Caldera, Inc. <greg@caldera.com>
+ * Portions Copyright (c) 1995 Caldera, Inc. <greg@caldera.com>
* Neither Greg Page nor Caldera, Inc. admit liability nor provide
* warranty for any of this software. This material is provided
* "AS-IS" and at no charge.
@@ -758,7 +758,7 @@ static void ipxitf_discover_netnum(struct ipx_interface *intrfc,
/**
* ipxitf_pprop - Process packet propagation IPX packet type 0x14, used for
- * NetBIOS broadcasts
+ * NetBIOS broadcasts
* @intrfc: IPX interface receiving this packet
* @skb: Received packet
*
@@ -870,11 +870,11 @@ static struct ipx_interface *ipxitf_alloc(struct net_device *dev, __be32 netnum,
if (intrfc) {
intrfc->if_dev = dev;
intrfc->if_netnum = netnum;
- intrfc->if_dlink_type = dlink_type;
- intrfc->if_dlink = dlink;
- intrfc->if_internal = internal;
- intrfc->if_ipx_offset = ipx_offset;
- intrfc->if_sknum = IPX_MIN_EPHEMERAL_SOCKET;
+ intrfc->if_dlink_type = dlink_type;
+ intrfc->if_dlink = dlink;
+ intrfc->if_internal = internal;
+ intrfc->if_ipx_offset = ipx_offset;
+ intrfc->if_sknum = IPX_MIN_EPHEMERAL_SOCKET;
INIT_HLIST_HEAD(&intrfc->if_sklist);
refcount_set(&intrfc->refcnt, 1);
spin_lock_init(&intrfc->if_sklist_lock);
@@ -965,23 +965,23 @@ static int ipxitf_create(struct ipx_interface_definition *idef)
switch (idef->ipx_dlink_type) {
case IPX_FRAME_8022:
- dlink_type = htons(ETH_P_802_2);
- datalink = p8022_datalink;
+ dlink_type = htons(ETH_P_802_2);
+ datalink = p8022_datalink;
break;
case IPX_FRAME_ETHERII:
if (dev->type != ARPHRD_IEEE802) {
- dlink_type = htons(ETH_P_IPX);
- datalink = pEII_datalink;
+ dlink_type = htons(ETH_P_IPX);
+ datalink = pEII_datalink;
break;
}
/* fall through */
case IPX_FRAME_SNAP:
- dlink_type = htons(ETH_P_SNAP);
- datalink = pSNAP_datalink;
+ dlink_type = htons(ETH_P_SNAP);
+ datalink = pSNAP_datalink;
break;
case IPX_FRAME_8023:
- dlink_type = htons(ETH_P_802_3);
- datalink = p8023_datalink;
+ dlink_type = htons(ETH_P_802_3);
+ datalink = p8023_datalink;
break;
case IPX_FRAME_NONE:
default:
@@ -1522,7 +1522,7 @@ static int ipx_connect(struct socket *sock, struct sockaddr *uaddr,
struct ipx_route *rt;
sk->sk_state = TCP_CLOSE;
- sock->state = SS_UNCONNECTED;
+ sock->state = SS_UNCONNECTED;
lock_sock(sk);
if (addr_len != sizeof(*addr))
@@ -1534,7 +1534,7 @@ static int ipx_connect(struct socket *sock, struct sockaddr *uaddr,
struct sockaddr_ipx uaddr;
uaddr.sipx_port = 0;
- uaddr.sipx_network = 0;
+ uaddr.sipx_network = 0;
#ifdef CONFIG_IPX_INTERN
rc = -ENETDOWN;
@@ -1563,8 +1563,8 @@ static int ipx_connect(struct socket *sock, struct sockaddr *uaddr,
ipxs->type = addr->sipx_type;
if (sock->type == SOCK_DGRAM) {
- sock->state = SS_CONNECTED;
- sk->sk_state = TCP_ESTABLISHED;
+ sock->state = SS_CONNECTED;
+ sk->sk_state = TCP_ESTABLISHED;
}
if (rt)
@@ -1736,10 +1736,10 @@ static int ipx_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
goto out;
usipx = &local_sipx;
- usipx->sipx_family = AF_IPX;
- usipx->sipx_type = ipxs->type;
- usipx->sipx_port = ipxs->dest_addr.sock;
- usipx->sipx_network = ipxs->dest_addr.net;
+ usipx->sipx_family = AF_IPX;
+ usipx->sipx_type = ipxs->type;
+ usipx->sipx_port = ipxs->dest_addr.sock;
+ usipx->sipx_network = ipxs->dest_addr.net;
memcpy(usipx->sipx_node, ipxs->dest_addr.node, IPX_NODE_LEN);
}
@@ -1769,7 +1769,7 @@ static int ipx_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
struct sockaddr_ipx uaddr;
uaddr.sipx_port = 0;
- uaddr.sipx_network = 0;
+ uaddr.sipx_network = 0;
#ifdef CONFIG_IPX_INTERN
rc = -ENETDOWN;
@@ -1798,8 +1798,8 @@ static int ipx_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
goto out;
}
- ipx = ipx_hdr(skb);
- copied = ntohs(ipx->ipx_pktsize) - sizeof(struct ipxhdr);
+ ipx = ipx_hdr(skb);
+ copied = ntohs(ipx->ipx_pktsize) - sizeof(struct ipxhdr);
if (copied > size) {
copied = size;
msg->msg_flags |= MSG_TRUNC;
@@ -1816,7 +1816,7 @@ static int ipx_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
sipx->sipx_port = ipx->ipx_source.sock;
memcpy(sipx->sipx_node, ipx->ipx_source.node, IPX_NODE_LEN);
sipx->sipx_network = IPX_SKB_CB(skb)->ipx_source_net;
- sipx->sipx_type = ipx->ipx_type;
+ sipx->sipx_type = ipx->ipx_type;
sipx->sipx_zero = 0;
msg->msg_namelen = sizeof(*sipx);
}
diff --git a/drivers/staging/ipx/ipx_proc.c b/drivers/staging/ipx/ipx_proc.c
index 38a3d51..893b2ab 100644
--- a/drivers/staging/ipx/ipx_proc.c
+++ b/drivers/staging/ipx/ipx_proc.c
@@ -2,7 +2,7 @@
/*
* IPX proc routines
*
- * Copyright(C) Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 2002
+ * Copyright(C) Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 2002
*/
#include <linux/init.h>
diff --git a/drivers/staging/ipx/ipx_route.c b/drivers/staging/ipx/ipx_route.c
index 3cf93aa9..833206c 100644
--- a/drivers/staging/ipx/ipx_route.c
+++ b/drivers/staging/ipx/ipx_route.c
@@ -71,8 +71,8 @@ int ipxrtr_add_route(__be32 network, struct ipx_interface *intrfc,
goto out_put;
}
- rt->ir_net = network;
- rt->ir_intrfc = intrfc;
+ rt->ir_net = network;
+ rt->ir_intrfc = intrfc;
if (!node) {
memset(rt->ir_router_node, '\0', IPX_NODE_LEN);
rt->ir_routed = 0;
@@ -207,7 +207,7 @@ int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx,
ipx = ipx_hdr(skb);
ipx->ipx_pktsize = htons(len + sizeof(struct ipxhdr));
IPX_SKB_CB(skb)->ipx_tctrl = 0;
- ipx->ipx_type = usipx->sipx_type;
+ ipx->ipx_type = usipx->sipx_type;
IPX_SKB_CB(skb)->last_hop.index = -1;
#ifdef CONFIG_IPX_INTERN
--
2.7.4
^ permalink raw reply related
* Re: [patch iproute2 v3 3/4] tc: Add -bs option to batch mode
From: Marcelo Ricardo Leitner @ 2017-12-27 20:39 UTC (permalink / raw)
To: David Ahern; +Cc: Chris Mi, netdev, gerlitz.or, stephen
In-Reply-To: <28563dd5-01be-9198-2911-658bbd0ba3d7@gmail.com>
On Wed, Dec 27, 2017 at 09:39:15AM -0600, David Ahern wrote:
> On 12/25/17 2:46 AM, Chris Mi wrote:
> > Signed-off-by: Chris Mi <chrism@mellanox.com>
> > ---
> > tc/m_action.c | 91 +++++++++++++++++++++++++++++++++----------
> > tc/tc.c | 47 ++++++++++++++++++----
> > tc/tc_common.h | 8 +++-
> > tc/tc_filter.c | 121 +++++++++++++++++++++++++++++++++++++++++----------------
> > 4 files changed, 204 insertions(+), 63 deletions(-)
> >
> > diff --git a/tc/m_action.c b/tc/m_action.c
> > index fc422364..c4c3b862 100644
> > --- a/tc/m_action.c
> > +++ b/tc/m_action.c
> > @@ -23,6 +23,7 @@
> > #include <arpa/inet.h>
> > #include <string.h>
> > #include <dlfcn.h>
> > +#include <errno.h>
> >
> > #include "utils.h"
> > #include "tc_common.h"
> > @@ -546,40 +547,88 @@ bad_val:
> > return ret;
> > }
> >
> > +typedef struct {
> > + struct nlmsghdr n;
> > + struct tcamsg t;
> > + char buf[MAX_MSG];
> > +} tc_action_req;
> > +
> > +static tc_action_req *action_reqs;
> > +static struct iovec msg_iov[MSG_IOV_MAX];
> > +
> > +void free_action_reqs(void)
> > +{
> > + free(action_reqs);
> > +}
> > +
> > +static tc_action_req *get_action_req(int batch_size, int index)
> > +{
> > + tc_action_req *req;
> > +
> > + if (action_reqs == NULL) {
> > + action_reqs = malloc(batch_size * sizeof (tc_action_req));
> > + if (action_reqs == NULL)
> > + return NULL;
> > + }
> > + req = &action_reqs[index];
> > + memset(req, 0, sizeof (*req));
> > +
> > + return req;
> > +}
> > +
> > static int tc_action_modify(int cmd, unsigned int flags,
> > - int *argc_p, char ***argv_p)
> > + int *argc_p, char ***argv_p,
> > + int batch_size, int index, bool send)
> > {
> > int argc = *argc_p;
> > char **argv = *argv_p;
> > int ret = 0;
> > - struct {
> > - struct nlmsghdr n;
> > - struct tcamsg t;
> > - char buf[MAX_MSG];
> > - } req = {
> > - .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
> > - .n.nlmsg_flags = NLM_F_REQUEST | flags,
> > - .n.nlmsg_type = cmd,
> > - .t.tca_family = AF_UNSPEC,
> > + tc_action_req *req;
> > + struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
> > + struct iovec *iov = &msg_iov[index];
> > +
> > + req = get_action_req(batch_size, index);
> > + if (req == NULL) {
> > + fprintf(stderr, "get_action_req error: not enough buffer\n");
> > + return -ENOMEM;
> > + }
> > +
> > + req->n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg));
> > + req->n.nlmsg_flags = NLM_F_REQUEST | flags;
> > + req->n.nlmsg_type = cmd;
> > + req->t.tca_family = AF_UNSPEC;
> > + struct rtattr *tail = NLMSG_TAIL(&req->n);
> > +
> > + struct msghdr msg = {
> > + .msg_name = &nladdr,
> > + .msg_namelen = sizeof(nladdr),
> > + .msg_iov = msg_iov,
> > + .msg_iovlen = index + 1,
> > };
> > - struct rtattr *tail = NLMSG_TAIL(&req.n);
> >
> > argc -= 1;
> > argv += 1;
> > - if (parse_action(&argc, &argv, TCA_ACT_TAB, &req.n)) {
> > + if (parse_action(&argc, &argv, TCA_ACT_TAB, &req->n)) {
> > fprintf(stderr, "Illegal \"action\"\n");
> > return -1;
> > }
> > - tail->rta_len = (void *) NLMSG_TAIL(&req.n) - (void *) tail;
> > + tail->rta_len = (void *) NLMSG_TAIL(&req->n) - (void *) tail;
> > +
> > + *argc_p = argc;
> > + *argv_p = argv;
> > +
> > + iov->iov_base = &req->n;
> > + iov->iov_len = req->n.nlmsg_len;
> > +
> > + if (!send)
> > + return 0;
> >
> > - if (rtnl_talk(&rth, &req.n, NULL) < 0) {
> > + ret = rtnl_talk_msg(&rth, &msg, NULL);
> > + if (ret < 0) {
> > fprintf(stderr, "We have an error talking to the kernel\n");
> > ret = -1;
> > }
> >
> > - *argc_p = argc;
> > - *argv_p = argv;
> > -
> > return ret;
> > }
> >
> > @@ -679,7 +728,7 @@ bad_val:
> > return ret;
> > }
> >
> > -int do_action(int argc, char **argv)
> > +int do_action(int argc, char **argv, int batch_size, int index, bool send)
> > {
> >
> > int ret = 0;
> > @@ -689,12 +738,14 @@ int do_action(int argc, char **argv)
> > if (matches(*argv, "add") == 0) {
> > ret = tc_action_modify(RTM_NEWACTION,
> > NLM_F_EXCL | NLM_F_CREATE,
> > - &argc, &argv);
> > + &argc, &argv, batch_size,
> > + index, send);
> > } else if (matches(*argv, "change") == 0 ||
> > matches(*argv, "replace") == 0) {
> > ret = tc_action_modify(RTM_NEWACTION,
> > NLM_F_CREATE | NLM_F_REPLACE,
> > - &argc, &argv);
> > + &argc, &argv, batch_size,
> > + index, send);
> > } else if (matches(*argv, "delete") == 0) {
> > argc -= 1;
> > argv += 1;
> > diff --git a/tc/tc.c b/tc/tc.c
> > index ad9f07e9..7ea2fc89 100644
> > --- a/tc/tc.c
> > +++ b/tc/tc.c
> > @@ -189,20 +189,20 @@ static void usage(void)
> > fprintf(stderr, "Usage: tc [ OPTIONS ] OBJECT { COMMAND | help }\n"
> > " tc [-force] -batch filename\n"
> > "where OBJECT := { qdisc | class | filter | action | monitor | exec }\n"
> > - " OPTIONS := { -s[tatistics] | -d[etails] | -r[aw] | -p[retty] | -b[atch] [filename] | -n[etns] name |\n"
> > + " OPTIONS := { -s[tatistics] | -d[etails] | -r[aw] | -p[retty] | -b[atch] [filename] | -bs | -batchsize [size] | -n[etns] name |\n"
> > " -nm | -nam[es] | { -cf | -conf } path } | -j[son]\n");
> > }
> >
> > -static int do_cmd(int argc, char **argv)
> > +static int do_cmd(int argc, char **argv, int batch_size, int index, bool send)
> > {
> > if (matches(*argv, "qdisc") == 0)
> > return do_qdisc(argc-1, argv+1);
> > if (matches(*argv, "class") == 0)
> > return do_class(argc-1, argv+1);
> > if (matches(*argv, "filter") == 0)
> > - return do_filter(argc-1, argv+1);
> > + return do_filter(argc-1, argv+1, batch_size, index, send);
> > if (matches(*argv, "actions") == 0)
> > - return do_action(argc-1, argv+1);
> > + return do_action(argc-1, argv+1, batch_size, index, send);
> > if (matches(*argv, "monitor") == 0)
> > return do_tcmonitor(argc-1, argv+1);
> > if (matches(*argv, "exec") == 0)
> > @@ -217,11 +217,16 @@ static int do_cmd(int argc, char **argv)
> > return -1;
> > }
> >
> > -static int batch(const char *name)
> > +static int batch(const char *name, int batch_size)
> > {
> > + int msg_iov_index = 0;
> > char *line = NULL;
> > size_t len = 0;
> > int ret = 0;
> > + bool send;
> > +
> > + if (batch_size > 1)
> > + setcmdlinetotal(name);
> >
> > batch_mode = 1;
> > if (name && strcmp(name, "-") != 0) {
> > @@ -248,15 +253,30 @@ static int batch(const char *name)
> > if (largc == 0)
> > continue; /* blank line */
> >
> > - if (do_cmd(largc, largv)) {
> > + /*
> > + * In batch mode, if we haven't accumulated enough commands
> > + * and this is not the last command, don't send the message
> > + * immediately.
> > + */
> > + if (batch_size > 1 && msg_iov_index + 1 != batch_size
> > + && cmdlineno != cmdlinetotal)
I think you can replace the cmdlineno check with a simple !feof(stdin)
> > + send = false;
> > + else
> > + send = true;
> > +
> > + ret = do_cmd(largc, largv, batch_size, msg_iov_index++, send);
>
> What happens if tc commands are interlaced in the file -- qdisc add,
> class add, filter add, then a delete, show, exec, etc.? Right now each
> command is handled one at a time so an add followed by a delete will
> work. Your proposed batching loop won't work for this case as some
> commands are executed when that line is reached and others are batched
> for later send. Not all of the tc commands need to be batched in a
> single message so perhaps those commands cause the queue to be flushed
> (ie, message sent), then that command is executed and you start the
> batching over.
>
> Further, I really think the batching can be done without the global
> variables and without the command handlers knowing it is batching or
> part of an iov. e.g., in the case of batching try having the commands
> malloc the request buffer and return the pointer back to this loop in
> which case this loop calls rtnl_talk_msg and frees the buffers.
Sounds like the batching is being done at the wrong level. If it was
done by rtnl_talk(), it should be easier.
We can keep rtnl_talk() for previous users and make rtnl_talk_msg() do
the batching, mostly independent of which kind of msg it it.
As you need to inform it that it was the last entry, that may be
detected with feof(stdin). Just add a 'bool flush' parameter to it.
rtnl_talk_msg(...., flush=feof(stdin));
Next step then would be to add a memory manager layer to it, so
libnetlink wouldn't need to copy the messages but recycle pointers:
rtnl_get_msgbuf(): returns a buffer that one can use to fill in the
msg and use with rtnl_talk_msg()
and the free is done by libnetlink itself when the message is
finally sent, so no need to keep track of what one needs to free or
can reuse.
>
> > + if (ret < 0) {
> > fprintf(stderr, "Command failed %s:%d\n", name, cmdlineno);
> > ret = 1;
> > if (!force)
> > break;
> > }
> > + msg_iov_index %= batch_size;
> > }
> > if (line)
> > free(line);
> > + free_filter_reqs();
> > + free_action_reqs();
> >
> > rtnl_close(&rth);
> > return ret;
>
^ permalink raw reply
* Re: [PATCH v3 1/4] security: Add support for SCTP security hooks
From: Paul Moore @ 2017-12-27 20:35 UTC (permalink / raw)
To: Richard Haines
Cc: Marcelo Ricardo Leitner, Casey Schaufler, selinux, netdev,
linux-sctp, linux-security-module, Vlad Yasevich, nhorman,
Stephen Smalley, Eric Paris
In-Reply-To: <1514391741.2780.4.camel@btinternet.com>
On Wed, Dec 27, 2017 at 11:22 AM, Richard Haines
<richard_c_haines@btinternet.com> wrote:
> On Fri, 2017-12-22 at 15:45 -0200, Marcelo Ricardo Leitner wrote:
>> On Fri, Dec 22, 2017 at 09:20:45AM -0800, Casey Schaufler wrote:
>> > On 12/22/2017 5:05 AM, Marcelo Ricardo Leitner wrote:
>> > > From: Richard Haines <richard_c_haines@btinternet.com>
>> > >
>> > > The SCTP security hooks are explained in:
>> > > Documentation/security/LSM-sctp.rst
>>
>> Thanks Casey for your comments. However, I'm not that acquainted with
>> these area of codes and I cannot work on them. I'll just wait for
>> Richard then.
>
> I'm back online and will post a V4 set of patches within a week. These
> will address Paul's comments as per [1] and Casey's regarding the
> documentation.
> Sorry for the delay
No worries, thanks.
--
paul moore
www.paul-moore.com
^ permalink raw reply
* Re: [PATCH 3/4] libbpf: break loop earlier
From: Eric Leblond @ 2017-12-27 20:30 UTC (permalink / raw)
To: Alexei Starovoitov; +Cc: netdev, daniel, linux-kernel
In-Reply-To: <20171227190042.hwxk6ccazdtnob77@ast-mbp>
Hello,
On Wed, 2017-12-27 at 11:00 -0800, Alexei Starovoitov wrote:
> On Wed, Dec 27, 2017 at 07:02:28PM +0100, Eric Leblond wrote:
> > Get out of the loop when we have a match.
> >
> > Signed-off-by: Eric Leblond <eric@regit.org>
> > ---
> > tools/lib/bpf/libbpf.c | 1 +
> > 1 file changed, 1 insertion(+)
> >
> > diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> > index 5fe8aaa2123e..d263748aa341 100644
> > --- a/tools/lib/bpf/libbpf.c
> > +++ b/tools/lib/bpf/libbpf.c
> > @@ -412,6 +412,7 @@ bpf_object__init_prog_names(struct bpf_object
> > *obj)
> > prog->section_name);
> > return -LIBBPF_ERRNO__LIBELF;
> > }
> > + break;
>
> why this is needed?
It was just cosmetic, no related bug.
> The top of the loop is:
> for (si = 0; si < symbols->d_size / sizeof(GElf_Sym) && !name;
>
> so as soon as name is found the loop will exit.
OK, I've missed that. Please disregard this patch.
BR,
--
Eric Leblond <eric@regit.org>
^ permalink raw reply
* Re: [PATCHv3 0/2] capability controlled user-namespaces
From: Michael Kerrisk (man-pages) @ 2017-12-27 20:23 UTC (permalink / raw)
To: Mahesh Bandewar (महेश बंडेवार)
Cc: James Morris, LKML, Netdev, Kernel-hardening, Linux API,
Kees Cook, Serge Hallyn, Eric W . Biederman, Eric Dumazet,
David Miller, Mahesh Bandewar
In-Reply-To: <CAF2d9jit74_VCdD-pEFy3bJo2W1-0cDo0BOJC5beJiy8yFPCWg@mail.gmail.com>
Hello Mahesh,
On 27 December 2017 at 18:09, Mahesh Bandewar (महेश बंडेवार)
<maheshb@google.com> wrote:
> Hello James,
>
> Seems like I missed your name to be added into the review of this
> patch series. Would you be willing be pull this into the security
> tree? Serge Hallyn has already ACKed it.
We seem to have no formal documentation/specification of this feature.
I think that should be written up before this patch goes into
mainline...
Cheers,
Michael
>
> On Tue, Dec 5, 2017 at 2:30 PM, Mahesh Bandewar <mahesh@bandewar.net> wrote:
>> From: Mahesh Bandewar <maheshb@google.com>
>>
>> TL;DR version
>> -------------
>> Creating a sandbox environment with namespaces is challenging
>> considering what these sandboxed processes can engage into. e.g.
>> CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few.
>> Current form of user-namespaces, however, if changed a bit can allow
>> us to create a sandbox environment without locking down user-
>> namespaces.
>>
>> Detailed version
>> ----------------
>>
>> Problem
>> -------
>> User-namespaces in the current form have increased the attack surface as
>> any process can acquire capabilities which are not available to them (by
>> default) by performing combination of clone()/unshare()/setns() syscalls.
>>
>> #define _GNU_SOURCE
>> #include <stdio.h>
>> #include <sched.h>
>> #include <netinet/in.h>
>>
>> int main(int ac, char **av)
>> {
>> int sock = -1;
>>
>> printf("Attempting to open RAW socket before unshare()...\n");
>> sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
>> if (sock < 0) {
>> perror("socket() SOCK_RAW failed: ");
>> } else {
>> printf("Successfully opened RAW-Sock before unshare().\n");
>> close(sock);
>> sock = -1;
>> }
>>
>> if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
>> perror("unshare() failed: ");
>> return 1;
>> }
>>
>> printf("Attempting to open RAW socket after unshare()...\n");
>> sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
>> if (sock < 0) {
>> perror("socket() SOCK_RAW failed: ");
>> } else {
>> printf("Successfully opened RAW-Sock after unshare().\n");
>> close(sock);
>> sock = -1;
>> }
>>
>> return 0;
>> }
>>
>> The above example shows how easy it is to acquire NET_RAW capabilities
>> and once acquired, these processes could take benefit of above mentioned
>> or similar issues discovered/undiscovered with malicious intent. Note
>> that this is just an example and the problem/solution is not limited
>> to NET_RAW capability *only*.
>>
>> The easiest fix one can apply here is to lock-down user-namespaces which
>> many of the distros do (i.e. don't allow users to create user namespaces),
>> but unfortunately that prevents everyone from using them.
>>
>> Approach
>> --------
>> Introduce a notion of 'controlled' user-namespaces. Every process on
>> the host is allowed to create user-namespaces (governed by the limit
>> imposed by per-ns sysctl) however, mark user-namespaces created by
>> sandboxed processes as 'controlled'. Use this 'mark' at the time of
>> capability check in conjunction with a global capability whitelist.
>> If the capability is not whitelisted, processes that belong to
>> controlled user-namespaces will not be allowed.
>>
>> Once a user-ns is marked as 'controlled'; all its child user-
>> namespaces are marked as 'controlled' too.
>>
>> A global whitelist is list of capabilities governed by the
>> sysctl which is available to (privileged) user in init-ns to modify
>> while it's applicable to all controlled user-namespaces on the host.
>>
>> Marking user-namespaces controlled without modifying the whitelist is
>> equivalent of the current behavior. The default value of whitelist includes
>> all capabilities so that the compatibility is maintained. However it gives
>> admins fine-grained ability to control various capabilities system wide
>> without locking down user-namespaces.
>>
>> Please see individual patches in this series.
>>
>> Mahesh Bandewar (2):
>> capability: introduce sysctl for controlled user-ns capability whitelist
>> userns: control capabilities of some user namespaces
>>
>> Documentation/sysctl/kernel.txt | 21 +++++++++++++++++
>> include/linux/capability.h | 7 ++++++
>> include/linux/user_namespace.h | 25 ++++++++++++++++++++
>> kernel/capability.c | 52 +++++++++++++++++++++++++++++++++++++++++
>> kernel/sysctl.c | 5 ++++
>> kernel/user_namespace.c | 4 ++++
>> security/commoncap.c | 8 +++++++
>> 7 files changed, 122 insertions(+)
>>
>> --
>> 2.15.0.531.g2ccb3012c9-goog
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-api" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/
^ permalink raw reply
* Re: [patch net-next v2 00/10] Add support for resource abstraction
From: David Ahern @ 2017-12-27 20:16 UTC (permalink / raw)
To: Andrew Lunn, Jiri Pirko
Cc: netdev, davem, arkadis, mlxsw, vivien.didelot, f.fainelli,
michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
yuvalm, ogerlitz, roopa
In-Reply-To: <20171227193110.GA5494@lunn.ch>
On 12/27/17 1:31 PM, Andrew Lunn wrote:
>> Hmm. That documents mainly sysfs. No mention of Netlink at all. But
>> maybe I missed it. Also, that defines the interface as is. However we
>> are talking about the data exchanged over the interface, not the
>> interface itself. I don't see how ASIC/HW specific thing, like for
>> example KVD in our case could be part of kernel ABI.
>
> You need to be very careful here. As soon as somebody starts using it,
> it might become an ABI. Or you need to clearly document it is not ABI,
> there is no guarantee it will not disappear or change its meaning in
> the next kernel, and it should be used with extreme caution.
>
+1
Once the names go in, people can write scripts that invoke devlink at
boot to partition resources. With the proposed patch set, the name
(e.g., kvd/linear) becomes part of the ABI.
^ permalink raw reply
* Re: [patch net-next v2 00/10] Add support for resource abstraction
From: Arkadi Sharshevsky @ 2017-12-27 20:15 UTC (permalink / raw)
To: David Ahern, Jiri Pirko
Cc: netdev, davem, mlxsw, andrew, vivien.didelot, f.fainelli,
michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
yuvalm, ogerlitz, roopa
In-Reply-To: <ae70d810-8277-899b-b2a9-6b2dbdd5eb21@cumulusnetworks.com>
On 12/27/2017 06:34 PM, David Ahern wrote:
> On 12/27/17 2:09 AM, Jiri Pirko wrote:
>> Wed, Dec 27, 2017 at 05:05:09AM CET, dsa@cumulusnetworks.com wrote:
>>> On 12/26/17 5:23 AM, Jiri Pirko wrote:
>>>> From: Jiri Pirko <jiri@mellanox.com>
>>>>
>>>> Many of the ASIC's internal resources are limited and are shared between
>>>> several hardware procedures. For example, unified hash-based memory can
>>>> be used for many lookup purposes, like FDB and LPM. In many cases the user
>>>> can provide a partitioning scheme for such a resource in order to perform
>>>> fine tuning for his application. In such cases performing driver reload is
>>>> needed for the changes to take place, thus this patchset also adds support
>>>> for hot reload.
>>>>
>>>> Such an abstraction can be coupled with devlink's dpipe interface, which
>>>> models the ASIC's pipeline as a graph of match/action tables. By modeling
>>>> the hardware resource object, and by coupling it to several dpipe tables,
>>>> further visibility can be achieved in order to debug ASIC-wide issues.
>>>>
>>>> The proposed interface will provide the user the ability to understand the
>>>> limitations of the hardware, and receive notification regarding its occupancy.
>>>> Furthermore, monitoring the resource occupancy can be done in real-time and
>>>> can be useful in many cases.
>>>
>>> In the last RFC (not v1, but RFC) I asked for some kind of description
>>> for each resource, and you and Arkadi have pushed back. Let's walk
>>> through an example to see what I mean:
>>>
>>> $ devlink resource show pci/0000:03:00.0
>>> pci/0000:03:00.0:
>>> name kvd size 245760 size_valid true
>>> resources:
>>> name linear size 98304 occ 0
>>> name hash_double size 60416
>>> name hash_single size 87040
>>>
>>> So this 2700 has 3 resources that can be managed -- some table or
>>> resource or something named 'kvd' with linear, hash_double and
>>> hash_single sub-resources. What are these names referring too? The above
>>> output gives no description, and 'kvd' is not an industry term. Further,
>>
>> This are internal resources specific to the ASIC. Would you like some
>> description to each or something like that?
>
> devlink has some nice self-documenting capabilities. What's missing here
> is a description of what the resource is used for in standard terms --
> ipv4 host routes, fdb, nexthops, rifs, etc. Even if the description is a
> short list versus an exhaustive list of everything it is used for. e.g.,
> Why would a user decrease linear and increase hash_single or vice versa?
>
>>
>>
>>> what are these sizes that a user can control? The output contains no
>>> units, no description, nothing. In short, the above output provides
>>> random numbers associated with random names.
>>
>> Units are now exposed from kernel, just this version of iproute2 patch
>> does not display it.
>
> please provide an iproute2 patch that does so the full context if this
> patch set can be reviewed from a user perspective.
>
>>
>>
>>>
>>> I can see dpipe tables exported by this device:
>>>
>>> $ devlink dpipe header show pci/0000:03:00.0
>>>
>>> pci/0000:03:00.0:
>>> name mlxsw_meta
>>> field:
>>> name erif_port bitwidth 32 mapping_type ifindex
>>> name l3_forward bitwidth 1
>>> name l3_drop bitwidth 1
>>> name adj_index bitwidth 32
>>> name adj_size bitwidth 32
>>> name adj_hash_index bitwidth 32
>>>
>>> name ipv6
>>> field:
>>> name destination ip bitwidth 128
>>>
>>> name ipv4
>>> field:
>>> name destination ip bitwidth 32
>>>
>>> name ethernet
>>> field:
>>> name destination mac bitwidth 48
>>>
>>> but none mention 'kvd' or 'linear' or 'hash" and none of the other
>>> various devlink options:
>>>
>>> $ devlink
>>> Usage: devlink [ OPTIONS ] OBJECT { COMMAND | help }
>>> where OBJECT := { dev | port | sb | monitor | dpipe }
>>>
>>> seem to related to resources.
>>>
>>> So how does a user know what they are controlling by this 'resource'
>>> option? Is the user expected to have a PRM or user guide on hand for the
>>> specific device model that is being configured?
>>
>> The relation of specific dpipe table to specific resource is exposed by
>> the kernel as well. Probably the iproute2 patch just does not display
>> it.
>
> please provide an iproute2 patch that does so the full context if this
> patch set can be reviewed from a user perspective.
>
As Yuval stated you are using the wrong command here.
You are printing the headers not the tables. On each dpipe
table you can see the resource it is using (the resource
path aka host table uses /kvd/hash_single for example).
This is already working. Just try it.
>>
>>
>>>
>>> Again, I have no objections to kvd, linear, hash, etc terms as they do
>>> relate to Mellanox products. But kvd/linear, for example, does correlate
>>> to industry standard concepts in some way. My request is that the
>>> resource listing guide the user in some way, stating what these
>>> resources mean.
>>
>> So the showed relation to dpipe table would be enougn or you would still
>> like to see some description? I don't like the description concept here
>> as the relations to dpipe table should tell user exactly what he needs
>> to know.
>
> I believe it is useful to have a 1-line, short description that gives
> the user some memory jogger as to what the resource is used for. It does
> not have to be an exhaustive list, but the user should not have to do
> mental jumping jacks running a bunch of commands to understand the
> resources for vendor specific asics.
>
^ permalink raw reply
* Re: WARNING in strp_data_ready
From: Dmitry Vyukov @ 2017-12-27 20:14 UTC (permalink / raw)
To: Ozgur
Cc: Tom Herbert, John Fastabend, syzbot, David S. Miller,
Eric Biggers, LKML, Linux Kernel Network Developers,
syzkaller-bugs@googlegroups.com, Tom Herbert, Cong Wang
In-Reply-To: <1164631514405294@web52j.yandex.ru>
On Wed, Dec 27, 2017 at 9:08 PM, Ozgur <ozgur@goosey.org> wrote:
>
>
> 27.12.2017, 22:21, "Dmitry Vyukov" <dvyukov@google.com>:
>> On Wed, Dec 27, 2017 at 8:09 PM, Tom Herbert <tom@herbertland.com> wrote:
>>> Did you try the patch I posted?
>>
>> Hi Tom,
>
> Hello Dmitry,
>
>> No. And I didn't know I need to. Why?
>> If you think the patch needs additional testing, you can ask syzbot to
>> test it. See https://github.com/google/syzkaller/blob/master/docs/syzbot.md#communication-with-syzbot
>> Otherwise proceed with committing it. Or what are we waiting for?
>>
>> Thanks
>
> I think we need to fixed patch for crash, in fact check to patch code and test solve the bug.
> How do test it because there is no patch in the following bug?
Hi Ozgur,
I am not sure I completely understand what you mean. But the
reproducer for this bug (which one can use for testing) is here:
https://groups.google.com/forum/#!topic/syzkaller-bugs/Kxs05ziCpgY
Tom also mentions there is some patch for this, but I don't know where
it is, it doesn't seem to be referenced from this thread.
> The fix patch should be for this net/kcm/kcmsock.c file and lock functions must be added calling sk_data_ready ().
> Regards
>
> Ozgur
>
>>> On Wed, Dec 27, 2017 at 10:25 AM, Dmitry Vyukov <dvyukov@google.com> wrote:
>>>> On Wed, Dec 6, 2017 at 4:44 PM, Dmitry Vyukov <dvyukov@google.com> wrote:
>>>>>> <john.fastabend@gmail.com> wrote:
>>>>>>> On 10/24/2017 08:20 AM, syzbot wrote:
>>>>>>>> Hello,
>>>>>>>>
>>>>>>>> syzkaller hit the following crash on 73d3393ada4f70fa3df5639c8d438f2f034c0ecb
>>>>>>>> git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/master
>>>>>>>> compiler: gcc (GCC) 7.1.1 20170620
>>>>>>>> .config is attached
>>>>>>>> Raw console output is attached.
>>>>>>>> C reproducer is attached
>>>>>>>> syzkaller reproducer is attached. See https://goo.gl/kgGztJ
>>>>>>>> for information about syzkaller reproducers
>>>>>>>>
>>>>>>>> WARNING: CPU: 0 PID: 2996 at ./include/net/sock.h:1505 sock_owned_by_me include/net/sock.h:1505 [inline]
>>>>>>>> WARNING: CPU: 0 PID: 2996 at ./include/net/sock.h:1505 sock_owned_by_user include/net/sock.h:1511 [inline]
>>>>>>>> WARNING: CPU: 0 PID: 2996 at ./include/net/sock.h:1505 strp_data_ready+0x2b7/0x390 net/strparser/strparser.c:404
>>>>>>>> Kernel panic - not syncing: panic_on_warn set ...
>>>>>>>>
>>>>>>>> CPU: 0 PID: 2996 Comm: syzkaller142210 Not tainted 4.14.0-rc5+ #138
>>>>>>>> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
>>>>>>>> Call Trace:
>>>>>>>> <IRQ>
>>>>>>>> __dump_stack lib/dump_stack.c:16 [inline]
>>>>>>>> dump_stack+0x194/0x257 lib/dump_stack.c:52
>>>>>>>> panic+0x1e4/0x417 kernel/panic.c:181
>>>>>>>> __warn+0x1c4/0x1d9 kernel/panic.c:542
>>>>>>>> report_bug+0x211/0x2d0 lib/bug.c:183
>>>>>>>> fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:178
>>>>>>>> do_trap_no_signal arch/x86/kernel/traps.c:212 [inline]
>>>>>>>> do_trap+0x260/0x390 arch/x86/kernel/traps.c:261
>>>>>>>> do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:298
>>>>>>>> do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:311
>>>>>>>> invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905
>>>>>>>> RIP: 0010:sock_owned_by_me include/net/sock.h:1505 [inline]
>>>>>>>> RIP: 0010:sock_owned_by_user include/net/sock.h:1511 [inline]
>>>>>>>> RIP: 0010:strp_data_ready+0x2b7/0x390 net/strparser/strparser.c:404
>>>>>>>> RSP: 0018:ffff8801db206b18 EFLAGS: 00010206
>>>>>>>> RAX: ffff8801d1e02080 RBX: ffff8801dad74c48 RCX: 0000000000000000
>>>>>>>> RDX: 0000000000000100 RSI: ffff8801d29fa0a0 RDI: ffffffff85cbede0
>>>>>>>> RBP: ffff8801db206b38 R08: 0000000000000005 R09: 1ffffffff0ce0bcd
>>>>>>>> R10: ffff8801db206a00 R11: dffffc0000000000 R12: ffff8801d29fa000
>>>>>>>> R13: ffff8801dad74c50 R14: ffff8801d4350a92 R15: 0000000000000001
>>>>>>>> psock_data_ready+0x56/0x70 net/kcm/kcmsock.c:353
>>>>>>>
>>>>>>> Looks like KCM is calling sk_data_ready() without first taking the
>>>>>>> sock lock.
>>>>>>>
>>>>>>> /* Called with lower sock held */
>>>>>>> static void kcm_rcv_strparser(struct strparser *strp, struct sk_buff *skb)
>>>>>>> {
>>>>>>> [...]
>>>>>>> if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
>>>>>>>
>>>>>>> In this case kcm->sk is not the same lock the comment is referring to.
>>>>>>> And kcm_queue_rcv_skb() will eventually call sk_data_ready().
>>>>>>>
>>>>>>> @Tom, how about wrapping the sk_data_ready call in {lock|release}_sock?
>>>>>>> I don't have anything better in mind immediately.
>>>>>> The sock locks are taken in reverse order in the send path so so
>>>>>> grabbing kcm sock lock with lower lock held to call sk_data_ready may
>>>>>> lead to deadlock like I think.
>>>>>>
>>>>>> It might be possible to change the order in the send path to do this.
>>>>>> Something like:
>>>>>>
>>>>>> trylock on lower socket lock
>>>>>> -if trylock fails
>>>>>> - release kcm sock lock
>>>>>> - lock lower sock
>>>>>> - lock kcm sock
>>>>>> - call sendpage locked function
>>>>>>
>>>>>> I admit that dealing with two levels of socket locks in the data path
>>>>>> is quite a pain :-)
>>>>>
>>>>> up
>>>>>
>>>>> still happening and we've lost 50K+ test VMs on this
>>>>
>>>> up
>>>>
>>>> Still happens and number of crashes crossed 60K, can we do something
>>>> with this please?
^ permalink raw reply
* Re: [patch iproute2 v3 3/4] tc: Add -bs option to batch mode
From: Marcelo Ricardo Leitner @ 2017-12-27 19:56 UTC (permalink / raw)
To: Chris Mi; +Cc: netdev, gerlitz.or, stephen, dsahern
In-Reply-To: <20171225084658.24076-4-chrism@mellanox.com>
On Mon, Dec 25, 2017 at 05:46:57PM +0900, Chris Mi wrote:
> @@ -267,6 +287,7 @@ int main(int argc, char **argv)
> {
> int ret;
> char *batch_file = NULL;
> + int batch_size = 1;
>
> while (argc > 1) {
> if (argv[1][0] != '-')
> @@ -297,6 +318,14 @@ int main(int argc, char **argv)
> if (argc <= 1)
> usage();
> batch_file = argv[1];
> + } else if (matches(argv[1], "-batchsize") == 0 ||
> + matches(argv[1], "-bs") == 0) {
> + argc--; argv++;
> + if (argc <= 1)
> + usage();
> + batch_size = atoi(argv[1]);
> + if (batch_size > MSG_IOV_MAX)
> + batch_size = MSG_IOV_MAX;
what about
if (batch_size < 1)
batch_size = 1;
> } else if (matches(argv[1], "-netns") == 0) {
> NEXT_ARG();
> if (netns_switch(argv[1]))
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox