* [PATCH V4 net-next 08/10] net: hns3: add interrupt affinity support for misc interrupt
From: Huazhong Tan @ 2019-07-29 2:53 UTC (permalink / raw)
To: davem
Cc: netdev, linux-kernel, salil.mehta, yisen.zhuang, linuxarm, saeedm,
Yunsheng Lin, Peng Li, Huazhong Tan
In-Reply-To: <1564368811-65492-1-git-send-email-tanhuazhong@huawei.com>
From: Yunsheng Lin <linyunsheng@huawei.com>
The misc interrupt is used to schedule the reset and mailbox
subtask, and service_task delayed_work is used to do periodic
management work each second.
This patch sets the above three subtask's affinity using the
misc interrupt' affinity.
Also this patch setups a affinity notify for misc interrupt to
allow user to change the above three subtask's affinity.
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
.../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 53 ++++++++++++++++++++--
.../ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 4 ++
2 files changed, 53 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 13c9697..30a7074 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -1270,6 +1270,12 @@ static int hclge_configure(struct hclge_dev *hdev)
hclge_init_kdump_kernel_config(hdev);
+ /* Set the init affinity based on pci func number */
+ i = cpumask_weight(cpumask_of_node(dev_to_node(&hdev->pdev->dev)));
+ i = i ? PCI_FUNC(hdev->pdev->devfn) % i : 0;
+ cpumask_set_cpu(cpumask_local_spread(i, dev_to_node(&hdev->pdev->dev)),
+ &hdev->affinity_mask);
+
return ret;
}
@@ -2499,14 +2505,16 @@ static void hclge_mbx_task_schedule(struct hclge_dev *hdev)
{
if (!test_bit(HCLGE_STATE_CMD_DISABLE, &hdev->state) &&
!test_and_set_bit(HCLGE_STATE_MBX_SERVICE_SCHED, &hdev->state))
- schedule_work(&hdev->mbx_service_task);
+ queue_work_on(cpumask_first(&hdev->affinity_mask), system_wq,
+ &hdev->mbx_service_task);
}
static void hclge_reset_task_schedule(struct hclge_dev *hdev)
{
if (!test_bit(HCLGE_STATE_REMOVING, &hdev->state) &&
!test_and_set_bit(HCLGE_STATE_RST_SERVICE_SCHED, &hdev->state))
- schedule_work(&hdev->rst_service_task);
+ queue_work_on(cpumask_first(&hdev->affinity_mask), system_wq,
+ &hdev->rst_service_task);
}
static void hclge_task_schedule(struct hclge_dev *hdev)
@@ -2516,8 +2524,9 @@ static void hclge_task_schedule(struct hclge_dev *hdev)
!test_and_set_bit(HCLGE_STATE_SERVICE_SCHED, &hdev->state)) {
hdev->hw_stats.stats_timer++;
hdev->fd_arfs_expire_timer++;
- mod_delayed_work(system_wq, &hdev->service_task,
- round_jiffies_relative(HZ));
+ mod_delayed_work_on(cpumask_first(&hdev->affinity_mask),
+ system_wq, &hdev->service_task,
+ round_jiffies_relative(HZ));
}
}
@@ -2903,6 +2912,36 @@ static void hclge_get_misc_vector(struct hclge_dev *hdev)
hdev->num_msi_used += 1;
}
+static void hclge_irq_affinity_notify(struct irq_affinity_notify *notify,
+ const cpumask_t *mask)
+{
+ struct hclge_dev *hdev = container_of(notify, struct hclge_dev,
+ affinity_notify);
+
+ cpumask_copy(&hdev->affinity_mask, mask);
+}
+
+static void hclge_irq_affinity_release(struct kref *ref)
+{
+}
+
+static void hclge_misc_affinity_setup(struct hclge_dev *hdev)
+{
+ irq_set_affinity_hint(hdev->misc_vector.vector_irq,
+ &hdev->affinity_mask);
+
+ hdev->affinity_notify.notify = hclge_irq_affinity_notify;
+ hdev->affinity_notify.release = hclge_irq_affinity_release;
+ irq_set_affinity_notifier(hdev->misc_vector.vector_irq,
+ &hdev->affinity_notify);
+}
+
+static void hclge_misc_affinity_teardown(struct hclge_dev *hdev)
+{
+ irq_set_affinity_notifier(hdev->misc_vector.vector_irq, NULL);
+ irq_set_affinity_hint(hdev->misc_vector.vector_irq, NULL);
+}
+
static int hclge_misc_irq_init(struct hclge_dev *hdev)
{
int ret;
@@ -8794,6 +8833,11 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
INIT_WORK(&hdev->rst_service_task, hclge_reset_service_task);
INIT_WORK(&hdev->mbx_service_task, hclge_mailbox_service_task);
+ /* Setup affinity after service timer setup because add_timer_on
+ * is called in affinity notify.
+ */
+ hclge_misc_affinity_setup(hdev);
+
hclge_clear_all_event_cause(hdev);
hclge_clear_resetting_state(hdev);
@@ -8955,6 +8999,7 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev)
struct hclge_dev *hdev = ae_dev->priv;
struct hclge_mac *mac = &hdev->hw.mac;
+ hclge_misc_affinity_teardown(hdev);
hclge_state_uninit(hdev);
if (mac->phydev)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index dde8f22..688e425 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -863,6 +863,10 @@ struct hclge_dev {
DECLARE_KFIFO(mac_tnl_log, struct hclge_mac_tnl_stats,
HCLGE_MAC_TNL_LOG_SIZE);
+
+ /* affinity mask and notify for misc interrupt */
+ cpumask_t affinity_mask;
+ struct irq_affinity_notify affinity_notify;
};
/* VPort level vlan tag configuration for TX direction */
--
2.7.4
^ permalink raw reply related
* [PATCH V4 net-next 04/10] net: hns3: change GFP flag during lock period
From: Huazhong Tan @ 2019-07-29 2:53 UTC (permalink / raw)
To: davem
Cc: netdev, linux-kernel, salil.mehta, yisen.zhuang, linuxarm, saeedm,
Yufeng Mo, lipeng 00277521, Huazhong Tan
In-Reply-To: <1564368811-65492-1-git-send-email-tanhuazhong@huawei.com>
From: Yufeng Mo <moyufeng@huawei.com>
When allocating memory, the GFP_KERNEL cannot be used during the
spin_lock period. This is because it may cause scheduling when holding
spin_lock. This patch changes GFP flag to GFP_ATOMIC in this case.
Fixes: dd74f815dd41 ("net: hns3: Add support for rule add/delete for flow director")
Signed-off-by: Yufeng Mo <moyufeng@huawei.com>
Signed-off-by: lipeng 00277521 <lipeng321@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 3c64d70..14199c4 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -5796,7 +5796,7 @@ static int hclge_add_fd_entry_by_arfs(struct hnae3_handle *handle, u16 queue_id,
return -ENOSPC;
}
- rule = kzalloc(sizeof(*rule), GFP_KERNEL);
+ rule = kzalloc(sizeof(*rule), GFP_ATOMIC);
if (!rule) {
spin_unlock_bh(&hdev->fd_rule_lock);
--
2.7.4
^ permalink raw reply related
* [PATCH net] net: ipv6: Fix a bug in ndisc_send_ns when netdev only has a global address
From: Su Yanjun @ 2019-07-29 2:49 UTC (permalink / raw)
To: davem, kuznet, yoshfuji; +Cc: netdev, linux-kernel, suyj.fnst
When we send mpls packets and the interface only has a
manual global ipv6 address, then the two hosts cant communicate.
I find that in ndisc_send_ns it only tries to get a ll address.
In my case, the executive path is as below.
ip6_output
->ip6_finish_output
->lwtunnel_xmit
->mpls_xmit
->neigh_resolve_output
->neigh_probe
->ndisc_solicit
->ndisc_send_ns
In RFC4861, 7.2.2 says
"If the source address of the packet prompting the solicitation is the
same as one of the addresses assigned to the outgoing interface, that
address SHOULD be placed in the IP Source Address of the outgoing
solicitation. Otherwise, any one of the addresses assigned to the
interface should be used."
In this patch we try get a global address if we get ll address failed.
Signed-off-by: Su Yanjun <suyj.fnst@cn.fujitsu.com>
---
include/net/addrconf.h | 4 ++++
net/ipv6/addrconf.c | 34 ++++++++++++++++++++++++++++++++++
net/ipv6/ndisc.c | 8 ++++++--
3 files changed, 44 insertions(+), 2 deletions(-)
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index becdad5..006db8e 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -107,6 +107,10 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr,
u32 banned_flags);
int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
u32 banned_flags);
+int __ipv6_get_addr(struct inet6_dev *idev, struct in6_addr *addr,
+ u32 banned_flags);
+int ipv6_get_addr(struct net_device *dev, struct in6_addr *addr,
+ u32 banned_flags);
bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
bool match_wildcard);
bool inet_rcv_saddr_any(const struct sock *sk);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 521e320..4c0a43f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1870,6 +1870,40 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
return err;
}
+int __ipv6_get_addr(struct inet6_dev *idev, struct in6_addr *addr,
+ u32 banned_flags)
+{
+ struct inet6_ifaddr *ifp;
+ int err = -EADDRNOTAVAIL;
+
+ list_for_each_entry_reverse(ifp, &idev->addr_list, if_list) {
+ if (ifp->scope == 0 &&
+ !(ifp->flags & banned_flags)) {
+ *addr = ifp->addr;
+ err = 0;
+ break;
+ }
+ }
+ return err;
+}
+
+int ipv6_get_addr(struct net_device *dev, struct in6_addr *addr,
+ u32 banned_flags)
+{
+ struct inet6_dev *idev;
+ int err = -EADDRNOTAVAIL;
+
+ rcu_read_lock();
+ idev = __in6_dev_get(dev);
+ if (idev) {
+ read_lock_bh(&idev->lock);
+ err = __ipv6_get_addr(idev, addr, banned_flags);
+ read_unlock_bh(&idev->lock);
+ }
+ rcu_read_unlock();
+ return err;
+}
+
static int ipv6_count_addresses(const struct inet6_dev *idev)
{
const struct inet6_ifaddr *ifp;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 083cc1c..18ac2fb 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -606,8 +606,12 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
if (!saddr) {
if (ipv6_get_lladdr(dev, &addr_buf,
- (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)))
- return;
+ (IFA_F_TENTATIVE | IFA_F_OPTIMISTIC))) {
+ /* try global address */
+ if (ipv6_get_addr(dev, &addr_buf,
+ (IFA_F_TENTATIVE | IFA_F_OPTIMISTIC)))
+ return;
+ }
saddr = &addr_buf;
}
--
2.7.4
^ permalink raw reply related
* Re: [PATCH net-next v4 2/3] flow_offload: Support get default block from tc immediately
From: wenxu @ 2019-07-29 2:43 UTC (permalink / raw)
To: Jakub Kicinski; +Cc: pablo, fw, netfilter-devel, netdev
In-Reply-To: <20190728131653.6af72a87@cakuba.netronome.com>
On 7/29/2019 4:16 AM, Jakub Kicinski wrote:
> .
> The TC default block is there because the indirect registration may
> happen _after_ the block is installed and populated. It's the device
> driver that usually does the indirect registration, the tunnel device
> and its rules may already be set when device driver is loaded or
> reloaded.
Yes, I know this scenario.
> I don't know the nft code, but it seems unlikely it wouldn't have the
> same problem/need..
nft don't have the same problem. The offload rule can only attached to offload base chain.
Th offload base chain is created after the device driver loaded (the device exist).
>
^ permalink raw reply
* [PATCH] net: sched: Fix a possible null-pointer dereference in dequeue_func()
From: Jia-Ju Bai @ 2019-07-29 2:21 UTC (permalink / raw)
To: jhs, xiyou.wangcong, jiri, davem; +Cc: netdev, linux-kernel, Jia-Ju Bai
In dequeue_func(), there is an if statement on line 74 to check whether
skb is NULL:
if (skb)
When skb is NULL, it is used on line 77:
prefetch(&skb->end);
Thus, a possible null-pointer dereference may occur.
To fix this bug, skb->end is used when skb is not NULL.
This bug is found by a static analysis tool STCheck written by us.
Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
---
net/sched/sch_codel.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 25ef172c23df..30169b3adbbb 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -71,10 +71,10 @@ static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx)
struct Qdisc *sch = ctx;
struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
- if (skb)
+ if (skb) {
sch->qstats.backlog -= qdisc_pkt_len(skb);
-
- prefetch(&skb->end); /* we'll need skb_shinfo() */
+ prefetch(&skb->end); /* we'll need skb_shinfo() */
+ }
return skb;
}
--
2.17.0
^ permalink raw reply related
* Re: [PATCH 4.4 stable net] net: tcp: Fix use-after-free in tcp_write_xmit
From: maowenan @ 2019-07-29 1:26 UTC (permalink / raw)
To: Greg KH; +Cc: stable, davem, netdev, linux-kernel
In-Reply-To: <20190727114001.GA6685@kroah.com>
On 2019/7/27 19:40, Greg KH wrote:
> On Sat, Jul 27, 2019 at 07:22:30PM +0800, maowenan wrote:
>>
>>
>> On 2019/7/27 18:44, maowenan wrote:
>>>
>>>
>>> On 2019/7/24 20:13, maowenan wrote:
>>>>
>>>>
>>>> On 2019/7/24 19:05, Greg KH wrote:
>>>>> On Wed, Jul 24, 2019 at 05:17:15PM +0800, Mao Wenan wrote:
>>>>>> There is one report about tcp_write_xmit use-after-free with version 4.4.136:
>>>>>>
>>>>>> BUG: KASAN: use-after-free in tcp_skb_pcount include/net/tcp.h:796 [inline]
>>>>>> BUG: KASAN: use-after-free in tcp_init_tso_segs net/ipv4/tcp_output.c:1619 [inline]
>>>>>> BUG: KASAN: use-after-free in tcp_write_xmit+0x3fc2/0x4cb0 net/ipv4/tcp_output.c:2056
>>>>>> Read of size 2 at addr ffff8801d6fc87b0 by task syz-executor408/4195
>>>>>>
>>>>>> CPU: 0 PID: 4195 Comm: syz-executor408 Not tainted 4.4.136-gfb7e319 #59
>>>>>> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
>>>>>> 0000000000000000 7d8f38ecc03be946 ffff8801d73b7710 ffffffff81e0edad
>>>>>> ffffea00075bf200 ffff8801d6fc87b0 0000000000000000 ffff8801d6fc87b0
>>>>>> dffffc0000000000 ffff8801d73b7748 ffffffff815159b6 ffff8801d6fc87b0
>>>>>> Call Trace:
>>>>>> [<ffffffff81e0edad>] __dump_stack lib/dump_stack.c:15 [inline]
>>>>>> [<ffffffff81e0edad>] dump_stack+0xc1/0x124 lib/dump_stack.c:51
>>>>>> [<ffffffff815159b6>] print_address_description+0x6c/0x216 mm/kasan/report.c:252
>>>>>> [<ffffffff81515cd5>] kasan_report_error mm/kasan/report.c:351 [inline]
>>>>>> [<ffffffff81515cd5>] kasan_report.cold.7+0x175/0x2f7 mm/kasan/report.c:408
>>>>>> [<ffffffff814f9784>] __asan_report_load2_noabort+0x14/0x20 mm/kasan/report.c:427
>>>>>> [<ffffffff83286582>] tcp_skb_pcount include/net/tcp.h:796 [inline]
>>>>>> [<ffffffff83286582>] tcp_init_tso_segs net/ipv4/tcp_output.c:1619 [inline]
>>>>>> [<ffffffff83286582>] tcp_write_xmit+0x3fc2/0x4cb0 net/ipv4/tcp_output.c:2056
>>>>>> [<ffffffff83287a40>] __tcp_push_pending_frames+0xa0/0x290 net/ipv4/tcp_output.c:2307
>>>>>> [<ffffffff8328e966>] tcp_send_fin+0x176/0xab0 net/ipv4/tcp_output.c:2883
>>>>>> [<ffffffff8324c0d0>] tcp_close+0xca0/0xf70 net/ipv4/tcp.c:2112
>>>>>> [<ffffffff832f8d0f>] inet_release+0xff/0x1d0 net/ipv4/af_inet.c:435
>>>>>> [<ffffffff82f1a156>] sock_release+0x96/0x1c0 net/socket.c:586
>>>>>> [<ffffffff82f1a296>] sock_close+0x16/0x20 net/socket.c:1037
>>>>>> [<ffffffff81522da5>] __fput+0x235/0x6f0 fs/file_table.c:208
>>>>>> [<ffffffff815232e5>] ____fput+0x15/0x20 fs/file_table.c:244
>>>>>> [<ffffffff8118bd7f>] task_work_run+0x10f/0x190 kernel/task_work.c:115
>>>>>> [<ffffffff81135285>] exit_task_work include/linux/task_work.h:21 [inline]
>>>>>> [<ffffffff81135285>] do_exit+0x9e5/0x26b0 kernel/exit.c:759
>>>>>> [<ffffffff8113b1d1>] do_group_exit+0x111/0x330 kernel/exit.c:889
>>>>>> [<ffffffff8115e5cc>] get_signal+0x4ec/0x14b0 kernel/signal.c:2321
>>>>>> [<ffffffff8100e02b>] do_signal+0x8b/0x1d30 arch/x86/kernel/signal.c:712
>>>>>> [<ffffffff8100360a>] exit_to_usermode_loop+0x11a/0x160 arch/x86/entry/common.c:248
>>>>>> [<ffffffff81006535>] prepare_exit_to_usermode arch/x86/entry/common.c:283 [inline]
>>>>>> [<ffffffff81006535>] syscall_return_slowpath+0x1b5/0x1f0 arch/x86/entry/common.c:348
>>>>>> [<ffffffff838c29b5>] int_ret_from_sys_call+0x25/0xa3
>>>>>>
>>>>>> Allocated by task 4194:
>>>>>> [<ffffffff810341d6>] save_stack_trace+0x26/0x50 arch/x86/kernel/stacktrace.c:63
>>>>>> [<ffffffff814f8873>] save_stack+0x43/0xd0 mm/kasan/kasan.c:512
>>>>>> [<ffffffff814f8b57>] set_track mm/kasan/kasan.c:524 [inline]
>>>>>> [<ffffffff814f8b57>] kasan_kmalloc+0xc7/0xe0 mm/kasan/kasan.c:616
>>>>>> [<ffffffff814f9122>] kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:554
>>>>>> [<ffffffff814f4c1e>] slab_post_alloc_hook mm/slub.c:1349 [inline]
>>>>>> [<ffffffff814f4c1e>] slab_alloc_node mm/slub.c:2615 [inline]
>>>>>> [<ffffffff814f4c1e>] slab_alloc mm/slub.c:2623 [inline]
>>>>>> [<ffffffff814f4c1e>] kmem_cache_alloc+0xbe/0x2a0 mm/slub.c:2628
>>>>>> [<ffffffff82f380a6>] kmem_cache_alloc_node include/linux/slab.h:350 [inline]
>>>>>> [<ffffffff82f380a6>] __alloc_skb+0xe6/0x600 net/core/skbuff.c:218
>>>>>> [<ffffffff832466c3>] alloc_skb_fclone include/linux/skbuff.h:856 [inline]
>>>>>> [<ffffffff832466c3>] sk_stream_alloc_skb+0xa3/0x5d0 net/ipv4/tcp.c:833
>>>>>> [<ffffffff83249164>] tcp_sendmsg+0xd34/0x2b00 net/ipv4/tcp.c:1178
>>>>>> [<ffffffff83300ef3>] inet_sendmsg+0x203/0x4d0 net/ipv4/af_inet.c:755
>>>>>> [<ffffffff82f1e1fc>] sock_sendmsg_nosec net/socket.c:625 [inline]
>>>>>> [<ffffffff82f1e1fc>] sock_sendmsg+0xcc/0x110 net/socket.c:635
>>>>>> [<ffffffff82f1eedc>] SYSC_sendto+0x21c/0x370 net/socket.c:1665
>>>>>> [<ffffffff82f21560>] SyS_sendto+0x40/0x50 net/socket.c:1633
>>>>>> [<ffffffff838c2825>] entry_SYSCALL_64_fastpath+0x22/0x9e
>>>>>>
>>>>>> Freed by task 4194:
>>>>>> [<ffffffff810341d6>] save_stack_trace+0x26/0x50 arch/x86/kernel/stacktrace.c:63
>>>>>> [<ffffffff814f8873>] save_stack+0x43/0xd0 mm/kasan/kasan.c:512
>>>>>> [<ffffffff814f91a2>] set_track mm/kasan/kasan.c:524 [inline]
>>>>>> [<ffffffff814f91a2>] kasan_slab_free+0x72/0xc0 mm/kasan/kasan.c:589
>>>>>> [<ffffffff814f632e>] slab_free_hook mm/slub.c:1383 [inline]
>>>>>> [<ffffffff814f632e>] slab_free_freelist_hook mm/slub.c:1405 [inline]
>>>>>> [<ffffffff814f632e>] slab_free mm/slub.c:2859 [inline]
>>>>>> [<ffffffff814f632e>] kmem_cache_free+0xbe/0x340 mm/slub.c:2881
>>>>>> [<ffffffff82f3527f>] kfree_skbmem+0xcf/0x100 net/core/skbuff.c:635
>>>>>> [<ffffffff82f372fd>] __kfree_skb+0x1d/0x20 net/core/skbuff.c:676
>>>>>> [<ffffffff83288834>] sk_wmem_free_skb include/net/sock.h:1447 [inline]
>>>>>> [<ffffffff83288834>] tcp_write_queue_purge include/net/tcp.h:1460 [inline]
>>>>>> [<ffffffff83288834>] tcp_connect_init net/ipv4/tcp_output.c:3122 [inline]
>>>>>> [<ffffffff83288834>] tcp_connect+0xb24/0x30c0 net/ipv4/tcp_output.c:3261
>>>>>> [<ffffffff8329b991>] tcp_v4_connect+0xf31/0x1890 net/ipv4/tcp_ipv4.c:246
>>>>>> [<ffffffff832f9ca9>] __inet_stream_connect+0x2a9/0xc30 net/ipv4/af_inet.c:615
>>>>>> [<ffffffff832fa685>] inet_stream_connect+0x55/0xa0 net/ipv4/af_inet.c:676
>>>>>> [<ffffffff82f1eb78>] SYSC_connect+0x1b8/0x300 net/socket.c:1557
>>>>>> [<ffffffff82f214b4>] SyS_connect+0x24/0x30 net/socket.c:1538
>>>>>> [<ffffffff838c2825>] entry_SYSCALL_64_fastpath+0x22/0x9e
>>>>>>
>>>>>> Syzkaller reproducer():
>>>>>> r0 = socket$packet(0x11, 0x3, 0x300)
>>>>>> r1 = socket$inet_tcp(0x2, 0x1, 0x0)
>>>>>> bind$inet(r1, &(0x7f0000000300)={0x2, 0x4e21, @multicast1}, 0x10)
>>>>>> connect$inet(r1, &(0x7f0000000140)={0x2, 0x1000004e21, @loopback}, 0x10)
>>>>>> recvmmsg(r1, &(0x7f0000001e40)=[{{0x0, 0x0, &(0x7f0000000100)=[{&(0x7f00000005c0)=""/88, 0x58}], 0x1}}], 0x1, 0x40000000, 0x0)
>>>>>> sendto$inet(r1, &(0x7f0000000000)="e2f7ad5b661c761edf", 0x9, 0x8080, 0x0, 0x0)
>>>>>> r2 = fcntl$dupfd(r1, 0x0, r0)
>>>>>> connect$unix(r2, &(0x7f00000001c0)=@file={0x0, './file0\x00'}, 0x6e)
>>>>>>
>>>>>> C repro link: https://syzkaller.appspot.com/text?tag=ReproC&x=14db474f800000
>>>>>>
>>>>>> This is because when tcp_connect_init call tcp_write_queue_purge, it will
>>>>>> kfree all the skb in the write_queue, but the sk->sk_send_head forget to set NULL,
>>>>>> then tcp_write_xmit try to send skb, which has freed in tcp_write_queue_purge, UAF happens.
>>>>>>
>>>>>> Signed-off-by: Mao Wenan <maowenan@huawei.com>
>>>>>> ---
>>>>>> include/net/tcp.h | 1 +
>>>>>> 1 file changed, 1 insertion(+)
>>>>>>
>>>>>> diff --git a/include/net/tcp.h b/include/net/tcp.h
>>>>>> index bf8a0dae977a..8f8aace28cf8 100644
>>>>>> --- a/include/net/tcp.h
>>>>>> +++ b/include/net/tcp.h
>>>>>> @@ -1457,6 +1457,7 @@ static inline void tcp_write_queue_purge(struct sock *sk)
>>>>>>
>>>>>> while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
>>>>>> sk_wmem_free_skb(sk, skb);
>>>>>> + sk->sk_send_head = NULL;
>>>>>> sk_mem_reclaim(sk);
>>>>>> tcp_clear_all_retrans_hints(tcp_sk(sk));
>>>>>> inet_csk(sk)->icsk_backoff = 0;
>>>>>
>>>>> Does this corrispond with a specific commit that is already in Linus's
>>>>> tree? If not, why, did we change/mess something up when doing
>>>>> backports, or is the code just that different?
>>>>>
>>>>> Also, is this needed in 4.9.y, 4.14.y, 4.19.y, and/or 5.2.y? Why just
>>>>> 4.4.y?
>>>
>>> Greg,
>>>
>>> I have tested latest stable tree
>>> 4.4.186 oops
>>> 4.9.151 oops
>>> 4.14.106 NO oops
>>>
>>> This patch can simple fix them.
>>
>> I have checked 4.14.y it has already existed the same fix as mine, this is the reason why 4.14.106 is NO oops.
>> commit dbbf2d1e4077bab0c65ece2765d3fc69cf7d610f
>> Author: Soheil Hassas Yeganeh <soheil@google.com>
>> Date: Thu Mar 15 12:09:13 2018 -0400
>>
>> tcp: reset sk_send_head in tcp_write_queue_purge
>>
>
> So if this patch is backported to 4.4.y and 4.9.y all will be fine?
>
yes, all are fine, but the scenarios are different, additional description should be added when backport.
4.4 and 4.9 don't have the commit abb4a8b870b5 ("tcp: purge write queue upon RST") which is referred in dbbf2d1e4077:
in tcp_connect_init calls tcp_write_queue_purge, and does not reset sk_send_head, then UAF.
4.14 have the commit abb4a8b870b5 ("tcp: purge write queue upon RST"),
in tcp_reset calls tcp_write_queue_purge(sk), and does not reset sk_send_head, then UAF.
> thanks,
>
> greg k-h
>
> .
>
^ permalink raw reply
* Re: [PATCH V3 net-next 06/10] net: hns3: add debug messages to identify eth down cause
From: tanhuazhong @ 2019-07-29 1:21 UTC (permalink / raw)
To: David Miller
Cc: netdev, linux-kernel, salil.mehta, yisen.zhuang, linuxarm, saeedm,
liuyonglong, lipeng321
In-Reply-To: <20190727.190333.249806415176311786.davem@davemloft.net>
On 2019/7/28 10:03, David Miller wrote:
> From: Huazhong Tan <tanhuazhong@huawei.com>
> Date: Sat, 27 Jul 2019 13:46:08 +0800
>
>> From: Yonglong Liu <liuyonglong@huawei.com>
>>
>> Some times just see the eth interface have been down/up via
>> dmesg, but can not know why the eth down. So adds some debug
>> messages to identify the cause for this.
>>
>> Signed-off-by: Yonglong Liu <liuyonglong@huawei.com>
>> Signed-off-by: Peng Li <lipeng321@huawei.com>
>> Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
>> ---
>> drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 18 ++++++++++++++++++
>> drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 19 +++++++++++++++++++
>> .../net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c | 11 +++++++++++
>> 3 files changed, 48 insertions(+)
>>
>> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
>> index 4d58c53..973c57b 100644
>> --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
>> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
>> @@ -459,6 +459,9 @@ static int hns3_nic_net_open(struct net_device *netdev)
>> h->ae_algo->ops->set_timer_task(priv->ae_handle, true);
>>
>> hns3_config_xps(priv);
>> +
>> + netif_info(h, drv, netdev, "net open\n");
>
> These will pollute everyone's kernel logs for normal operations.
>
> This is not reasonable at all, sorry.
>
> Furthermore, even if it was appropriate, "netif_info()" is not "debug".
>
Will replace it with netif_dbg.
thanks.
>
> .
>
^ permalink raw reply
* Re: memory leak in bio_copy_user_iov
From: Bob Liu @ 2019-07-29 1:03 UTC (permalink / raw)
To: syzbot, agk, axboe, coreteam, davem, dm-devel, hdanton, kaber,
kadlec, linux-block, linux-kernel, linux-raid, netdev,
netfilter-devel, pablo, shli, snitzer, syzkaller-bugs
In-Reply-To: <000000000000aec4ec058ec71a3d@google.com>
On 7/29/19 8:38 AM, syzbot wrote:
> syzbot has bisected this bug to:
>
> commit 664820265d70a759dceca87b6eb200cd2b93cda8
> Author: Mike Snitzer <snitzer@redhat.com>
> Date: Thu Feb 18 20:44:39 2016 +0000
>
> dm: do not return target from dm_get_live_table_for_ioctl()
>
This(and previous bisection) look not related to the reported leak.
A possible reason may be KASAN can't recognize the failure path of bio_alloc_bioset()
where mempool_free() is called but not kmalloc(p).
But it's not a real bug, because we have the condition if (nr_iovecs > inline_vecs).
Below fix may avoid the syzbot bug report..
diff --git a/block/bio.c b/block/bio.c
index 4db1008..04a7879 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -513,8 +513,10 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
}
- if (unlikely(!bvl))
- goto err_free;
+ if (unlikely(!bvl)) {
+ mempool_free(p, &bs->bio_pool);
+ return NULL;
+ }
bio->bi_flags |= idx << BVEC_POOL_OFFSET;
} else if (nr_iovecs) {
@@ -525,10 +527,6 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
bio->bi_max_vecs = nr_iovecs;
bio->bi_io_vec = bvl;
return bio;
-
-err_free:
- mempool_free(p, &bs->bio_pool);
- return NULL;
}
EXPORT_SYMBOL(bio_alloc_bioset);
Regards, -Bob
> bisection log: https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_bisect.txt-3Fx-3D13f4eb64600000&d=DwIBaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=1ktT0U2YS_I8Zz2o-MS1YcCAzWZ6hFGtyTgvVMGM7gI&m=NfGQRVxYCfZacAKiml9Wue-G1r2h8qkuAhAMOx_uFcc&s=MNjYy_nft_s0ErmK2n89p7y2yhKmeWlxWch0z7_dsm8&e=start commit: 0011572c Merge branch 'for-5.2-fixes' of git://git.kernel...
> git tree: upstream
> final crash: https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_report.txt-3Fx-3D100ceb64600000&d=DwIBaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=1ktT0U2YS_I8Zz2o-MS1YcCAzWZ6hFGtyTgvVMGM7gI&m=NfGQRVxYCfZacAKiml9Wue-G1r2h8qkuAhAMOx_uFcc&s=iviPOQNPEIjkuqBma_VWEQ9l1Ve3eOiTwads42E4ZPo&e=console output: https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_log.txt-3Fx-3D17f4eb64600000&d=DwIBaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=1ktT0U2YS_I8Zz2o-MS1YcCAzWZ6hFGtyTgvVMGM7gI&m=NfGQRVxYCfZacAKiml9Wue-G1r2h8qkuAhAMOx_uFcc&s=MBwnFwjEcSQfYymfv8EYt_EawVdK9vD-OAqDMutO-YY&e=kernel config: https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_.config-3Fx-3Dcb38d33cd06d8d48&d=DwIBaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=1ktT0U2YS_I8Zz2o-MS1YcCAzWZ6hFGtyTgvVMGM7gI&m=NfGQRVxYCfZacAKiml9Wue-G1r2h8qkuAhAMOx_uFcc&s=SqmDUenNFS-961PGgiMW5mIUv0nIBrf0oBrzUxYZ8Do&e=dashboard link:
> https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_bug-3Fextid-3D03e5c8ebd22cc6c3a8cb&d=DwIBaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=1ktT0U2YS_I8Zz2o-MS1YcCAzWZ6hFGtyTgvVMGM7gI&m=NfGQRVxYCfZacAKiml9Wue-G1r2h8qkuAhAMOx_uFcc&s=jKd2ocY5X94uyB8Or-OC3yffbOgClPQPlXqFnLzvvSY&e=syz repro: https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_repro.syz-3Fx-3D13244221a00000&d=DwIBaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=1ktT0U2YS_I8Zz2o-MS1YcCAzWZ6hFGtyTgvVMGM7gI&m=NfGQRVxYCfZacAKiml9Wue-G1r2h8qkuAhAMOx_uFcc&s=K-C39Kcd1oEOtJKwnby-s1EyEZZA10mr9bcXZ0J9Kh0&e=C reproducer: https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_repro.c-3Fx-3D117b2432a00000&d=DwIBaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=1ktT0U2YS_I8Zz2o-MS1YcCAzWZ6hFGtyTgvVMGM7gI&m=NfGQRVxYCfZacAKiml9Wue-G1r2h8qkuAhAMOx_uFcc&s=7J685CwQN6_FA2KgO3Vgy1msF0zi5O0OqZj_bgvEqBE&e=
> Reported-by: syzbot+03e5c8ebd22cc6c3a8cb@syzkaller.appspotmail.com
> Fixes: 664820265d70 ("dm: do not return target from dm_get_live_table_for_ioctl()")
>
> For information about bisection process see: https://urldefense.proofpoint.com/v2/url?u=https-3A__goo.gl_tpsmEJ-23bisection&d=DwIBaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=1ktT0U2YS_I8Zz2o-MS1YcCAzWZ6hFGtyTgvVMGM7gI&m=NfGQRVxYCfZacAKiml9Wue-G1r2h8qkuAhAMOx_uFcc&s=rs52TkiEQCrV4V8YQa2wT55HD8E-0AX9pn7MNIDcje4&e=
^ permalink raw reply related
* [PATCH] net: ehea: Mark expected switch fall-through
From: Gustavo A. R. Silva @ 2019-07-29 0:30 UTC (permalink / raw)
To: Douglas Miller, David S. Miller
Cc: netdev, linux-kernel, Gustavo A. R. Silva, Stephen Rothwell,
Kees Cook
Mark switch cases where we are expecting to fall through.
This patch fixes the following warning:
drivers/net/ethernet/ibm/ehea/ehea_main.c: In function 'ehea_mem_notifier':
include/linux/printk.h:311:2: warning: this statement may fall through [-Wimplicit-fallthrough=]
printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/ethernet/ibm/ehea/ehea_main.c:3253:3: note: in expansion of macro 'pr_info'
pr_info("memory offlining canceled");
^~~~~~~
drivers/net/ethernet/ibm/ehea/ehea_main.c:3256:2: note: here
case MEM_ONLINE:
^~~~
Notice that, in this particular case, the code comment is
modified in accordance with what GCC is expecting to find.
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
drivers/net/ethernet/ibm/ehea/ehea_main.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c
index 4138a8480347..cca71ba7a74a 100644
--- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c
@@ -3251,7 +3251,7 @@ static int ehea_mem_notifier(struct notifier_block *nb,
switch (action) {
case MEM_CANCEL_OFFLINE:
pr_info("memory offlining canceled");
- /* Fall through: re-add canceled memory block */
+ /* Fall through - re-add canceled memory block */
case MEM_ONLINE:
pr_info("memory is going online");
--
2.22.0
^ permalink raw reply related
* [PATCH] net: spider_net: Mark expected switch fall-through
From: Gustavo A. R. Silva @ 2019-07-29 0:32 UTC (permalink / raw)
To: Ishizaki Kou, David S. Miller
Cc: netdev, linux-kernel, Gustavo A. R. Silva, Stephen Rothwell,
Kees Cook
Mark switch cases where we are expecting to fall through.
This patch fixes the following warning:
drivers/net/ethernet/toshiba/spider_net.c: In function 'spider_net_release_tx_chain':
drivers/net/ethernet/toshiba/spider_net.c:783:7: warning: this statement may fall through [-Wimplicit-fallthrough=]
if (!brutal) {
^
drivers/net/ethernet/toshiba/spider_net.c:792:3: note: here
case SPIDER_NET_DESCR_RESPONSE_ERROR:
^~~~
Notice that, in this particular case, the code comment is
modified in accordance with what GCC is expecting to find.
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
drivers/net/ethernet/toshiba/spider_net.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/net/ethernet/toshiba/spider_net.c b/drivers/net/ethernet/toshiba/spider_net.c
index 5b196ebfed49..0f346761a2b2 100644
--- a/drivers/net/ethernet/toshiba/spider_net.c
+++ b/drivers/net/ethernet/toshiba/spider_net.c
@@ -788,6 +788,7 @@ spider_net_release_tx_chain(struct spider_net_card *card, int brutal)
/* fallthrough, if we release the descriptors
* brutally (then we don't care about
* SPIDER_NET_DESCR_CARDOWNED) */
+ /* Fall through */
case SPIDER_NET_DESCR_RESPONSE_ERROR:
case SPIDER_NET_DESCR_PROTECTION_ERROR:
--
2.22.0
^ permalink raw reply related
* Re: memory leak in bio_copy_user_iov
From: syzbot @ 2019-07-29 0:38 UTC (permalink / raw)
To: agk, axboe, coreteam, davem, dm-devel, hdanton, kaber, kadlec,
linux-block, linux-kernel, linux-raid, netdev, netfilter-devel,
pablo, shli, snitzer, syzkaller-bugs
In-Reply-To: <000000000000c75fb7058ba0c0e4@google.com>
syzbot has bisected this bug to:
commit 664820265d70a759dceca87b6eb200cd2b93cda8
Author: Mike Snitzer <snitzer@redhat.com>
Date: Thu Feb 18 20:44:39 2016 +0000
dm: do not return target from dm_get_live_table_for_ioctl()
bisection log: https://syzkaller.appspot.com/x/bisect.txt?x=13f4eb64600000
start commit: 0011572c Merge branch 'for-5.2-fixes' of git://git.kernel...
git tree: upstream
final crash: https://syzkaller.appspot.com/x/report.txt?x=100ceb64600000
console output: https://syzkaller.appspot.com/x/log.txt?x=17f4eb64600000
kernel config: https://syzkaller.appspot.com/x/.config?x=cb38d33cd06d8d48
dashboard link: https://syzkaller.appspot.com/bug?extid=03e5c8ebd22cc6c3a8cb
syz repro: https://syzkaller.appspot.com/x/repro.syz?x=13244221a00000
C reproducer: https://syzkaller.appspot.com/x/repro.c?x=117b2432a00000
Reported-by: syzbot+03e5c8ebd22cc6c3a8cb@syzkaller.appspotmail.com
Fixes: 664820265d70 ("dm: do not return target from
dm_get_live_table_for_ioctl()")
For information about bisection process see: https://goo.gl/tpsmEJ#bisection
^ permalink raw reply
* Re: [PATCH] net: bridge: Allow bridge to joing multicast groups
From: Andrew Lunn @ 2019-07-28 23:07 UTC (permalink / raw)
To: Allan W. Nielsen
Cc: Horatiu Vultur, Nikolay Aleksandrov, roopa, davem, bridge, netdev,
linux-kernel
In-Reply-To: <20190728191558.zuopgfqza2iz5d5b@lx-anielsen.microsemi.net>
> Trying to get back to the original problem:
>
> We have a network which implements the ODVA/DLR ring protocol. This protocol
> sends out a beacon frame as often as every 3 us (as far as I recall, default I
> believe is 400 us) to this MAC address: 01:21:6C:00:00:01.
>
> Try take a quick look at slide 10 in [1].
>
> If we assume that the SwitchDev driver implemented such that all multicast
> traffic goes to the CPU, then we should really have a way to install a HW
> offload path in the silicon, such that these packets does not go to the CPU (as
> they are known not to be use full, and a frame every 3 us is a significant load
> on small DMA connections and CPU resources).
>
> If we assume that the SwitchDev driver implemented such that only "needed"
> multicast packets goes to the CPU, then we need a way to get these packets in
> case we want to implement the DLR protocol.
>
> I'm sure that both models can work, and I do not think that this is the main
> issue here.
>
> Our initial attempt was to allow install static L2-MAC entries and append
> multiple ports to such an entry in the MAC table. This was rejected, for several
> good reasons it seems. But I'm not sure it was clear what we wanted to achieve,
> and why we find it to be important. Hopefully this is clear with a real world
> use-case.
>
> Any hints or ideas on what would be a better way to solve this problems will be
> much appreciated.
I always try to think about how this would work if i had a bunch of
discrete network interfaces, not a switch. What APIs are involved in
configuring such a system? How does the Linux network stack perform
software DLR? How is the reception and blocking of the multicast group
performed?
Once you understand how it works in the software implement, it should
then be more obvious which switchdev hooks should be used to
accelerate this using hardware.
Andrew
^ permalink raw reply
* Re: [PATCH v3] net: dsa: qca8k: enable port flow control
From: Andrew Lunn @ 2019-07-28 22:31 UTC (permalink / raw)
To: xiaofeis
Cc: davem, vkoul, netdev, linux-arm-msm, bjorn.andersson,
vivien.didelot, f.fainelli, niklas.cassel, xiazha
In-Reply-To: <1564275470-52666-1-git-send-email-xiaofeis@codeaurora.org>
On Sun, Jul 28, 2019 at 08:57:50AM +0800, xiaofeis wrote:
> Set phy device advertising to enable MAC flow control.
Hi Xiaofei.
This is half of the needed change for MAC flow control.
phy_support_asym_pause(phy) is used by the MAC to tell the PHY layer
that the MAC supports flow control. The PHY will then advertise
this. When auto-negotiation is completed, the PHY layer will call
qca8k_adjust_link() with the results. It could be that the peer does
not support flow control, or only supports symmetric flow control. So
in that function, you need to program the MAC with the results of the
auto-neg. This is currently missing. You need to look at phydev->pause
and phydev->asym_pause to decide how to configure the MAC.
Andrew
^ permalink raw reply
* Re: [PATCH net] net: hns: fix LED configuration for marvell phy
From: Andrew Lunn @ 2019-07-28 22:14 UTC (permalink / raw)
To: Pavel Machek
Cc: liuyonglong, David Miller, netdev, linux-kernel, linuxarm,
salil.mehta, yisen.zhuang, shiju.jose
In-Reply-To: <20190728132412.GC8718@xo-6d-61-c0.localdomain>
On Sun, Jul 28, 2019 at 03:24:12PM +0200, Pavel Machek wrote:
> On Thu 2019-07-25 06:28:29, Andrew Lunn wrote:
> > On Thu, Jul 25, 2019 at 11:00:08AM +0800, liuyonglong wrote:
> > > > Revert "net: hns: fix LED configuration for marvell phy"
> > > > This reverts commit f4e5f775db5a4631300dccd0de5eafb50a77c131.
> > > >
> > > > Andrew Lunn says this should be handled another way.
> > > >
> > > > Signed-off-by: David S. Miller <davem@davemloft.net>
> > >
> > >
> > > Hi Andrew:
> > >
> > > I see this patch have been reverted, can you tell me the better way to do this?
> > > Thanks very much!
> >
> > Please take a look at the work Matthias Kaehlcke is doing. It has not
> > got too far yet, but when it is complete, it should define a generic
> > way to configure PHY LEDs.
>
> I don't remember PHY LED discussion from LED mailing list. Would you have a pointer?
Hi Pavel
So far, it has not made it onto the generic LED list. And the current
implementation is unlikely to go as far as using the generic LED
code. But i would like the binding to be compatible with it, so that
some time in the future it could be migrated to being part of the
generic LED code. But that would also require extensions to the
generic LED code to support hardware offload of triggers.
Andrew
^ permalink raw reply
* Re: [PATCH] tcp: add new tcp_mtu_probe_floor sysctl
From: Josh Hunt @ 2019-07-28 21:32 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev, David Miller
In-Reply-To: <5a054ca5-4077-5e91-69d5-f1add8dc8bfa@akamai.com>
On 7/28/19 2:14 PM, Josh Hunt wrote:
> On 7/28/19 6:54 AM, Eric Dumazet wrote:
>> On Sun, Jul 28, 2019 at 1:21 AM Josh Hunt <johunt@akamai.com> wrote:
>>>
>>> On 7/27/19 12:05 AM, Eric Dumazet wrote:
>>>> On Sat, Jul 27, 2019 at 4:23 AM Josh Hunt <johunt@akamai.com> wrote:
>>>>>
>>>>> The current implementation of TCP MTU probing can considerably
>>>>> underestimate the MTU on lossy connections allowing the MSS to get
>>>>> down to
>>>>> 48. We have found that in almost all of these cases on our networks
>>>>> these
>>>>> paths can handle much larger MTUs meaning the connections are being
>>>>> artificially limited. Even though TCP MTU probing can raise the MSS
>>>>> back up
>>>>> we have seen this not to be the case causing connections to be
>>>>> "stuck" with
>>>>> an MSS of 48 when heavy loss is present.
>>>>>
>>>>> Prior to pushing out this change we could not keep TCP MTU probing
>>>>> enabled
>>>>> b/c of the above reasons. Now with a reasonble floor set we've had it
>>>>> enabled for the past 6 months.
>>>>
>>>> And what reasonable value have you used ???
>>>
>>> Reasonable for some may not be reasonable for others hence the new
>>> sysctl :) We're currently running with a fairly high value based off of
>>> the v6 min MTU minus headers and options, etc. We went conservative with
>>> our setting initially as it seemed a reasonable first step when
>>> re-enabling TCP MTU probing since with no configurable floor we saw a #
>>> of cases where connections were using severely reduced mss b/c of loss
>>> and not b/c of actual path restriction. I plan to reevaluate the setting
>>> at some point, but since the probing method is still the same it means
>>> the same clients who got stuck with mss of 48 before will land at
>>> whatever floor we set. Looking forward we are interested in trying to
>>> improve TCP MTU probing so it does not penalize clients like this.
>>>
>>> A suggestion for a more reasonable floor default would be 512, which is
>>> the same as the min_pmtu. Given both mechanisms are trying to achieve
>>> the same goal it seems like they should have a similar min/floor.
>>>
>>>>
>>>>>
>>>>> The new sysctl will still default to TCP_MIN_SND_MSS (48), but gives
>>>>> administrators the ability to control the floor of MSS probing.
>>>>>
>>>>> Signed-off-by: Josh Hunt <johunt@akamai.com>
>>>>> ---
>>>>> Documentation/networking/ip-sysctl.txt | 6 ++++++
>>>>> include/net/netns/ipv4.h | 1 +
>>>>> net/ipv4/sysctl_net_ipv4.c | 9 +++++++++
>>>>> net/ipv4/tcp_ipv4.c | 1 +
>>>>> net/ipv4/tcp_timer.c | 2 +-
>>>>> 5 files changed, 18 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/Documentation/networking/ip-sysctl.txt
>>>>> b/Documentation/networking/ip-sysctl.txt
>>>>> index df33674799b5..49e95f438ed7 100644
>>>>> --- a/Documentation/networking/ip-sysctl.txt
>>>>> +++ b/Documentation/networking/ip-sysctl.txt
>>>>> @@ -256,6 +256,12 @@ tcp_base_mss - INTEGER
>>>>> Path MTU discovery (MTU probing). If MTU probing is
>>>>> enabled,
>>>>> this is the initial MSS used by the connection.
>>>>>
>>>>> +tcp_mtu_probe_floor - INTEGER
>>>>> + If MTU probing is enabled this caps the minimum MSS used
>>>>> for search_low
>>>>> + for the connection.
>>>>> +
>>>>> + Default : 48
>>>>> +
>>>>> tcp_min_snd_mss - INTEGER
>>>>> TCP SYN and SYNACK messages usually advertise an ADVMSS
>>>>> option,
>>>>> as described in RFC 1122 and RFC 6691.
>>>>> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
>>>>> index bc24a8ec1ce5..c0c0791b1912 100644
>>>>> --- a/include/net/netns/ipv4.h
>>>>> +++ b/include/net/netns/ipv4.h
>>>>> @@ -116,6 +116,7 @@ struct netns_ipv4 {
>>>>> int sysctl_tcp_l3mdev_accept;
>>>>> #endif
>>>>> int sysctl_tcp_mtu_probing;
>>>>> + int sysctl_tcp_mtu_probe_floor;
>>>>> int sysctl_tcp_base_mss;
>>>>> int sysctl_tcp_min_snd_mss;
>>>>> int sysctl_tcp_probe_threshold;
>>>>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>>>>> index 0b980e841927..59ded25acd04 100644
>>>>> --- a/net/ipv4/sysctl_net_ipv4.c
>>>>> +++ b/net/ipv4/sysctl_net_ipv4.c
>>>>> @@ -820,6 +820,15 @@ static struct ctl_table ipv4_net_table[] = {
>>>>> .extra2 = &tcp_min_snd_mss_max,
>>>>> },
>>>>> {
>>>>> + .procname = "tcp_mtu_probe_floor",
>>>>> + .data =
>>>>> &init_net.ipv4.sysctl_tcp_mtu_probe_floor,
>>>>> + .maxlen = sizeof(int),
>>>>> + .mode = 0644,
>>>>> + .proc_handler = proc_dointvec_minmax,
>>>>> + .extra1 = &tcp_min_snd_mss_min,
>>>>> + .extra2 = &tcp_min_snd_mss_max,
>>>>> + },
>>>>> + {
>>>>> .procname = "tcp_probe_threshold",
>>>>> .data =
>>>>> &init_net.ipv4.sysctl_tcp_probe_threshold,
>>>>> .maxlen = sizeof(int),
>>>>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
>>>>> index d57641cb3477..e0a372676329 100644
>>>>> --- a/net/ipv4/tcp_ipv4.c
>>>>> +++ b/net/ipv4/tcp_ipv4.c
>>>>> @@ -2637,6 +2637,7 @@ static int __net_init tcp_sk_init(struct net
>>>>> *net)
>>>>> net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
>>>>> net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
>>>>> net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
>>>>> + net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
>>>>>
>>>>> net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
>>>>> net->ipv4.sysctl_tcp_keepalive_probes =
>>>>> TCP_KEEPALIVE_PROBES;
>>>>> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
>>>>> index c801cd37cc2a..dbd9d2d0ee63 100644
>>>>> --- a/net/ipv4/tcp_timer.c
>>>>> +++ b/net/ipv4/tcp_timer.c
>>>>> @@ -154,7 +154,7 @@ static void tcp_mtu_probing(struct
>>>>> inet_connection_sock *icsk, struct sock *sk)
>>>>> } else {
>>>>> mss = tcp_mtu_to_mss(sk,
>>>>> icsk->icsk_mtup.search_low) >> 1;
>>>>> mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
>>>>> - mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len);
>>>>> + mss = max(mss, net->ipv4.sysctl_tcp_mtu_probe_floor);
>>>>> mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss);
>>>>> icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk,
>>>>> mss);
>>>>> }
>>>>
>>>>
>>>> Existing sysctl should be enough ?
>>>
>>> I don't think so. Changing tcp_min_snd_mss could impact clients that
>>> really want/need a small mss. When you added the new sysctl I tried to
>>> analyze the mss values we're seeing to understand what we could possibly
>>> raise it to. While not a huge amount, we see more clients than I
>>> expected announcing mss values in the 180-512 range. Given that I would
>>> not feel comfortable setting tcp_min_snd_mss to say 512 as I suggested
>>> above.
>>
>> If these clients need mss values in 180-512 ranges, how MTU probing
>> would work for them,
>> if you set a floor to 512 ?
>
> First, we already seem to be fine with ignoring these paths with ICMP
> based PMTU discovery b/c of our min_pmtu default of 512 and that is
> configurable. Second by adding this sysctl we're giving administrators
> the choice to decide if they'd like to attempt to support these very
> very small # of paths which may be below 512 (MSS <= 512 does not mean
> MTU <= 512) or cover themselves by being able to raise the floor to not
> penalize clients who may be on very lossy networks.
>
>>
>> Are we sure the intent of tcp_base_mss was not to act as a floor ?
>
> My understanding is that tcp_base_mss is meant to be the initial value
> of search_low (as per Docs). Then in RFC 4821 [1] Sections 7.2, shows
> search_low should be configurable, and 7.7 we see that in response to
> successive black hole detection search_low should be halved. So I don't
> think it was meant to be a floor, but just the initial search_low param.
> Also note that in that same section they suggest a floor of 68 for v4,
> but a floor of 1280 for v6 which we do not adhere to currently.
>
Clarification. We == Akamai in regards to setting tcp_base_mss to
1400-overheads. Upstream default is 1024.
> We actually set tcp_base_mss to something close to the value suggested
> towards the end of section 7.2 of the RFC of 1400 bytes minus IP and
> Transport overheads and options. This way we have more realistic
> searching based on the majority of clients that we see. The kernel winds
> up using initial search_low/tcp_base_mss as initial eff_pmtu, so we see
> something like:
>
> 21:03:41.314612 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq
> 1:1461, ack 1, win 229, length 1460: HTTP
> 21:03:41.670307 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq
> 1:1461, ack 1, win 229, length 1460: HTTP
> 21:03:42.030308 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq
> 1:1461, ack 1, win 229, length 1460: HTTP
> 21:03:42.534307 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq
> 1:1461, ack 1, win 229, length 1460: HTTP
> 21:03:43.198308 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq
> 1:1461, ack 1, win 229, length 1460: HTTP
> 21:03:44.478307 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq
> 1:1461, ack 1, win 229, length 1460: HTTP
> 21:03:47.742310 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [.], seq
> 1:1349, ack 1, win 229, length 1348: HTTP
> 21:03:56.702310 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [.], seq
> 1:675, ack 1, win 229, length 674: HTTP
>
> For further evidence this is a real problem here's a sample of mss
> values I found when originally investigating this problem for us:
>
> I dug up some #s I found when originally investigating this problem:
>
> # ss -emoitn | grep mss | sed "s/.*mss:\([0-9]*\).*/\1/" | sort -u |
> sort -g | head -5
>
> 36:11
> 64:7
> 72:1
> 128:13
> 144:4
>
> From what I could tell these connections were on paths much larger than
> the mss they were being forced to use. I determined this by looking at
> the mss used for other objects fetched from the same IPs.
>
> Josh
>
> [1] - https://www.ietf.org/rfc/rfc4821.txt
>
>>
>> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
>> index
>> c801cd37cc2a9c11f2dd4b9681137755e501a538..6d15895e9dcfb2eff51bbcf3608c7e68c1970a9e
>>
>> 100644
>> --- a/net/ipv4/tcp_timer.c
>> +++ b/net/ipv4/tcp_timer.c
>> @@ -153,7 +153,7 @@ static void tcp_mtu_probing(struct
>> inet_connection_sock *icsk, struct sock *sk)
>> icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
>> } else {
>> mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)
>> >> 1;
>> - mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
>> + mss = max(net->ipv4.sysctl_tcp_base_mss, mss);
>> mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len);
>> mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss);
>> icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
>>
>>
>>
>>>
>>>>
>>>> tcp_min_snd_mss documentation could be slightly updated.
>>>>
>>>> And maybe its default value could be raised a bit.
>>>>
>>>
>>> Thanks
>>> Josh
^ permalink raw reply
* Re: [PATCH] tcp: add new tcp_mtu_probe_floor sysctl
From: Josh Hunt @ 2019-07-28 21:14 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev, David Miller
In-Reply-To: <CANn89iLqeixzZkop8tqOQka_9ZiKurZL9Vj05bgU99M5Pbenqw@mail.gmail.com>
On 7/28/19 6:54 AM, Eric Dumazet wrote:
> On Sun, Jul 28, 2019 at 1:21 AM Josh Hunt <johunt@akamai.com> wrote:
>>
>> On 7/27/19 12:05 AM, Eric Dumazet wrote:
>>> On Sat, Jul 27, 2019 at 4:23 AM Josh Hunt <johunt@akamai.com> wrote:
>>>>
>>>> The current implementation of TCP MTU probing can considerably
>>>> underestimate the MTU on lossy connections allowing the MSS to get down to
>>>> 48. We have found that in almost all of these cases on our networks these
>>>> paths can handle much larger MTUs meaning the connections are being
>>>> artificially limited. Even though TCP MTU probing can raise the MSS back up
>>>> we have seen this not to be the case causing connections to be "stuck" with
>>>> an MSS of 48 when heavy loss is present.
>>>>
>>>> Prior to pushing out this change we could not keep TCP MTU probing enabled
>>>> b/c of the above reasons. Now with a reasonble floor set we've had it
>>>> enabled for the past 6 months.
>>>
>>> And what reasonable value have you used ???
>>
>> Reasonable for some may not be reasonable for others hence the new
>> sysctl :) We're currently running with a fairly high value based off of
>> the v6 min MTU minus headers and options, etc. We went conservative with
>> our setting initially as it seemed a reasonable first step when
>> re-enabling TCP MTU probing since with no configurable floor we saw a #
>> of cases where connections were using severely reduced mss b/c of loss
>> and not b/c of actual path restriction. I plan to reevaluate the setting
>> at some point, but since the probing method is still the same it means
>> the same clients who got stuck with mss of 48 before will land at
>> whatever floor we set. Looking forward we are interested in trying to
>> improve TCP MTU probing so it does not penalize clients like this.
>>
>> A suggestion for a more reasonable floor default would be 512, which is
>> the same as the min_pmtu. Given both mechanisms are trying to achieve
>> the same goal it seems like they should have a similar min/floor.
>>
>>>
>>>>
>>>> The new sysctl will still default to TCP_MIN_SND_MSS (48), but gives
>>>> administrators the ability to control the floor of MSS probing.
>>>>
>>>> Signed-off-by: Josh Hunt <johunt@akamai.com>
>>>> ---
>>>> Documentation/networking/ip-sysctl.txt | 6 ++++++
>>>> include/net/netns/ipv4.h | 1 +
>>>> net/ipv4/sysctl_net_ipv4.c | 9 +++++++++
>>>> net/ipv4/tcp_ipv4.c | 1 +
>>>> net/ipv4/tcp_timer.c | 2 +-
>>>> 5 files changed, 18 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
>>>> index df33674799b5..49e95f438ed7 100644
>>>> --- a/Documentation/networking/ip-sysctl.txt
>>>> +++ b/Documentation/networking/ip-sysctl.txt
>>>> @@ -256,6 +256,12 @@ tcp_base_mss - INTEGER
>>>> Path MTU discovery (MTU probing). If MTU probing is enabled,
>>>> this is the initial MSS used by the connection.
>>>>
>>>> +tcp_mtu_probe_floor - INTEGER
>>>> + If MTU probing is enabled this caps the minimum MSS used for search_low
>>>> + for the connection.
>>>> +
>>>> + Default : 48
>>>> +
>>>> tcp_min_snd_mss - INTEGER
>>>> TCP SYN and SYNACK messages usually advertise an ADVMSS option,
>>>> as described in RFC 1122 and RFC 6691.
>>>> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
>>>> index bc24a8ec1ce5..c0c0791b1912 100644
>>>> --- a/include/net/netns/ipv4.h
>>>> +++ b/include/net/netns/ipv4.h
>>>> @@ -116,6 +116,7 @@ struct netns_ipv4 {
>>>> int sysctl_tcp_l3mdev_accept;
>>>> #endif
>>>> int sysctl_tcp_mtu_probing;
>>>> + int sysctl_tcp_mtu_probe_floor;
>>>> int sysctl_tcp_base_mss;
>>>> int sysctl_tcp_min_snd_mss;
>>>> int sysctl_tcp_probe_threshold;
>>>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>>>> index 0b980e841927..59ded25acd04 100644
>>>> --- a/net/ipv4/sysctl_net_ipv4.c
>>>> +++ b/net/ipv4/sysctl_net_ipv4.c
>>>> @@ -820,6 +820,15 @@ static struct ctl_table ipv4_net_table[] = {
>>>> .extra2 = &tcp_min_snd_mss_max,
>>>> },
>>>> {
>>>> + .procname = "tcp_mtu_probe_floor",
>>>> + .data = &init_net.ipv4.sysctl_tcp_mtu_probe_floor,
>>>> + .maxlen = sizeof(int),
>>>> + .mode = 0644,
>>>> + .proc_handler = proc_dointvec_minmax,
>>>> + .extra1 = &tcp_min_snd_mss_min,
>>>> + .extra2 = &tcp_min_snd_mss_max,
>>>> + },
>>>> + {
>>>> .procname = "tcp_probe_threshold",
>>>> .data = &init_net.ipv4.sysctl_tcp_probe_threshold,
>>>> .maxlen = sizeof(int),
>>>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
>>>> index d57641cb3477..e0a372676329 100644
>>>> --- a/net/ipv4/tcp_ipv4.c
>>>> +++ b/net/ipv4/tcp_ipv4.c
>>>> @@ -2637,6 +2637,7 @@ static int __net_init tcp_sk_init(struct net *net)
>>>> net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
>>>> net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
>>>> net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
>>>> + net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
>>>>
>>>> net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
>>>> net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
>>>> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
>>>> index c801cd37cc2a..dbd9d2d0ee63 100644
>>>> --- a/net/ipv4/tcp_timer.c
>>>> +++ b/net/ipv4/tcp_timer.c
>>>> @@ -154,7 +154,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
>>>> } else {
>>>> mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
>>>> mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
>>>> - mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len);
>>>> + mss = max(mss, net->ipv4.sysctl_tcp_mtu_probe_floor);
>>>> mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss);
>>>> icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
>>>> }
>>>
>>>
>>> Existing sysctl should be enough ?
>>
>> I don't think so. Changing tcp_min_snd_mss could impact clients that
>> really want/need a small mss. When you added the new sysctl I tried to
>> analyze the mss values we're seeing to understand what we could possibly
>> raise it to. While not a huge amount, we see more clients than I
>> expected announcing mss values in the 180-512 range. Given that I would
>> not feel comfortable setting tcp_min_snd_mss to say 512 as I suggested
>> above.
>
> If these clients need mss values in 180-512 ranges, how MTU probing
> would work for them,
> if you set a floor to 512 ?
First, we already seem to be fine with ignoring these paths with ICMP
based PMTU discovery b/c of our min_pmtu default of 512 and that is
configurable. Second by adding this sysctl we're giving administrators
the choice to decide if they'd like to attempt to support these very
very small # of paths which may be below 512 (MSS <= 512 does not mean
MTU <= 512) or cover themselves by being able to raise the floor to not
penalize clients who may be on very lossy networks.
>
> Are we sure the intent of tcp_base_mss was not to act as a floor ?
My understanding is that tcp_base_mss is meant to be the initial value
of search_low (as per Docs). Then in RFC 4821 [1] Sections 7.2, shows
search_low should be configurable, and 7.7 we see that in response to
successive black hole detection search_low should be halved. So I don't
think it was meant to be a floor, but just the initial search_low param.
Also note that in that same section they suggest a floor of 68 for v4,
but a floor of 1280 for v6 which we do not adhere to currently.
We actually set tcp_base_mss to something close to the value suggested
towards the end of section 7.2 of the RFC of 1400 bytes minus IP and
Transport overheads and options. This way we have more realistic
searching based on the majority of clients that we see. The kernel winds
up using initial search_low/tcp_base_mss as initial eff_pmtu, so we see
something like:
21:03:41.314612 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq
1:1461, ack 1, win 229, length 1460: HTTP
21:03:41.670307 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq
1:1461, ack 1, win 229, length 1460: HTTP
21:03:42.030308 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq
1:1461, ack 1, win 229, length 1460: HTTP
21:03:42.534307 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq
1:1461, ack 1, win 229, length 1460: HTTP
21:03:43.198308 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq
1:1461, ack 1, win 229, length 1460: HTTP
21:03:44.478307 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [P.], seq
1:1461, ack 1, win 229, length 1460: HTTP
21:03:47.742310 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [.], seq
1:1349, ack 1, win 229, length 1348: HTTP
21:03:56.702310 IP 192.168.0.1.8080 > 192.0.2.1.41523: Flags [.], seq
1:675, ack 1, win 229, length 674: HTTP
For further evidence this is a real problem here's a sample of mss
values I found when originally investigating this problem for us:
I dug up some #s I found when originally investigating this problem:
# ss -emoitn | grep mss | sed "s/.*mss:\([0-9]*\).*/\1/" | sort -u |
sort -g | head -5
36:11
64:7
72:1
128:13
144:4
From what I could tell these connections were on paths much larger than
the mss they were being forced to use. I determined this by looking at
the mss used for other objects fetched from the same IPs.
Josh
[1] - https://www.ietf.org/rfc/rfc4821.txt
>
> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> index c801cd37cc2a9c11f2dd4b9681137755e501a538..6d15895e9dcfb2eff51bbcf3608c7e68c1970a9e
> 100644
> --- a/net/ipv4/tcp_timer.c
> +++ b/net/ipv4/tcp_timer.c
> @@ -153,7 +153,7 @@ static void tcp_mtu_probing(struct
> inet_connection_sock *icsk, struct sock *sk)
> icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
> } else {
> mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
> - mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
> + mss = max(net->ipv4.sysctl_tcp_base_mss, mss);
> mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len);
> mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss);
> icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
>
>
>
>>
>>>
>>> tcp_min_snd_mss documentation could be slightly updated.
>>>
>>> And maybe its default value could be raised a bit.
>>>
>>
>> Thanks
>> Josh
^ permalink raw reply
* Re: Slowness forming TIPC cluster with explicit node addresses
From: Chris Packham @ 2019-07-28 21:04 UTC (permalink / raw)
To: jon.maloy@ericsson.com, tipc-discussion@lists.sourceforge.net
Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <CH2PR15MB35754D65AB240A74AE488E719AC00@CH2PR15MB3575.namprd15.prod.outlook.com>
On Fri, 2019-07-26 at 13:31 +0000, Jon Maloy wrote:
>
> >
> > -----Original Message-----
> > From: netdev-owner@vger.kernel.org <netdev-owner@vger.kernel.org>
> > On
> > Behalf Of Chris Packham
> > Sent: 25-Jul-19 19:37
> > To: tipc-discussion@lists.sourceforge.net
> > Cc: netdev@vger.kernel.org; linux-kernel@vger.kernel.org
> > Subject: Slowness forming TIPC cluster with explicit node addresses
> >
> > Hi,
> >
> > I'm having problems forming a TIPC cluster between 2 nodes.
> >
> > This is the basic steps I'm going through on each node.
> >
> > modprobe tipc
> > ip link set eth2 up
> > tipc node set addr 1.1.5 # or 1.1.6
> > tipc bearer enable media eth dev eth0
> eth2, I assume...
>
Yes sorry I keep switching between between Ethernet ports for testing
so I hand edited the email.
> >
> >
> > Then to confirm if the cluster is formed I use tipc link list
> >
> > [root@node-5 ~]# tipc link list
> > broadcast-link: up
> > ...
> >
> > Looking at tcpdump the two nodes are sending packets
> >
> > 22:30:05.782320 TIPC v2.0 1.1.5 > 0.0.0, headerlength 60 bytes,
> > MessageSize
> > 76 bytes, Neighbor Detection Protocol internal, messageType Link
> > request
> > 22:30:05.863555 TIPC v2.0 1.1.6 > 0.0.0, headerlength 60 bytes,
> > MessageSize
> > 76 bytes, Neighbor Detection Protocol internal, messageType Link
> > request
> >
> > Eventually (after a few minutes) the link does come up
> >
> > [root@node-6 ~]# tipc link list
> > broadcast-link: up
> > 1001006:eth2-1001005:eth2: up
> >
> > [root@node-5 ~]# tipc link list
> > broadcast-link: up
> > 1001005:eth2-1001006:eth2: up
> >
> > When I remove the "tipc node set addr" things seem to kick into
> > life straight
> > away
> >
> > [root@node-5 ~]# tipc link list
> > broadcast-link: up
> > 0050b61bd2aa:eth2-0050b61e6dfa:eth2: up
> >
> > So there appears to be some difference in behaviour between having
> > an
> > explicit node address and using the default. Unfortunately our
> > application
> > relies on setting the node addresses.
> I do this many times a day, without any problems. If there would be
> any time difference, I would expect the 'auto configurable' version
> to be slower, because it involves a DAD step.
> Are you sure you don't have any other nodes running in your system?
>
> ///jon
>
Nope the two nodes are connected back to back. Does the number of
Ethernet interfaces make a difference? As you can see I've got 3 on
each node. One is completely disconnected, one is for booting over TFTP
(only used by U-boot) and the other is the USB Ethernet I'm using for
testing.
>
> >
> >
> > [root@node-5 ~]# uname -a
> > Linux linuxbox 5.2.0-at1+ #8 SMP Thu Jul 25 23:22:41 UTC 2019 ppc
> > GNU/Linux
> >
> > Any thoughts on the problem?
^ permalink raw reply
* Re: [PATCH net-next v4 2/3] flow_offload: Support get default block from tc immediately
From: Jakub Kicinski @ 2019-07-28 20:16 UTC (permalink / raw)
To: wenxu; +Cc: pablo, fw, netfilter-devel, netdev
In-Reply-To: <1564296769-32294-3-git-send-email-wenxu@ucloud.cn>
On Sun, 28 Jul 2019 14:52:48 +0800, wenxu@ucloud.cn wrote:
> From: wenxu <wenxu@ucloud.cn>
>
> When thre indr device register, it can get the default block
> from tc immediately if the block is exist.
>
> Signed-off-by: wenxu <wenxu@ucloud.cn>
> ---
> v3: no change
> v4: get tc default block without callback
Please stop reposting new versions of the patches while discussion is
ongoing, it makes it harder to follow.
The TC default block is there because the indirect registration may
happen _after_ the block is installed and populated. It's the device
driver that usually does the indirect registration, the tunnel device
and its rules may already be set when device driver is loaded or
reloaded.
I don't know the nft code, but it seems unlikely it wouldn't have the
same problem/need..
Please explain.
^ permalink raw reply
* Linux Plumbers BPF micro-conference CFP (reminder)
From: Alexei Starovoitov @ 2019-07-28 19:24 UTC (permalink / raw)
To: Daniel Borkmann; +Cc: bpf, Network Development
In-Reply-To: <CAADnVQJ0ATngyqo8xjXdDsyFuuov3KRtbHMR1LcV8VnEDUK8Fg@mail.gmail.com>
Hey Folks,
August 2nd deadline to submit a proposal for BPF uconf
is quickly approaching.
If you're attending LPC in Lisbon and interested
in awesome BPF uconf you need to submit a proposal.
Some of you already submitted them to lpc-bpf@vger
per instructions that were sent back on July 12.
Some proposals were sent via website.
We'd like all proposals to be seen in the website.
Could you please re-enter your proposal there?
Please go to:
https://www.linuxplumbersconf.org/event/4/abstracts/
click on 'submit new proposal'
and copy-paste what you've already sent to lpc-bpf@vger.
Much appreciate it and sorry for confusion.
There is still room for few new proposals,
but space is getting very limited.
Please don't delay.
Thanks!
> ---------- Forwarded message ---------
> From: Daniel Borkmann <daniel@iogearbox.net>
> Date: Fri, Jul 12, 2019 at 7:26 AM
> Subject: Linux Plumbers BPF micro-conference CFP (reminder)
> To: <bpf@vger.kernel.org>
> Cc: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
> <xdp-newbies@vger.kernel.org>, <iovisor-dev@lists.iovisor.org>,
> <lpc-bpf@vger.kernel.org>, <alexei.starovoitov@gmail.com>
>
>
> This is a call for proposals for the BPF micro-conference at this
> years' Linux Plumbers Conference (LPC) 2019 which will be held in
> Lisbon, Portugal for September 9-11.
>
> The goal of the BPF micro-conference is to bring BPF developers
> together to discuss topics around Linux kernel work related to
> the BPF core infrastructure as well as its many subsystems under
> tracing, networking, security, and BPF user space tooling (LLVM,
> libbpf, bpftool and many others).
>
> The format of the micro-conference has a main focus on discussion,
> therefore each accepted topic will provide a short 1-2 slide
> introduction with subsequent discussion for the rest of the given
> time slot.
>
> The BPF micro-conference is a community-driven event and open to
> all LPC attendees, there is no additional registration required.
>
> Please submit your discussion proposals to the LPC BPF micro-conference
> organizers at:
>
> lpc-bpf@vger.kernel.org
>
> Proposals must be submitted until August 2nd, and submitters will
> be notified of acceptance at latest by August 9. (Please note that
> proposals must not be sent as html mail as they are otherwise dropped
> by vger.)
>
> The format of the submission and many other details can be found at:
>
> http://vger.kernel.org/lpc-bpf.html
>
> Looking forward to seeing you all in Lisbon in September!
^ permalink raw reply
* Re: [PATCH] net: bridge: Allow bridge to joing multicast groups
From: Allan W. Nielsen @ 2019-07-28 19:15 UTC (permalink / raw)
To: Andrew Lunn
Cc: Horatiu Vultur, Nikolay Aleksandrov, roopa, davem, bridge, netdev,
linux-kernel
In-Reply-To: <20190727030223.GA29731@lunn.ch>
The 07/27/2019 05:02, Andrew Lunn wrote:
> > As you properly guessed, this model is quite different from what we are used to.
>
> Yes, it takes a while to get the idea that the hardware is just an
> accelerator for what the Linux stack can already do. And if the switch
> cannot do some feature, pass the frame to Linux so it can handle it.
This is understood, and not that different from what we are used to.
The surprise was to make all multicast traffic to go to the CPU.
> You need to keep in mind that there could be other ports in the bridge
> than switch ports, and those ports might be interested in the
> multicast traffic. Hence the CPU needs to see the traffic.
This is a good argument, but I was under the impression that not all HW/drivers
supports foreign interfaces (see ocelot_netdevice_dev_check and
mlxsw_sp_port_dev_check).
> But IGMP snooping can be used to optimise this.
Yes, IGMP snooping can limit the multicast storm of multicast IP traffic, but
not for L2 non-IP multicast traffic.
We could really use something similar for non-IP multicast MAC addresses.
Trying to get back to the original problem:
We have a network which implements the ODVA/DLR ring protocol. This protocol
sends out a beacon frame as often as every 3 us (as far as I recall, default I
believe is 400 us) to this MAC address: 01:21:6C:00:00:01.
Try take a quick look at slide 10 in [1].
If we assume that the SwitchDev driver implemented such that all multicast
traffic goes to the CPU, then we should really have a way to install a HW
offload path in the silicon, such that these packets does not go to the CPU (as
they are known not to be use full, and a frame every 3 us is a significant load
on small DMA connections and CPU resources).
If we assume that the SwitchDev driver implemented such that only "needed"
multicast packets goes to the CPU, then we need a way to get these packets in
case we want to implement the DLR protocol.
I'm sure that both models can work, and I do not think that this is the main
issue here.
Our initial attempt was to allow install static L2-MAC entries and append
multiple ports to such an entry in the MAC table. This was rejected, for several
good reasons it seems. But I'm not sure it was clear what we wanted to achieve,
and why we find it to be important. Hopefully this is clear with a real world
use-case.
Any hints or ideas on what would be a better way to solve this problems will be
much appreciated.
/Allan
[1] https://www.odva.org/Portals/0/Library/Conference/2017-ODVA-Conference_Woods_High%20Availability_Guidelines%20for%20Use%20of%20DLR%20in%20EtherNetIP%20Networks_FINAL%20PPT.pdf
^ permalink raw reply
* [PATCH net] hv_sock: Fix hang when a connection is closed
From: Dexuan Cui @ 2019-07-28 18:32 UTC (permalink / raw)
To: Sunil Muthuswamy, David Miller, netdev@vger.kernel.org
Cc: KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
sashal@kernel.org, Michael Kelley, linux-hyperv@vger.kernel.org,
linux-kernel@vger.kernel.org, olaf@aepfle.de, apw@canonical.com,
jasowang@redhat.com, vkuznets, marcelo.cerri@canonical.com
hvs_do_close_lock_held() may decrease the reference count to 0 and free the
sk struct completely, and then the following release_sock(sk) may hang.
Fixes: a9eeb998c28d ("hv_sock: Add support for delayed close")
Signed-off-by: Dexuan Cui <decui@microsoft.com>
Cc: stable@vger.kernel.org
---
With the proper kernel debugging options enabled, first a warning can
appear:
kworker/1:0/4467 is freeing memory ..., with a lock still held there!
stack backtrace:
Workqueue: events vmbus_onmessage_work [hv_vmbus]
Call Trace:
dump_stack+0x67/0x90
debug_check_no_locks_freed.cold.52+0x78/0x7d
slab_free_freelist_hook+0x85/0x140
kmem_cache_free+0xa5/0x380
__sk_destruct+0x150/0x260
hvs_close_connection+0x24/0x30 [hv_sock]
vmbus_onmessage_work+0x1d/0x30 [hv_vmbus]
process_one_work+0x241/0x600
worker_thread+0x3c/0x390
kthread+0x11b/0x140
ret_from_fork+0x24/0x30
and then the following release_sock(sk) can hang:
watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [kworker/1:0:4467]
...
irq event stamp: 62890
CPU: 1 PID: 4467 Comm: kworker/1:0 Tainted: G W 5.2.0+ #39
Workqueue: events vmbus_onmessage_work [hv_vmbus]
RIP: 0010:queued_spin_lock_slowpath+0x2b/0x1e0
...
Call Trace:
do_raw_spin_lock+0xab/0xb0
release_sock+0x19/0xb0
vmbus_onmessage_work+0x1d/0x30 [hv_vmbus]
process_one_work+0x241/0x600
worker_thread+0x3c/0x390
kthread+0x11b/0x140
ret_from_fork+0x24/0x30
net/vmw_vsock/hyperv_transport.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index f2084e3f7aa4..efbda8ef1eff 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -309,9 +309,16 @@ static void hvs_close_connection(struct vmbus_channel *chan)
{
struct sock *sk = get_per_channel_state(chan);
+ /* Grab an extra reference since hvs_do_close_lock_held() may decrease
+ * the reference count to 0 by calling sock_put(sk).
+ */
+ sock_hold(sk);
+
lock_sock(sk);
hvs_do_close_lock_held(vsock_sk(sk), true);
release_sock(sk);
+
+ sock_put(sk);
}
static void hvs_open_connection(struct vmbus_channel *chan)
--
2.19.1
^ permalink raw reply related
* [PATCH net] net: bridge: delete local fdbs on device init failure
From: Nikolay Aleksandrov @ 2019-07-28 18:22 UTC (permalink / raw)
To: netdev
Cc: davem, roopa, bridge, Nikolay Aleksandrov,
syzbot+88533dc8b582309bf3ee
On initialization failure we have to delete all local fdbs which were
inserted due to the default pvid. This problem has been present since the
inception of default_pvid. Note that currently there are 2 cases:
1) in br_dev_init() when br_multicast_init() fails
2) if register_netdevice() fails after calling ndo_init()
This patch takes care of both since br_vlan_flush() is called on both
occasions. Also the new fdb delete would be a no-op on normal bridge device
destruction since the local fdbs would've been already flushed by
br_dev_delete(). This is not an issue for ports since nbp_vlan_init() is
called last when adding a port thus nothing can fail after it.
Reported-by: syzbot+88533dc8b582309bf3ee@syzkaller.appspotmail.com
Fixes: 5be5a2df40f0 ("bridge: Add filtering support for default_pvid")
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
---
Tested with the provided reproducer and can no longer trigger the leak.
Also tested the br_multicast_init() failure manually by making it always
return an error.
net/bridge/br_vlan.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 021cc9f66804..3e6a702e4c21 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -715,6 +715,11 @@ void br_vlan_flush(struct net_bridge *br)
ASSERT_RTNL();
+ /* delete auto-added default pvid local fdbs before flushing vlans
+ * otherwise these will be leaked on bridge device init failure
+ */
+ br_fdb_delete_by_port(br, NULL, 0, 1);
+
vg = br_vlan_group(br);
__vlan_flush(vg);
RCU_INIT_POINTER(br->vlgrp, NULL);
--
2.21.0
^ permalink raw reply related
* [PATCH net v2] mvpp2: refactor the HW checksum setup
From: Matteo Croce @ 2019-07-28 17:35 UTC (permalink / raw)
To: netdev
Cc: Antoine Tenart, Maxime Chevallier, Marcin Wojtas, Stefan Chulski,
LKML, David Miller
The hardware can only offload checksum calculation on first port due to
the Tx FIFO size limitation, and has a maximum L3 offset of 128 bytes.
Document this in a comment and move duplicated code in a function.
Fixes: 576193f2d579 ("net: mvpp2: jumbo frames support")
Signed-off-by: Matteo Croce <mcroce@redhat.com>
---
.../net/ethernet/marvell/mvpp2/mvpp2_main.c | 35 ++++++++++++-------
1 file changed, 22 insertions(+), 13 deletions(-)
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 937e4b928b94..a99405135046 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -811,6 +811,26 @@ static int mvpp2_swf_bm_pool_init(struct mvpp2_port *port)
return 0;
}
+static void mvpp2_set_hw_csum(struct mvpp2_port *port,
+ enum mvpp2_bm_pool_log_num new_long_pool)
+{
+ const netdev_features_t csums = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+
+ /* Update L4 checksum when jumbo enable/disable on port.
+ * Only port 0 supports hardware checksum offload due to
+ * the Tx FIFO size limitation.
+ * Also, don't set NETIF_F_HW_CSUM because L3_offset in TX descriptor
+ * has 7 bits, so the maximum L3 offset is 128.
+ */
+ if (new_long_pool == MVPP2_BM_JUMBO && port->id != 0) {
+ port->dev->features &= ~csums;
+ port->dev->hw_features &= ~csums;
+ } else {
+ port->dev->features |= csums;
+ port->dev->hw_features |= csums;
+ }
+}
+
static int mvpp2_bm_update_mtu(struct net_device *dev, int mtu)
{
struct mvpp2_port *port = netdev_priv(dev);
@@ -843,15 +863,7 @@ static int mvpp2_bm_update_mtu(struct net_device *dev, int mtu)
/* Add port to new short & long pool */
mvpp2_swf_bm_pool_init(port);
- /* Update L4 checksum when jumbo enable/disable on port */
- if (new_long_pool == MVPP2_BM_JUMBO && port->id != 0) {
- dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
- dev->hw_features &= ~(NETIF_F_IP_CSUM |
- NETIF_F_IPV6_CSUM);
- } else {
- dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
- dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
- }
+ mvpp2_set_hw_csum(port, new_long_pool);
}
dev->mtu = mtu;
@@ -5209,10 +5221,7 @@ static int mvpp2_port_probe(struct platform_device *pdev,
dev->features |= NETIF_F_NTUPLE;
}
- if (port->pool_long->id == MVPP2_BM_JUMBO && port->id != 0) {
- dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
- dev->hw_features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
- }
+ mvpp2_set_hw_csum(port, port->pool_long->id);
dev->vlan_features |= features;
dev->gso_max_segs = MVPP2_MAX_TSO_SEGS;
--
2.21.0
^ permalink raw reply related
* Re: memory leak in fdb_create
From: Nikolay Aleksandrov @ 2019-07-28 16:51 UTC (permalink / raw)
To: syzbot, bridge, bsingharora, coreteam, davem, duwe, kaber, kadlec,
linux-kernel, mingo, mpe, netdev, netfilter-devel, pablo, roopa,
rostedt, syzkaller-bugs
In-Reply-To: <0000000000008be1b2058ebe7805@google.com>
On 28/07/2019 17:20, syzbot wrote:
> syzbot has bisected this bug to:
>
> commit 04cf31a759ef575f750a63777cee95500e410994
> Author: Michael Ellerman <mpe@ellerman.id.au>
> Date: Thu Mar 24 11:04:01 2016 +0000
>
> ftrace: Make ftrace_location_range() global
>
> bisection log: https://syzkaller.appspot.com/x/bisect.txt?x=1538c778600000
> start commit: abf02e29 Merge tag 'pm-5.2-rc6' of git://git.kernel.org/pu..
> git tree: upstream
> final crash: https://syzkaller.appspot.com/x/report.txt?x=1738c778600000
> console output: https://syzkaller.appspot.com/x/log.txt?x=1338c778600000
> kernel config: https://syzkaller.appspot.com/x/.config?x=56f1da14935c3cce
> dashboard link: https://syzkaller.appspot.com/bug?extid=88533dc8b582309bf3ee
> syz repro: https://syzkaller.appspot.com/x/repro.syz?x=16de5c06a00000
> C reproducer: https://syzkaller.appspot.com/x/repro.c?x=10546026a00000
>
> Reported-by: syzbot+88533dc8b582309bf3ee@syzkaller.appspotmail.com
> Fixes: 04cf31a759ef ("ftrace: Make ftrace_location_range() global")
>
> For information about bisection process see: https://goo.gl/tpsmEJ#bisection
I see the problem, it'd happen if the multicast stats memory allocation fails on bridge
init then the fdb added due to the default vlan would remain and the bridge kmem cache
would be destroyed while not empty (you can even trigger a BUG because of that).
I'll post a patch shortly after running a few tests.
Thanks,
Nik
^ permalink raw reply
* Re: ip route JSON format is unparseable for "unreachable" routes
From: Stephen Hemminger @ 2019-07-28 16:15 UTC (permalink / raw)
To: Michael Ziegler; +Cc: netdev
In-Reply-To: <6e88311b-5edc-4c62-1581-0f5b160a5f4e@michaelziegler.name>
On Sun, 28 Jul 2019 13:09:55 +0200
Michael Ziegler <ich@michaelziegler.name> wrote:
> Hi,
>
> I created a couple "unreachable" routes on one of my systems, like such:
>
> > ip route add unreachable 10.0.0.0/8 metric 255
> > ip route add unreachable 192.168.0.0/16 metric 255
>
> Unfortunately this results in unparseable JSON output from "ip":
>
> > # ip -j route show | jq .
> > parse error: Objects must consist of key:value pairs at line 1, column 84
>
> The offending JSON objects are these:
>
> > {"unreachable","dst":"10.0.0.0/8","metric":255,"flags":[]}
> > {"unreachable","dst":"192.168.0.0/16","metric":255,"flags":[]}
> "unreachable" cannot appear on its own here, it needs to be some kind of
> field.
>
> The manpage says to report here, thus I do :) I've searched the
> archives, but I wasn't able to find any existing bug reports about this.
> I'm running version
>
> > ip utility, iproute2-ss190107
>
> on Debian Buster.
>
> Regards,
> Michael.
Already fixed upstream by:
commit 073661773872709518d35d4d093f3a715281f21d
Author: Matteo Croce <mcroce@redhat.com>
Date: Mon Mar 18 18:19:29 2019 +0100
ip route: print route type in JSON output
ip route generates an invalid JSON if the route type has to be printed,
eg. when detailed mode is active, or the type is different that unicast:
$ ip -d -j -p route show
[ {"unicast",
"dst": "192.168.122.0/24",
"dev": "virbr0",
"protocol": "kernel",
"scope": "link",
"prefsrc": "192.168.122.1",
"flags": [ "linkdown" ]
} ]
$ ip -j -p route show
[ {"unreachable",
"dst": "192.168.23.0/24",
"flags": [ ]
},{"prohibit",
"dst": "192.168.24.0/24",
"flags": [ ]
},{"blackhole",
"dst": "192.168.25.0/24",
"flags": [ ]
} ]
Fix it by printing the route type as the "type" attribute:
$ ip -d -j -p route show
[ {
"type": "unicast",
"dst": "default",
"gateway": "192.168.85.1",
"dev": "wlp3s0",
"protocol": "dhcp",
"scope": "global",
"metric": 600,
"flags": [ ]
},{
"type": "unreachable",
"dst": "192.168.23.0/24",
"protocol": "boot",
"scope": "global",
"flags": [ ]
},{
"type": "prohibit",
"dst": "192.168.24.0/24",
"protocol": "boot",
"scope": "global",
"flags": [ ]
},{
"type": "blackhole",
"dst": "192.168.25.0/24",
"protocol": "boot",
"scope": "global",
"flags": [ ]
} ]
Fixes: 663c3cb23103 ("iproute: implement JSON and color output")
Acked-by: Phil Sutter <phil@nwl.cc>
Reviewed-and-tested-by: Andrea Claudi <aclaudi@redhat.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox