* [PATCH net 4/9] net: hns3: Fix for not setting rx private buffer size to zero
From: Yunsheng Lin @ 2017-09-20 10:52 UTC (permalink / raw)
To: davem
Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, gabriele.paoloni,
john.garry, linuxarm, yisen.zhuang, salil.mehta, lipeng321,
netdev, linux-kernel
In-Reply-To: <1505904778-53217-1-git-send-email-linyunsheng@huawei.com>
When rx private buffer is disabled, there may be some case that
the rx private buffer is not set to zero, which may cause buffer
allocation process to fail.
This patch fixes this problem by setting priv->enable to 0 and
priv->buf_size to zero when rx private buffer is disabled.
Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support")
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 1876418..bf3179a 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -1504,6 +1504,11 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev, u32 tx_size)
priv->wl.high = 2 * hdev->mps;
priv->buf_size = priv->wl.high;
}
+ } else {
+ priv->enable = 0;
+ priv->wl.low = 0;
+ priv->wl.high = 0;
+ priv->buf_size = 0;
}
}
@@ -1516,8 +1521,15 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev, u32 tx_size)
for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
priv = &hdev->priv_buf[i];
- if (hdev->hw_tc_map & BIT(i))
- priv->enable = 1;
+ priv->enable = 0;
+ priv->wl.low = 0;
+ priv->wl.high = 0;
+ priv->buf_size = 0;
+
+ if (!(hdev->hw_tc_map & BIT(i)))
+ continue;
+
+ priv->enable = 1;
if (hdev->tm_info.hw_pfc_map & BIT(i)) {
priv->wl.low = 128;
--
1.9.1
^ permalink raw reply related
* [PATCH net 3/9] net: hns3: Fix for DEFAULT_DV when dev doesn't support DCB
From: Yunsheng Lin @ 2017-09-20 10:52 UTC (permalink / raw)
To: davem
Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, gabriele.paoloni,
john.garry, linuxarm, yisen.zhuang, salil.mehta, lipeng321,
netdev, linux-kernel
In-Reply-To: <1505904778-53217-1-git-send-email-linyunsheng@huawei.com>
When ae_dev doesn't support DCB, DEFAULT_DV must be set to
a lower value, otherwise the buffer allocation process will
fail.
This patch fix it by setting it to 30K bytes.
Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support")
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 1 +
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 6 +++++-
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
index c2b613b..30e2ad5 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
@@ -688,6 +688,7 @@ struct hclge_reset_tqp_queue {
#define HCLGE_DEFAULT_TX_BUF 0x4000 /* 16k bytes */
#define HCLGE_TOTAL_PKT_BUF 0x108000 /* 1.03125M bytes */
#define HCLGE_DEFAULT_DV 0xA000 /* 40k byte */
+#define HCLGE_DEFAULT_NON_DCB_DV 0x7800 /* 30K byte */
#define HCLGE_TYPE_CRQ 0
#define HCLGE_TYPE_CSQ 1
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index c515b84..1876418 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -1444,7 +1444,11 @@ static bool hclge_is_rx_buf_ok(struct hclge_dev *hdev, u32 rx_all)
tc_num = hclge_get_tc_num(hdev);
pfc_enable_num = hclge_get_pfc_enalbe_num(hdev);
- shared_buf_min = 2 * hdev->mps + HCLGE_DEFAULT_DV;
+ if (hnae3_dev_dcb_supported(hdev))
+ shared_buf_min = 2 * hdev->mps + HCLGE_DEFAULT_DV;
+ else
+ shared_buf_min = 2 * hdev->mps + HCLGE_DEFAULT_NON_DCB_DV;
+
shared_buf_tc = pfc_enable_num * hdev->mps +
(tc_num - pfc_enable_num) * hdev->mps / 2 +
hdev->mps;
--
1.9.1
^ permalink raw reply related
* [PATCH net 2/9] net: hns3: Fix initialization when cmd is not supported
From: Yunsheng Lin @ 2017-09-20 10:52 UTC (permalink / raw)
To: davem
Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, gabriele.paoloni,
john.garry, linuxarm, yisen.zhuang, salil.mehta, lipeng321,
netdev, linux-kernel
In-Reply-To: <1505904778-53217-1-git-send-email-linyunsheng@huawei.com>
When ae_dev doesn't support DCB, rx_priv_wl_config,
common_thrd_config and tm_qs_bp_cfg can't be called, otherwise
cmd return fail, which causes the hclge module initialization
process to fail.
This patch fix it by adding a DCB capability flag to check if
the ae_dev support DCB.
Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support")
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
drivers/net/ethernet/hisilicon/hns3/hnae3.h | 7 ++++++
.../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 26 +++++++++++++---------
.../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 4 ++++
.../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 10 ++++-----
4 files changed, 31 insertions(+), 16 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index 0f7b61a..ad685f5 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -50,10 +50,17 @@
#define HNAE3_DEV_INITED_B 0x0
#define HNAE3_DEV_SUPPORT_ROCE_B 0x1
+#define HNAE3_DEV_SUPPORT_DCB_B 0x2
+
+#define HNAE3_DEV_SUPPORT_ROCE_DCB_BITS (BIT(HNAE3_DEV_SUPPORT_DCB_B) |\
+ BIT(HNAE3_DEV_SUPPORT_ROCE_B))
#define hnae3_dev_roce_supported(hdev) \
hnae_get_bit(hdev->ae_dev->flag, HNAE3_DEV_SUPPORT_ROCE_B)
+#define hnae3_dev_dcb_supported(hdev) \
+ hnae_get_bit(hdev->ae_dev->flag, HNAE3_DEV_SUPPORT_DCB_B)
+
#define ring_ptr_move_fw(ring, p) \
((ring)->p = ((ring)->p + 1) % (ring)->desc_num)
#define ring_ptr_move_bw(ring, p) \
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index eb78c23..c515b84 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -1772,18 +1772,22 @@ int hclge_buffer_alloc(struct hclge_dev *hdev)
return ret;
}
- ret = hclge_rx_priv_wl_config(hdev);
- if (ret) {
- dev_err(&hdev->pdev->dev,
- "could not configure rx private waterline %d\n", ret);
- return ret;
- }
+ if (hnae3_dev_dcb_supported(hdev)) {
+ ret = hclge_rx_priv_wl_config(hdev);
+ if (ret) {
+ dev_err(&hdev->pdev->dev,
+ "could not configure rx private waterline %d\n",
+ ret);
+ return ret;
+ }
- ret = hclge_common_thrd_config(hdev);
- if (ret) {
- dev_err(&hdev->pdev->dev,
- "could not configure common threshold %d\n", ret);
- return ret;
+ ret = hclge_common_thrd_config(hdev);
+ if (ret) {
+ dev_err(&hdev->pdev->dev,
+ "could not configure common threshold %d\n",
+ ret);
+ return ret;
+ }
}
ret = hclge_common_wl_config(hdev);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
index 1c577d2..c91dbf1 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
@@ -976,6 +976,10 @@ int hclge_pause_setup_hw(struct hclge_dev *hdev)
if (ret)
return ret;
+ /* Only DCB-supported dev supports qset back pressure setting */
+ if (!hnae3_dev_dcb_supported(hdev))
+ return 0;
+
for (i = 0; i < hdev->tm_info.num_tc; i++) {
ret = hclge_tm_qs_bp_cfg(hdev, i);
if (ret)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
index 94d8bb5..35369e1 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
@@ -42,15 +42,15 @@
{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_GE), 0},
{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE), 0},
{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA),
- BIT(HNAE3_DEV_SUPPORT_ROCE_B)},
+ HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC),
- BIT(HNAE3_DEV_SUPPORT_ROCE_B)},
+ HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA),
- BIT(HNAE3_DEV_SUPPORT_ROCE_B)},
+ HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC),
- BIT(HNAE3_DEV_SUPPORT_ROCE_B)},
+ HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC),
- BIT(HNAE3_DEV_SUPPORT_ROCE_B)},
+ HNAE3_DEV_SUPPORT_ROCE_DCB_BITS},
/* required last entry */
{0, }
};
--
1.9.1
^ permalink raw reply related
* [PATCH net 1/9] net: hns3: Cleanup for ROCE capability flag in ae_dev
From: Yunsheng Lin @ 2017-09-20 10:52 UTC (permalink / raw)
To: davem
Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, gabriele.paoloni,
john.garry, linuxarm, yisen.zhuang, salil.mehta, lipeng321,
netdev, linux-kernel
In-Reply-To: <1505904778-53217-1-git-send-email-linyunsheng@huawei.com>
This patch add the ROCE supported flag in the driver_data
field of pci_device_id, delete roce_pci_tbl and change
HNAE_DEV_SUPPORT_ROCE_B to HNAE3_DEV_SUPPORT_ROCE_B.
This cleanup is done in order to support adding capability
in pci_device_id and to fix initialization failure when
cmd is not supported.
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
drivers/net/ethernet/hisilicon/hns3/hnae3.h | 5 ++++-
.../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 25 ++++------------------
.../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 16 +++++++++-----
3 files changed, 19 insertions(+), 27 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index b2f28ae..0f7b61a 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -49,7 +49,10 @@
#define HNAE3_CLASS_NAME_SIZE 16
#define HNAE3_DEV_INITED_B 0x0
-#define HNAE_DEV_SUPPORT_ROCE_B 0x1
+#define HNAE3_DEV_SUPPORT_ROCE_B 0x1
+
+#define hnae3_dev_roce_supported(hdev) \
+ hnae_get_bit(hdev->ae_dev->flag, HNAE3_DEV_SUPPORT_ROCE_B)
#define ring_ptr_move_fw(ring, p) \
((ring)->p = ((ring)->p + 1) % (ring)->desc_num)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 44c722a..eb78c23 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -46,17 +46,7 @@ static int hclge_set_mta_filter_mode(struct hclge_dev *hdev,
{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), 0},
{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), 0},
{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), 0},
- /* Required last entry */
- {0, }
-};
-
-static const struct pci_device_id roce_pci_tbl[] = {
- {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), 0},
- {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC), 0},
- {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), 0},
- {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), 0},
- {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), 0},
- /* Required last entry */
+ /* required last entry */
{0, }
};
@@ -894,7 +884,7 @@ static int hclge_query_pf_resource(struct hclge_dev *hdev)
hdev->num_tqps = __le16_to_cpu(req->tqp_num);
hdev->pkt_buf_size = __le16_to_cpu(req->buf_size) << HCLGE_BUF_UNIT_S;
- if (hnae_get_bit(hdev->ae_dev->flag, HNAE_DEV_SUPPORT_ROCE_B)) {
+ if (hnae3_dev_roce_supported(hdev)) {
hdev->num_roce_msix =
hnae_get_field(__le16_to_cpu(req->pf_intr_vector_number),
HCLGE_PF_VEC_NUM_M, HCLGE_PF_VEC_NUM_S);
@@ -3931,8 +3921,7 @@ static int hclge_init_client_instance(struct hnae3_client *client,
goto err;
if (hdev->roce_client &&
- hnae_get_bit(hdev->ae_dev->flag,
- HNAE_DEV_SUPPORT_ROCE_B)) {
+ hnae3_dev_roce_supported(hdev)) {
struct hnae3_client *rc = hdev->roce_client;
ret = hclge_init_roce_base_info(vport);
@@ -3955,8 +3944,7 @@ static int hclge_init_client_instance(struct hnae3_client *client,
break;
case HNAE3_CLIENT_ROCE:
- if (hnae_get_bit(hdev->ae_dev->flag,
- HNAE_DEV_SUPPORT_ROCE_B)) {
+ if (hnae3_dev_roce_supported(hdev)) {
hdev->roce_client = client;
vport->roce.client = client;
}
@@ -4068,7 +4056,6 @@ static void hclge_pci_uninit(struct hclge_dev *hdev)
static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
{
struct pci_dev *pdev = ae_dev->pdev;
- const struct pci_device_id *id;
struct hclge_dev *hdev;
int ret;
@@ -4083,10 +4070,6 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
hdev->ae_dev = ae_dev;
ae_dev->priv = hdev;
- id = pci_match_id(roce_pci_tbl, ae_dev->pdev);
- if (id)
- hnae_set_bit(ae_dev->flag, HNAE_DEV_SUPPORT_ROCE_B, 1);
-
ret = hclge_pci_init(hdev);
if (ret) {
dev_err(&pdev->dev, "PCI init failed\n");
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
index 4d68d6e..94d8bb5 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
@@ -41,11 +41,16 @@
static const struct pci_device_id hns3_pci_tbl[] = {
{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_GE), 0},
{PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE), 0},
- {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), 0},
- {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC), 0},
- {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), 0},
- {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), 0},
- {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), 0},
+ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA),
+ BIT(HNAE3_DEV_SUPPORT_ROCE_B)},
+ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC),
+ BIT(HNAE3_DEV_SUPPORT_ROCE_B)},
+ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA),
+ BIT(HNAE3_DEV_SUPPORT_ROCE_B)},
+ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC),
+ BIT(HNAE3_DEV_SUPPORT_ROCE_B)},
+ {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC),
+ BIT(HNAE3_DEV_SUPPORT_ROCE_B)},
/* required last entry */
{0, }
};
@@ -1348,6 +1353,7 @@ static int hns3_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
}
ae_dev->pdev = pdev;
+ ae_dev->flag = ent->driver_data;
ae_dev->dev_type = HNAE3_DEV_KNIC;
pci_set_drvdata(pdev, ae_dev);
--
1.9.1
^ permalink raw reply related
* [PATCH net 0/9] TM related bugfixes for the HNS3 Ethernet Driver
From: Yunsheng Lin @ 2017-09-20 10:52 UTC (permalink / raw)
To: davem
Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, gabriele.paoloni,
john.garry, linuxarm, yisen.zhuang, salil.mehta, lipeng321,
netdev, linux-kernel
This patch set contains a few bugfixes related to hclge_tm module.
Yunsheng Lin (9):
net: hns3: Cleanup for ROCE capability flag in ae_dev
net: hns3: Fix initialization when cmd is not supported
net: hns3: Fix for DEFAULT_DV when dev doesn't support DCB
net: hns3: Fix for not setting rx private buffer size to zero
net: hns3: Fix for rx_priv_buf_alloc not setting rx shared buffer
net: hns3: Fix for rx priv buf allocation when DCB is not supported
net: hns3: Fix typo error for feild in hclge_tm
net: hns3: Fix for setting rss_size incorrectly
net: hns3: Fix for pri to tc mapping in TM
drivers/net/ethernet/hisilicon/hns3/hnae3.h | 15 +-
.../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 4 +-
.../ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 163 +++++++++++----------
.../ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 3 +-
.../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 41 +++---
.../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h | 4 +-
.../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 16 +-
7 files changed, 143 insertions(+), 103 deletions(-)
--
1.9.1
^ permalink raw reply
* [PATCH 1/2] netfilter: nat: Do not use ARRAY_SIZE() on spinlocks to fix zero div
From: Pablo Neira Ayuso @ 2017-09-20 10:49 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1505904543-10004-1-git-send-email-pablo@netfilter.org>
From: Geert Uytterhoeven <geert@linux-m68k.org>
If no spinlock debugging options (CONFIG_GENERIC_LOCKBREAK,
CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_LOCK_ALLOC) are enabled on a UP
platform (e.g. m68k defconfig), arch_spinlock_t is an empty struct,
hence using ARRAY_SIZE(nf_nat_locks) causes a division by zero:
net/netfilter/nf_nat_core.c: In function ‘nf_nat_setup_info’:
net/netfilter/nf_nat_core.c:432: warning: division by zero
net/netfilter/nf_nat_core.c: In function ‘__nf_nat_cleanup_conntrack’:
net/netfilter/nf_nat_core.c:535: warning: division by zero
net/netfilter/nf_nat_core.c:537: warning: division by zero
net/netfilter/nf_nat_core.c: In function ‘nf_nat_init’:
net/netfilter/nf_nat_core.c:810: warning: division by zero
net/netfilter/nf_nat_core.c:811: warning: division by zero
net/netfilter/nf_nat_core.c:824: warning: division by zero
Fix this by using the CONNTRACK_LOCKS definition instead.
Suggested-by: Florian Westphal <fw@strlen.de>
Fixes: 8073e960a03bf7b5 ("netfilter: nat: use keyed locks")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_nat_core.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index f393a7086025..af8345fc4fbd 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -429,7 +429,7 @@ nf_nat_setup_info(struct nf_conn *ct,
srchash = hash_by_src(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- lock = &nf_nat_locks[srchash % ARRAY_SIZE(nf_nat_locks)];
+ lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];
spin_lock_bh(lock);
hlist_add_head_rcu(&ct->nat_bysource,
&nf_nat_bysource[srchash]);
@@ -532,9 +532,9 @@ static void __nf_nat_cleanup_conntrack(struct nf_conn *ct)
unsigned int h;
h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- spin_lock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]);
+ spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]);
hlist_del_rcu(&ct->nat_bysource);
- spin_unlock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]);
+ spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]);
}
static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
@@ -807,8 +807,8 @@ static int __init nf_nat_init(void)
/* Leave them the same for the moment. */
nf_nat_htable_size = nf_conntrack_htable_size;
- if (nf_nat_htable_size < ARRAY_SIZE(nf_nat_locks))
- nf_nat_htable_size = ARRAY_SIZE(nf_nat_locks);
+ if (nf_nat_htable_size < CONNTRACK_LOCKS)
+ nf_nat_htable_size = CONNTRACK_LOCKS;
nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
if (!nf_nat_bysource)
@@ -821,7 +821,7 @@ static int __init nf_nat_init(void)
return ret;
}
- for (i = 0; i < ARRAY_SIZE(nf_nat_locks); i++)
+ for (i = 0; i < CONNTRACK_LOCKS; i++)
spin_lock_init(&nf_nat_locks[i]);
nf_ct_helper_expectfn_register(&follow_master_nat);
--
2.1.4
^ permalink raw reply related
* [PATCH 0/2] Netfilter fixes for net
From: Pablo Neira Ayuso @ 2017-09-20 10:49 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
Hi David,
The following patchset contains two Netfilter fixes for your net tree,
they are:
1) Fix NAt compilation with UP, from Geert Uytterhoeven.
2) Fix incorrect number of entries when dumping a set, from
Vishwanath Pai.
You can pull these changes from:
git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git
Thanks!
----------------------------------------------------------------
The following changes since commit 2bd6bf03f4c1c59381d62c61d03f6cc3fe71f66e:
Linux 4.14-rc1 (2017-09-16 15:47:51 -0700)
are available in the git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git HEAD
for you to fetch changes up to 7f4f7dd4417d9efd038b14d39c70170db2e0baa0:
netfilter: ipset: ipset list may return wrong member count for set with timeout (2017-09-18 17:35:32 +0200)
----------------------------------------------------------------
Geert Uytterhoeven (1):
netfilter: nat: Do not use ARRAY_SIZE() on spinlocks to fix zero div
Vishwanath Pai (1):
netfilter: ipset: ipset list may return wrong member count for set with timeout
net/netfilter/ipset/ip_set_hash_gen.h | 14 +++++++++++++-
net/netfilter/nf_nat_core.c | 12 ++++++------
2 files changed, 19 insertions(+), 7 deletions(-)
^ permalink raw reply
* [PATCH 2/2] netfilter: ipset: ipset list may return wrong member count for set with timeout
From: Pablo Neira Ayuso @ 2017-09-20 10:49 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1505904543-10004-1-git-send-email-pablo@netfilter.org>
From: Vishwanath Pai <vpai@akamai.com>
Simple testcase:
$ ipset create test hash:ip timeout 5
$ ipset add test 1.2.3.4
$ ipset add test 1.2.2.2
$ sleep 5
$ ipset l
Name: test
Type: hash:ip
Revision: 5
Header: family inet hashsize 1024 maxelem 65536 timeout 5
Size in memory: 296
References: 0
Number of entries: 2
Members:
We return "Number of entries: 2" but no members are listed. That is
because mtype_list runs "ip_set_timeout_expired" and does not list the
expired entries, but set->elements is never upated (until mtype_gc
cleans it up later).
Reviewed-by: Joshua Hunt <johunt@akamai.com>
Signed-off-by: Vishwanath Pai <vpai@akamai.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/ipset/ip_set_hash_gen.h | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index f236c0bc7b3f..51063d9ed0f7 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -1041,12 +1041,24 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
static int
mtype_head(struct ip_set *set, struct sk_buff *skb)
{
- const struct htype *h = set->data;
+ struct htype *h = set->data;
const struct htable *t;
struct nlattr *nested;
size_t memsize;
u8 htable_bits;
+ /* If any members have expired, set->elements will be wrong
+ * mytype_expire function will update it with the right count.
+ * we do not hold set->lock here, so grab it first.
+ * set->elements can still be incorrect in the case of a huge set,
+ * because elements might time out during the listing.
+ */
+ if (SET_WITH_TIMEOUT(set)) {
+ spin_lock_bh(&set->lock);
+ mtype_expire(set, h);
+ spin_unlock_bh(&set->lock);
+ }
+
rcu_read_lock_bh();
t = rcu_dereference_bh_nfnl(h->table);
memsize = mtype_ahash_memsize(h, t) + set->ext_size;
--
2.1.4
^ permalink raw reply related
* Re: [PATCH v2 1/2] mac80211: Add rcu read side critical sections
From: Johannes Berg @ 2017-09-20 10:39 UTC (permalink / raw)
To: Ville Syrjala, linux-wireless-u79uwXL29TY76Z2rM5mHXA
Cc: David S. Miller, netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20170920101123.23312-1-ville.syrjala-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
On Wed, 2017-09-20 at 13:11 +0300, Ville Syrjala wrote:
> --- a/net/mac80211/tx.c
> +++ b/net/mac80211/tx.c
> @@ -1770,15 +1770,21 @@ bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
> struct ieee80211_tx_data tx;
> struct sk_buff *skb2;
>
> - if (ieee80211_tx_prepare(sdata, &tx, NULL, skb) == TX_DROP)
> + rcu_read_lock();
The documentation says:
/**
* ieee80211_tx_prepare_skb - prepare an 802.11 skb for transmission
* @hw: pointer as obtained from ieee80211_alloc_hw()
* @vif: virtual interface
* @skb: frame to be sent from within the driver
* @band: the band to transmit on
* @sta: optional pointer to get the station to send the frame to
*
* Note: must be called under RCU lock
*/
You can't even argue that it should be the function itself doing it,
because the (admittedly optional) sta pointer would otherwise not have
proper protection after you leave the function ... You can't pass out a
sta pointer that's RCU protected.
Side note: Perhaps some annotation should be there? not sure it's
possible - would have to be something like
struct ieee80211_sta * __rcu *sta;
I guess since the outer pointer isn't protected, only the inner ...
Therefore, this patch is wrong.
I actually think the same is true for ieee80211_tx_dequeue(), but I'm
less sure about it - the sta pointer there clearly is somehow safely
passed in (even if it's w/o RCU, the driver can potentially make that
safe), but the key pointer seems unsafe in this case (as well) if
there's no outer RCU protection.
johannes
^ permalink raw reply
* Re: [PATCH] net: ethernet: aquantia: default to no in config
From: Sergei Shtylyov @ 2017-09-20 10:28 UTC (permalink / raw)
To: Vito Caputo, linux-kernel; +Cc: netdev
In-Reply-To: <20170919224315.GA17797@shells.gnugeneration.com>
Hello!
On 9/20/2017 1:43 AM, Vito Caputo wrote:
> NET_VENDOR_AQUANTIA was "default y" for some reason, which seems
> obviously inappropriate.
> ---
> drivers/net/ethernet/aquantia/Kconfig | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/aquantia/Kconfig b/drivers/net/ethernet/aquantia/Kconfig
> index cdf78e069a39..6167b13cf349 100644
> --- a/drivers/net/ethernet/aquantia/Kconfig
> +++ b/drivers/net/ethernet/aquantia/Kconfig
> @@ -4,7 +4,7 @@
>
> config NET_VENDOR_AQUANTIA
> bool "aQuantia devices"
> - default y
> + default n
Just remove it -- 'n' is the default default. :-)
[...]
MBR, Sergei
^ permalink raw reply
* Re: Latest net-next from GIT panic
From: Paweł Staszewski @ 2017-09-20 10:22 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Linux Kernel Network Developers
In-Reply-To: <6c073f86-ab71-0a8f-7b9a-91d5ae5da214@itcare.pl>
Soo far bisected and marked:
git bisect start
# bad: [07dd6cc1fff160143e82cf5df78c1db0b6e03355] Linux 4.13.2
git bisect bad 07dd6cc1fff160143e82cf5df78c1db0b6e03355
# good: [5d7d2e03e0f01a992e3521b180c3d3e67905f269] Linux 4.12.13
git bisect good 5d7d2e03e0f01a992e3521b180c3d3e67905f269
# good: [6f7da290413ba713f0cdd9ff1a2a9bb129ef4f6c] Linux 4.12
git bisect good 6f7da290413ba713f0cdd9ff1a2a9bb129ef4f6c
# bad: [ac7b75966c9c86426b55fe1c50ae148aa4571075] Merge tag
'pinctrl-v4.13-1' of
git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-pinctrl
git bisect bad ac7b75966c9c86426b55fe1c50ae148aa4571075
# good: [e24dd9ee5399747b71c1d982a484fc7601795f31] Merge branch 'next'
of git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security
git bisect good e24dd9ee5399747b71c1d982a484fc7601795f31
# good: [e24dd9ee5399747b71c1d982a484fc7601795f31] Merge branch 'next'
of git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security
git bisect good e24dd9ee5399747b71c1d982a484fc7601795f31
# good: [e24dd9ee5399747b71c1d982a484fc7601795f31] Merge branch 'next'
of git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security
git bisect good e24dd9ee5399747b71c1d982a484fc7601795f31
W dniu 2017-09-20 o 12:21, Paweł Staszewski pisze:
> Ok kernel crashed with different panic that i didnt catch when i was
> doing bisect and now my bisection is broken :)
>
> git bisect good
> Bisecting: 1787 revisions left to test after this (roughly 11 steps)
> error: Your local changes to the following files would be overwritten
> by checkout:
> Documentation/00-INDEX
> Documentation/ABI/stable/sysfs-class-udc
> Documentation/ABI/testing/configfs-usb-gadget-uac1
> Documentation/ABI/testing/ima_policy
> Documentation/ABI/testing/sysfs-bus-iio
> Documentation/ABI/testing/sysfs-bus-iio-meas-spec
> Documentation/ABI/testing/sysfs-bus-iio-timer-stm32
> Documentation/ABI/testing/sysfs-class-net
> Documentation/ABI/testing/sysfs-class-power-twl4030
> Documentation/ABI/testing/sysfs-class-typec
> Documentation/DMA-API.txt
> Documentation/IRQ-domain.txt
> Documentation/Makefile
> Documentation/PCI/MSI-HOWTO.txt
> Documentation/RCU/00-INDEX
> Documentation/RCU/Design/Requirements/Requirements.html
> Documentation/RCU/checklist.txt
> Documentation/admin-guide/README.rst
> Documentation/admin-guide/devices.txt
> Documentation/admin-guide/index.rst
> Documentation/admin-guide/kernel-parameters.txt
> Documentation/admin-guide/pm/cpufreq.rst
> Documentation/admin-guide/pm/intel_pstate.rst
> Documentation/admin-guide/ras.rst
> Documentation/arm/Atmel/README
> Documentation/block/biodoc.txt
> Documentation/conf.py
> Documentation/core-api/assoc_array.rst
> Documentation/core-api/atomic_ops.rst
> Documentation/core-api/index.rst
> Documentation/crypto/asymmetric-keys.txt
> Documentation/dev-tools/index.rst
> Documentation/dev-tools/sparse.rst
> Documentation/devicetree/bindings/arm/amlogic.txt
> Documentation/devicetree/bindings/arm/atmel-at91.txt
> Documentation/devicetree/bindings/arm/ccn.txt
> Documentation/devicetree/bindings/arm/cpus.txt
> Documentation/devicetree/bindings/arm/gemini.txt
> Documentation/devicetree/bindings/arm/hisilicon/hisilicon.txt
> Documentation/devicetree/bindings/arm/keystone/keystone.txt
> Documentation/devicetree/bindings/arm/mediatek.txt
> Documentation/devicetree/bindings/arm/rockchip.txt
> Documentation/devicetree/bindings/arm/shmobile.txt
> Documentation/devicetree/bindings/arm/tegra.txt
> Documentation/devicetree/bindings/ata/ahci-fsl-qoriq.txt
> Documentation/devicetree/bindings/bus/brcm,gisb-arb.txt
> Documentation/devicetree/bindings/clock/brcm,iproc-clocks.txt
> Documentation/devicetree/bindings/cpufreq/ti-cpufreq.txt
> Documentation/devicetree/bindings/gpio/gpio_atmel.txt
> Documentation/devicetree/bindings/iio/adc/amlogic,meson-saradc.txt
> Documentation/devicetree/bindings/iio/adc/renesas,gyroadc.txt
> Documentation/devicetree/bindings/iio/adc/st,stm32-adc.txt
> Documentation/devicetree/bindings/iio/imu/st_lsm6dsx.txt
> Documentation/devicetree/bindings/interrupt-controller/allwinner,sunxi-nmi.txt
>
> Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2400-vic.txt
>
> Documentation/devicetree/bindings/interrupt-controller/mediatek,sysirq.txt
>
> Documentation/devicetree/bindings/leds/common.txt
> Documentation/devicetree/bindings/mfd/hi6421.txt
> Documentation/devicetree/bindings/mfd/tps65910.txt
> Documentation/devicetree/bindings/mmc/fsl-esdhc.txt
> Documentation/devicetree/bindings/mmc/k3-dw-mshc.txt
> Documentation/devicetree/bindings/mmc/rockchip-dw-mshc.txt
> Documentation/devicetree/bindings/mmc/ti-omap-hsmmc.txt
> Documentation/devicetree/bindings/mtd/atmel-nand.txt
> Documentation/devicetree/bindings/net/dsa/b53.txt
> Documentation/devicetree/bindings/net/ethernet.txt
> Documentation/devicetree/bindings/net/macb.txt
> Documentation/devicetree/bindings/net/marvell-orion-mdio.txt
> Documentation/devicetree/bindings/net/ti,wilink-st.txt
> Documentation/devicetree/bindings/net/wireless/ti,wlcore.txt
> Documentation/devicetree/bindings/nvmem/rockchip-efuse.txt
> Documentation/devicetree/bindings/opp/opp.txt
> Documentation/devicetree/bindings/phy/bcm-ns-usb3-phy.txt
> Documentation/devicetree/bindings/phy/brcm-sata-phy.txt
> Documentation/devicetree/bindings/phy/meson8b-usb2-phy.txt
> Documentation/devicetree/bindings/phy/phy-rockchip-inno-usb2.txt
> Documentation/devicetree/bindings/power/rockchip-io-domain.txt
> Documentation/devicetree/bindings/power/supply/bq27xxx.txt
> Documentation/devicetree/bindings/property-units.txt
> Documentation/devicetree/bindings/regulator/regulator.txt
> Documentation/devicetree/bindings/serial/8
> error: The following untracked working tree files would be overwritten
> by checkout:
> Documentation/ABI/testing/sysfs-class-net-phydev
> Documentation/DocBook/.gitignore
> Documentation/DocBook/Makefile
> Documentation/DocBook/filesystems.tmpl
> Documentation/DocBook/kernel-hacking.tmpl
> Documentation/DocBook/kernel-locking.tmpl
> Documentation/DocBook/kgdb.tmpl
> Documentation/DocBook/libata.tmpl
> Documentation/DocBook/librs.tmpl
> Documentation/DocBook/lsm.tmpl
> Documentation/DocBook/mtdnand.tmpl
> Documentation/DocBook/networking.tmpl
> Documentation/DocBook/rapidio.tmpl
> Documentation/DocBook/s390-drivers.tmpl
> Documentation/DocBook/scsi.tmpl
> Documentation/DocBook/sh.tmpl
> Documentation/DocBook/stylesheet.xsl
> Documentation/DocBook/w1.tmpl
> Documentation/DocBook/z8530book.tmpl
> Documentation/Makefile.sphinx
> Documentation/RCU/trace.txt
> Documentation/devicetree/bindings/i2c/i2c-mt6577.txt
> Documentation/devicetree/bindings/misc/allwinner,syscon.txt
> Documentation/devicetree/bindings/net/cortina.txt
> Documentation/devicetree/bindings/net/dsa/ksz.txt
> Documentation/devicetree/bindings/net/dwmac-sun8i.txt
> Documentation/devicetree/bindings/net/qca,qca7000.txt
> Documentation/devicetree/bindings/power/max8903-charger.txt
> Documentation/devicetree/bindings/power_supply/maxim,max14656.txt
> Documentation/devicetree/bindings/ptp/brcm,ptp-dte.txt
> Documentation/devicetree/bindings/timer/moxa,moxart-timer.txt
> Documentation/doc-guide/docbook.rst
> Documentation/networking/tls.txt
> Documentation/prctl/no_new_privs.txt
> Documentation/prctl/seccomp_filter.txt
> Documentation/security/00-INDEX
> Documentation/security/IMA-templates.txt
> Documentation/security/LSM.txt
> Documentation/security/LoadPin.txt
> Documentation/security/SELinux.txt
> Documentation/security/Smack.txt
> Documentation/security/Yama.txt
> Documentation/security/apparmor.txt
> Documentation/security/conf.py
> Documentation/security/credentials.txt
> Documentation/security/keys-ecryptfs.txt
> Documentation/security/keys-request-key.txt
> Documentation/security/keys-trusted-encrypted.txt
> Documentation/security/keys.txt
> Documentation/security/self-protection.txt
> Documentation/security/tomoyo.txt
> Documentation/sphinx/convert_template.sed
> Documentation/sphinx/post_convert.sed
> Documentation/sphinx/tmplcvt
> Documentation/usb/typec.rst
> Documentation/usb/usb3-debug-port.rst
> arch/arm/boot/dts/rk1108-evb.dts
> arch/arm/boot/dts/rk1108.dtsi
> arch/arm/boot/dts/tegra20-whistler.dts
> arch/arm/mach-omap2/opp.c
> arch/arm/mach-omap2/pmu.c
> arch/ia64/include/asm/siginfo.h
> arch/m32r/include/uapi/asm/siginfo.h
> arch/microblaze/include/asm/bitops.h
> arch/microblaze/include/asm/bug.h
> arch/microblaze/include/asm/bugs.h
> arch/microblaze/include/asm/div64.h
> arch/microblaze/include/asm/emergency-restart.h
> arch/microblaze/include/asm/fb.h
> arch/microblaze/include/asm/hardirq.h
> arch/microblaze/include/asm/irq_regs.h
> arch/microblaze/include/asm/kdebug.h
> arch/microblaze/include/asm/kmap_types.h
> arch/microblaze/include/asm/linkage.h
> arch/microblaze/include/asm/local.h
> arch/microblaze/include/asm/local64.h
> arch/microblaze/include/asm/parport.h
> arch/microblaze/include/asm/percpu.h
> arch/microblaze/include/asm/serial.h
> arch/microblaze/include/asm/shmparam.h
> arch/microblaze/include/asm/topology.h
> arch/microblaze/include/asm/ucontext.h
> arch/microblaze/include/asm/vga.h
> arch/microblaze/include/asm/xor.h
> arch/microblaze/include/uapi/asm/bitsperlong.h
> arch/microblaze/include/uapi/asm/errno.h
> arch/microblaze/include/uapi/asm/fcntl.h
> arch/microblaze/include/uapi/asm/ioctl.h
> arch/microblaze/include/uapi/asm/ioctls.h
> arch/microblaze/include/uapi/asm/ipcbuf.h
> arch/microblaze/include/uapi/asm/kvm_para.h
> arch/microblaze/include/uapi/asm/mman.h
> arch/microblaze/include/uapi/asm/msgbuf.h
> arch/microblaze/include/uapi/asm/param.h
> arch/microblaze/include/uapi/asm/poll.h
> arch/microblaze/include/uapi/asm/resource.h
> arch/microblaze/include/uapi/asm/sembuf.h
> arch/microblaze/include/uapi/asm/shmbuf.h
> arch/microblaze/include/uapi/asm/siginfo.h
> arch/microblaze/include/uapi/asm/signal.h
> arch/microblaze/includ
> Aborting
>
>
>
> W dniu 2017-09-20 o 11:45, Paweł Staszewski pisze:
>> Ok looks like ending bisection
>>
>>
>> Latest bisected kernel when there is no kernel panic 4.12.0+ (from
>> next) - but only this warning:
>>
>> [ 309.030019] NETDEV WATCHDOG: enp4s0f0 (ixgbe): transmit queue 0
>> timed out
>> [ 309.030034] ------------[ cut here ]------------
>> [ 309.030040] WARNING: CPU: 35 PID: 0 at dev_watchdog+0xcf/0x139
>> [ 309.030041] Modules linked in: bonding ipmi_si x86_pkg_temp_thermal
>> [ 309.030045] CPU: 35 PID: 0 Comm: swapper/35 Not tainted 4.12.0+ #5
>> [ 309.030046] task: ffff88086d98a000 task.stack: ffffc90003378000
>> [ 309.030048] RIP: 0010:dev_watchdog+0xcf/0x139
>> [ 309.030049] RSP: 0018:ffff88087fbc3ea8 EFLAGS: 00010246
>> [ 309.030050] RAX: 000000000000003d RBX: ffff88046b680000 RCX:
>> 0000000000000000
>> [ 309.030050] RDX: ffff88087fbd2f01 RSI: 0000000000000000 RDI:
>> ffff88087fbcda08
>> [ 309.030051] RBP: ffff88087fbc3eb8 R08: 0000000000000000 R09:
>> ffff88087ff80a04
>> [ 309.030051] R10: 0000000000000000 R11: ffff88086d98a001 R12:
>> 0000000000000000
>> [ 309.030052] R13: ffff88087fbc3ef8 R14: ffff88086d98a000 R15:
>> ffffffff81c06008
>> [ 309.030053] FS: 0000000000000000(0000) GS:ffff88087fbc0000(0000)
>> knlGS:0000000000000000
>> [ 309.030054] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [ 309.030054] CR2: 00007fba600f6098 CR3: 000000086b955000 CR4:
>> 00000000001406e0
>> [ 309.030055] Call Trace:
>> [ 309.030057] <IRQ>
>> [ 309.030059] ? netif_tx_lock+0x79/0x79
>> [ 309.030062] call_timer_fn.isra.24+0x17/0x77
>> [ 309.030063] run_timer_softirq+0x118/0x161
>> [ 309.030065] ? netif_tx_lock+0x79/0x79
>> [ 309.030066] ? ktime_get+0x2b/0x42
>> [ 309.030070] ? lapic_next_deadline+0x21/0x27
>> [ 309.030073] ? clockevents_program_event+0xa8/0xc5
>> [ 309.030076] __do_softirq+0xa8/0x19d
>> [ 309.030078] irq_exit+0x5d/0x6b
>> [ 309.030079] smp_apic_timer_interrupt+0x2a/0x36
>> [ 309.030082] apic_timer_interrupt+0x89/0x90
>> [ 309.030085] RIP: 0010:mwait_idle+0x4e/0x6a
>> [ 309.030086] RSP: 0018:ffffc9000337be98 EFLAGS: 00000246 ORIG_RAX:
>> ffffffffffffff10
>> [ 309.030087] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>> 0000000000000000
>> [ 309.030087] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
>> ffff88086d98a000
>> [ 309.030088] RBP: ffffc9000337be98 R08: ffff88046f8279a0 R09:
>> ffff88046f827040
>> [ 309.030089] R10: ffff88086d98a000 R11: ffff88086d98a000 R12:
>> 0000000000000000
>> [ 309.030089] R13: ffff88086d98a000 R14: ffff88086d98a000 R15:
>> ffff88086d98a000
>> [ 309.030090] </IRQ>
>> [ 309.030094] arch_cpu_idle+0xa/0xc
>> [ 309.030095] default_idle_call+0x19/0x1b
>> [ 309.030102] do_idle+0xbc/0x196
>> [ 309.030104] cpu_startup_entry+0x1d/0x20
>> [ 309.030105] start_secondary+0xd8/0xdc
>> [ 309.030108] secondary_startup_64+0x9f/0x9f
>> [ 309.030109] Code: cc 75 bd eb 35 48 89 df c6 05 c3 dc 74 00 01 e8
>> 3a 62 fe ff 44 89 e1 48 89 de 48 89 c2 48 c7 c7 0f 65 a4 81 31 c0 e8
>> 3d 4c b5 ff <0f> ff 48 8b 83 e0 01 00 00 48 89 df ff 50 78 48 8b 05
>> a0 bc 6a
>> [ 309.030128] ---[ end trace 9102cb25703ae2d9 ]---
>>
>>
>> I just marked it as good - cause this problem above is differend -
>> and im going to:
>>
>> git bisect good
>> Bisecting: 1787 revisions left to test after this (roughly 11 steps)
>>
>>
>>
>>
>> W dniu 2017-09-20 o 10:44, Paweł Staszewski pisze:
>>> Trying to make video from ipmi :)
>>>
>>> with that results:
>>>
>>> https://bugzilla.kernel.org/attachment.cgi?id=258521
>>>
>>> catched two more lines where it starts - panic from 4.13.2.
>>>
>>>
>>> Now will try tro do some bisection
>>>
>>>
>>>
>>> W dniu 2017-09-20 o 09:58, Paweł Staszewski pisze:
>>>> Hi
>>>>
>>>>
>>>> Will try bisecting tonight
>>>>
>>>>
>>>>
>>>> W dniu 2017-09-20 o 05:24, Eric Dumazet pisze:
>>>>> On Wed, 2017-09-20 at 02:06 +0200, Paweł Staszewski wrote:
>>>>>> Just checked kernel 4.13.2 and same problem
>>>>>>
>>>>>> Just after start all 6 bgp sessions - and kernel starts to learn
>>>>>> routes
>>>>>> it panic.
>>>>>>
>>>>>> https://bugzilla.kernel.org/attachment.cgi?id=258509
>>>>>>
>>>>>
>>>>> Unfortunately we have not enough information from these traces.
>>>>>
>>>>> Can you get a full stack trace ?
>>>>>
>>>>> Alternatively, can you bisect ?
>>>>>
>>>>> Thanks.
>>>>>
>>>>>
>>>>>
>>>>
>>>>
>>>
>>>
>>
>>
>
>
^ permalink raw reply
* Re: Latest net-next from GIT panic
From: Paweł Staszewski @ 2017-09-20 10:21 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Linux Kernel Network Developers
In-Reply-To: <54d058d4-b9d1-54cb-f064-45cec430fe5d@itcare.pl>
Ok kernel crashed with different panic that i didnt catch when i was
doing bisect and now my bisection is broken :)
git bisect good
Bisecting: 1787 revisions left to test after this (roughly 11 steps)
error: Your local changes to the following files would be overwritten by
checkout:
Documentation/00-INDEX
Documentation/ABI/stable/sysfs-class-udc
Documentation/ABI/testing/configfs-usb-gadget-uac1
Documentation/ABI/testing/ima_policy
Documentation/ABI/testing/sysfs-bus-iio
Documentation/ABI/testing/sysfs-bus-iio-meas-spec
Documentation/ABI/testing/sysfs-bus-iio-timer-stm32
Documentation/ABI/testing/sysfs-class-net
Documentation/ABI/testing/sysfs-class-power-twl4030
Documentation/ABI/testing/sysfs-class-typec
Documentation/DMA-API.txt
Documentation/IRQ-domain.txt
Documentation/Makefile
Documentation/PCI/MSI-HOWTO.txt
Documentation/RCU/00-INDEX
Documentation/RCU/Design/Requirements/Requirements.html
Documentation/RCU/checklist.txt
Documentation/admin-guide/README.rst
Documentation/admin-guide/devices.txt
Documentation/admin-guide/index.rst
Documentation/admin-guide/kernel-parameters.txt
Documentation/admin-guide/pm/cpufreq.rst
Documentation/admin-guide/pm/intel_pstate.rst
Documentation/admin-guide/ras.rst
Documentation/arm/Atmel/README
Documentation/block/biodoc.txt
Documentation/conf.py
Documentation/core-api/assoc_array.rst
Documentation/core-api/atomic_ops.rst
Documentation/core-api/index.rst
Documentation/crypto/asymmetric-keys.txt
Documentation/dev-tools/index.rst
Documentation/dev-tools/sparse.rst
Documentation/devicetree/bindings/arm/amlogic.txt
Documentation/devicetree/bindings/arm/atmel-at91.txt
Documentation/devicetree/bindings/arm/ccn.txt
Documentation/devicetree/bindings/arm/cpus.txt
Documentation/devicetree/bindings/arm/gemini.txt
Documentation/devicetree/bindings/arm/hisilicon/hisilicon.txt
Documentation/devicetree/bindings/arm/keystone/keystone.txt
Documentation/devicetree/bindings/arm/mediatek.txt
Documentation/devicetree/bindings/arm/rockchip.txt
Documentation/devicetree/bindings/arm/shmobile.txt
Documentation/devicetree/bindings/arm/tegra.txt
Documentation/devicetree/bindings/ata/ahci-fsl-qoriq.txt
Documentation/devicetree/bindings/bus/brcm,gisb-arb.txt
Documentation/devicetree/bindings/clock/brcm,iproc-clocks.txt
Documentation/devicetree/bindings/cpufreq/ti-cpufreq.txt
Documentation/devicetree/bindings/gpio/gpio_atmel.txt
Documentation/devicetree/bindings/iio/adc/amlogic,meson-saradc.txt
Documentation/devicetree/bindings/iio/adc/renesas,gyroadc.txt
Documentation/devicetree/bindings/iio/adc/st,stm32-adc.txt
Documentation/devicetree/bindings/iio/imu/st_lsm6dsx.txt
Documentation/devicetree/bindings/interrupt-controller/allwinner,sunxi-nmi.txt
Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2400-vic.txt
Documentation/devicetree/bindings/interrupt-controller/mediatek,sysirq.txt
Documentation/devicetree/bindings/leds/common.txt
Documentation/devicetree/bindings/mfd/hi6421.txt
Documentation/devicetree/bindings/mfd/tps65910.txt
Documentation/devicetree/bindings/mmc/fsl-esdhc.txt
Documentation/devicetree/bindings/mmc/k3-dw-mshc.txt
Documentation/devicetree/bindings/mmc/rockchip-dw-mshc.txt
Documentation/devicetree/bindings/mmc/ti-omap-hsmmc.txt
Documentation/devicetree/bindings/mtd/atmel-nand.txt
Documentation/devicetree/bindings/net/dsa/b53.txt
Documentation/devicetree/bindings/net/ethernet.txt
Documentation/devicetree/bindings/net/macb.txt
Documentation/devicetree/bindings/net/marvell-orion-mdio.txt
Documentation/devicetree/bindings/net/ti,wilink-st.txt
Documentation/devicetree/bindings/net/wireless/ti,wlcore.txt
Documentation/devicetree/bindings/nvmem/rockchip-efuse.txt
Documentation/devicetree/bindings/opp/opp.txt
Documentation/devicetree/bindings/phy/bcm-ns-usb3-phy.txt
Documentation/devicetree/bindings/phy/brcm-sata-phy.txt
Documentation/devicetree/bindings/phy/meson8b-usb2-phy.txt
Documentation/devicetree/bindings/phy/phy-rockchip-inno-usb2.txt
Documentation/devicetree/bindings/power/rockchip-io-domain.txt
Documentation/devicetree/bindings/power/supply/bq27xxx.txt
Documentation/devicetree/bindings/property-units.txt
Documentation/devicetree/bindings/regulator/regulator.txt
Documentation/devicetree/bindings/serial/8
error: The following untracked working tree files would be overwritten
by checkout:
Documentation/ABI/testing/sysfs-class-net-phydev
Documentation/DocBook/.gitignore
Documentation/DocBook/Makefile
Documentation/DocBook/filesystems.tmpl
Documentation/DocBook/kernel-hacking.tmpl
Documentation/DocBook/kernel-locking.tmpl
Documentation/DocBook/kgdb.tmpl
Documentation/DocBook/libata.tmpl
Documentation/DocBook/librs.tmpl
Documentation/DocBook/lsm.tmpl
Documentation/DocBook/mtdnand.tmpl
Documentation/DocBook/networking.tmpl
Documentation/DocBook/rapidio.tmpl
Documentation/DocBook/s390-drivers.tmpl
Documentation/DocBook/scsi.tmpl
Documentation/DocBook/sh.tmpl
Documentation/DocBook/stylesheet.xsl
Documentation/DocBook/w1.tmpl
Documentation/DocBook/z8530book.tmpl
Documentation/Makefile.sphinx
Documentation/RCU/trace.txt
Documentation/devicetree/bindings/i2c/i2c-mt6577.txt
Documentation/devicetree/bindings/misc/allwinner,syscon.txt
Documentation/devicetree/bindings/net/cortina.txt
Documentation/devicetree/bindings/net/dsa/ksz.txt
Documentation/devicetree/bindings/net/dwmac-sun8i.txt
Documentation/devicetree/bindings/net/qca,qca7000.txt
Documentation/devicetree/bindings/power/max8903-charger.txt
Documentation/devicetree/bindings/power_supply/maxim,max14656.txt
Documentation/devicetree/bindings/ptp/brcm,ptp-dte.txt
Documentation/devicetree/bindings/timer/moxa,moxart-timer.txt
Documentation/doc-guide/docbook.rst
Documentation/networking/tls.txt
Documentation/prctl/no_new_privs.txt
Documentation/prctl/seccomp_filter.txt
Documentation/security/00-INDEX
Documentation/security/IMA-templates.txt
Documentation/security/LSM.txt
Documentation/security/LoadPin.txt
Documentation/security/SELinux.txt
Documentation/security/Smack.txt
Documentation/security/Yama.txt
Documentation/security/apparmor.txt
Documentation/security/conf.py
Documentation/security/credentials.txt
Documentation/security/keys-ecryptfs.txt
Documentation/security/keys-request-key.txt
Documentation/security/keys-trusted-encrypted.txt
Documentation/security/keys.txt
Documentation/security/self-protection.txt
Documentation/security/tomoyo.txt
Documentation/sphinx/convert_template.sed
Documentation/sphinx/post_convert.sed
Documentation/sphinx/tmplcvt
Documentation/usb/typec.rst
Documentation/usb/usb3-debug-port.rst
arch/arm/boot/dts/rk1108-evb.dts
arch/arm/boot/dts/rk1108.dtsi
arch/arm/boot/dts/tegra20-whistler.dts
arch/arm/mach-omap2/opp.c
arch/arm/mach-omap2/pmu.c
arch/ia64/include/asm/siginfo.h
arch/m32r/include/uapi/asm/siginfo.h
arch/microblaze/include/asm/bitops.h
arch/microblaze/include/asm/bug.h
arch/microblaze/include/asm/bugs.h
arch/microblaze/include/asm/div64.h
arch/microblaze/include/asm/emergency-restart.h
arch/microblaze/include/asm/fb.h
arch/microblaze/include/asm/hardirq.h
arch/microblaze/include/asm/irq_regs.h
arch/microblaze/include/asm/kdebug.h
arch/microblaze/include/asm/kmap_types.h
arch/microblaze/include/asm/linkage.h
arch/microblaze/include/asm/local.h
arch/microblaze/include/asm/local64.h
arch/microblaze/include/asm/parport.h
arch/microblaze/include/asm/percpu.h
arch/microblaze/include/asm/serial.h
arch/microblaze/include/asm/shmparam.h
arch/microblaze/include/asm/topology.h
arch/microblaze/include/asm/ucontext.h
arch/microblaze/include/asm/vga.h
arch/microblaze/include/asm/xor.h
arch/microblaze/include/uapi/asm/bitsperlong.h
arch/microblaze/include/uapi/asm/errno.h
arch/microblaze/include/uapi/asm/fcntl.h
arch/microblaze/include/uapi/asm/ioctl.h
arch/microblaze/include/uapi/asm/ioctls.h
arch/microblaze/include/uapi/asm/ipcbuf.h
arch/microblaze/include/uapi/asm/kvm_para.h
arch/microblaze/include/uapi/asm/mman.h
arch/microblaze/include/uapi/asm/msgbuf.h
arch/microblaze/include/uapi/asm/param.h
arch/microblaze/include/uapi/asm/poll.h
arch/microblaze/include/uapi/asm/resource.h
arch/microblaze/include/uapi/asm/sembuf.h
arch/microblaze/include/uapi/asm/shmbuf.h
arch/microblaze/include/uapi/asm/siginfo.h
arch/microblaze/include/uapi/asm/signal.h
arch/microblaze/includ
Aborting
W dniu 2017-09-20 o 11:45, Paweł Staszewski pisze:
> Ok looks like ending bisection
>
>
> Latest bisected kernel when there is no kernel panic 4.12.0+ (from
> next) - but only this warning:
>
> [ 309.030019] NETDEV WATCHDOG: enp4s0f0 (ixgbe): transmit queue 0
> timed out
> [ 309.030034] ------------[ cut here ]------------
> [ 309.030040] WARNING: CPU: 35 PID: 0 at dev_watchdog+0xcf/0x139
> [ 309.030041] Modules linked in: bonding ipmi_si x86_pkg_temp_thermal
> [ 309.030045] CPU: 35 PID: 0 Comm: swapper/35 Not tainted 4.12.0+ #5
> [ 309.030046] task: ffff88086d98a000 task.stack: ffffc90003378000
> [ 309.030048] RIP: 0010:dev_watchdog+0xcf/0x139
> [ 309.030049] RSP: 0018:ffff88087fbc3ea8 EFLAGS: 00010246
> [ 309.030050] RAX: 000000000000003d RBX: ffff88046b680000 RCX:
> 0000000000000000
> [ 309.030050] RDX: ffff88087fbd2f01 RSI: 0000000000000000 RDI:
> ffff88087fbcda08
> [ 309.030051] RBP: ffff88087fbc3eb8 R08: 0000000000000000 R09:
> ffff88087ff80a04
> [ 309.030051] R10: 0000000000000000 R11: ffff88086d98a001 R12:
> 0000000000000000
> [ 309.030052] R13: ffff88087fbc3ef8 R14: ffff88086d98a000 R15:
> ffffffff81c06008
> [ 309.030053] FS: 0000000000000000(0000) GS:ffff88087fbc0000(0000)
> knlGS:0000000000000000
> [ 309.030054] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 309.030054] CR2: 00007fba600f6098 CR3: 000000086b955000 CR4:
> 00000000001406e0
> [ 309.030055] Call Trace:
> [ 309.030057] <IRQ>
> [ 309.030059] ? netif_tx_lock+0x79/0x79
> [ 309.030062] call_timer_fn.isra.24+0x17/0x77
> [ 309.030063] run_timer_softirq+0x118/0x161
> [ 309.030065] ? netif_tx_lock+0x79/0x79
> [ 309.030066] ? ktime_get+0x2b/0x42
> [ 309.030070] ? lapic_next_deadline+0x21/0x27
> [ 309.030073] ? clockevents_program_event+0xa8/0xc5
> [ 309.030076] __do_softirq+0xa8/0x19d
> [ 309.030078] irq_exit+0x5d/0x6b
> [ 309.030079] smp_apic_timer_interrupt+0x2a/0x36
> [ 309.030082] apic_timer_interrupt+0x89/0x90
> [ 309.030085] RIP: 0010:mwait_idle+0x4e/0x6a
> [ 309.030086] RSP: 0018:ffffc9000337be98 EFLAGS: 00000246 ORIG_RAX:
> ffffffffffffff10
> [ 309.030087] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
> 0000000000000000
> [ 309.030087] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
> ffff88086d98a000
> [ 309.030088] RBP: ffffc9000337be98 R08: ffff88046f8279a0 R09:
> ffff88046f827040
> [ 309.030089] R10: ffff88086d98a000 R11: ffff88086d98a000 R12:
> 0000000000000000
> [ 309.030089] R13: ffff88086d98a000 R14: ffff88086d98a000 R15:
> ffff88086d98a000
> [ 309.030090] </IRQ>
> [ 309.030094] arch_cpu_idle+0xa/0xc
> [ 309.030095] default_idle_call+0x19/0x1b
> [ 309.030102] do_idle+0xbc/0x196
> [ 309.030104] cpu_startup_entry+0x1d/0x20
> [ 309.030105] start_secondary+0xd8/0xdc
> [ 309.030108] secondary_startup_64+0x9f/0x9f
> [ 309.030109] Code: cc 75 bd eb 35 48 89 df c6 05 c3 dc 74 00 01 e8
> 3a 62 fe ff 44 89 e1 48 89 de 48 89 c2 48 c7 c7 0f 65 a4 81 31 c0 e8
> 3d 4c b5 ff <0f> ff 48 8b 83 e0 01 00 00 48 89 df ff 50 78 48 8b 05 a0
> bc 6a
> [ 309.030128] ---[ end trace 9102cb25703ae2d9 ]---
>
>
> I just marked it as good - cause this problem above is differend - and
> im going to:
>
> git bisect good
> Bisecting: 1787 revisions left to test after this (roughly 11 steps)
>
>
>
>
> W dniu 2017-09-20 o 10:44, Paweł Staszewski pisze:
>> Trying to make video from ipmi :)
>>
>> with that results:
>>
>> https://bugzilla.kernel.org/attachment.cgi?id=258521
>>
>> catched two more lines where it starts - panic from 4.13.2.
>>
>>
>> Now will try tro do some bisection
>>
>>
>>
>> W dniu 2017-09-20 o 09:58, Paweł Staszewski pisze:
>>> Hi
>>>
>>>
>>> Will try bisecting tonight
>>>
>>>
>>>
>>> W dniu 2017-09-20 o 05:24, Eric Dumazet pisze:
>>>> On Wed, 2017-09-20 at 02:06 +0200, Paweł Staszewski wrote:
>>>>> Just checked kernel 4.13.2 and same problem
>>>>>
>>>>> Just after start all 6 bgp sessions - and kernel starts to learn
>>>>> routes
>>>>> it panic.
>>>>>
>>>>> https://bugzilla.kernel.org/attachment.cgi?id=258509
>>>>>
>>>>
>>>> Unfortunately we have not enough information from these traces.
>>>>
>>>> Can you get a full stack trace ?
>>>>
>>>> Alternatively, can you bisect ?
>>>>
>>>> Thanks.
>>>>
>>>>
>>>>
>>>
>>>
>>
>>
>
>
^ permalink raw reply
* [PATCH v2 1/2] mac80211: Add rcu read side critical sections
From: Ville Syrjala @ 2017-09-20 10:11 UTC (permalink / raw)
To: linux-wireless
Cc: Johannes Berg, David S. Miller, netdev, Ville Syrjälä
In-Reply-To: <20170918195919.15860-1-ville.syrjala@linux.intel.com>
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
I got the following lockdep warning about the rcu_dereference()s in
ieee80211_tx_h_select_key(). After tracing all callers of
ieee80211_tx_h_select_key() I discovered that ieee80211_get_buffered_bc()
and ieee80211_build_data_template() had the rcu_read_lock/unlock() but
three other places did not. So I just blindly added them and made the
read side critical section extend as far as the lifetime of 'tx' which
is where we seem to be stuffing the rcu protected pointers. No real clue
whether this is correct or not.
[ 854.573700] ../net/mac80211/tx.c:594 suspicious rcu_dereference_check() usage!
[ 854.573704]
other info that might help us debug this:
[ 854.573707]
rcu_scheduler_active = 2, debug_locks = 1
[ 854.573712] 6 locks held by kworker/u2:0/2877:
[ 854.573715] #0: ("%s"wiphy_name(local->hw.wiphy)){++++.+}, at: [<c1067f37>] process_one_work+0x127/0x580
[ 854.573742] #1: ((&sdata->work)){+.+.+.}, at: [<c1067f37>] process_one_work+0x127/0x580
[ 854.573758] #2: (&wdev->mtx){+.+.+.}, at: [<f83271c3>] ieee80211_sta_work+0x23/0x1c70 [mac80211]
[ 854.573902] #3: (&local->sta_mtx){+.+.+.}, at: [<f82c9b10>] __sta_info_flush+0x60/0x160 [mac80211]
[ 854.573947] #4: (&(&txq->axq_lock)->rlock){+.-...}, at: [<f825729c>] ath_tx_node_cleanup+0x5c/0x180 [ath9k]
[ 854.573973] #5: (&(&fq->lock)->rlock){+.-...}, at: [<f82fb064>] ieee80211_tx_dequeue+0x24/0xa80 [mac80211]
[ 854.574023]
stack backtrace:
[ 854.574028] CPU: 0 PID: 2877 Comm: kworker/u2:0 Not tainted 4.13.0-mgm-ovl+ #52
[ 854.574032] Hardware name: FUJITSU SIEMENS LIFEBOOK S6120/FJNB16C, BIOS Version 1.26 05/10/2004
[ 854.574070] Workqueue: phy0 ieee80211_iface_work [mac80211]
[ 854.574076] Call Trace:
[ 854.574086] dump_stack+0x16/0x19
[ 854.574092] lockdep_rcu_suspicious+0xcb/0xf0
[ 854.574131] ieee80211_tx_h_select_key+0x1b5/0x500 [mac80211]
[ 854.574171] ieee80211_tx_dequeue+0x283/0xa80 [mac80211]
[ 854.574181] ath_tid_dequeue+0x84/0xf0 [ath9k]
[ 854.574189] ath_tx_node_cleanup+0xb8/0x180 [ath9k]
[ 854.574199] ath9k_sta_state+0x48/0xf0 [ath9k]
[ 854.574207] ? ath9k_del_ps_key.isra.19+0x60/0x60 [ath9k]
[ 854.574240] drv_sta_state+0xaf/0x8c0 [mac80211]
[ 854.574275] __sta_info_destroy_part2+0x10b/0x140 [mac80211]
[ 854.574309] __sta_info_flush+0xd5/0x160 [mac80211]
[ 854.574349] ieee80211_set_disassoc+0xd3/0x570 [mac80211]
[ 854.574390] ieee80211_sta_connection_lost+0x30/0x60 [mac80211]
[ 854.574431] ieee80211_sta_work+0x1ff/0x1c70 [mac80211]
[ 854.574436] ? mark_held_locks+0x62/0x90
[ 854.574443] ? _raw_spin_unlock_irqrestore+0x55/0x70
[ 854.574447] ? trace_hardirqs_on_caller+0x11c/0x1a0
[ 854.574452] ? trace_hardirqs_on+0xb/0x10
[ 854.574459] ? dev_mc_net_exit+0xe/0x20
[ 854.574467] ? skb_dequeue+0x48/0x70
[ 854.574504] ieee80211_iface_work+0x2d8/0x320 [mac80211]
[ 854.574509] process_one_work+0x1d1/0x580
[ 854.574513] ? process_one_work+0x127/0x580
[ 854.574519] worker_thread+0x31/0x380
[ 854.574525] kthread+0xd9/0x110
[ 854.574529] ? process_one_work+0x580/0x580
[ 854.574534] ? kthread_create_on_node+0x30/0x30
[ 854.574540] ret_from_fork+0x19/0x24
[ 854.574548] =============================
[ 854.574551] WARNING: suspicious RCU usage
[ 854.574555] 4.13.0-mgm-ovl+ #52 Not tainted
[ 854.574558] -----------------------------
[ 854.574561] ../net/mac80211/tx.c:608 suspicious rcu_dereference_check() usage!
[ 854.574564]
other info that might help us debug this:
[ 854.574568]
rcu_scheduler_active = 2, debug_locks = 1
[ 854.574572] 6 locks held by kworker/u2:0/2877:
[ 854.574574] #0: ("%s"wiphy_name(local->hw.wiphy)){++++.+}, at: [<c1067f37>] process_one_work+0x127/0x580
[ 854.574590] #1: ((&sdata->work)){+.+.+.}, at: [<c1067f37>] process_one_work+0x127/0x580
[ 854.574606] #2: (&wdev->mtx){+.+.+.}, at: [<f83271c3>] ieee80211_sta_work+0x23/0x1c70 [mac80211]
[ 854.574657] #3: (&local->sta_mtx){+.+.+.}, at: [<f82c9b10>] __sta_info_flush+0x60/0x160 [mac80211]
[ 854.574702] #4: (&(&txq->axq_lock)->rlock){+.-...}, at: [<f825729c>] ath_tx_node_cleanup+0x5c/0x180 [ath9k]
[ 854.574721] #5: (&(&fq->lock)->rlock){+.-...}, at: [<f82fb064>] ieee80211_tx_dequeue+0x24/0xa80 [mac80211]
[ 854.574771]
stack backtrace:
[ 854.574775] CPU: 0 PID: 2877 Comm: kworker/u2:0 Not tainted 4.13.0-mgm-ovl+ #52
[ 854.574779] Hardware name: FUJITSU SIEMENS LIFEBOOK S6120/FJNB16C, BIOS Version 1.26 05/10/2004
[ 854.574814] Workqueue: phy0 ieee80211_iface_work [mac80211]
[ 854.574821] Call Trace:
[ 854.574825] dump_stack+0x16/0x19
[ 854.574830] lockdep_rcu_suspicious+0xcb/0xf0
[ 854.574869] ieee80211_tx_h_select_key+0x44e/0x500 [mac80211]
[ 854.574908] ieee80211_tx_dequeue+0x283/0xa80 [mac80211]
[ 854.574919] ath_tid_dequeue+0x84/0xf0 [ath9k]
[ 854.574927] ath_tx_node_cleanup+0xb8/0x180 [ath9k]
[ 854.574936] ath9k_sta_state+0x48/0xf0 [ath9k]
[ 854.574945] ? ath9k_del_ps_key.isra.19+0x60/0x60 [ath9k]
[ 854.574978] drv_sta_state+0xaf/0x8c0 [mac80211]
[ 854.575012] __sta_info_destroy_part2+0x10b/0x140 [mac80211]
[ 854.575046] __sta_info_flush+0xd5/0x160 [mac80211]
[ 854.575087] ieee80211_set_disassoc+0xd3/0x570 [mac80211]
[ 854.575127] ieee80211_sta_connection_lost+0x30/0x60 [mac80211]
[ 854.575168] ieee80211_sta_work+0x1ff/0x1c70 [mac80211]
[ 854.575173] ? mark_held_locks+0x62/0x90
[ 854.575178] ? _raw_spin_unlock_irqrestore+0x55/0x70
[ 854.575182] ? trace_hardirqs_on_caller+0x11c/0x1a0
[ 854.575187] ? trace_hardirqs_on+0xb/0x10
[ 854.575192] ? dev_mc_net_exit+0xe/0x20
[ 854.575197] ? skb_dequeue+0x48/0x70
[ 854.575233] ieee80211_iface_work+0x2d8/0x320 [mac80211]
[ 854.575238] process_one_work+0x1d1/0x580
[ 854.575243] ? process_one_work+0x127/0x580
[ 854.575248] worker_thread+0x31/0x380
[ 854.575253] kthread+0xd9/0x110
[ 854.575257] ? process_one_work+0x580/0x580
[ 854.575262] ? kthread_create_on_node+0x30/0x30
[ 854.575267] ret_from_fork+0x19/0x24
v2: Callers of ieee80211_tx() already have the
rcu_read_lock/unlock()
Move the rcu critical section inside the spinlock in
ieee80211_tx_dequeue() (Johannes Berg)
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: netdev@vger.kernel.org
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
---
net/mac80211/tx.c | 17 +++++++++++++++--
1 file changed, 15 insertions(+), 2 deletions(-)
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 94826680cf2b..fc4d8294d664 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1770,15 +1770,21 @@ bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
struct ieee80211_tx_data tx;
struct sk_buff *skb2;
- if (ieee80211_tx_prepare(sdata, &tx, NULL, skb) == TX_DROP)
+ rcu_read_lock();
+
+ if (ieee80211_tx_prepare(sdata, &tx, NULL, skb) == TX_DROP) {
+ rcu_read_unlock();
return false;
+ }
info->band = band;
info->control.vif = vif;
info->hw_queue = vif->hw_queue[skb_get_queue_mapping(skb)];
- if (invoke_tx_handlers(&tx))
+ if (invoke_tx_handlers(&tx)) {
+ rcu_read_unlock();
return false;
+ }
if (sta) {
if (tx.sta)
@@ -1792,9 +1798,12 @@ bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
if (WARN_ON(skb2 != skb || !skb_queue_empty(&tx.skbs))) {
ieee80211_free_txskb(hw, skb2);
ieee80211_purge_tx_queue(hw, &tx.skbs);
+ rcu_read_unlock();
return false;
}
+ rcu_read_unlock();
+
return true;
}
EXPORT_SYMBOL(ieee80211_tx_prepare_skb);
@@ -3413,6 +3422,8 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
spin_lock_bh(&fq->lock);
+ rcu_read_lock();
+
if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
goto out;
@@ -3511,6 +3522,8 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
IEEE80211_SKB_CB(skb)->control.vif = vif;
out:
+ rcu_read_unlock();
+
spin_unlock_bh(&fq->lock);
return skb;
--
2.13.5
^ permalink raw reply related
* Re: Latest net-next from GIT panic
From: Paweł Staszewski @ 2017-09-20 9:45 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Linux Kernel Network Developers
In-Reply-To: <8f0b0143-657e-d574-c442-24d3d017bc87@itcare.pl>
Ok looks like ending bisection
Latest bisected kernel when there is no kernel panic 4.12.0+ (from
next) - but only this warning:
[ 309.030019] NETDEV WATCHDOG: enp4s0f0 (ixgbe): transmit queue 0 timed out
[ 309.030034] ------------[ cut here ]------------
[ 309.030040] WARNING: CPU: 35 PID: 0 at dev_watchdog+0xcf/0x139
[ 309.030041] Modules linked in: bonding ipmi_si x86_pkg_temp_thermal
[ 309.030045] CPU: 35 PID: 0 Comm: swapper/35 Not tainted 4.12.0+ #5
[ 309.030046] task: ffff88086d98a000 task.stack: ffffc90003378000
[ 309.030048] RIP: 0010:dev_watchdog+0xcf/0x139
[ 309.030049] RSP: 0018:ffff88087fbc3ea8 EFLAGS: 00010246
[ 309.030050] RAX: 000000000000003d RBX: ffff88046b680000 RCX:
0000000000000000
[ 309.030050] RDX: ffff88087fbd2f01 RSI: 0000000000000000 RDI:
ffff88087fbcda08
[ 309.030051] RBP: ffff88087fbc3eb8 R08: 0000000000000000 R09:
ffff88087ff80a04
[ 309.030051] R10: 0000000000000000 R11: ffff88086d98a001 R12:
0000000000000000
[ 309.030052] R13: ffff88087fbc3ef8 R14: ffff88086d98a000 R15:
ffffffff81c06008
[ 309.030053] FS: 0000000000000000(0000) GS:ffff88087fbc0000(0000)
knlGS:0000000000000000
[ 309.030054] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 309.030054] CR2: 00007fba600f6098 CR3: 000000086b955000 CR4:
00000000001406e0
[ 309.030055] Call Trace:
[ 309.030057] <IRQ>
[ 309.030059] ? netif_tx_lock+0x79/0x79
[ 309.030062] call_timer_fn.isra.24+0x17/0x77
[ 309.030063] run_timer_softirq+0x118/0x161
[ 309.030065] ? netif_tx_lock+0x79/0x79
[ 309.030066] ? ktime_get+0x2b/0x42
[ 309.030070] ? lapic_next_deadline+0x21/0x27
[ 309.030073] ? clockevents_program_event+0xa8/0xc5
[ 309.030076] __do_softirq+0xa8/0x19d
[ 309.030078] irq_exit+0x5d/0x6b
[ 309.030079] smp_apic_timer_interrupt+0x2a/0x36
[ 309.030082] apic_timer_interrupt+0x89/0x90
[ 309.030085] RIP: 0010:mwait_idle+0x4e/0x6a
[ 309.030086] RSP: 0018:ffffc9000337be98 EFLAGS: 00000246 ORIG_RAX:
ffffffffffffff10
[ 309.030087] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
0000000000000000
[ 309.030087] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
ffff88086d98a000
[ 309.030088] RBP: ffffc9000337be98 R08: ffff88046f8279a0 R09:
ffff88046f827040
[ 309.030089] R10: ffff88086d98a000 R11: ffff88086d98a000 R12:
0000000000000000
[ 309.030089] R13: ffff88086d98a000 R14: ffff88086d98a000 R15:
ffff88086d98a000
[ 309.030090] </IRQ>
[ 309.030094] arch_cpu_idle+0xa/0xc
[ 309.030095] default_idle_call+0x19/0x1b
[ 309.030102] do_idle+0xbc/0x196
[ 309.030104] cpu_startup_entry+0x1d/0x20
[ 309.030105] start_secondary+0xd8/0xdc
[ 309.030108] secondary_startup_64+0x9f/0x9f
[ 309.030109] Code: cc 75 bd eb 35 48 89 df c6 05 c3 dc 74 00 01 e8 3a
62 fe ff 44 89 e1 48 89 de 48 89 c2 48 c7 c7 0f 65 a4 81 31 c0 e8 3d 4c
b5 ff <0f> ff 48 8b 83 e0 01 00 00 48 89 df ff 50 78 48 8b 05 a0 bc 6a
[ 309.030128] ---[ end trace 9102cb25703ae2d9 ]---
I just marked it as good - cause this problem above is differend - and
im going to:
git bisect good
Bisecting: 1787 revisions left to test after this (roughly 11 steps)
W dniu 2017-09-20 o 10:44, Paweł Staszewski pisze:
> Trying to make video from ipmi :)
>
> with that results:
>
> https://bugzilla.kernel.org/attachment.cgi?id=258521
>
> catched two more lines where it starts - panic from 4.13.2.
>
>
> Now will try tro do some bisection
>
>
>
> W dniu 2017-09-20 o 09:58, Paweł Staszewski pisze:
>> Hi
>>
>>
>> Will try bisecting tonight
>>
>>
>>
>> W dniu 2017-09-20 o 05:24, Eric Dumazet pisze:
>>> On Wed, 2017-09-20 at 02:06 +0200, Paweł Staszewski wrote:
>>>> Just checked kernel 4.13.2 and same problem
>>>>
>>>> Just after start all 6 bgp sessions - and kernel starts to learn
>>>> routes
>>>> it panic.
>>>>
>>>> https://bugzilla.kernel.org/attachment.cgi?id=258509
>>>>
>>>
>>> Unfortunately we have not enough information from these traces.
>>>
>>> Can you get a full stack trace ?
>>>
>>> Alternatively, can you bisect ?
>>>
>>> Thanks.
>>>
>>>
>>>
>>
>>
>
>
^ permalink raw reply
* Re: [lkp-robot] [test_rhashtable] c1bd3689a7: WARNING:at_lib/debugobjects.c:#__debug_object_init
From: Florian Westphal @ 2017-09-20 8:44 UTC (permalink / raw)
To: kernel test robot; +Cc: Florian Westphal, netdev, lkp
In-Reply-To: <20170920015457.GA30213@yexl-desktop>
kernel test robot <xiaolong.ye@intel.com> wrote:
> FYI, we noticed the following commit:
>
> commit: c1bd3689a70d1ba1a2f7c6781770920087166018 ("test_rhashtable: add test case for rhl_table interface")
> url: https://github.com/0day-ci/linux/commits/Florian-Westphal/test_rhashtable-add-test-case-for-rhl-table/20170919-135550
>
>
> in testcase: boot
>
> on test machine: qemu-system-x86_64 -enable-kvm -smp 2 -m 512M
>
> caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
>
>
> +---------------------------------------------------------+------------+------------+
> [ 15.235031] WARNING: CPU: 0 PID: 1 at lib/debugobjects.c:328 __debug_object_init+0x794/0x930
[..]
This is with v1 of the patch where the rhltable struct was allocated on
stack, v2 is fine.
^ permalink raw reply
* Re: Latest net-next from GIT panic
From: Paweł Staszewski @ 2017-09-20 8:44 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Linux Kernel Network Developers
In-Reply-To: <07bde5d4-fab6-3ef9-f586-403dadbb0a2a@itcare.pl>
Trying to make video from ipmi :)
with that results:
https://bugzilla.kernel.org/attachment.cgi?id=258521
catched two more lines where it starts - panic from 4.13.2.
Now will try tro do some bisection
W dniu 2017-09-20 o 09:58, Paweł Staszewski pisze:
> Hi
>
>
> Will try bisecting tonight
>
>
>
> W dniu 2017-09-20 o 05:24, Eric Dumazet pisze:
>> On Wed, 2017-09-20 at 02:06 +0200, Paweł Staszewski wrote:
>>> Just checked kernel 4.13.2 and same problem
>>>
>>> Just after start all 6 bgp sessions - and kernel starts to learn routes
>>> it panic.
>>>
>>> https://bugzilla.kernel.org/attachment.cgi?id=258509
>>>
>>
>> Unfortunately we have not enough information from these traces.
>>
>> Can you get a full stack trace ?
>>
>> Alternatively, can you bisect ?
>>
>> Thanks.
>>
>>
>>
>
>
^ permalink raw reply
* Re: [RFC PATCH 2/3] usbnet: Avoid potential races in usbnet_deferred_kevent()
From: Oliver Neukum @ 2017-09-20 8:25 UTC (permalink / raw)
To: Doug Anderson
Cc: Guenter Roeck, Grant Grundler, linux-kernel@vger.kernel.org,
linux-usb@vger.kernel.org, netdev
In-Reply-To: <CAD=FV=WJOrpEqpUGA3EL5BMQTMN9NVgekd7qBX+u=gX90eRGyA@mail.gmail.com>
Am Dienstag, den 19.09.2017, 13:53 -0700 schrieb Doug Anderson:
> Hi,
>
> On Tue, Sep 19, 2017 at 1:37 PM, Oliver Neukum <oneukum@suse.com> wrote:
> >
> > Am Dienstag, den 19.09.2017, 09:15 -0700 schrieb Douglas Anderson:
> > >
> > > In general when you've got a flag communicating that "something needs
> > > to be done" you want to clear that flag _before_ doing the task. If
> > > you clear the flag _after_ doing the task you end up with the risk
> > > that this will happen:
> > >
> > > 1. Requester sets flag saying task A needs to be done.
> > > 2. Worker comes and stars doing task A.
> > > 3. Worker finishes task A but hasn't yet cleared the flag.
> > > 4. Requester wants to set flag saying task A needs to be done again.
> > > 5. Worker clears the flag without doing anything.
> > >
> > > Let's make the usbnet codebase consistently clear the flag _before_ it
> > > does the requested work. That way if there's another request to do
> > > the work while the work is already in progress it won't be lost.
> > >
> > > NOTES:
> > > - No known bugs are fixed by this; it's just found by code inspection.
> >
> > Hi,
> >
> > unfortunately the patch is wrong. The flags must be cleared only
> > in case the handler is successful. That is not guaranteed.
> >
> > Regards
> > Oliver
> >
> > NACK
>
> OK, thanks for reviewing! I definitely wasn't super confident about
> the patch (hence the RFC).
>
> Do you think that the races I identified are possible to hit? In
As far as I can tell, we are safe, but you are right to say that the
driver is not quite clean at that point.
> other words: should I try to rework the patch somehow or just drop it?
> Originally I had the patch setting the flags back to true in the
> failure cases, but then I convinced myself that wasn't needed. I can
> certainly go back and try it that way...
Setting the flags again in the error case would certainly be an
improvement. I'd be happy with a patch doing that.
Regards
Oliver
^ permalink raw reply
* Re: [RFC PATCH 2/3] usbnet: Avoid potential races in usbnet_deferred_kevent()
From: Oliver Neukum @ 2017-09-20 8:23 UTC (permalink / raw)
To: Guenter Roeck
Cc: Douglas Anderson, Guenter Roeck, Grant Grundler, linux-kernel,
linux-usb, netdev
In-Reply-To: <CABXOdTdnaf5REYjgtR6+hoRhiB2c=xb+1CxMmv7XsukYDFDitA@mail.gmail.com>
Am Dienstag, den 19.09.2017, 13:51 -0700 schrieb Guenter Roeck:
> On Tue, Sep 19, 2017 at 1:37 PM, Oliver Neukum <oneukum@suse.com> wrote:
> >
> > Am Dienstag, den 19.09.2017, 09:15 -0700 schrieb Douglas Anderson:
> > >
[..]
> > > NOTES:
> > > - No known bugs are fixed by this; it's just found by code inspection.
> >
> > Hi,
> >
> > unfortunately the patch is wrong. The flags must be cleared only
> > in case the handler is successful. That is not guaranteed.
> >
>
> Just out of curiosity, what is the retry mechanism ? Whenever a new,
> possibly unrelated, event is scheduled ?
Hi,
that actually depends on the flag.
Look at the case of fail_lowmem. There we reschedule.
HTH
Oliver
^ permalink raw reply
* Re: Latest net-next from GIT panic
From: Paweł Staszewski @ 2017-09-20 7:58 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Linux Kernel Network Developers
In-Reply-To: <1505877870.29839.82.camel@edumazet-glaptop3.roam.corp.google.com>
Hi
Will try bisecting tonight
W dniu 2017-09-20 o 05:24, Eric Dumazet pisze:
> On Wed, 2017-09-20 at 02:06 +0200, Paweł Staszewski wrote:
>> Just checked kernel 4.13.2 and same problem
>>
>> Just after start all 6 bgp sessions - and kernel starts to learn routes
>> it panic.
>>
>> https://bugzilla.kernel.org/attachment.cgi?id=258509
>>
>
> Unfortunately we have not enough information from these traces.
>
> Can you get a full stack trace ?
>
> Alternatively, can you bisect ?
>
> Thanks.
>
>
>
^ permalink raw reply
* [net-next] macvlan: code refine to check data before using
From: Zhang Shengju @ 2017-09-20 0:12 UTC (permalink / raw)
To: davem, fgao, vyasevic, netdev
This patch checks data first at one place, return if it's null.
Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
---
drivers/net/macvlan.c | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index d2aea96..1ffe77e 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1231,11 +1231,14 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[],
return -EADDRNOTAVAIL;
}
- if (data && data[IFLA_MACVLAN_FLAGS] &&
+ if (!data)
+ return 0;
+
+ if (data[IFLA_MACVLAN_FLAGS] &&
nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~MACVLAN_FLAG_NOPROMISC)
return -EINVAL;
- if (data && data[IFLA_MACVLAN_MODE]) {
+ if (data[IFLA_MACVLAN_MODE]) {
switch (nla_get_u32(data[IFLA_MACVLAN_MODE])) {
case MACVLAN_MODE_PRIVATE:
case MACVLAN_MODE_VEPA:
@@ -1248,7 +1251,7 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[],
}
}
- if (data && data[IFLA_MACVLAN_MACADDR_MODE]) {
+ if (data[IFLA_MACVLAN_MACADDR_MODE]) {
switch (nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE])) {
case MACVLAN_MACADDR_ADD:
case MACVLAN_MACADDR_DEL:
@@ -1260,7 +1263,7 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[],
}
}
- if (data && data[IFLA_MACVLAN_MACADDR]) {
+ if (data[IFLA_MACVLAN_MACADDR]) {
if (nla_len(data[IFLA_MACVLAN_MACADDR]) != ETH_ALEN)
return -EINVAL;
@@ -1268,7 +1271,7 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[],
return -EADDRNOTAVAIL;
}
- if (data && data[IFLA_MACVLAN_MACADDR_COUNT])
+ if (data[IFLA_MACVLAN_MACADDR_COUNT])
return -EINVAL;
return 0;
--
1.8.3.1
^ permalink raw reply related
* [PATCH] netfilter: nf_tables: Release memory obtained by kasprintf
From: Arvind Yadav @ 2017-09-20 7:01 UTC (permalink / raw)
To: pablo, kadlec, fw, davem; +Cc: netfilter-devel, coreteam, netdev, linux-kernel
Free memory region, if nf_tables_set_alloc_name is not successful.
Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
---
net/netfilter/nf_tables_api.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9299271..393e37e 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2741,8 +2741,10 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
list_for_each_entry(i, &ctx->table->sets, list) {
if (!nft_is_active_next(ctx->net, i))
continue;
- if (!strcmp(set->name, i->name))
+ if (!strcmp(set->name, i->name)) {
+ kfree(set->name);
return -ENFILE;
+ }
}
return 0;
}
--
1.9.1
^ permalink raw reply related
* Re: Regression in throughput between kvm guests over virtual bridge
From: Jason Wang @ 2017-09-20 6:27 UTC (permalink / raw)
To: Matthew Rosato, netdev; +Cc: davem, mst
In-Reply-To: <7d444584-3854-ace2-008d-0fdef1c9cef4@linux.vnet.ibm.com>
[-- Attachment #1: Type: text/plain, Size: 4826 bytes --]
On 2017年09月19日 02:11, Matthew Rosato wrote:
> On 09/18/2017 03:36 AM, Jason Wang wrote:
>>
>> On 2017年09月18日 11:13, Jason Wang wrote:
>>>
>>> On 2017年09月16日 03:19, Matthew Rosato wrote:
>>>>> It looks like vhost is slowed down for some reason which leads to more
>>>>> idle time on 4.13+VHOST_RX_BATCH=1. Appreciated if you can collect the
>>>>> perf.diff on host, one for rx and one for tx.
>>>>>
>>>> perf data below for the associated vhost threads, baseline=4.12,
>>>> delta1=4.13, delta2=4.13+VHOST_RX_BATCH=1
>>>>
>>>> Client vhost:
>>>>
>>>> 60.12% -11.11% -12.34% [kernel.vmlinux] [k] raw_copy_from_user
>>>> 13.76% -1.28% -0.74% [kernel.vmlinux] [k] get_page_from_freelist
>>>> 2.00% +3.69% +3.54% [kernel.vmlinux] [k] __wake_up_sync_key
>>>> 1.19% +0.60% +0.66% [kernel.vmlinux] [k] __alloc_pages_nodemask
>>>> 1.12% +0.76% +0.86% [kernel.vmlinux] [k] copy_page_from_iter
>>>> 1.09% +0.28% +0.35% [vhost] [k] vhost_get_vq_desc
>>>> 1.07% +0.31% +0.26% [kernel.vmlinux] [k] alloc_skb_with_frags
>>>> 0.94% +0.42% +0.65% [kernel.vmlinux] [k] alloc_pages_current
>>>> 0.91% -0.19% -0.18% [kernel.vmlinux] [k] memcpy
>>>> 0.88% +0.26% +0.30% [kernel.vmlinux] [k] __next_zones_zonelist
>>>> 0.85% +0.05% +0.12% [kernel.vmlinux] [k] iov_iter_advance
>>>> 0.79% +0.09% +0.19% [vhost] [k] __vhost_add_used_n
>>>> 0.74% [kernel.vmlinux] [k] get_task_policy.part.7
>>>> 0.74% -0.01% -0.05% [kernel.vmlinux] [k] tun_net_xmit
>>>> 0.60% +0.17% +0.33% [kernel.vmlinux] [k] policy_nodemask
>>>> 0.58% -0.15% -0.12% [ebtables] [k] ebt_do_table
>>>> 0.52% -0.25% -0.22% [kernel.vmlinux] [k] __alloc_skb
>>>> ...
>>>> 0.42% +0.58% +0.59% [kernel.vmlinux] [k] eventfd_signal
>>>> ...
>>>> 0.32% +0.96% +0.93% [kernel.vmlinux] [k] finish_task_switch
>>>> ...
>>>> +1.50% +1.16% [kernel.vmlinux] [k] get_task_policy.part.9
>>>> +0.40% +0.42% [kernel.vmlinux] [k] __skb_get_hash_symmetr
>>>> +0.39% +0.40% [kernel.vmlinux] [k] _copy_from_iter_full
>>>> +0.24% +0.23% [vhost_net] [k] vhost_net_buf_peek
>>>>
>>>> Server vhost:
>>>>
>>>> 61.93% -10.72% -10.91% [kernel.vmlinux] [k] raw_copy_to_user
>>>> 9.25% +0.47% +0.86% [kernel.vmlinux] [k] free_hot_cold_page
>>>> 5.16% +1.41% +1.57% [vhost] [k] vhost_get_vq_desc
>>>> 5.12% -3.81% -3.78% [kernel.vmlinux] [k] skb_release_data
>>>> 3.30% +0.42% +0.55% [kernel.vmlinux] [k] raw_copy_from_user
>>>> 1.29% +2.20% +2.28% [kernel.vmlinux] [k] copy_page_to_iter
>>>> 1.24% +1.65% +0.45% [vhost_net] [k] handle_rx
>>>> 1.08% +3.03% +2.85% [kernel.vmlinux] [k] __wake_up_sync_key
>>>> 0.96% +0.70% +1.10% [vhost] [k] translate_desc
>>>> 0.69% -0.20% -0.22% [kernel.vmlinux] [k] tun_do_read.part.10
>>>> 0.69% [kernel.vmlinux] [k] tun_peek_len
>>>> 0.67% +0.75% +0.78% [kernel.vmlinux] [k] eventfd_signal
>>>> 0.52% +0.96% +0.98% [kernel.vmlinux] [k] finish_task_switch
>>>> 0.50% +0.05% +0.09% [vhost] [k] vhost_add_used_n
>>>> ...
>>>> +0.63% +0.58% [vhost_net] [k] vhost_net_buf_peek
>>>> +0.32% +0.32% [kernel.vmlinux] [k] _copy_to_iter
>>>> +0.19% +0.19% [kernel.vmlinux] [k] __skb_get_hash_symmetr
>>>> +0.11% +0.21% [vhost] [k] vhost_umem_interval_tr
>>>>
>>> Looks like for some unknown reason which leads more wakeups.
>>>
>>> Could you please try to attached patch to see if it solves or mitigate
>>> the issue?
>>>
>>> Thanks
>> My bad, please try this.
>>
>> Thanks
> Thanks Jason. Built 4.13 + supplied patch, I see some decrease in
> wakeups, but there's still quite a bit more compared to 4.12
> (baseline=4.12, delta1=4.13, delta2=4.13+patch):
>
> client:
> 2.00% +3.69% +2.55% [kernel.vmlinux] [k] __wake_up_sync_key
>
> server:
> 1.08% +3.03% +1.85% [kernel.vmlinux] [k] __wake_up_sync_key
>
>
> Throughput was roughly equivalent to base 4.13 (so, still seeing the
> regression w/ this patch applied).
>
Seems to make some progress on wakeup mitigation. Previous patch tries
to reduce the unnecessary traversal of waitqueue during rx. Attached
patch goes even further which disables rx polling during processing tx.
Please try it to see if it has any difference.
And two questions:
- Is the issue existed if you do uperf between 2VMs (instead of 4VMs)
- Can enable batching in the tap of sending VM improve the performance
(ethtool -C $tap rx-frames 64)
Thanks
[-- Attachment #2: 0001-vhost_net-avoid-unnecessary-wakeups-during-tx.patch --]
[-- Type: text/x-patch, Size: 1938 bytes --]
>From d57ad96083fc57205336af1b5ea777e5185f1581 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 20 Sep 2017 11:44:49 +0800
Subject: [PATCH] vhost_net: avoid unnecessary wakeups during tx
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/net.c | 21 ++++++++++++++++++---
1 file changed, 18 insertions(+), 3 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index ed476fa..e7349cf 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -444,8 +444,11 @@ static bool vhost_exceeds_maxpend(struct vhost_net *net)
* read-size critical section for our kind of RCU. */
static void handle_tx(struct vhost_net *net)
{
+ struct vhost_net_virtqueue *rx_nvq = &net->vqs[VHOST_NET_VQ_RX];
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = &nvq->vq;
+ struct vhost_virtqueue *rx_vq = &rx_nvq->vq;
+
unsigned out, in;
int head;
struct msghdr msg = {
@@ -462,6 +465,10 @@ static void handle_tx(struct vhost_net *net)
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
bool zcopy, zcopy_used;
+ mutex_lock(&rx_vq->mutex);
+ vhost_net_disable_vq(net, rx_vq);
+ mutex_unlock(&rx_vq->mutex);
+
mutex_lock(&vq->mutex);
sock = vq->private_data;
if (!sock)
@@ -574,13 +581,21 @@ static void handle_tx(struct vhost_net *net)
else
vhost_zerocopy_signal_used(net, vq);
vhost_net_tx_packet(net);
- if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
- vhost_poll_queue(&vq->poll);
+ if (unlikely(total_len >= VHOST_NET_WEIGHT))
break;
- }
}
out:
mutex_unlock(&vq->mutex);
+
+ mutex_lock(&rx_vq->mutex);
+ vhost_net_enable_vq(net, rx_vq);
+ mutex_unlock(&rx_vq->mutex);
+
+ if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
+ mutex_lock(&vq->mutex);
+ vhost_poll_queue(&vq->poll);
+ mutex_unlock(&vq->mutex);
+ }
}
static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v5 2/4] bpf: add a test case for helper bpf_perf_event_read_value
From: Yonghong Song @ 2017-09-20 6:09 UTC (permalink / raw)
To: peterz, rostedt, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20170920060935.1102268-1-yhs@fb.com>
The bpf sample program tracex6 is enhanced to use the new
helper to read enabled/running time as well.
Signed-off-by: Yonghong Song <yhs@fb.com>
---
samples/bpf/tracex6_kern.c | 26 ++++++++++++++++++++++++++
samples/bpf/tracex6_user.c | 13 ++++++++++++-
tools/include/uapi/linux/bpf.h | 3 ++-
tools/testing/selftests/bpf/bpf_helpers.h | 3 +++
4 files changed, 43 insertions(+), 2 deletions(-)
diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c
index e7d1803..46c557a 100644
--- a/samples/bpf/tracex6_kern.c
+++ b/samples/bpf/tracex6_kern.c
@@ -15,6 +15,12 @@ struct bpf_map_def SEC("maps") values = {
.value_size = sizeof(u64),
.max_entries = 64,
};
+struct bpf_map_def SEC("maps") values2 = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct bpf_perf_event_value),
+ .max_entries = 64,
+};
SEC("kprobe/htab_map_get_next_key")
int bpf_prog1(struct pt_regs *ctx)
@@ -37,5 +43,25 @@ int bpf_prog1(struct pt_regs *ctx)
return 0;
}
+SEC("kprobe/htab_map_lookup_elem")
+int bpf_prog2(struct pt_regs *ctx)
+{
+ u32 key = bpf_get_smp_processor_id();
+ struct bpf_perf_event_value *val, buf;
+ int error;
+
+ error = bpf_perf_event_read_value(&counters, key, &buf, sizeof(buf));
+ if (error)
+ return 0;
+
+ val = bpf_map_lookup_elem(&values2, &key);
+ if (val)
+ *val = buf;
+ else
+ bpf_map_update_elem(&values2, &key, &buf, BPF_NOEXIST);
+
+ return 0;
+}
+
char _license[] SEC("license") = "GPL";
u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c
index a05a99a..3341a96 100644
--- a/samples/bpf/tracex6_user.c
+++ b/samples/bpf/tracex6_user.c
@@ -22,6 +22,7 @@
static void check_on_cpu(int cpu, struct perf_event_attr *attr)
{
+ struct bpf_perf_event_value value2;
int pmu_fd, error = 0;
cpu_set_t set;
__u64 value;
@@ -46,8 +47,18 @@ static void check_on_cpu(int cpu, struct perf_event_attr *attr)
fprintf(stderr, "Value missing for CPU %d\n", cpu);
error = 1;
goto on_exit;
+ } else {
+ fprintf(stderr, "CPU %d: %llu\n", cpu, value);
+ }
+ /* The above bpf_map_lookup_elem should trigger the second kprobe */
+ if (bpf_map_lookup_elem(map_fd[2], &cpu, &value2)) {
+ fprintf(stderr, "Value2 missing for CPU %d\n", cpu);
+ error = 1;
+ goto on_exit;
+ } else {
+ fprintf(stderr, "CPU %d: counter: %llu, enabled: %llu, running: %llu\n", cpu,
+ value2.counter, value2.enabled, value2.running);
}
- fprintf(stderr, "CPU %d: %llu\n", cpu, value);
on_exit:
assert(bpf_map_delete_elem(map_fd[0], &cpu) == 0 || error);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 461811e..79eb529 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -632,7 +632,8 @@ union bpf_attr {
FN(skb_adjust_room), \
FN(redirect_map), \
FN(sk_redirect_map), \
- FN(sock_map_update),
+ FN(sock_map_update), \
+ FN(perf_event_read_value),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 36fb916..08e6f8c 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -70,6 +70,9 @@ static int (*bpf_sk_redirect_map)(void *map, int key, int flags) =
static int (*bpf_sock_map_update)(void *map, void *key, void *value,
unsigned long long flags) =
(void *) BPF_FUNC_sock_map_update;
+static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags,
+ void *buf, unsigned int buf_size) =
+ (void *) BPF_FUNC_perf_event_read_value;
/* llvm builtin functions that eBPF C program may use to
--
2.9.5
^ permalink raw reply related
* [PATCH net-next v5 4/4] bpf: add a test case for helper bpf_perf_prog_read_value
From: Yonghong Song @ 2017-09-20 6:09 UTC (permalink / raw)
To: peterz, rostedt, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20170920060935.1102268-1-yhs@fb.com>
The bpf sample program trace_event is enhanced to use the new
helper to print out enabled/running time.
Signed-off-by: Yonghong Song <yhs@fb.com>
---
samples/bpf/trace_event_kern.c | 10 ++++++++++
samples/bpf/trace_event_user.c | 13 ++++++++-----
tools/include/uapi/linux/bpf.h | 3 ++-
tools/testing/selftests/bpf/bpf_helpers.h | 3 +++
4 files changed, 23 insertions(+), 6 deletions(-)
diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c
index 41b6115..a77a583d 100644
--- a/samples/bpf/trace_event_kern.c
+++ b/samples/bpf/trace_event_kern.c
@@ -37,10 +37,14 @@ struct bpf_map_def SEC("maps") stackmap = {
SEC("perf_event")
int bpf_prog1(struct bpf_perf_event_data *ctx)
{
+ char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu";
+ char time_fmt2[] = "Get Time Failed, ErrCode: %d";
char fmt[] = "CPU-%d period %lld ip %llx";
u32 cpu = bpf_get_smp_processor_id();
+ struct bpf_perf_event_value value_buf;
struct key_t key;
u64 *val, one = 1;
+ int ret;
if (ctx->sample_period < 10000)
/* ignore warmup */
@@ -54,6 +58,12 @@ int bpf_prog1(struct bpf_perf_event_data *ctx)
return 0;
}
+ ret = bpf_perf_prog_read_value(ctx, (void *)&value_buf, sizeof(struct bpf_perf_event_value));
+ if (!ret)
+ bpf_trace_printk(time_fmt1, sizeof(time_fmt1), value_buf.enabled, value_buf.running);
+ else
+ bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret);
+
val = bpf_map_lookup_elem(&counts, &key);
if (val)
(*val)++;
diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c
index 7bd827b..bf4f1b6 100644
--- a/samples/bpf/trace_event_user.c
+++ b/samples/bpf/trace_event_user.c
@@ -127,6 +127,9 @@ static void test_perf_event_all_cpu(struct perf_event_attr *attr)
int *pmu_fd = malloc(nr_cpus * sizeof(int));
int i, error = 0;
+ /* system wide perf event, no need to inherit */
+ attr->inherit = 0;
+
/* open perf_event on all cpus */
for (i = 0; i < nr_cpus; i++) {
pmu_fd[i] = sys_perf_event_open(attr, -1, i, -1, 0);
@@ -154,6 +157,11 @@ static void test_perf_event_task(struct perf_event_attr *attr)
{
int pmu_fd;
+ /* per task perf event, enable inherit so the "dd ..." command can be traced properly.
+ * Enabling inherit will cause bpf_perf_prog_read_time helper failure.
+ */
+ attr->inherit = 1;
+
/* open task bound event */
pmu_fd = sys_perf_event_open(attr, 0, -1, -1, 0);
if (pmu_fd < 0) {
@@ -175,14 +183,12 @@ static void test_bpf_perf_event(void)
.freq = 1,
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
- .inherit = 1,
};
struct perf_event_attr attr_type_sw = {
.sample_freq = SAMPLE_FREQ,
.freq = 1,
.type = PERF_TYPE_SOFTWARE,
.config = PERF_COUNT_SW_CPU_CLOCK,
- .inherit = 1,
};
struct perf_event_attr attr_hw_cache_l1d = {
.sample_freq = SAMPLE_FREQ,
@@ -192,7 +198,6 @@ static void test_bpf_perf_event(void)
PERF_COUNT_HW_CACHE_L1D |
(PERF_COUNT_HW_CACHE_OP_READ << 8) |
(PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
- .inherit = 1,
};
struct perf_event_attr attr_hw_cache_branch_miss = {
.sample_freq = SAMPLE_FREQ,
@@ -202,7 +207,6 @@ static void test_bpf_perf_event(void)
PERF_COUNT_HW_CACHE_BPU |
(PERF_COUNT_HW_CACHE_OP_READ << 8) |
(PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
- .inherit = 1,
};
struct perf_event_attr attr_type_raw = {
.sample_freq = SAMPLE_FREQ,
@@ -210,7 +214,6 @@ static void test_bpf_perf_event(void)
.type = PERF_TYPE_RAW,
/* Intel Instruction Retired */
.config = 0xc0,
- .inherit = 1,
};
printf("Test HW_CPU_CYCLES\n");
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 79eb529..50d2bcd 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -633,7 +633,8 @@ union bpf_attr {
FN(redirect_map), \
FN(sk_redirect_map), \
FN(sock_map_update), \
- FN(perf_event_read_value),
+ FN(perf_event_read_value), \
+ FN(perf_prog_read_value),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 08e6f8c..1d3dcd4 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -73,6 +73,9 @@ static int (*bpf_sock_map_update)(void *map, void *key, void *value,
static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags,
void *buf, unsigned int buf_size) =
(void *) BPF_FUNC_perf_event_read_value;
+static int (*bpf_perf_prog_read_value)(void *ctx, void *buf,
+ unsigned int buf_size) =
+ (void *) BPF_FUNC_perf_prog_read_value;
/* llvm builtin functions that eBPF C program may use to
--
2.9.5
^ permalink raw reply related
* [PATCH net-next v5 1/4] bpf: add helper bpf_perf_event_read_value for perf event array map
From: Yonghong Song @ 2017-09-20 6:09 UTC (permalink / raw)
To: peterz, rostedt, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20170920060935.1102268-1-yhs@fb.com>
Hardware pmu counters are limited resources. When there are more
pmu based perf events opened than available counters, kernel will
multiplex these events so each event gets certain percentage
(but not 100%) of the pmu time. In case that multiplexing happens,
the number of samples or counter value will not reflect the
case compared to no multiplexing. This makes comparison between
different runs difficult.
Typically, the number of samples or counter value should be
normalized before comparing to other experiments. The typical
normalization is done like:
normalized_num_samples = num_samples * time_enabled / time_running
normalized_counter_value = counter_value * time_enabled / time_running
where time_enabled is the time enabled for event and time_running is
the time running for event since last normalization.
This patch adds helper bpf_perf_event_read_value for kprobed based perf
event array map, to read perf counter and enabled/running time.
The enabled/running time is accumulated since the perf event open.
To achieve scaling factor between two bpf invocations, users
can can use cpu_id as the key (which is typical for perf array usage model)
to remember the previous value and do the calculation inside the
bpf program.
Signed-off-by: Yonghong Song <yhs@fb.com>
---
include/linux/perf_event.h | 6 ++++--
include/uapi/linux/bpf.h | 19 ++++++++++++++++++-
kernel/bpf/arraymap.c | 2 +-
kernel/bpf/verifier.c | 4 +++-
kernel/events/core.c | 15 ++++++++++++---
kernel/trace/bpf_trace.c | 46 +++++++++++++++++++++++++++++++++++++++++-----
6 files changed, 79 insertions(+), 13 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8e22f24..21d8c12 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -884,7 +884,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
void *context);
extern void perf_pmu_migrate_context(struct pmu *pmu,
int src_cpu, int dst_cpu);
-int perf_event_read_local(struct perf_event *event, u64 *value);
+int perf_event_read_local(struct perf_event *event, u64 *value,
+ u64 *enabled, u64 *running);
extern u64 perf_event_read_value(struct perf_event *event,
u64 *enabled, u64 *running);
@@ -1286,7 +1287,8 @@ static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *
{
return ERR_PTR(-EINVAL);
}
-static inline int perf_event_read_local(struct perf_event *event, u64 *value)
+static inline int perf_event_read_local(struct perf_event *event, u64 *value,
+ u64 *enabled, u64 *running)
{
return -EINVAL;
}
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 43ab5c4..ccfe1b1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -582,6 +582,14 @@ union bpf_attr {
* @map: pointer to sockmap to update
* @key: key to insert/update sock in map
* @flags: same flags as map update elem
+ *
+ * int bpf_perf_event_read_value(map, flags, buf, buf_size)
+ * read perf event counter value and perf event enabled/running time
+ * @map: pointer to perf_event_array map
+ * @flags: index of event in the map or bitmask flags
+ * @buf: buf to fill
+ * @buf_size: size of the buf
+ * Return: 0 on success or negative error code
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -638,6 +646,7 @@ union bpf_attr {
FN(redirect_map), \
FN(sk_redirect_map), \
FN(sock_map_update), \
+ FN(perf_event_read_value), \
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@@ -681,7 +690,9 @@ enum bpf_func_id {
#define BPF_F_ZERO_CSUM_TX (1ULL << 1)
#define BPF_F_DONT_FRAGMENT (1ULL << 2)
-/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */
+/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
+ * BPF_FUNC_perf_event_read_value flags.
+ */
#define BPF_F_INDEX_MASK 0xffffffffULL
#define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK
/* BPF_FUNC_perf_event_output for sk_buff input context. */
@@ -864,4 +875,10 @@ enum {
#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */
#define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */
+struct bpf_perf_event_value {
+ __u64 counter;
+ __u64 enabled;
+ __u64 running;
+};
+
#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 98c0f00..68d8666 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
ee = ERR_PTR(-EOPNOTSUPP);
event = perf_file->private_data;
- if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
+ if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
goto err_out;
ee = bpf_event_entry_gen(perf_file, map_file);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 799b245..1bf9d7b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1494,7 +1494,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
break;
case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
if (func_id != BPF_FUNC_perf_event_read &&
- func_id != BPF_FUNC_perf_event_output)
+ func_id != BPF_FUNC_perf_event_output &&
+ func_id != BPF_FUNC_perf_event_read_value)
goto error;
break;
case BPF_MAP_TYPE_STACK_TRACE:
@@ -1537,6 +1538,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
break;
case BPF_FUNC_perf_event_read:
case BPF_FUNC_perf_event_output:
+ case BPF_FUNC_perf_event_read_value:
if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
goto error;
break;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3e691b7..2d5bbe5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3684,10 +3684,12 @@ static inline u64 perf_event_count(struct perf_event *event)
* will not be local and we cannot read them atomically
* - must not have a pmu::count method
*/
-int perf_event_read_local(struct perf_event *event, u64 *value)
+int perf_event_read_local(struct perf_event *event, u64 *value,
+ u64 *enabled, u64 *running)
{
unsigned long flags;
int ret = 0;
+ u64 now;
/*
* Disabling interrupts avoids all counter scheduling (context
@@ -3718,14 +3720,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
goto out;
}
+ now = event->shadow_ctx_time + perf_clock();
+ if (enabled)
+ *enabled = now - event->tstamp_enabled;
/*
* If the event is currently on this CPU, its either a per-task event,
* or local to this CPU. Furthermore it means its ACTIVE (otherwise
* oncpu == -1).
*/
- if (event->oncpu == smp_processor_id())
+ if (event->oncpu == smp_processor_id()) {
event->pmu->read(event);
-
+ if (running)
+ *running = now - event->tstamp_running;
+ } else if (running) {
+ *running = event->total_time_running;
+ }
*value = local64_read(&event->count);
out:
local_irq_restore(flags);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index dc498b6..686dfa1 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -255,14 +255,13 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
-BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
-{
+static __always_inline int
+get_map_perf_counter(struct bpf_map *map, u64 flags,
+ u64 *value, u64 *enabled, u64 *running) {
struct bpf_array *array = container_of(map, struct bpf_array, map);
unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
struct bpf_event_entry *ee;
- u64 value = 0;
- int err;
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
return -EINVAL;
@@ -275,7 +274,15 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
if (!ee)
return -ENOENT;
- err = perf_event_read_local(ee->event, &value);
+ return perf_event_read_local(ee->event, value, enabled, running);
+}
+
+BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+{
+ u64 value = 0;
+ int err;
+
+ err = get_map_perf_counter(map, flags, &value, NULL, NULL);
/*
* this api is ugly since we miss [-22..-2] range of valid
* counter values, but that's uapi
@@ -293,6 +300,33 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags,
+ struct bpf_perf_event_value *, buf, u32, size)
+{
+ int err;
+
+ if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+ return -EINVAL;
+
+ err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled,
+ &buf->running);
+ if (unlikely(err)) {
+ memset(buf, 0, size);
+ return err;
+ }
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
+ .func = bpf_perf_event_read_value,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
static __always_inline u64
@@ -499,6 +533,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return &bpf_perf_event_output_proto;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto;
+ case BPF_FUNC_perf_event_read_value:
+ return &bpf_perf_event_read_value_proto;
default:
return tracing_func_proto(func_id);
}
--
2.9.5
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox