Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH 1/1] net: dsa: b53: Fix for brcm tag issue in Cygnus SoC
From: Clément Péron @ 2018-06-06  8:32 UTC (permalink / raw)
  To: scott.branden
  Cc: arun.parameswaran, Florian Fainelli, vivien.didelot, andrew,
	davem, netdev, linux-kernel, BCM Kernel Feedback
In-Reply-To: <8bb9ab39-a38b-9492-975e-d76c864ff8ff@broadcom.com>

Hi Scott,

On Tue, 5 Jun 2018 at 22:55, Scott Branden <scott.branden@broadcom.com> wrote:
>
> Clement,
>
> Please test this works for your issue.

Tested on top of 4.17.0 with a BCM58305

Tested-by: Clément Péron <peron.clem@gmail.com>

>
>
> On 18-06-05 01:38 PM, Arun Parameswaran wrote:
> > In the Broadcom Cygnus SoC, the brcm tag needs to be inserted
> > in between the mac address and the ether type (should use
> > 'DSA_PROTO_TAG_BRCM') for the packets sent to the internal
> > b53 switch.
> >
> > Since the Cygnus was added with the BCM58XX device id and the
> > BCM58XX uses 'DSA_PROTO_TAG_BRCM_PREPEND', the data path is
> > broken, due to the incorrect brcm tag location.
> >
> > Add a new b53 device id (BCM583XX) for Cygnus family to fix the
> > issue. Add the new device id to the BCM58XX family as Cygnus
> > is similar to the BCM58XX in most other functionalities.
> >
> > Fixes: 11606039604c ("net: dsa: b53: Support prepended Broadcom tags")
> >
> > Signed-off-by: Arun Parameswaran <arun.parameswaran@broadcom.com>
> Acked-by: Scott Branden <scott.branden@broadcom.com>
> > ---
> >   drivers/net/dsa/b53/b53_common.c | 15 ++++++++++++++-
> >   drivers/net/dsa/b53/b53_priv.h   |  2 ++
> >   drivers/net/dsa/b53/b53_srab.c   |  4 ++--
> >   3 files changed, 18 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
> > index 3da5fca..bbc6cc6 100644
> > --- a/drivers/net/dsa/b53/b53_common.c
> > +++ b/drivers/net/dsa/b53/b53_common.c
> > @@ -684,7 +684,8 @@ static int b53_switch_reset(struct b53_device *dev)
> >        * still use this driver as a library and need to perform the reset
> >        * earlier.
> >        */
> > -     if (dev->chip_id == BCM58XX_DEVICE_ID) {
> > +     if (dev->chip_id == BCM58XX_DEVICE_ID ||
> > +         dev->chip_id == BCM583XX_DEVICE_ID) {
> >               b53_read8(dev, B53_CTRL_PAGE, B53_SOFTRESET, &reg);
> >               reg |= SW_RST | EN_SW_RST | EN_CH_RST;
> >               b53_write8(dev, B53_CTRL_PAGE, B53_SOFTRESET, reg);
> > @@ -1880,6 +1881,18 @@ struct b53_chip_data {
> >               .jumbo_size_reg = B53_JUMBO_MAX_SIZE,
> >       },
> >       {
> > +             .chip_id = BCM583XX_DEVICE_ID,
> > +             .dev_name = "BCM583xx/11360",
> > +             .vlans = 4096,
> > +             .enabled_ports = 0x103,
> > +             .arl_entries = 4,
> > +             .cpu_port = B53_CPU_PORT,
> > +             .vta_regs = B53_VTA_REGS,
> > +             .duplex_reg = B53_DUPLEX_STAT_GE,
> > +             .jumbo_pm_reg = B53_JUMBO_PORT_MASK,
> > +             .jumbo_size_reg = B53_JUMBO_MAX_SIZE,
> > +     },
> > +     {
> >               .chip_id = BCM7445_DEVICE_ID,
> >               .dev_name = "BCM7445",
> >               .vlans  = 4096,
> > diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
> > index 3b57f47..b232aaa 100644
> > --- a/drivers/net/dsa/b53/b53_priv.h
> > +++ b/drivers/net/dsa/b53/b53_priv.h
> > @@ -62,6 +62,7 @@ enum {
> >       BCM53018_DEVICE_ID = 0x53018,
> >       BCM53019_DEVICE_ID = 0x53019,
> >       BCM58XX_DEVICE_ID = 0x5800,
> > +     BCM583XX_DEVICE_ID = 0x58300,
> >       BCM7445_DEVICE_ID = 0x7445,
> >       BCM7278_DEVICE_ID = 0x7278,
> >   };
> > @@ -181,6 +182,7 @@ static inline int is5301x(struct b53_device *dev)
> >   static inline int is58xx(struct b53_device *dev)
> >   {
> >       return dev->chip_id == BCM58XX_DEVICE_ID ||
> > +             dev->chip_id == BCM583XX_DEVICE_ID ||
> >               dev->chip_id == BCM7445_DEVICE_ID ||
> >               dev->chip_id == BCM7278_DEVICE_ID;
> >   }
> > diff --git a/drivers/net/dsa/b53/b53_srab.c b/drivers/net/dsa/b53/b53_srab.c
> > index c37ffd1..8247481 100644
> > --- a/drivers/net/dsa/b53/b53_srab.c
> > +++ b/drivers/net/dsa/b53/b53_srab.c
> > @@ -364,7 +364,7 @@ static int b53_srab_write64(struct b53_device *dev, u8 page, u8 reg,
> >       { .compatible = "brcm,bcm53018-srab" },
> >       { .compatible = "brcm,bcm53019-srab" },
> >       { .compatible = "brcm,bcm5301x-srab" },
> > -     { .compatible = "brcm,bcm11360-srab", .data = (void *)BCM58XX_DEVICE_ID },
> > +     { .compatible = "brcm,bcm11360-srab", .data = (void *)BCM583XX_DEVICE_ID },
> >       { .compatible = "brcm,bcm58522-srab", .data = (void *)BCM58XX_DEVICE_ID },
> >       { .compatible = "brcm,bcm58525-srab", .data = (void *)BCM58XX_DEVICE_ID },
> >       { .compatible = "brcm,bcm58535-srab", .data = (void *)BCM58XX_DEVICE_ID },
> > @@ -372,7 +372,7 @@ static int b53_srab_write64(struct b53_device *dev, u8 page, u8 reg,
> >       { .compatible = "brcm,bcm58623-srab", .data = (void *)BCM58XX_DEVICE_ID },
> >       { .compatible = "brcm,bcm58625-srab", .data = (void *)BCM58XX_DEVICE_ID },
> >       { .compatible = "brcm,bcm88312-srab", .data = (void *)BCM58XX_DEVICE_ID },
> > -     { .compatible = "brcm,cygnus-srab", .data = (void *)BCM58XX_DEVICE_ID },
> > +     { .compatible = "brcm,cygnus-srab", .data = (void *)BCM583XX_DEVICE_ID },
> >       { .compatible = "brcm,nsp-srab", .data = (void *)BCM58XX_DEVICE_ID },
> >       { /* sentinel */ },
> >   };
>

^ permalink raw reply

* Re: [PATCH net-next 2/3] net: hns3: Fix for VF mailbox receiving unknown message
From: kbuild test robot @ 2018-06-06  8:48 UTC (permalink / raw)
  To: Salil Mehta
  Cc: kbuild-all, davem, salil.mehta, yisen.zhuang, lipeng321,
	mehta.salil, netdev, linux-kernel, linuxarm, Xi Wang
In-Reply-To: <20180605114201.29900-3-salil.mehta@huawei.com>

[-- Attachment #1: Type: text/plain, Size: 9646 bytes --]

Hi Xi,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Salil-Mehta/Bug-fixes-optimization-for-HNS3-Driver/20180606-155040
config: xtensa-allmodconfig (attached as .config)
compiler: xtensa-linux-gcc (GCC) 8.1.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=xtensa 

All errors (new ones prefixed by >>):

   In file included from include/linux/init.h:5,
                    from drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h:6,
                    from drivers/net/ethernet//hisilicon/hns3/hns3vf/hclgevf_mbx.c:4:
   drivers/net/ethernet//hisilicon/hns3/hns3vf/hclgevf_mbx.c: In function 'hclgevf_mbx_handler':
>> drivers/net/ethernet//hisilicon/hns3/hns3vf/hclgevf_mbx.c:155:17: error: implicit declaration of function 'hnae3_get_bit'; did you mean 'hnae_get_bit'? [-Werror=implicit-function-declaration]
      if (unlikely(!hnae3_get_bit(flag, HCLGEVF_CMDQ_RX_OUTVLD_B))) {
                    ^~~~~~~~~~~~~
   include/linux/compiler.h:77:42: note: in definition of macro 'unlikely'
    # define unlikely(x) __builtin_expect(!!(x), 0)
                                             ^
   cc1: some warnings being treated as errors

vim +155 drivers/net/ethernet//hisilicon/hns3/hns3vf/hclgevf_mbx.c

     3	
   > 4	#include "hclge_mbx.h"
     5	#include "hclgevf_main.h"
     6	#include "hnae3.h"
     7	
     8	static void hclgevf_reset_mbx_resp_status(struct hclgevf_dev *hdev)
     9	{
    10		/* this function should be called with mbx_resp.mbx_mutex held
    11		 * to prtect the received_response from race condition
    12		 */
    13		hdev->mbx_resp.received_resp  = false;
    14		hdev->mbx_resp.origin_mbx_msg = 0;
    15		hdev->mbx_resp.resp_status    = 0;
    16		memset(hdev->mbx_resp.additional_info, 0, HCLGE_MBX_MAX_RESP_DATA_SIZE);
    17	}
    18	
    19	/* hclgevf_get_mbx_resp: used to get a response from PF after VF sends a mailbox
    20	 * message to PF.
    21	 * @hdev: pointer to struct hclgevf_dev
    22	 * @resp_msg: pointer to store the original message type and response status
    23	 * @len: the resp_msg data array length.
    24	 */
    25	static int hclgevf_get_mbx_resp(struct hclgevf_dev *hdev, u16 code0, u16 code1,
    26					u8 *resp_data, u16 resp_len)
    27	{
    28	#define HCLGEVF_MAX_TRY_TIMES	500
    29	#define HCLGEVF_SLEEP_USCOEND	1000
    30		struct hclgevf_mbx_resp_status *mbx_resp;
    31		u16 r_code0, r_code1;
    32		int i = 0;
    33	
    34		if (resp_len > HCLGE_MBX_MAX_RESP_DATA_SIZE) {
    35			dev_err(&hdev->pdev->dev,
    36				"VF mbx response len(=%d) exceeds maximum(=%d)\n",
    37				resp_len,
    38				HCLGE_MBX_MAX_RESP_DATA_SIZE);
    39			return -EINVAL;
    40		}
    41	
    42		while ((!hdev->mbx_resp.received_resp) && (i < HCLGEVF_MAX_TRY_TIMES)) {
    43			udelay(HCLGEVF_SLEEP_USCOEND);
    44			i++;
    45		}
    46	
    47		if (i >= HCLGEVF_MAX_TRY_TIMES) {
    48			dev_err(&hdev->pdev->dev,
    49				"VF could not get mbx resp(=%d) from PF in %d tries\n",
    50				hdev->mbx_resp.received_resp, i);
    51			return -EIO;
    52		}
    53	
    54		mbx_resp = &hdev->mbx_resp;
    55		r_code0 = (u16)(mbx_resp->origin_mbx_msg >> 16);
    56		r_code1 = (u16)(mbx_resp->origin_mbx_msg & 0xff);
    57	
    58		if (mbx_resp->resp_status)
    59			return mbx_resp->resp_status;
    60	
    61		if (resp_data)
    62			memcpy(resp_data, &mbx_resp->additional_info[0], resp_len);
    63	
    64		hclgevf_reset_mbx_resp_status(hdev);
    65	
    66		if (!(r_code0 == code0 && r_code1 == code1 && !mbx_resp->resp_status)) {
    67			dev_err(&hdev->pdev->dev,
    68				"VF could not match resp code(code0=%d,code1=%d), %d",
    69				code0, code1, mbx_resp->resp_status);
    70			return -EIO;
    71		}
    72	
    73		return 0;
    74	}
    75	
    76	int hclgevf_send_mbx_msg(struct hclgevf_dev *hdev, u16 code, u16 subcode,
    77				 const u8 *msg_data, u8 msg_len, bool need_resp,
    78				 u8 *resp_data, u16 resp_len)
    79	{
    80		struct hclge_mbx_vf_to_pf_cmd *req;
    81		struct hclgevf_desc desc;
    82		int status;
    83	
    84		req = (struct hclge_mbx_vf_to_pf_cmd *)desc.data;
    85	
    86		/* first two bytes are reserved for code & subcode */
    87		if (msg_len > (HCLGE_MBX_MAX_MSG_SIZE - 2)) {
    88			dev_err(&hdev->pdev->dev,
    89				"VF send mbx msg fail, msg len %d exceeds max len %d\n",
    90				msg_len, HCLGE_MBX_MAX_MSG_SIZE);
    91			return -EINVAL;
    92		}
    93	
    94		hclgevf_cmd_setup_basic_desc(&desc, HCLGEVF_OPC_MBX_VF_TO_PF, false);
    95		req->msg[0] = code;
    96		req->msg[1] = subcode;
    97		memcpy(&req->msg[2], msg_data, msg_len);
    98	
    99		/* synchronous send */
   100		if (need_resp) {
   101			mutex_lock(&hdev->mbx_resp.mbx_mutex);
   102			hclgevf_reset_mbx_resp_status(hdev);
   103			status = hclgevf_cmd_send(&hdev->hw, &desc, 1);
   104			if (status) {
   105				dev_err(&hdev->pdev->dev,
   106					"VF failed(=%d) to send mbx message to PF\n",
   107					status);
   108				mutex_unlock(&hdev->mbx_resp.mbx_mutex);
   109				return status;
   110			}
   111	
   112			status = hclgevf_get_mbx_resp(hdev, code, subcode, resp_data,
   113						      resp_len);
   114			mutex_unlock(&hdev->mbx_resp.mbx_mutex);
   115		} else {
   116			/* asynchronous send */
   117			status = hclgevf_cmd_send(&hdev->hw, &desc, 1);
   118			if (status) {
   119				dev_err(&hdev->pdev->dev,
   120					"VF failed(=%d) to send mbx message to PF\n",
   121					status);
   122				return status;
   123			}
   124		}
   125	
   126		return status;
   127	}
   128	
   129	static bool hclgevf_cmd_crq_empty(struct hclgevf_hw *hw)
   130	{
   131		u32 tail = hclgevf_read_dev(hw, HCLGEVF_NIC_CRQ_TAIL_REG);
   132	
   133		return tail == hw->cmq.crq.next_to_use;
   134	}
   135	
   136	void hclgevf_mbx_handler(struct hclgevf_dev *hdev)
   137	{
   138		struct hclgevf_mbx_resp_status *resp;
   139		struct hclge_mbx_pf_to_vf_cmd *req;
   140		struct hclgevf_cmq_ring *crq;
   141		struct hclgevf_desc *desc;
   142		u16 *msg_q;
   143		u16 flag;
   144		u8 *temp;
   145		int i;
   146	
   147		resp = &hdev->mbx_resp;
   148		crq = &hdev->hw.cmq.crq;
   149	
   150		while (!hclgevf_cmd_crq_empty(&hdev->hw)) {
   151			desc = &crq->desc[crq->next_to_use];
   152			req = (struct hclge_mbx_pf_to_vf_cmd *)desc->data;
   153	
   154			flag = le16_to_cpu(crq->desc[crq->next_to_use].flag);
 > 155			if (unlikely(!hnae3_get_bit(flag, HCLGEVF_CMDQ_RX_OUTVLD_B))) {
   156				dev_warn(&hdev->pdev->dev,
   157					 "dropped invalid mailbox message, code = %d\n",
   158					 req->msg[0]);
   159	
   160				/* dropping/not processing this invalid message */
   161				crq->desc[crq->next_to_use].flag = 0;
   162				hclge_mbx_ring_ptr_move_crq(crq);
   163				continue;
   164			}
   165	
   166			/* synchronous messages are time critical and need preferential
   167			 * treatment. Therefore, we need to acknowledge all the sync
   168			 * responses as quickly as possible so that waiting tasks do not
   169			 * timeout and simultaneously queue the async messages for later
   170			 * prcessing in context of mailbox task i.e. the slow path.
   171			 */
   172			switch (req->msg[0]) {
   173			case HCLGE_MBX_PF_VF_RESP:
   174				if (resp->received_resp)
   175					dev_warn(&hdev->pdev->dev,
   176						 "VF mbx resp flag not clear(%d)\n",
   177						 req->msg[1]);
   178				resp->received_resp = true;
   179	
   180				resp->origin_mbx_msg = (req->msg[1] << 16);
   181				resp->origin_mbx_msg |= req->msg[2];
   182				resp->resp_status = req->msg[3];
   183	
   184				temp = (u8 *)&req->msg[4];
   185				for (i = 0; i < HCLGE_MBX_MAX_RESP_DATA_SIZE; i++) {
   186					resp->additional_info[i] = *temp;
   187					temp++;
   188				}
   189				break;
   190			case HCLGE_MBX_LINK_STAT_CHANGE:
   191			case HCLGE_MBX_ASSERTING_RESET:
   192				/* set this mbx event as pending. This is required as we
   193				 * might loose interrupt event when mbx task is busy
   194				 * handling. This shall be cleared when mbx task just
   195				 * enters handling state.
   196				 */
   197				hdev->mbx_event_pending = true;
   198	
   199				/* we will drop the async msg if we find ARQ as full
   200				 * and continue with next message
   201				 */
   202				if (hdev->arq.count >= HCLGE_MBX_MAX_ARQ_MSG_NUM) {
   203					dev_warn(&hdev->pdev->dev,
   204						 "Async Q full, dropping msg(%d)\n",
   205						 req->msg[1]);
   206					break;
   207				}
   208	
   209				/* tail the async message in arq */
   210				msg_q = hdev->arq.msg_q[hdev->arq.tail];
   211				memcpy(&msg_q[0], req->msg, HCLGE_MBX_MAX_ARQ_MSG_SIZE);
   212				hclge_mbx_tail_ptr_move_arq(hdev->arq);
   213				hdev->arq.count++;
   214	
   215				hclgevf_mbx_task_schedule(hdev);
   216	
   217				break;
   218			default:
   219				dev_err(&hdev->pdev->dev,
   220					"VF received unsupported(%d) mbx msg from PF\n",
   221					req->msg[0]);
   222				break;
   223			}
   224			crq->desc[crq->next_to_use].flag = 0;
   225			hclge_mbx_ring_ptr_move_crq(crq);
   226		}
   227	
   228		/* Write back CMDQ_RQ header pointer, M7 need this pointer */
   229		hclgevf_write_dev(&hdev->hw, HCLGEVF_NIC_CRQ_HEAD_REG,
   230				  crq->next_to_use);
   231	}
   232	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 53124 bytes --]

^ permalink raw reply

* Re: [RFC PATCH 0/6] net: ethernet: ti: cpsw: add MQPRIO and CBS Qdisc offload
From: Ivan Khoronzhuk @ 2018-06-06  8:53 UTC (permalink / raw)
  To: Vinicius Costa Gomes
  Cc: grygorii.strashko, davem, corbet, akpm, netdev, linux-doc,
	linux-kernel, linux-omap, henrik, jesus.sanchez-palencia
In-Reply-To: <87602y1n4j.fsf@intel.com>

On Mon, Jun 04, 2018 at 02:23:56PM -0300, Vinicius Costa Gomes wrote:
>Hi Ivan,
>
>Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org> writes:
>
>> This series adds MQPRIO and CBS Qdisc offload for TI cpsw driver.
>> It potentially can be used in audio video bridging (AVB) and time
>> sensitive networking (TSN).
>>
>> Patchset was tested on AM572x EVM and BBB boards. Last patch from this
>> series adds detailed description of configuration with examples. For
>> consistency reasons, in role of talker and listener, tools from
>> patchset "TSN: Add qdisc based config interface for CBS" were used and
>> can be seen here: https://www.spinics.net/lists/netdev/msg460869.html
>>
>> Based on net-next/master
>>
>
>I didn't test this, but it looks fine from my side.
>
>I agree with Grygorii, that if no comments, this should be re-sent as a
>patch series next.
Thanks, I'll do it on this week.

>
>
>> Ivan Khoronzhuk (6):
>>   net: ethernet: ti: cpsw: use cpdma channels in backward order for txq
>>   net: ethernet: ti: cpdma: fit rated channels in backward order
>>   net: ethernet: ti: cpsw: add MQPRIO Qdisc offload
>>   net: ethernet: ti: cpsw: add CBS Qdisc offload
>>   net: ethernet: ti: cpsw: restore shaper configuration while down/up
>>   Documentation: networking: cpsw: add MQPRIO & CBS offload examples
>>
>>  Documentation/networking/cpsw.txt       | 540 ++++++++++++++++++++++++
>>  drivers/net/ethernet/ti/cpsw.c          | 364 +++++++++++++++-
>>  drivers/net/ethernet/ti/davinci_cpdma.c |  31 +-
>>  3 files changed, 913 insertions(+), 22 deletions(-)
>>  create mode 100644 Documentation/networking/cpsw.txt
>>
>> --
>> 2.17.0
>
>
>Cheers,
>--
>Vinicius

-- 
Regards,
Ivan Khoronzhuk

^ permalink raw reply

* Re: [PATCH v2] ppp: remove the PPPIOCDETACH ioctl
From: Walter Harms @ 2018-06-06  9:01 UTC (permalink / raw)
  To: linux-ppp, Eric Biggers, Paul Mackerras
  Cc: syzkaller-bugs, Guillaume Nault, Eric Biggers, netdev,
	linux-kernel, linux-fsdevel
In-Reply-To: <20180523213738.146911-1-ebiggers3@gmail.com>



> Eric Biggers <ebiggers3@gmail.com> hat am 23. Mai 2018 um 23:37 geschrieben:
> 
> 
> From: Eric Biggers <ebiggers@google.com>
> 
> The PPPIOCDETACH ioctl effectively tries to "close" the given ppp file
> before f_count has reached 0, which is fundamentally a bad idea.  It
> does check 'f_count < 2', which excludes concurrent operations on the
> file since they would only be possible with a shared fd table, in which
> case each fdget() would take a file reference.  However, it fails to
> account for the fact that even with 'f_count == 1' the file can still be
> linked into epoll instances.  As reported by syzbot, this can trivially
> be used to cause a use-after-free.
> 
> Yet, the only known user of PPPIOCDETACH is pppd versions older than
> ppp-2.4.2, which was released almost 15 years ago (November 2003).
> Also, PPPIOCDETACH apparently stopped working reliably at around the
> same time, when the f_count check was added to the kernel, e.g. see
> https://lkml.org/lkml/2002/12/31/83.  Also, the current 'f_count < 2'
> check makes PPPIOCDETACH only work in single-threaded applications; it
> always fails if called from a multithreaded application.
> 
> All pppd versions released in the last 15 years just close() the file
> descriptor instead.
> 
> Therefore, instead of hacking around this bug by exporting epoll
> internals to modules, and probably missing other related bugs, just
> remove the PPPIOCDETACH ioctl and see if anyone actually notices.  Leave
> a stub in place that prints a one-time warning and returns EINVAL.
> 
> Reported-by: syzbot+16363c99d4134717c05b@syzkaller.appspotmail.com
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> Cc: stable@vger.kernel.org
> Signed-off-by: Eric Biggers <ebiggers@google.com>
> ---
> 
> v2: leave a stub in place, rather than removing the ioctl completely.
> 
>  Documentation/networking/ppp_generic.txt |  6 ------
>  drivers/net/ppp/ppp_generic.c            | 27 +++++-------------------
>  include/uapi/linux/ppp-ioctl.h           |  2 +-
>  3 files changed, 6 insertions(+), 29 deletions(-)
> 
> diff --git a/Documentation/networking/ppp_generic.txt
> b/Documentation/networking/ppp_generic.txt
> index 091d20273dcb..61daf4b39600 100644
> --- a/Documentation/networking/ppp_generic.txt
> +++ b/Documentation/networking/ppp_generic.txt
> @@ -300,12 +300,6 @@ unattached instance are:
>  The ioctl calls available on an instance of /dev/ppp attached to a
>  channel are:
>  
> -* PPPIOCDETACH detaches the instance from the channel.  This ioctl is
> -  deprecated since the same effect can be achieved by closing the
> -  instance.  In order to prevent possible races this ioctl will fail
> -  with an EINVAL error if more than one file descriptor refers to this
> -  instance (i.e. as a result of dup(), dup2() or fork()).
> -
>  * PPPIOCCONNECT connects this channel to a PPP interface.  The
>    argument should point to an int containing the interface unit
>    number.  It will return an EINVAL error if the channel is already
> diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
> index dc7c7ec43202..02ad03a2fab7 100644
> --- a/drivers/net/ppp/ppp_generic.c
> +++ b/drivers/net/ppp/ppp_generic.c
> @@ -605,30 +605,13 @@ static long ppp_ioctl(struct file *file, unsigned int
> cmd, unsigned long arg)
>  
>  	if (cmd == PPPIOCDETACH) {
>  		/*
> -		 * We have to be careful here... if the file descriptor
> -		 * has been dup'd, we could have another process in the
> -		 * middle of a poll using the same file *, so we had
> -		 * better not free the interface data structures -
> -		 * instead we fail the ioctl.  Even in this case, we
> -		 * shut down the interface if we are the owner of it.
> -		 * Actually, we should get rid of PPPIOCDETACH, userland
> -		 * (i.e. pppd) could achieve the same effect by closing
> -		 * this fd and reopening /dev/ppp.
> +		 * PPPIOCDETACH is no longer supported as it was heavily broken,
> +		 * and is only known to have been used by pppd older than
> +		 * ppp-2.4.2 (released November 2003).
>  		 */
> +		pr_warn_once("%s (%d) used obsolete PPPIOCDETACH ioctl\n",
> +			     current->comm, current->pid);
>  		err = -EINVAL;
> -		if (pf->kind == INTERFACE) {
> -			ppp = PF_TO_PPP(pf);
> -			rtnl_lock();
> -			if (file == ppp->owner)
> -				unregister_netdevice(ppp->dev);
> -			rtnl_unlock();
> -		}
> -		if (atomic_long_read(&file->f_count) < 2) {
> -			ppp_release(NULL, file);
> -			err = 0;
> -		} else
> -			pr_warn("PPPIOCDETACH file->f_count=%ld\n",
> -				atomic_long_read(&file->f_count));
>  		goto out;
>  	}
>  
> diff --git a/include/uapi/linux/ppp-ioctl.h b/include/uapi/linux/ppp-ioctl.h
> index b19a9c249b15..784c2e3e572e 100644
> --- a/include/uapi/linux/ppp-ioctl.h
> +++ b/include/uapi/linux/ppp-ioctl.h
> @@ -106,7 +106,7 @@ struct pppol2tp_ioc_stats {
>  #define PPPIOCGIDLE	_IOR('t', 63, struct ppp_idle) /* get idle time */
>  #define PPPIOCNEWUNIT	_IOWR('t', 62, int)	/* create new ppp unit */
>  #define PPPIOCATTACH	_IOW('t', 61, int)	/* attach to ppp unit */
> -#define PPPIOCDETACH	_IOW('t', 60, int)	/* detach from ppp unit/chan */
> +#define PPPIOCDETACH	_IOW('t', 60, int)	/* obsolete, do not use */

It's a bit late but ...
i would suggest a stronger wording here. like /* removed 2003 */ 
for me "obsolete" sounds like "maybe it will be removed in a distant future"

just my 2 cents,

re,
 wh

>  #define PPPIOCSMRRU	_IOW('t', 59, int)	/* set multilink MRU */
>  #define PPPIOCCONNECT	_IOW('t', 58, int)	/* connect channel to unit */
>  #define PPPIOCDISCONN	_IO('t', 57)		/* disconnect channel */
> -- 
> 2.17.0.441.gb46fe60e1d-goog
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ppp" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [RFC PATCH] kcm: hold rx mux lock when updating the receive queue.
From: Paolo Abeni @ 2018-06-06  9:44 UTC (permalink / raw)
  To: Tom Herbert, David Miller
  Cc: Linux Kernel Network Developers, Tom Herbert, ktkhai
In-Reply-To: <68f848e2b7bedeeec8327af78a23cce20eea39e5.camel@redhat.com>

On Tue, 2018-06-05 at 18:06 +0200, Paolo Abeni wrote:
> On Tue, 2018-06-05 at 08:35 -0700, Tom Herbert wrote:
> > Paolo, thanks for looking into this! Can you try replacing
> > __skb_dequeue in requeue_rx_msgs with skb_dequeue to see if that is
> > the fix.
> 
> Sure, I'll retrigger the test, and report the result here (or directly
> a new patch, should the test be succesful)

Contrary to my expectations, the suggested change does not fix the
issue. I'm still investigating the overall locking schema.

Cheers,

Paolo

^ permalink raw reply

* Re: [PATCH net-next 1/6] net: hippi: use pci_zalloc_consistent
From: Sergei Shtylyov @ 2018-06-06  9:53 UTC (permalink / raw)
  To: YueHaibing, davem
  Cc: netdev, linux-kernel, jcliburn, chris.snook, benve, jdmason,
	chessman, jes, rahul.verma
In-Reply-To: <20180605122851.23912-2-yuehaibing@huawei.com>

Hello!

On 6/5/2018 3:28 PM, YueHaibing wrote:

> use pci_zalloc_consistent to remove unnecessary memset.
> Also remove a local intermediate pointer 'tmpptr'.

    Don't do 2 things with 1 patch.

> Signed-off-by: YueHaibing <yuehaibing@huawei.com>
[...]

MBR, Sergei

^ permalink raw reply

* Re: [PATCH net-next 4/6] netxen_nic: use pci_zalloc_consistent
From: Sergei Shtylyov @ 2018-06-06  9:56 UTC (permalink / raw)
  To: YueHaibing, davem
  Cc: netdev, linux-kernel, jcliburn, chris.snook, benve, jdmason,
	chessman, jes, rahul.verma
In-Reply-To: <20180605122851.23912-5-yuehaibing@huawei.com>

On 6/5/2018 3:28 PM, YueHaibing wrote:

> use pci_zalloc_consistent to remove unnecessary memset.
> Also remove a local intermediate pointer 'void *addr'.

    AAgain, don't do 2 things with 1 patch.

> Signed-off-by: YueHaibing <yuehaibing@huawei.com>
[...]

MBR, Sergei

^ permalink raw reply

* Re: general protection fault in sockfs_setattr
From: Tetsuo Handa @ 2018-06-06 10:17 UTC (permalink / raw)
  To: shankarapailoor, Cong Wang
  Cc: David Miller, LKML, syzkaller, Linux Kernel Network Developers
In-Reply-To: <CAB+yDaahHYeWE1kS=4GEAC3GKyrAenaSat9Wi_iKQbt1yTp_zg@mail.gmail.com>

Pastebin says that it was 4.17.0-rc4+ rather than 4.17-rc7.
I suggest reporting to Al Viro and linux-fsdevel ML after
confirming that this bug still happens with linux.git , in
case this is a dentry related bug (e.g. someone is by error
calling dput() without getting a refcount).

Also, please don't eliminate kernel messages prior to the
crash. Sometimes previous kernel messages (e.g. memory
allocation fault injection) as-is indicate the cause.

On 2018/06/06 11:19, shankarapailoor wrote:
> Hi Cong,
> 
> I added that check and it seems to stop the crash. Like you said, I
> don't see where the reference count for the file is increased. The
> inode lock also seems to be held during this call.
> 
> Regards,
> Shankara
> 
> 
> 
> On Tue, Jun 5, 2018 at 12:14 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>> On Mon, Jun 4, 2018 at 9:53 PM, shankarapailoor
>> <shankarapailoor@gmail.com> wrote:
>>> Hi,
>>>
>>> I have been fuzzing Linux 4.17-rc7 with Syzkaller and found the
>>> following crash: https://pastebin.com/ixX3RB9j
>>>
>>> Syzkaller isolated the cause of the bug to the following program:
>>>
>>> socketpair$unix(0x1, 0x1, 0x0,
>>> &(0x7f0000000000)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
>>> getresuid(&(0x7f0000000080)=<r2=>0x0, &(0x7f00000000c0),
>>> &(0x7f0000000700))r3 = getegid()
>>> fchownat(r0, &(0x7f0000000040)='\x00', r2, r3, 0x1000)
>>> dup3(r1, r0, 0x80000)
>>>
>>>
>>> The problematic area appears to be here:
>>>
>>> static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
>>> {
>>>     int err = simple_setattr(dentry, iattr);
>>>
>>>     if (!err && (iattr->ia_valid & ATTR_UID)) {
>>>          struct socket *sock = SOCKET_I(d_inode(dentry));
>>>
>>>          sock->sk->sk_uid = iattr->ia_uid; //KASAN GPF
>>>     }
>>>     return err;
>>> }
>>>
>>> If dup3 is called concurrently with fchownat then can sock->sk be NULL?
>>
>> Although dup3() implies a close(), fd is refcnt'ted, if dup3() runs
>> concurrently with fchownat() it should not be closed until whoever
>> the last closes it.
>>
>> Or maybe fchownat() doesn't even hold refcnt of fd, since it aims
>> to change the file backed.
>>
>>
>> Not sure if the following is sufficient, inode might need to be protected
>> with some lock...
>>
>> diff --git a/net/socket.c b/net/socket.c
>> index f10f1d947c78..6294b4b3132e 100644
>> --- a/net/socket.c
>> +++ b/net/socket.c
>> @@ -537,7 +537,10 @@ static int sockfs_setattr(struct dentry *dentry,
>> struct iattr *iattr)
>>         if (!err && (iattr->ia_valid & ATTR_UID)) {
>>                 struct socket *sock = SOCKET_I(d_inode(dentry));
>>
>> -               sock->sk->sk_uid = iattr->ia_uid;
>> +               if (sock->sk)
>> +                       sock->sk->sk_uid = iattr->ia_uid;
>> +               else
>> +                       err = -ENOENT;
>>         }
>>
>>         return err;
> 
> 
> 

^ permalink raw reply

* Question about force_primary in bonding driver
From: Xiangning Yu @ 2018-06-06 10:18 UTC (permalink / raw)
  To: Linux Kernel Network Developers

Hi netdev folks,

While playing with bonding active-standby mode, we found a possible
timing issue that the primary might not initialized in bond_enslave():

        if (bond_uses_primary(bond) && bond->params.primary[0]) {
                /* if there is a primary slave, remember it */
                if (strcmp(bond->params.primary, new_slave->dev->name) == 0) {
                        rcu_assign_pointer(bond->primary_slave, new_slave);
                        bond->force_primary = true;
                }
        }

Hence bond->force_primary won't be set and bond_select_active_slave()
ends up not selecting the primary slave as specified in bond params.

We applied the following patch to make sure force_primary is set when
primary is copied, and it seems to fix the issue. Could you please
provide some comments/thoughts on this?

Thank you very much!

- Xiangning

diff --git a/drivers/net/bonding/bond_options.c
b/drivers/net/bonding/bond_options.c
index 58c705f..b594bae 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -1142,6 +1142,7 @@ static int bond_option_primary_set(struct bonding *bond,
                                   slave->dev->name);
                        rcu_assign_pointer(bond->primary_slave, slave);
                        strcpy(bond->params.primary, slave->dev->name);
+                       bond->force_primary = true;
                        bond_select_active_slave(bond);
                        goto out;
                }

^ permalink raw reply related

* Re: [RFC PATCH] kcm: hold rx mux lock when updating the receive queue.
From: Kirill Tkhai @ 2018-06-06 10:25 UTC (permalink / raw)
  To: Paolo Abeni, Tom Herbert, David Miller
  Cc: Linux Kernel Network Developers, Tom Herbert
In-Reply-To: <e8506389509c13db6b7837d55cecee742517f34d.camel@redhat.com>

Hi, Paolo,

below is couple my thoughts about this.

On 06.06.2018 12:44, Paolo Abeni wrote:
> On Tue, 2018-06-05 at 18:06 +0200, Paolo Abeni wrote:
>> On Tue, 2018-06-05 at 08:35 -0700, Tom Herbert wrote:
>>> Paolo, thanks for looking into this! Can you try replacing
>>> __skb_dequeue in requeue_rx_msgs with skb_dequeue to see if that is
>>> the fix.
>>
>> Sure, I'll retrigger the test, and report the result here (or directly
>> a new patch, should the test be succesful)
> 
> Contrary to my expectations, the suggested change does not fix the
> issue. I'm still investigating the overall locking schema.

kcm_rcv_strparser()->unreserve_rx_kcm()->requeue_rx_msgs()->__skb_dequeue()

seems needed to be synchronized with:

kcm_recvmsg()->kcm_wait_data().

Otherwise, requeue_rx_msgs() removes kcm_recvmsg() peeked skb.

The solution could be to take lock_sock(&kcm->sk) in requeue_rx_msgs(), but
we can't do that since there is already locked another socket (and potentially,
this may be a reason of deadlock).

The approach you made in initial patch seems good for me to solve this problem.
The only thing I'm not sure is either lock_sock() is needed in kcm_recvmsg() after
this.

Thanks,
Kirill

^ permalink raw reply

* Re: BUG: 4.14.11 unable to handle kernel NULL pointer dereference in xfrm_lookup
From: Kristian Evensen @ 2018-06-06 10:41 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: Markus Berner, netdev-list, Network Development
In-Reply-To: <20180202080945.vj3qbgegodew2hxr@gauss3.secunet.de>

Hi,

I am experiencing the same issue on a PC Engines APU2 running kernel
4.14.34, both with and without hardware encryption. With hw.
encryption, the crash occurs within 2-4 hours. Without hw. encryption,
it takes 7-8 hours. My setup is nothing crazy, between 7 and 20
tunnels with heavy RX/TX.

On Fri, Feb 2, 2018 at 9:09 AM, Steffen Klassert
<steffen.klassert@secunet.com> wrote:
> Thanks for offering help, but I fear we have to wait until
> Tobias has bisected it.

Has any progress been made here? I would like to try a bisect myself,
but my board is running OpenWRT, which makes bisecting hard. The only
observation I have so far, is that I did not see the issue with kernel
4.9.

BR,
Kristian

^ permalink raw reply

* Re: BUG: 4.14.11 unable to handle kernel NULL pointer dereference in xfrm_lookup
From: Kristian Evensen @ 2018-06-06 10:58 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: Markus Berner, netdev-list, Network Development
In-Reply-To: <CAKfDRXhHMg5HKX=T1BKs5mx+BL-L8VyzqApS+8E52mDipZzU6Q@mail.gmail.com>

On Wed, Jun 6, 2018 at 12:41 PM, Kristian Evensen
<kristian.evensen@gmail.com> wrote:
> Hi,
>
> I am experiencing the same issue on a PC Engines APU2 running kernel
> 4.14.34, both with and without hardware encryption. With hw.
> encryption, the crash occurs within 2-4 hours. Without hw. encryption,
> it takes 7-8 hours. My setup is nothing crazy, between 7 and 20
> tunnels with heavy RX/TX.

I am very sorry, I forgot to c&p the oops. Here it is:

[26675.130763] BUG: unable to handle kernel NULL pointer dereference
at 0000000000000020
[26675.139335] IP: xfrm_lookup+0x1d/0x7b0
[26675.143318] PGD 11726e067 P4D 11726e067 PUD 11697e067 PMD 0
[26675.149542] Oops: 0000 [#1] SMP NOPTI
[26675.153451] Modules linked in: qcserial ppp_async option cdc_mbim
usb_wwan rt2800pci rt2800mmio rt2800lib rndis_host qmi_wwan
ppp_generic nf_nat_pptp nf_conntrack_pptp nf_conntrack_ipv6
iptable_nat ipt_REJECT ipt_MASQUERADE huawei_cdc_ncm cdc_ncm cdc_ether
ax88179_178a asix xt_time xt_tcpudp xt_tcpmss xt_string xt_statistic
xt_state xt_recent xt_quota xt_policy xt_pkttype xt_owner xt_nfacct
xt_nat xt_multiport xt_mark xt_mac xt_limit xt_length xt_hl xt_helper
xt_esp xt_ecn xt_dscp xt_conntrack xt_connmark xt_connlimit
xt_connlabel xt_connbytes xt_comment xt_addrtype xt_TCPMSS xt_REDIRECT
xt_NFQUEUE xt_LOG xt_HL xt_DSCP xt_CT xt_CLASSIFY usbserial usbnet
ts_fsm ts_bm slhc rt2x00pci rt2x00mmio rt2x00lib r8169 nfnetlink_queue
nfnetlink_acct nf_reject_ipv4 nf_nat_tftp nf_nat_snmp_basic nf_nat_sip
nf_nat_redirect
[26675.227953]  nf_nat_proto_gre nf_nat_masquerade_ipv4 nf_nat_irc
nf_conntrack_ipv4 nf_nat_ipv4 nf_nat_h323 nf_nat_ftp nf_nat_amanda
nf_nat nf_log_ipv4 nf_defrag_ipv6 nf_defrag_ipv4 nf_conntrack_tftp
nf_conntrack_snmp nf_conntrack_sip nf_conntrack_proto_gre
nf_conntrack_netlink nf_conntrack_irc nf_conntrack_h323
nf_conntrack_ftp nf_conntrack_broadcast ts_kmp nf_conntrack_amanda
lib80211_crypt_wep lib80211_crypt_tkip lib80211_crypt_ccmp lib80211
iptable_raw iptable_mangle iptable_filter ipt_ah ipt_ECN ip6table_raw
ip_tables crc_itu_t crc_ccitt compat_xtables cdc_wdm br_netfilter
ath10k_pci ath10k_core sch_cake ath9k ath9k_common ath9k_hw ath
mac80211 cfg80211 compat act_connmark nf_conntrack act_skbedit
act_mirred em_u32 cls_u32 cls_tcindex cls_flow cls_route cls_fw
sch_tbf sch_htb sch_hfsc sch_ingress
[26675.300777]  evdev i2c_dev cryptodev xt_set ip_set_list_set
ip_set_hash_netiface ip_set_hash_netport ip_set_hash_netnet
ip_set_hash_net ip_set_hash_netportnet ip_set_hash_mac
ip_set_hash_ipportnet ip_set_hash_ipportip ip_set_hash_ipport
ip_set_hash_ipmark ip_set_hash_ip ip_set_bitmap_port
ip_set_bitmap_ipmac ip_set_bitmap_ip ip_set nfnetlink ip6t_REJECT
nf_reject_ipv6 nf_log_ipv6 nf_log_common ip6table_mangle
ip6table_filter ip6_tables x_tables igb i2c_algo_bit e1000 ifb ipcomp6
xfrm6_tunnel xfrm6_mode_tunnel xfrm6_mode_transport xfrm6_mode_beet
esp6 ah6 ipcomp xfrm4_tunnel xfrm4_mode_tunnel xfrm4_mode_transport
xfrm4_mode_beet esp4 ah4 tunnel6 tunnel4 tun af_key xfrm_user
xfrm_ipcomp xfrm_algo eeprom_93cx6 sha256_ssse3 sha1_ssse3
sha1_generic michael_mic md5 echainiv des_generic deflate zlib_deflate
[26675.373443]  cmac authenc crypto_acompress xhci_plat_hcd uhci_hcd
leds_apu2 ehci_platform gpio_button_hotplug ptp pps_core mii libphy
[26675.385812] CPU: 3 PID: 24 Comm: ksoftirqd/3 Not tainted 4.14.34 #0
[26675.392308] Hardware name: PC Engines apu2/apu2, BIOS 88a4f96 03/07/2016
[26675.399200] task: ffff88011a0fc680 task.stack: ffffc900000e0000
[26675.405368] RIP: 0010:xfrm_lookup+0x1d/0x7b0
[26675.409736] RSP: 0018:ffffc900000e3b68 EFLAGS: 00010292
[26675.415066] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
[26675.422407] RDX: ffffc900000e3be8 RSI: 0000000000000000 RDI: ffffffff81c6b480
[26675.429732] RBP: ffffc900000e3bd8 R08: 0000000000000002 R09: 0000000000000001
[26675.437107] R10: 0000000000000032 R11: 0000000000000000 R12: ffff880112954800
[26675.444524] R13: 0000000000000002 R14: ffffc900000e3be8 R15: ffffffff81c6b480
[26675.451876] FS:  0000000000000000(0000) GS:ffff88011ed80000(0000)
knlGS:0000000000000000
[26675.460101] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[26675.465971] CR2: 0000000000000020 CR3: 0000000116928000 CR4: 00000000000406e0
[26675.473282] Call Trace:
[26675.475801]  ? __xfrm_policy_check+0x1cc/0x520
[26675.480419]  __xfrm_route_forward+0x96/0xe0
[26675.484698]  ip_forward+0x10f/0x3a0
[26675.488233]  ? ip_route_input_noref+0x14/0x20
[26675.492713]  ip_rcv_finish+0x26d/0x2c0
[26675.496525]  ip_rcv+0x253/0x2c0
[26675.499767]  ? ip_local_deliver_finish+0x180/0x180
[26675.504752]  __netif_receive_skb_core+0x7b6/0x920
[26675.509577]  __netif_receive_skb+0x54/0x60
[26675.513754]  ? __netif_receive_skb+0x54/0x60
[26675.518140]  process_backlog+0x91/0x130
[26675.522194]  net_rx_action+0xf3/0x250
[26675.525933]  __do_softirq+0xb5/0x1d1
[26675.529600]  run_ksoftirqd+0x1b/0x40
[26675.533275]  smpboot_thread_fn+0x152/0x170
[26675.537538]  kthread+0x112/0x120
[26675.540873]  ? sort_range+0x20/0x20
[26675.544422]  ? kthread_create_on_node+0x40/0x40
[26675.549092]  ret_from_fork+0x22/0x40
[26675.552737] Code: 1f 44 00 00 66 2e 0f 1f 84 00 00 00 00 00 55 48
89 e5 41 57 41 56 41 55 41 54 49 89 ff 53 49 89 d6 48 89 f3 45 89 c5
48 83 ec 48 <48> 8b 46 20 48 85 c9 0f b7 00 c7 45 ac 00 00 00 00 66 89
45 a0
[26675.572124] RIP: xfrm_lookup+0x1d/0x7b0 RSP: ffffc900000e3b68
[26675.578059] CR2: 0000000000000020
[26675.581516] ---[ end trace a1b7aabd4d96add5 ]---
[26675.586317] Kernel panic - not syncing: Fatal exception in interrupt
[26675.592914] Kernel Offset: disabled
[26675.596505] Rebooting in 3 seconds..

BR,
Kristian

^ permalink raw reply

* Re: [PATCH bpf-next v4 2/2] samples/bpf: Add xdp_sample_pkts example
From: Toke Høiland-Jørgensen @ 2018-06-06 11:01 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: netdev
In-Reply-To: <20180605152434.0149b8a7@cakuba.netronome.com>

Jakub Kicinski <jakub.kicinski@netronome.com> writes:

> On Tue, 05 Jun 2018 16:50:00 +0200, Toke Høiland-Jørgensen wrote:
>> Add an example program showing how to sample packets from XDP using the
>> perf event buffer. The example userspace program just prints the ethernet
>> header for every packet sampled.
>> 
>> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
>
>> diff --git a/samples/bpf/xdp_sample_pkts_kern.c b/samples/bpf/xdp_sample_pkts_kern.c
>> new file mode 100644
>> index 000000000000..4560522ca015
>> --- /dev/null
>> +++ b/samples/bpf/xdp_sample_pkts_kern.c
>> @@ -0,0 +1,62 @@
>> +#include <linux/ptrace.h>
>> +#include <linux/version.h>
>> +#include <uapi/linux/bpf.h>
>> +#include "bpf_helpers.h"
>> +
>> +#define SAMPLE_SIZE 64ul
>> +#define MAX_CPUS 24
>
> That may be a lil' too few for modern HW with hyper-threading on ;)  
> My development machine says:
>
> $ ncpus
> 28
>
> 128, maybe?

What? 24 CPUs should be enough for everyone, surely? ;)

(I just took twice the number of CPUs in my largest machine; but fair
point, other people have access to more expensive toys I guess... will fix)

>> +#define bpf_printk(fmt, ...)					\
>> +({								\
>> +	       char ____fmt[] = fmt;				\
>> +	       bpf_trace_printk(____fmt, sizeof(____fmt),	\
>> +				##__VA_ARGS__);			\
>> +})
>> +
>> +struct bpf_map_def SEC("maps") my_map = {
>> +	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
>> +	.key_size = sizeof(int),
>> +	.value_size = sizeof(u32),
>> +	.max_entries = MAX_CPUS,
>> +};
>> +
>> +SEC("xdp_sample")
>> +int xdp_sample_prog(struct xdp_md *ctx)
>> +{
>> +	void *data_end = (void *)(long)ctx->data_end;
>> +	void *data = (void *)(long)ctx->data;
>> +
>> +        /* Metadata will be in the perf event before the packet data. */
>> +	struct S {
>> +		u16 cookie;
>> +		u16 pkt_len;
>> +	} __attribute__((packed)) metadata;
>> +
>> +	if (data + SAMPLE_SIZE < data_end) {
>> +		/* The XDP perf_event_output handler will use the upper 32 bits
>> +		 * of the flags argument as a number of bytes to include of the
>> +		 * packet payload in the event data. If the size is too big, the
>> +		 * call to bpf_perf_event_output will fail and return -EFAULT.
>> +		 *
>> +		 * See bpf_xdp_event_output in net/core/filter.c.
>> +		 *
>> +		 * The BPF_F_CURRENT_CPU flag means that the event output fd
>> +		 * will be indexed by the CPU number in the event map.
>> +		 */
>> +		u64 flags = (SAMPLE_SIZE << 32) | BPF_F_CURRENT_CPU;
>> +		int ret;
>> +
>> +		metadata.cookie = 0xdead;
>> +		metadata.pkt_len = (u16)(data_end - data);
>> +
>> +		ret = bpf_perf_event_output(ctx, &my_map, flags,
>> +				      &metadata, sizeof(metadata));
>> +		if(ret)
>
> Please run checkpatch --strict on the samples.

Will do!

>> +			bpf_printk("perf_event_output failed: %d\n", ret);
>> +	}
>> +
>> +	return XDP_PASS;
>> +}
>> +
>> +char _license[] SEC("license") = "GPL";
>> +u32 _version SEC("version") = LINUX_VERSION_CODE;

^ permalink raw reply

* Re: [PATCH bpf-next v4 2/2] samples/bpf: Add xdp_sample_pkts example
From: Jesper Dangaard Brouer @ 2018-06-06 11:33 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: brouer, Jakub Kicinski, netdev, Martin KaFai Lau
In-Reply-To: <87vaawyy8v.fsf@toke.dk>

On Wed, 06 Jun 2018 13:01:52 +0200
Toke Høiland-Jørgensen <toke@toke.dk> wrote:

> Jakub Kicinski <jakub.kicinski@netronome.com> writes:
> 
> > On Tue, 05 Jun 2018 16:50:00 +0200, Toke Høiland-Jørgensen wrote:  
> >> Add an example program showing how to sample packets from XDP using the
> >> perf event buffer. The example userspace program just prints the ethernet
> >> header for every packet sampled.
> >> 
> >> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>  
> >  
> >> diff --git a/samples/bpf/xdp_sample_pkts_kern.c b/samples/bpf/xdp_sample_pkts_kern.c
> >> new file mode 100644
> >> index 000000000000..4560522ca015
> >> --- /dev/null
> >> +++ b/samples/bpf/xdp_sample_pkts_kern.c
> >> @@ -0,0 +1,62 @@
> >> +#include <linux/ptrace.h>
> >> +#include <linux/version.h>
> >> +#include <uapi/linux/bpf.h>
> >> +#include "bpf_helpers.h"
> >> +
> >> +#define SAMPLE_SIZE 64ul
> >> +#define MAX_CPUS 24  
> >
> > That may be a lil' too few for modern HW with hyper-threading on ;)  
> > My development machine says:
> >
> > $ ncpus
> > 28
> >
> > 128, maybe?  
> 
> What? 24 CPUs should be enough for everyone, surely? ;)
> 
> (I just took twice the number of CPUs in my largest machine; but fair
> point, other people have access to more expensive toys I guess... will fix)
> 
> >> +#define bpf_printk(fmt, ...)					\
> >> +({								\
> >> +	       char ____fmt[] = fmt;				\
> >> +	       bpf_trace_printk(____fmt, sizeof(____fmt),	\
> >> +				##__VA_ARGS__);			\
> >> +})
> >> +
> >> +struct bpf_map_def SEC("maps") my_map = {
> >> +	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> >> +	.key_size = sizeof(int),
> >> +	.value_size = sizeof(u32),
> >> +	.max_entries = MAX_CPUS,
> >> +};

I agree, we should increase it to something larger than 24, just to
avoid people getting surprised, but this is only sample code.

At some point I want us to add another example, that show how to adjust
this runtime, which is actually possible with bpf_load.c via a map
callback that Martin added.  I've not look at how libbpf can support
this.  I do have sample code[1] that could be adjusted to do this, but
I don't want to create dependencies to bpf_load.c, as we all agree that
everything should be moved to use libbpf (tools/lib/bpf).

[1] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/samples/bpf/xdp_ddos01_blacklist_user.c#L186-L218

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH bpf-next v2 2/2] samples/bpf: Add xdp_sample_pkts example
From: David Beckett @ 2018-06-06 11:40 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, netdev
In-Reply-To: <152813003614.3465.11049830599815911223.stgit@alrua-kau>

On 04/06/18 17:33, Toke Høiland-Jørgensen wrote:
> +
> +#define SAMPLE_SIZE 64ul
> +
The program currently cannot sample minimum sized packets, as the 4 Byte 
crc checksum isnt present in ctx,
may be better to use 60ul sample size to allow for these packets to be 
processed?
> +	if (data + SAMPLE_SIZE < data_end) {
> +		/* The XDP perf_event_output handler will use the upper 32 bits
> +		 * of the flags argument as a number of bytes to include of the
I may be wrong on this but should this also be <= to allow for packets 
at SAMPLE_SIZE to be sampled?

^ permalink raw reply

* RE: [PATCH 09/10] dpaa_eth: add support for hardware timestamping
From: Y.b. Lu @ 2018-06-06 11:48 UTC (permalink / raw)
  To: Richard Cochran
  Cc: netdev@vger.kernel.org, Madalin-cristian Bucur, Rob Herring,
	Shawn Guo, David S . Miller, devicetree@vger.kernel.org,
	linuxppc-dev@lists.ozlabs.org,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <20180605135748.mlarwiyzf2oe27ax@localhost>

Hi Richard,

> -----Original Message-----
> From: Richard Cochran [mailto:richardcochran@gmail.com]
> Sent: Tuesday, June 5, 2018 9:58 PM
> To: Y.b. Lu <yangbo.lu@nxp.com>
> Cc: netdev@vger.kernel.org; Madalin-cristian Bucur
> <madalin.bucur@nxp.com>; Rob Herring <robh+dt@kernel.org>; Shawn Guo
> <shawnguo@kernel.org>; David S . Miller <davem@davemloft.net>;
> devicetree@vger.kernel.org; linuxppc-dev@lists.ozlabs.org;
> linux-arm-kernel@lists.infradead.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH 09/10] dpaa_eth: add support for hardware timestamping
> 
> On Tue, Jun 05, 2018 at 03:35:28AM +0000, Y.b. Lu wrote:
> > [Y.b. Lu] Actually these timestamping codes affected DPAA networking
> performance in our previous performance test.
> > That's why we used ifdef for it.
> 
> How much does time stamping hurt performance?
> 
> If the time stamping is compiled in but not enabled at run time, does it still
> affect performace?

[Y.b. Lu] I can't remember and find the old data since it had been a long time.
I just did the iperf test today between two 10G ports. I didn’t see any performance changes with timestamping code 😊
So, let's me remove the ifdef in next version.
Thanks a lot.


> 
> Thanks,
> Richard

^ permalink raw reply

* Re: [PATCH net] failover: eliminate callback hell
From: Michael S. Tsirkin @ 2018-06-06 12:19 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Samudrala, Sridhar, kys, haiyangz, davem, netdev,
	Stephen Hemminger
In-Reply-To: <20180605205118.439a7873@xeon-e3>

On Tue, Jun 05, 2018 at 08:51:18PM -0700, Stephen Hemminger wrote:
> > I think the push back was with the usage of the delay, not bringing up the primary/standby
> > device in the name change event handler.
> > Can't netvsc use this mechanism instead of depending on the delay?
> > 
> > 
> 
> The patch that was rejected for netvsc was about using name change.

So failover is now doing exactly what you wanted netvsc to do.  Rather
than reverting everyone to old behaviour how about using more pieces
from failover?

> Also, you can't depend on name change; you still need a timer. Not all distributions
> change name of devices.

So failover chose not to implement the delayed open so far.
If it does I suspect we'll have to keep it around forever -
kind of like netvsc seems to be stuck with it.
But let's see if there's enough actual demand from people running
ancient distros with latest kernels to even start looking for
a solution for failover.

And this kind of behaviour change really should be split out
so we can discuss it separately.

> Or user has blocked that by udev rules.

Don't do that then?

-- 
MST

^ permalink raw reply

* Re: [RFC PATCH] kcm: hold rx mux lock when updating the receive queue.
From: Paolo Abeni @ 2018-06-06 12:23 UTC (permalink / raw)
  To: Kirill Tkhai, Tom Herbert, David Miller
  Cc: Linux Kernel Network Developers, Tom Herbert
In-Reply-To: <9b036d06-f011-6e80-956a-1162af25a5b8@virtuozzo.com>

Hi,

On Wed, 2018-06-06 at 13:25 +0300, Kirill Tkhai wrote:
> Hi, Paolo,
> 
> below is couple my thoughts about this.
> 
> On 06.06.2018 12:44, Paolo Abeni wrote:
> > On Tue, 2018-06-05 at 18:06 +0200, Paolo Abeni wrote:
> > > On Tue, 2018-06-05 at 08:35 -0700, Tom Herbert wrote:
> > > > Paolo, thanks for looking into this! Can you try replacing
> > > > __skb_dequeue in requeue_rx_msgs with skb_dequeue to see if that is
> > > > the fix.
> > > 
> > > Sure, I'll retrigger the test, and report the result here (or directly
> > > a new patch, should the test be succesful)
> > 
> > Contrary to my expectations, the suggested change does not fix the
> > issue. I'm still investigating the overall locking schema.
> 
> kcm_rcv_strparser()->unreserve_rx_kcm()->requeue_rx_msgs()->__skb_dequeue()
> 
> seems needed to be synchronized with:
> 
> kcm_recvmsg()->kcm_wait_data().
> 
> Otherwise, requeue_rx_msgs() removes kcm_recvmsg() peeked skb.
> 
> The solution could be to take lock_sock(&kcm->sk) in requeue_rx_msgs(), but
> we can't do that since there is already locked another socket (and potentially,
> this may be a reason of deadlock).
> 
> The approach you made in initial patch seems good for me to solve this problem.
> The only thing I'm not sure is either lock_sock() is needed in kcm_recvmsg() after
> this.

Thank you for the feedback!

I tried a different approach (add en explicit 'peek' argument to
kcm_wait_data, and dequeue the packet there if not explicitly asked
otherwise). It solves the issue and looks reasonably clean.

I'll post the patch soon.

Cheers,

Paolo

^ permalink raw reply

* Re: [PATCH bpf-next v2 2/2] samples/bpf: Add xdp_sample_pkts example
From: Toke Høiland-Jørgensen @ 2018-06-06 12:28 UTC (permalink / raw)
  To: David Beckett, netdev
In-Reply-To: <1448c449-b853-b2df-2654-44c9c52adfe4@netronome.com>

David Beckett <david.beckett@netronome.com> writes:

> On 04/06/18 17:33, Toke Høiland-Jørgensen wrote:
>> +
>> +#define SAMPLE_SIZE 64ul
>> +
> The program currently cannot sample minimum sized packets, as the 4 Byte 
> crc checksum isnt present in ctx,
> may be better to use 60ul sample size to allow for these packets to be 
> processed?

Right. However, this also reminds me that I wanted to make the sampling
size dynamic, so it is possible to dump packets that are smaller than
the configured SAMPLE_SIZE. Will fix :)

>> +	if (data + SAMPLE_SIZE < data_end) {
>> +		/* The XDP perf_event_output handler will use the upper 32 bits
>> +		 * of the flags argument as a number of bytes to include of the
> I may be wrong on this but should this also be <= to allow for packets 
> at SAMPLE_SIZE to be sampled?

Yes, you are right, but that goes away with the change I mentioned above.

-Toke

^ permalink raw reply

* Re: [PATCH net] failover: eliminate callback hell
From: Michael S. Tsirkin @ 2018-06-06 12:30 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Stephen Hemminger, kys, haiyangz, davem, sridhar.samudrala,
	netdev, Stephen Hemminger
In-Reply-To: <20180606072512.GA2289@nanopsycho.orion>

On Wed, Jun 06, 2018 at 09:25:12AM +0200, Jiri Pirko wrote:
> Tue, Jun 05, 2018 at 05:42:31AM CEST, stephen@networkplumber.org wrote:
> >The net failover should be a simple library, not a virtual
> >object with function callbacks (see callback hell).
> 
> Why just a library? It should do a common things. I think it should be a
> virtual object. Looks like your patch again splits the common
> functionality into multiple drivers. That is kind of backwards attitude.
> I don't get it. We should rather focus on fixing the mess the
> introduction of netvsc-bonding caused and switch netvsc to 3-netdev
> model.

So it seems that at least one benefit for netvsc would be better
handling of renames.

Question is how can this change to 3-netdev happen?  Stephen is
concerned about risk of breaking some userspace.

Stephen, this seems to be the usecase that IFF_HIDDEN was trying to
address, and you said then "why not use existing network namespaces
rather than inventing a new abstraction". So how about it then? Do you
want to find a way to use namespaces to hide the PV device for netvsc
compatibility?

-- 
MST

^ permalink raw reply

* Re: [PATCH bpf-next v5 00/10] BTF: BPF Type Format
From: Arnaldo Carvalho de Melo @ 2018-06-06 12:33 UTC (permalink / raw)
  To: Martin KaFai Lau; +Cc: netdev, Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20180605212548.lwtaw4svvydo2lhy@kafai-mbp.dhcp.thefacebook.com>

Em Tue, Jun 05, 2018 at 02:25:48PM -0700, Martin KaFai Lau escreveu:
> On Thu, Apr 19, 2018 at 04:40:34PM -0300, Arnaldo Carvalho de Melo wrote:
> > Em Wed, Apr 18, 2018 at 03:55:56PM -0700, Martin KaFai Lau escreveu:
> > > This patch introduces BPF Type Format (BTF).
> > > 
> > > BTF (BPF Type Format) is the meta data format which describes
> > > the data types of BPF program/map.  Hence, it basically focus
> > > on the C programming language which the modern BPF is primary
> > > using.  The first use case is to provide a generic pretty print
> > > capability for a BPF map.
> > > 
> > > A modified pahole that can convert dwarf to BTF is here:
> > > https://github.com/iamkafai/pahole/tree/btf
> > > (Arnaldo, there is some BTF_KIND numbering changes on
> > >  Apr 18th, d61426c1571)
> > 
> > Thanks for letting me know, I'm starting to look at this,
> Hi Arnaldo,
> 
> Do you have a chance to take a look and pull it?  The kernel
> changes will be in 4.18, so it will be handy if it is available in
> the pahole repository.
> 
> [ btw, the latest commit (1 commit) should be 94a11b59e592 ].

Got sidetracked, will get back to it later today.

- Arnaldo

^ permalink raw reply

* Re: [PATCH] r8169: Reinstate ALDPS and ASPM support
From: Andrew Lunn @ 2018-06-06 12:34 UTC (permalink / raw)
  To: Kai-Heng Feng
  Cc: David Miller, Hayes Wang, hkallweit1, romieu, Linux Netdev List,
	Linux Kernel Mailing List, Ryankao
In-Reply-To: <8BAED2AC-DA94-4F63-BA16-EB3D54649BC5@canonical.com>

> Hi Andrew,
> 
> Sure.
> 
> Do you think we should also strip out the enable/disable logic?
> 
> Or just remove the parameter?

Hi Kai-Heng

How would you use the logic if you remove the parameter?

    Andrew

^ permalink raw reply

* Re: [PATCH] net/phy: Micrel KSZ8061 PHY link failure after cable connect
From: Andrew Lunn @ 2018-06-06 12:39 UTC (permalink / raw)
  To: Alexander Onnasch; +Cc: Florian Fainelli, netdev, linux-kernel
In-Reply-To: <1528272198-10825-1-git-send-email-alexander.onnasch@landisgyr.com>

On Wed, Jun 06, 2018 at 10:03:18AM +0200, Alexander Onnasch wrote:
> With Micrel KSZ8061 PHY, the link may occasionally not come up after
> Ethernet cable connect. The vendor's (Microchip, former Micrel) errata
> sheet 80000688A.pdf describes the problem and possible workarounds in
> detail, see below.
> The patch implements workaround 1, which permanently fixes the issue.
> 
> DESCRIPTION
> Link-up may not occur properly when the Ethernet cable is initially
> connected. This issue occurs more commonly when the cable is connected
> slowly, but it may occur any time a cable is connected. This issue occurs
> in the auto-negotiation circuit, and will not occur if auto-negotiation
> is disabled (which requires that the two link partners be set to the
> same speed and duplex).
> 
> END USER IMPLICATIONS
> When this issue occurs, link is not established. Subsequent cable
> plug/unplug cycles will not correct the issue.
> 
> WORK AROUND
> There are four approaches to work around this issue:
> 1.  This issue can be prevented by setting bit 15 in MMD device address 1,
>     register 2, prior to connecting the  cable or prior to setting the
>     Restart Auto-Negotiation bit in register 0h.The MMD registers are
>     accessed via the indirect access registers Dh and Eh, or via the Micrel
>     EthUtil utility as shown here:
>     •  If using the EthUtil utility (usually with a Micrel KSZ8061
>        Evaluation Board), type the following commands:
>        > address 1
>        > mmd 1
>        > iw 2 b61a
>     •  Alternatively, write the following registers to write to the
>        indirect MMD register:
>        Write register Dh, data 0001h
>        Write register Eh, data 0002h
>        Write register Dh, data 4001h
>        Write register Eh, data B61Ah
> 2.  The issue can be avoided by disabling auto-negotiation in the KSZ8061,
>     either by the strapping option, or by clearing bit 12 in register 0h.
>     Care must be taken to ensure that the KSZ8061 and the link partner
>     will link with the same speed and duplex. Note that the KSZ8061
>     defaults to full-duplex when auto-negotiation is off, but other
>     devices may default to half-duplex in the event of failed
>     auto-negotiation.
> 3.  The issue can be avoided by connecting the cable prior to powering-up
>     or resetting the KSZ8061, and leaving it plugged in thereafter.
> 4.  If the above measures are not taken and the problem occurs, link can
>     be recovered by setting the Restart Auto-Negotiation bit in
>     register 0h, or by resetting or power cycling the device. Reset may
>     be either hardware reset or software reset (register 0h, bit 15).
> 
> PLAN
> This errata will not be corrected in a future revision.
> 
> Signed-off-by: Alexander Onnasch <alexander.onnasch@landisgyr.com>
> ---
>  drivers/net/phy/micrel.c | 24 +++++++++++++++++++++++-
>  1 file changed, 23 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
> index 6c45ff6..7d80a00 100644
> --- a/drivers/net/phy/micrel.c
> +++ b/drivers/net/phy/micrel.c
> @@ -339,6 +339,28 @@ static int ksz8041_config_aneg(struct phy_device *phydev)
>  	return genphy_config_aneg(phydev);
>  }
>  
> +#define MII_KSZ8061RN_MMD_CTRL_REG	    0x0d
> +#define MII_KSZ8061RN_MMD_REGDATA_REG	0x0e
> +
> +static int ksz8061_extended_write(struct phy_device *phydev,
> +				  u8 mode, u32 dev_addr, u32 regnum, u16 val)
> +{
> +	phy_write(phydev, MII_KSZ8061RN_MMD_CTRL_REG, dev_addr);
> +	phy_write(phydev, MII_KSZ8061RN_MMD_REGDATA_REG, regnum);
> +	phy_write(phydev, MII_KSZ8061RN_MMD_CTRL_REG, (mode << 14) | dev_addr);
> +	return phy_write(phydev, MII_KSZ8061RN_MMD_REGDATA_REG, val);
> +}

Hi Alexander

This looks a lot like phy_write_mmd().

     Andrew

^ permalink raw reply

* [PATCH bpf-next v5 2/2] samples/bpf: Add xdp_sample_pkts example
From: Toke Høiland-Jørgensen @ 2018-06-06 12:43 UTC (permalink / raw)
  To: netdev; +Cc: Jesper Dangaard Brouer
In-Reply-To: <152828901936.9599.10143519677183418086.stgit@alrua-kau>

Add an example program showing how to sample packets from XDP using the
perf event buffer. The example userspace program just prints the ethernet
header for every packet sampled.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
 samples/bpf/Makefile               |    4 +
 samples/bpf/xdp_sample_pkts_kern.c |   66 ++++++++++++++
 samples/bpf/xdp_sample_pkts_user.c |  176 ++++++++++++++++++++++++++++++++++++
 3 files changed, 246 insertions(+)
 create mode 100644 samples/bpf/xdp_sample_pkts_kern.c
 create mode 100644 samples/bpf/xdp_sample_pkts_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 1303af10e54d..9ea2f7b64869 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -52,6 +52,7 @@ hostprogs-y += xdp_adjust_tail
 hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
 hostprogs-y += task_fd_query
+hostprogs-y += xdp_sample_pkts
 
 # Libbpf dependencies
 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -107,6 +108,7 @@ xdp_adjust_tail-objs := xdp_adjust_tail_user.o
 xdpsock-objs := bpf_load.o xdpsock_user.o
 xdp_fwd-objs := bpf_load.o xdp_fwd_user.o
 task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
+xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -163,6 +165,7 @@ always += xdp_adjust_tail_kern.o
 always += xdpsock_kern.o
 always += xdp_fwd_kern.o
 always += task_fd_query_kern.o
+always += xdp_sample_pkts_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -179,6 +182,7 @@ HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_xdp_sample_pkts_user.o += -I$(srctree)/tools/lib/bpf/
 
 HOST_LOADLIBES		+= $(LIBBPF) -lelf
 HOSTLOADLIBES_tracex4		+= -lrt
diff --git a/samples/bpf/xdp_sample_pkts_kern.c b/samples/bpf/xdp_sample_pkts_kern.c
new file mode 100644
index 000000000000..f7ca8b850978
--- /dev/null
+++ b/samples/bpf/xdp_sample_pkts_kern.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+#define SAMPLE_SIZE 64ul
+#define MAX_CPUS 128
+
+#define bpf_printk(fmt, ...)					\
+({								\
+	       char ____fmt[] = fmt;				\
+	       bpf_trace_printk(____fmt, sizeof(____fmt),	\
+				##__VA_ARGS__);			\
+})
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(u32),
+	.max_entries = MAX_CPUS,
+};
+
+SEC("xdp_sample")
+int xdp_sample_prog(struct xdp_md *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data = (void *)(long)ctx->data;
+
+	/* Metadata will be in the perf event before the packet data. */
+	struct S {
+		u16 cookie;
+		u16 pkt_len;
+	} __packed metadata;
+
+	if (data < data_end) {
+		/* The XDP perf_event_output handler will use the upper 32 bits
+		 * of the flags argument as a number of bytes to include of the
+		 * packet payload in the event data. If the size is too big, the
+		 * call to bpf_perf_event_output will fail and return -EFAULT.
+		 *
+		 * See bpf_xdp_event_output in net/core/filter.c.
+		 *
+		 * The BPF_F_CURRENT_CPU flag means that the event output fd
+		 * will be indexed by the CPU number in the event map.
+		 */
+		u64 flags = BPF_F_CURRENT_CPU;
+		u16 sample_size;
+		int ret;
+
+		metadata.cookie = 0xdead;
+		metadata.pkt_len = (u16)(data_end - data);
+		sample_size = min(metadata.pkt_len, SAMPLE_SIZE);
+		flags |= (u64)sample_size << 32;
+
+		ret = bpf_perf_event_output(ctx, &my_map, flags,
+					    &metadata, sizeof(metadata));
+		if (ret)
+			bpf_printk("perf_event_output failed: %d\n", ret);
+	}
+
+	return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c
new file mode 100644
index 000000000000..944ff7512d2b
--- /dev/null
+++ b/samples/bpf/xdp_sample_pkts_user.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include <net/if.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/sysinfo.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <signal.h>
+#include <libbpf.h>
+#include <bpf/bpf.h>
+
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define MAX_CPUS 128
+static int pmu_fds[MAX_CPUS], if_idx;
+static struct perf_event_mmap_page *headers[MAX_CPUS];
+static char *if_name;
+
+static int do_attach(int idx, int fd, const char *name)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(idx, fd, 0);
+	if (err < 0)
+		printf("ERROR: failed to attach program to %s\n", name);
+
+	return err;
+}
+
+static int do_detach(int idx, const char *name)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(idx, -1, 0);
+	if (err < 0)
+		printf("ERROR: failed to detach program from %s\n", name);
+
+	return err;
+}
+
+#define SAMPLE_SIZE 64
+
+static int print_bpf_output(void *data, int size)
+{
+	struct {
+		__u16 cookie;
+		__u16 pkt_len;
+		__u8  pkt_data[SAMPLE_SIZE];
+	} __packed *e = data;
+	int i;
+
+	if (e->cookie != 0xdead) {
+		printf("BUG cookie %x sized %d\n",
+		       e->cookie, size);
+		return LIBBPF_PERF_EVENT_ERROR;
+	}
+
+	printf("Pkt len: %-5d bytes. Ethernet hdr: ", e->pkt_len);
+	for (i = 0; i < 14 && i < e->pkt_len; i++)
+		printf("%02x ", e->pkt_data[i]);
+	printf("\n");
+
+	return LIBBPF_PERF_EVENT_CONT;
+}
+
+static void test_bpf_perf_event(int map_fd, int num)
+{
+	struct perf_event_attr attr = {
+		.sample_type = PERF_SAMPLE_RAW,
+		.type = PERF_TYPE_SOFTWARE,
+		.config = PERF_COUNT_SW_BPF_OUTPUT,
+		.wakeup_events = 1, /* get an fd notification for every event */
+	};
+	int i;
+
+	for (i = 0; i < num; i++) {
+		int key = i;
+
+		pmu_fds[i] = sys_perf_event_open(&attr, -1/*pid*/, i/*cpu*/,
+						 -1/*group_fd*/, 0);
+
+		assert(pmu_fds[i] >= 0);
+		assert(bpf_map_update_elem(map_fd, &key,
+					   &pmu_fds[i], BPF_ANY) == 0);
+		ioctl(pmu_fds[i], PERF_EVENT_IOC_ENABLE, 0);
+	}
+}
+
+static void sig_handler(int signo)
+{
+	do_detach(if_idx, if_name);
+	exit(0);
+}
+
+int main(int argc, char **argv)
+{
+	struct bpf_prog_load_attr prog_load_attr = {
+		.prog_type	= BPF_PROG_TYPE_XDP,
+	};
+	struct bpf_object *obj;
+	struct bpf_map *map;
+	int prog_fd, map_fd;
+	char filename[256];
+	int ret, err, i;
+	int numcpus;
+
+	if (argc < 2) {
+		printf("Usage: %s <ifname>\n", argv[0]);
+		return 1;
+	}
+
+	numcpus = get_nprocs();
+	if (numcpus > MAX_CPUS)
+		numcpus = MAX_CPUS;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+	prog_load_attr.file = filename;
+
+	if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+		return 1;
+
+	if (!prog_fd) {
+		printf("load_bpf_file: %s\n", strerror(errno));
+		return 1;
+	}
+
+	map = bpf_map__next(NULL, obj);
+	if (!map) {
+		printf("finding a map in obj file failed\n");
+		return 1;
+	}
+	map_fd = bpf_map__fd(map);
+
+	if_idx = if_nametoindex(argv[1]);
+	if (!if_idx)
+		if_idx = strtoul(argv[1], NULL, 0);
+
+	if (!if_idx) {
+		fprintf(stderr, "Invalid ifname\n");
+		return 1;
+	}
+	if_name = argv[1];
+	err = do_attach(if_idx, prog_fd, argv[1]);
+	if (err)
+		return err;
+
+	if (signal(SIGINT, sig_handler) ||
+	    signal(SIGHUP, sig_handler) ||
+	    signal(SIGTERM, sig_handler)) {
+		perror("signal");
+		return 1;
+	}
+
+	test_bpf_perf_event(map_fd, numcpus);
+
+	for (i = 0; i < numcpus; i++)
+		if (perf_event_mmap_header(pmu_fds[i], &headers[i]) < 0)
+			return 1;
+
+	ret = perf_event_poller_multi(pmu_fds, headers, numcpus,
+				      print_bpf_output);
+	kill(0, SIGINT);
+	return ret;
+}

^ permalink raw reply related

* [PATCH bpf-next v5 1/2] trace_helpers.c: Add helpers to poll multiple perf FDs for events
From: Toke Høiland-Jørgensen @ 2018-06-06 12:43 UTC (permalink / raw)
  To: netdev; +Cc: Jesper Dangaard Brouer

Add two new helper functions to trace_helpers that supports polling
multiple perf file descriptors for events. These are used to the XDP
perf_event_output example, which needs to work with one perf fd per CPU.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
 tools/testing/selftests/bpf/trace_helpers.c |   49 ++++++++++++++++++++++++++-
 tools/testing/selftests/bpf/trace_helpers.h |    4 ++
 2 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index 3868dcb63420..0673e8840cc8 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -88,7 +88,7 @@ static int page_size;
 static int page_cnt = 8;
 static struct perf_event_mmap_page *header;
 
-int perf_event_mmap(int fd)
+int perf_event_mmap_header(int fd, struct perf_event_mmap_page **header)
 {
 	void *base;
 	int mmap_size;
@@ -102,10 +102,15 @@ int perf_event_mmap(int fd)
 		return -1;
 	}
 
-	header = base;
+	*header = base;
 	return 0;
 }
 
+int perf_event_mmap(int fd)
+{
+	return perf_event_mmap_header(fd, &header);
+}
+
 static int perf_event_poll(int fd)
 {
 	struct pollfd pfd = { .fd = fd, .events = POLLIN };
@@ -163,3 +168,43 @@ int perf_event_poller(int fd, perf_event_print_fn output_fn)
 
 	return ret;
 }
+
+int perf_event_poller_multi(int *fds, struct perf_event_mmap_page **headers,
+			    int num_fds, perf_event_print_fn output_fn)
+{
+	enum bpf_perf_event_ret ret;
+	struct pollfd *pfds;
+	void *buf = NULL;
+	size_t len = 0;
+	int i;
+
+	pfds = malloc(sizeof(*pfds) * num_fds);
+	if (!pfds)
+		return LIBBPF_PERF_EVENT_ERROR;
+
+	memset(pfds, 0, sizeof(*pfds) * num_fds);
+	for (i = 0; i < num_fds; i++) {
+		pfds[i].fd = fds[i];
+		pfds[i].events = POLLIN;
+	}
+
+	for (;;) {
+		poll(pfds, num_fds, 1000);
+		for (i = 0; i < num_fds; i++) {
+			if (!pfds[i].revents)
+				continue;
+
+			ret = bpf_perf_event_read_simple(headers[i],
+							 page_cnt * page_size,
+							 page_size, &buf, &len,
+							 bpf_perf_event_print,
+							 output_fn);
+			if (ret != LIBBPF_PERF_EVENT_CONT)
+				break;
+		}
+	}
+	free(buf);
+	free(pfds);
+
+	return ret;
+}
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
index 3b4bcf7f5084..18924f23db1b 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -3,6 +3,7 @@
 #define __TRACE_HELPER_H
 
 #include <libbpf.h>
+#include <linux/perf_event.h>
 
 struct ksym {
 	long addr;
@@ -16,6 +17,9 @@ long ksym_get_addr(const char *name);
 typedef enum bpf_perf_event_ret (*perf_event_print_fn)(void *data, int size);
 
 int perf_event_mmap(int fd);
+int perf_event_mmap_header(int fd, struct perf_event_mmap_page **header);
 /* return LIBBPF_PERF_EVENT_DONE or LIBBPF_PERF_EVENT_ERROR */
 int perf_event_poller(int fd, perf_event_print_fn output_fn);
+int perf_event_poller_multi(int *fds, struct perf_event_mmap_page **headers,
+			    int num_fds, perf_event_print_fn output_fn);
 #endif

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox