Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 0/3] net: dsa: mv88e6xxx: add PCL support
From: Vivien Didelot @ 2019-09-07 20:00 UTC (permalink / raw)
  To: netdev; +Cc: davem, f.fainelli, andrew, Vivien Didelot

This small series implements the ethtool RXNFC operations in the
mv88e6xxx DSA driver to configure a port's Layer 2 Policy Control List
(PCL) supported by models such as 88E6352 and 88E6390 and equivalent.

This allows to configure a port to discard frames based on a configured
destination or source MAC address and an optional VLAN, with e.g.:

    # ethtool --config-nfc lan1 flow-type ether src 00:11:22:33:44:55 action -1

Vivien Didelot (3):
  net: dsa: mv88e6xxx: complete ATU state definitions
  net: dsa: mv88e6xxx: introduce .port_set_policy
  net: dsa: mv88e6xxx: add RXNFC support

 drivers/net/dsa/mv88e6xxx/chip.c        | 241 ++++++++++++++++++++++--
 drivers/net/dsa/mv88e6xxx/chip.h        |  35 ++++
 drivers/net/dsa/mv88e6xxx/global1.h     |  43 +++--
 drivers/net/dsa/mv88e6xxx/global1_atu.c |   6 +-
 drivers/net/dsa/mv88e6xxx/port.c        |  74 ++++++++
 drivers/net/dsa/mv88e6xxx/port.h        |  17 +-
 6 files changed, 388 insertions(+), 28 deletions(-)

-- 
2.23.0


^ permalink raw reply

* [PATCH net-next 1/3] net: dsa: mv88e6xxx: complete ATU state definitions
From: Vivien Didelot @ 2019-09-07 20:00 UTC (permalink / raw)
  To: netdev; +Cc: davem, f.fainelli, andrew, Vivien Didelot
In-Reply-To: <20190907200049.25273-1-vivien.didelot@gmail.com>

Marvell has different values for the state of a MAC address,
depending on its multicast bit. This patch completes the definitions
for these states.

At the same time, use 0 which is intuitive enough and simplifies the
code a bit, instead of the UC or MC unused value.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
---
 drivers/net/dsa/mv88e6xxx/chip.c        | 19 +++++------
 drivers/net/dsa/mv88e6xxx/global1.h     | 43 +++++++++++++++++--------
 drivers/net/dsa/mv88e6xxx/global1_atu.c |  6 ++--
 3 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 30365a54c31b..0d54a69f3622 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1497,7 +1497,7 @@ static int mv88e6xxx_port_db_load_purge(struct mv88e6xxx_chip *chip, int port,
 		fid = vlan.fid;
 	}
 
-	entry.state = MV88E6XXX_G1_ATU_DATA_STATE_UNUSED;
+	entry.state = 0;
 	ether_addr_copy(entry.mac, addr);
 	eth_addr_dec(entry.mac);
 
@@ -1506,17 +1506,16 @@ static int mv88e6xxx_port_db_load_purge(struct mv88e6xxx_chip *chip, int port,
 		return err;
 
 	/* Initialize a fresh ATU entry if it isn't found */
-	if (entry.state == MV88E6XXX_G1_ATU_DATA_STATE_UNUSED ||
-	    !ether_addr_equal(entry.mac, addr)) {
+	if (!entry.state || !ether_addr_equal(entry.mac, addr)) {
 		memset(&entry, 0, sizeof(entry));
 		ether_addr_copy(entry.mac, addr);
 	}
 
 	/* Purge the ATU entry only if no port is using it anymore */
-	if (state == MV88E6XXX_G1_ATU_DATA_STATE_UNUSED) {
+	if (!state) {
 		entry.portvec &= ~BIT(port);
 		if (!entry.portvec)
-			entry.state = MV88E6XXX_G1_ATU_DATA_STATE_UNUSED;
+			entry.state = 0;
 	} else {
 		entry.portvec |= BIT(port);
 		entry.state = state;
@@ -1732,8 +1731,7 @@ static int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port,
 	int err;
 
 	mv88e6xxx_reg_lock(chip);
-	err = mv88e6xxx_port_db_load_purge(chip, port, addr, vid,
-					   MV88E6XXX_G1_ATU_DATA_STATE_UNUSED);
+	err = mv88e6xxx_port_db_load_purge(chip, port, addr, vid, 0);
 	mv88e6xxx_reg_unlock(chip);
 
 	return err;
@@ -1747,7 +1745,7 @@ static int mv88e6xxx_port_db_dump_fid(struct mv88e6xxx_chip *chip,
 	bool is_static;
 	int err;
 
-	addr.state = MV88E6XXX_G1_ATU_DATA_STATE_UNUSED;
+	addr.state = 0;
 	eth_broadcast_addr(addr.mac);
 
 	do {
@@ -1755,7 +1753,7 @@ static int mv88e6xxx_port_db_dump_fid(struct mv88e6xxx_chip *chip,
 		if (err)
 			return err;
 
-		if (addr.state == MV88E6XXX_G1_ATU_DATA_STATE_UNUSED)
+		if (!addr.state)
 			break;
 
 		if (addr.trunk || (addr.portvec & BIT(port)) == 0)
@@ -4690,8 +4688,7 @@ static int mv88e6xxx_port_mdb_del(struct dsa_switch *ds, int port,
 	int err;
 
 	mv88e6xxx_reg_lock(chip);
-	err = mv88e6xxx_port_db_load_purge(chip, port, mdb->addr, mdb->vid,
-					   MV88E6XXX_G1_ATU_DATA_STATE_UNUSED);
+	err = mv88e6xxx_port_db_load_purge(chip, port, mdb->addr, mdb->vid, 0);
 	mv88e6xxx_reg_unlock(chip);
 
 	return err;
diff --git a/drivers/net/dsa/mv88e6xxx/global1.h b/drivers/net/dsa/mv88e6xxx/global1.h
index 78b9ae22d18c..0870fcc8bfc8 100644
--- a/drivers/net/dsa/mv88e6xxx/global1.h
+++ b/drivers/net/dsa/mv88e6xxx/global1.h
@@ -128,19 +128,36 @@
 #define MV88E6XXX_G1_ATU_OP_FULL_VIOLATION		BIT(4)
 
 /* Offset 0x0C: ATU Data Register */
-#define MV88E6XXX_G1_ATU_DATA				0x0c
-#define MV88E6XXX_G1_ATU_DATA_TRUNK			0x8000
-#define MV88E6XXX_G1_ATU_DATA_TRUNK_ID_MASK		0x00f0
-#define MV88E6XXX_G1_ATU_DATA_PORT_VECTOR_MASK		0x3ff0
-#define MV88E6XXX_G1_ATU_DATA_STATE_MASK		0x000f
-#define MV88E6XXX_G1_ATU_DATA_STATE_UNUSED		0x0000
-#define MV88E6XXX_G1_ATU_DATA_STATE_UC_MGMT		0x000d
-#define MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC		0x000e
-#define MV88E6XXX_G1_ATU_DATA_STATE_UC_PRIO_OVER	0x000f
-#define MV88E6XXX_G1_ATU_DATA_STATE_MC_NONE_RATE	0x0005
-#define MV88E6XXX_G1_ATU_DATA_STATE_MC_STATIC		0x0007
-#define MV88E6XXX_G1_ATU_DATA_STATE_MC_MGMT		0x000e
-#define MV88E6XXX_G1_ATU_DATA_STATE_MC_PRIO_OVER	0x000f
+#define MV88E6XXX_G1_ATU_DATA					0x0c
+#define MV88E6XXX_G1_ATU_DATA_TRUNK				0x8000
+#define MV88E6XXX_G1_ATU_DATA_TRUNK_ID_MASK			0x00f0
+#define MV88E6XXX_G1_ATU_DATA_PORT_VECTOR_MASK			0x3ff0
+#define MV88E6XXX_G1_ATU_DATA_STATE_MASK			0x000f
+#define MV88E6XXX_G1_ATU_DATA_STATE_UC_UNUSED			0x0000
+#define MV88E6XXX_G1_ATU_DATA_STATE_UC_AGE_1_OLDEST		0x0001
+#define MV88E6XXX_G1_ATU_DATA_STATE_UC_AGE_2			0x0002
+#define MV88E6XXX_G1_ATU_DATA_STATE_UC_AGE_3			0x0003
+#define MV88E6XXX_G1_ATU_DATA_STATE_UC_AGE_4			0x0004
+#define MV88E6XXX_G1_ATU_DATA_STATE_UC_AGE_5			0x0005
+#define MV88E6XXX_G1_ATU_DATA_STATE_UC_AGE_6			0x0006
+#define MV88E6XXX_G1_ATU_DATA_STATE_UC_AGE_7_NEWEST		0x0007
+#define MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC_POLICY		0x0008
+#define MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC_POLICY_PO		0x0009
+#define MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC_AVB_NRL		0x000a
+#define MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC_AVB_NRL_PO	0x000b
+#define MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC_DA_MGMT		0x000c
+#define MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC_DA_MGMT_PO	0x000d
+#define MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC			0x000e
+#define MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC_PO		0x000f
+#define MV88E6XXX_G1_ATU_DATA_STATE_MC_UNUSED			0x0000
+#define MV88E6XXX_G1_ATU_DATA_STATE_MC_STATIC_POLICY		0x0004
+#define MV88E6XXX_G1_ATU_DATA_STATE_MC_STATIC_AVB_NRL		0x0005
+#define MV88E6XXX_G1_ATU_DATA_STATE_MC_STATIC_DA_MGMT		0x0006
+#define MV88E6XXX_G1_ATU_DATA_STATE_MC_STATIC			0x0007
+#define MV88E6XXX_G1_ATU_DATA_STATE_MC_STATIC_POLICY_PO		0x000c
+#define MV88E6XXX_G1_ATU_DATA_STATE_MC_STATIC_AVB_NRL_PO	0x000d
+#define MV88E6XXX_G1_ATU_DATA_STATE_MC_STATIC_DA_MGMT_PO	0x000e
+#define MV88E6XXX_G1_ATU_DATA_STATE_MC_STATIC_PO		0x000f
 
 /* Offset 0x0D: ATU MAC Address Register Bytes 0 & 1
  * Offset 0x0E: ATU MAC Address Register Bytes 2 & 3
diff --git a/drivers/net/dsa/mv88e6xxx/global1_atu.c b/drivers/net/dsa/mv88e6xxx/global1_atu.c
index 18b86515b6bc..792a96ef418f 100644
--- a/drivers/net/dsa/mv88e6xxx/global1_atu.c
+++ b/drivers/net/dsa/mv88e6xxx/global1_atu.c
@@ -135,7 +135,7 @@ static int mv88e6xxx_g1_atu_data_read(struct mv88e6xxx_chip *chip,
 		return err;
 
 	entry->state = val & 0xf;
-	if (entry->state != MV88E6XXX_G1_ATU_DATA_STATE_UNUSED) {
+	if (entry->state) {
 		entry->trunk = !!(val & MV88E6XXX_G1_ATU_DATA_TRUNK);
 		entry->portvec = (val >> 4) & mv88e6xxx_port_mask(chip);
 	}
@@ -148,7 +148,7 @@ static int mv88e6xxx_g1_atu_data_write(struct mv88e6xxx_chip *chip,
 {
 	u16 data = entry->state & 0xf;
 
-	if (entry->state != MV88E6XXX_G1_ATU_DATA_STATE_UNUSED) {
+	if (entry->state) {
 		if (entry->trunk)
 			data |= MV88E6XXX_G1_ATU_DATA_TRUNK;
 
@@ -209,7 +209,7 @@ int mv88e6xxx_g1_atu_getnext(struct mv88e6xxx_chip *chip, u16 fid,
 		return err;
 
 	/* Write the MAC address to iterate from only once */
-	if (entry->state == MV88E6XXX_G1_ATU_DATA_STATE_UNUSED) {
+	if (!entry->state) {
 		err = mv88e6xxx_g1_atu_mac_write(chip, entry);
 		if (err)
 			return err;
-- 
2.23.0


^ permalink raw reply related

* [PATCH net-next 2/3] net: dsa: mv88e6xxx: introduce .port_set_policy
From: Vivien Didelot @ 2019-09-07 20:00 UTC (permalink / raw)
  To: netdev; +Cc: davem, f.fainelli, andrew, Vivien Didelot
In-Reply-To: <20190907200049.25273-1-vivien.didelot@gmail.com>

Introduce a new .port_set_policy operation to configure a port's
Policy Control List, based on mapping such as DA, SA, Etype and so on.

Models similar to 88E6352 and 88E6390 are supported at the moment.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
---
 drivers/net/dsa/mv88e6xxx/chip.c |  9 ++++
 drivers/net/dsa/mv88e6xxx/chip.h | 22 ++++++++++
 drivers/net/dsa/mv88e6xxx/port.c | 74 ++++++++++++++++++++++++++++++++
 drivers/net/dsa/mv88e6xxx/port.h | 17 +++++++-
 4 files changed, 121 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 0d54a69f3622..6f4d5303a1f3 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3132,6 +3132,7 @@ static const struct mv88e6xxx_ops mv88e6172_ops = {
 	.port_set_rgmii_delay = mv88e6352_port_set_rgmii_delay,
 	.port_set_speed = mv88e6352_port_set_speed,
 	.port_tag_remap = mv88e6095_port_tag_remap,
+	.port_set_policy = mv88e6352_port_set_policy,
 	.port_set_frame_mode = mv88e6351_port_set_frame_mode,
 	.port_set_egress_floods = mv88e6352_port_set_egress_floods,
 	.port_set_ether_type = mv88e6351_port_set_ether_type,
@@ -3218,6 +3219,7 @@ static const struct mv88e6xxx_ops mv88e6176_ops = {
 	.port_set_rgmii_delay = mv88e6352_port_set_rgmii_delay,
 	.port_set_speed = mv88e6352_port_set_speed,
 	.port_tag_remap = mv88e6095_port_tag_remap,
+	.port_set_policy = mv88e6352_port_set_policy,
 	.port_set_frame_mode = mv88e6351_port_set_frame_mode,
 	.port_set_egress_floods = mv88e6352_port_set_egress_floods,
 	.port_set_ether_type = mv88e6351_port_set_ether_type,
@@ -3303,6 +3305,7 @@ static const struct mv88e6xxx_ops mv88e6190_ops = {
 	.port_set_speed = mv88e6390_port_set_speed,
 	.port_max_speed_mode = mv88e6390_port_max_speed_mode,
 	.port_tag_remap = mv88e6390_port_tag_remap,
+	.port_set_policy = mv88e6352_port_set_policy,
 	.port_set_frame_mode = mv88e6351_port_set_frame_mode,
 	.port_set_egress_floods = mv88e6352_port_set_egress_floods,
 	.port_set_ether_type = mv88e6351_port_set_ether_type,
@@ -3351,6 +3354,7 @@ static const struct mv88e6xxx_ops mv88e6190x_ops = {
 	.port_set_speed = mv88e6390x_port_set_speed,
 	.port_max_speed_mode = mv88e6390x_port_max_speed_mode,
 	.port_tag_remap = mv88e6390_port_tag_remap,
+	.port_set_policy = mv88e6352_port_set_policy,
 	.port_set_frame_mode = mv88e6351_port_set_frame_mode,
 	.port_set_egress_floods = mv88e6352_port_set_egress_floods,
 	.port_set_ether_type = mv88e6351_port_set_ether_type,
@@ -3448,6 +3452,7 @@ static const struct mv88e6xxx_ops mv88e6240_ops = {
 	.port_set_rgmii_delay = mv88e6352_port_set_rgmii_delay,
 	.port_set_speed = mv88e6352_port_set_speed,
 	.port_tag_remap = mv88e6095_port_tag_remap,
+	.port_set_policy = mv88e6352_port_set_policy,
 	.port_set_frame_mode = mv88e6351_port_set_frame_mode,
 	.port_set_egress_floods = mv88e6352_port_set_egress_floods,
 	.port_set_ether_type = mv88e6351_port_set_ether_type,
@@ -3539,6 +3544,7 @@ static const struct mv88e6xxx_ops mv88e6290_ops = {
 	.port_set_speed = mv88e6390_port_set_speed,
 	.port_max_speed_mode = mv88e6390_port_max_speed_mode,
 	.port_tag_remap = mv88e6390_port_tag_remap,
+	.port_set_policy = mv88e6352_port_set_policy,
 	.port_set_frame_mode = mv88e6351_port_set_frame_mode,
 	.port_set_egress_floods = mv88e6352_port_set_egress_floods,
 	.port_set_ether_type = mv88e6351_port_set_ether_type,
@@ -3809,6 +3815,7 @@ static const struct mv88e6xxx_ops mv88e6352_ops = {
 	.port_set_rgmii_delay = mv88e6352_port_set_rgmii_delay,
 	.port_set_speed = mv88e6352_port_set_speed,
 	.port_tag_remap = mv88e6095_port_tag_remap,
+	.port_set_policy = mv88e6352_port_set_policy,
 	.port_set_frame_mode = mv88e6351_port_set_frame_mode,
 	.port_set_egress_floods = mv88e6352_port_set_egress_floods,
 	.port_set_ether_type = mv88e6351_port_set_ether_type,
@@ -3863,6 +3870,7 @@ static const struct mv88e6xxx_ops mv88e6390_ops = {
 	.port_set_speed = mv88e6390_port_set_speed,
 	.port_max_speed_mode = mv88e6390_port_max_speed_mode,
 	.port_tag_remap = mv88e6390_port_tag_remap,
+	.port_set_policy = mv88e6352_port_set_policy,
 	.port_set_frame_mode = mv88e6351_port_set_frame_mode,
 	.port_set_egress_floods = mv88e6352_port_set_egress_floods,
 	.port_set_ether_type = mv88e6351_port_set_ether_type,
@@ -3915,6 +3923,7 @@ static const struct mv88e6xxx_ops mv88e6390x_ops = {
 	.port_set_speed = mv88e6390x_port_set_speed,
 	.port_max_speed_mode = mv88e6390x_port_max_speed_mode,
 	.port_tag_remap = mv88e6390_port_tag_remap,
+	.port_set_policy = mv88e6352_port_set_policy,
 	.port_set_frame_mode = mv88e6351_port_set_frame_mode,
 	.port_set_egress_floods = mv88e6352_port_set_egress_floods,
 	.port_set_ether_type = mv88e6351_port_set_ether_type,
diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h
index 6bc0a4e4fe7b..04a329a98158 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.h
+++ b/drivers/net/dsa/mv88e6xxx/chip.h
@@ -189,6 +189,24 @@ struct mv88e6xxx_port_hwtstamp {
 	struct hwtstamp_config tstamp_config;
 };
 
+enum mv88e6xxx_policy_mapping {
+	MV88E6XXX_POLICY_MAPPING_DA,
+	MV88E6XXX_POLICY_MAPPING_SA,
+	MV88E6XXX_POLICY_MAPPING_VTU,
+	MV88E6XXX_POLICY_MAPPING_ETYPE,
+	MV88E6XXX_POLICY_MAPPING_PPPOE,
+	MV88E6XXX_POLICY_MAPPING_VBAS,
+	MV88E6XXX_POLICY_MAPPING_OPT82,
+	MV88E6XXX_POLICY_MAPPING_UDP,
+};
+
+enum mv88e6xxx_policy_action {
+	MV88E6XXX_POLICY_ACTION_NORMAL,
+	MV88E6XXX_POLICY_ACTION_MIRROR,
+	MV88E6XXX_POLICY_ACTION_TRAP,
+	MV88E6XXX_POLICY_ACTION_DISCARD,
+};
+
 struct mv88e6xxx_port {
 	struct mv88e6xxx_chip *chip;
 	int port;
@@ -381,6 +399,10 @@ struct mv88e6xxx_ops {
 
 	int (*port_tag_remap)(struct mv88e6xxx_chip *chip, int port);
 
+	int (*port_set_policy)(struct mv88e6xxx_chip *chip, int port,
+			       enum mv88e6xxx_policy_mapping mapping,
+			       enum mv88e6xxx_policy_action action);
+
 	int (*port_set_frame_mode)(struct mv88e6xxx_chip *chip, int port,
 				   enum mv88e6xxx_frame_mode mode);
 	int (*port_set_egress_floods)(struct mv88e6xxx_chip *chip, int port,
diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c
index 04006344adb2..15ef81654b67 100644
--- a/drivers/net/dsa/mv88e6xxx/port.c
+++ b/drivers/net/dsa/mv88e6xxx/port.c
@@ -1341,3 +1341,77 @@ int mv88e6390_port_tag_remap(struct mv88e6xxx_chip *chip, int port)
 
 	return 0;
 }
+
+/* Offset 0x0E: Policy Control Register */
+
+int mv88e6352_port_set_policy(struct mv88e6xxx_chip *chip, int port,
+			      enum mv88e6xxx_policy_mapping mapping,
+			      enum mv88e6xxx_policy_action action)
+{
+	u16 reg, mask, val;
+	int shift;
+	int err;
+
+	switch (mapping) {
+	case MV88E6XXX_POLICY_MAPPING_DA:
+		shift = __bf_shf(MV88E6XXX_PORT_POLICY_CTL_DA_MASK);
+		mask = MV88E6XXX_PORT_POLICY_CTL_DA_MASK;
+		break;
+	case MV88E6XXX_POLICY_MAPPING_SA:
+		shift = __bf_shf(MV88E6XXX_PORT_POLICY_CTL_SA_MASK);
+		mask = MV88E6XXX_PORT_POLICY_CTL_SA_MASK;
+		break;
+	case MV88E6XXX_POLICY_MAPPING_VTU:
+		shift = __bf_shf(MV88E6XXX_PORT_POLICY_CTL_VTU_MASK);
+		mask = MV88E6XXX_PORT_POLICY_CTL_VTU_MASK;
+		break;
+	case MV88E6XXX_POLICY_MAPPING_ETYPE:
+		shift = __bf_shf(MV88E6XXX_PORT_POLICY_CTL_ETYPE_MASK);
+		mask = MV88E6XXX_PORT_POLICY_CTL_ETYPE_MASK;
+		break;
+	case MV88E6XXX_POLICY_MAPPING_PPPOE:
+		shift = __bf_shf(MV88E6XXX_PORT_POLICY_CTL_PPPOE_MASK);
+		mask = MV88E6XXX_PORT_POLICY_CTL_PPPOE_MASK;
+		break;
+	case MV88E6XXX_POLICY_MAPPING_VBAS:
+		shift = __bf_shf(MV88E6XXX_PORT_POLICY_CTL_VBAS_MASK);
+		mask = MV88E6XXX_PORT_POLICY_CTL_VBAS_MASK;
+		break;
+	case MV88E6XXX_POLICY_MAPPING_OPT82:
+		shift = __bf_shf(MV88E6XXX_PORT_POLICY_CTL_OPT82_MASK);
+		mask = MV88E6XXX_PORT_POLICY_CTL_OPT82_MASK;
+		break;
+	case MV88E6XXX_POLICY_MAPPING_UDP:
+		shift = __bf_shf(MV88E6XXX_PORT_POLICY_CTL_UDP_MASK);
+		mask = MV88E6XXX_PORT_POLICY_CTL_UDP_MASK;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	switch (action) {
+	case MV88E6XXX_POLICY_ACTION_NORMAL:
+		val = MV88E6XXX_PORT_POLICY_CTL_NORMAL;
+		break;
+	case MV88E6XXX_POLICY_ACTION_MIRROR:
+		val = MV88E6XXX_PORT_POLICY_CTL_MIRROR;
+		break;
+	case MV88E6XXX_POLICY_ACTION_TRAP:
+		val = MV88E6XXX_PORT_POLICY_CTL_TRAP;
+		break;
+	case MV88E6XXX_POLICY_ACTION_DISCARD:
+		val = MV88E6XXX_PORT_POLICY_CTL_DISCARD;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_POLICY_CTL, &reg);
+	if (err)
+		return err;
+
+	reg &= ~mask;
+	reg |= (val << shift) & mask;
+
+	return mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_POLICY_CTL, reg);
+}
diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h
index d4e9bea6e82f..03a480cd71b9 100644
--- a/drivers/net/dsa/mv88e6xxx/port.h
+++ b/drivers/net/dsa/mv88e6xxx/port.h
@@ -222,7 +222,19 @@
 #define MV88E6XXX_PORT_PRI_OVERRIDE	0x0d
 
 /* Offset 0x0E: Policy Control Register */
-#define MV88E6XXX_PORT_POLICY_CTL	0x0e
+#define MV88E6XXX_PORT_POLICY_CTL		0x0e
+#define MV88E6XXX_PORT_POLICY_CTL_DA_MASK	0xc000
+#define MV88E6XXX_PORT_POLICY_CTL_SA_MASK	0x3000
+#define MV88E6XXX_PORT_POLICY_CTL_VTU_MASK	0x0c00
+#define MV88E6XXX_PORT_POLICY_CTL_ETYPE_MASK	0x0300
+#define MV88E6XXX_PORT_POLICY_CTL_PPPOE_MASK	0x00c0
+#define MV88E6XXX_PORT_POLICY_CTL_VBAS_MASK	0x0030
+#define MV88E6XXX_PORT_POLICY_CTL_OPT82_MASK	0x000c
+#define MV88E6XXX_PORT_POLICY_CTL_UDP_MASK	0x0003
+#define MV88E6XXX_PORT_POLICY_CTL_NORMAL	0x0000
+#define MV88E6XXX_PORT_POLICY_CTL_MIRROR	0x0001
+#define MV88E6XXX_PORT_POLICY_CTL_TRAP		0x0002
+#define MV88E6XXX_PORT_POLICY_CTL_DISCARD	0x0003
 
 /* Offset 0x0F: Port Special Ether Type */
 #define MV88E6XXX_PORT_ETH_TYPE		0x0f
@@ -324,6 +336,9 @@ int mv88e6185_port_set_egress_floods(struct mv88e6xxx_chip *chip, int port,
 				     bool unicast, bool multicast);
 int mv88e6352_port_set_egress_floods(struct mv88e6xxx_chip *chip, int port,
 				     bool unicast, bool multicast);
+int mv88e6352_port_set_policy(struct mv88e6xxx_chip *chip, int port,
+			      enum mv88e6xxx_policy_mapping mapping,
+			      enum mv88e6xxx_policy_action action);
 int mv88e6351_port_set_ether_type(struct mv88e6xxx_chip *chip, int port,
 				  u16 etype);
 int mv88e6xxx_port_set_message_port(struct mv88e6xxx_chip *chip, int port,
-- 
2.23.0


^ permalink raw reply related

* [PATCH net-next 3/3] net: dsa: mv88e6xxx: add RXNFC support
From: Vivien Didelot @ 2019-09-07 20:00 UTC (permalink / raw)
  To: netdev; +Cc: davem, f.fainelli, andrew, Vivien Didelot
In-Reply-To: <20190907200049.25273-1-vivien.didelot@gmail.com>

Implement the .get_rxnfc and .set_rxnfc DSA operations to configure
a port's Layer 2 Policy Control List (PCL) via ethtool.

Currently only dropping frames based on MAC Destination or Source
Address (including the option VLAN parameter) is supported.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 213 +++++++++++++++++++++++++++++++
 drivers/net/dsa/mv88e6xxx/chip.h |  13 ++
 2 files changed, 226 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 6f4d5303a1f3..6787d560e9e3 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1524,6 +1524,216 @@ static int mv88e6xxx_port_db_load_purge(struct mv88e6xxx_chip *chip, int port,
 	return mv88e6xxx_g1_atu_loadpurge(chip, fid, &entry);
 }
 
+static int mv88e6xxx_policy_apply(struct mv88e6xxx_chip *chip, int port,
+				  const struct mv88e6xxx_policy *policy)
+{
+	enum mv88e6xxx_policy_mapping mapping = policy->mapping;
+	enum mv88e6xxx_policy_action action = policy->action;
+	const u8 *addr = policy->addr;
+	u16 vid = policy->vid;
+	u8 state;
+	int err;
+	int id;
+
+	if (!chip->info->ops->port_set_policy)
+		return -EOPNOTSUPP;
+
+	switch (mapping) {
+	case MV88E6XXX_POLICY_MAPPING_DA:
+	case MV88E6XXX_POLICY_MAPPING_SA:
+		if (action == MV88E6XXX_POLICY_ACTION_NORMAL)
+			state = 0; /* Dissociate the port and address */
+		else if (action == MV88E6XXX_POLICY_ACTION_DISCARD &&
+			 is_multicast_ether_addr(addr))
+			state = MV88E6XXX_G1_ATU_DATA_STATE_MC_STATIC_POLICY;
+		else if (action == MV88E6XXX_POLICY_ACTION_DISCARD &&
+			 is_unicast_ether_addr(addr))
+			state = MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC_POLICY;
+		else
+			return -EOPNOTSUPP;
+
+		err = mv88e6xxx_port_db_load_purge(chip, port, addr, vid,
+						   state);
+		if (err)
+			return err;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	/* Skip the port's policy clearing if the mapping is still in use */
+	if (action == MV88E6XXX_POLICY_ACTION_NORMAL)
+		idr_for_each_entry(&chip->policies, policy, id)
+			if (policy->port == port &&
+			    policy->mapping == mapping &&
+			    policy->action != action)
+				return 0;
+
+	return chip->info->ops->port_set_policy(chip, port, mapping, action);
+}
+
+static int mv88e6xxx_policy_insert(struct mv88e6xxx_chip *chip, int port,
+				   struct ethtool_rx_flow_spec *fs)
+{
+	struct ethhdr *mac_entry = &fs->h_u.ether_spec;
+	struct ethhdr *mac_mask = &fs->m_u.ether_spec;
+	enum mv88e6xxx_policy_mapping mapping;
+	enum mv88e6xxx_policy_action action;
+	struct mv88e6xxx_policy *policy;
+	u16 vid = 0;
+	u8 *addr;
+	int err;
+	int id;
+
+	if (fs->location != RX_CLS_LOC_ANY)
+		return -EINVAL;
+
+	if (fs->ring_cookie == RX_CLS_FLOW_DISC)
+		action = MV88E6XXX_POLICY_ACTION_DISCARD;
+	else
+		return -EOPNOTSUPP;
+
+	switch (fs->flow_type & ~FLOW_EXT) {
+	case ETHER_FLOW:
+		if (!is_zero_ether_addr(mac_mask->h_dest) &&
+		    is_zero_ether_addr(mac_mask->h_source)) {
+			mapping = MV88E6XXX_POLICY_MAPPING_DA;
+			addr = mac_entry->h_dest;
+		} else if (is_zero_ether_addr(mac_mask->h_dest) &&
+		    !is_zero_ether_addr(mac_mask->h_source)) {
+			mapping = MV88E6XXX_POLICY_MAPPING_SA;
+			addr = mac_entry->h_source;
+		} else {
+			/* Cannot support DA and SA mapping in the same rule */
+			return -EOPNOTSUPP;
+		}
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	if ((fs->flow_type & FLOW_EXT) && fs->m_ext.vlan_tci) {
+		if (fs->m_ext.vlan_tci != 0xffff)
+			return -EOPNOTSUPP;
+		vid = be16_to_cpu(fs->h_ext.vlan_tci) & VLAN_VID_MASK;
+	}
+
+	idr_for_each_entry(&chip->policies, policy, id) {
+		if (policy->port == port && policy->mapping == mapping &&
+		    policy->action == action && policy->vid == vid &&
+		    ether_addr_equal(policy->addr, addr))
+			return -EEXIST;
+	}
+
+	policy = devm_kzalloc(chip->dev, sizeof(*policy), GFP_KERNEL);
+	if (!policy)
+		return -ENOMEM;
+
+	fs->location = 0;
+	err = idr_alloc_u32(&chip->policies, policy, &fs->location, 0xffffffff,
+			    GFP_KERNEL);
+	if (err) {
+		devm_kfree(chip->dev, policy);
+		return err;
+	}
+
+	memcpy(&policy->fs, fs, sizeof(*fs));
+	ether_addr_copy(policy->addr, addr);
+	policy->mapping = mapping;
+	policy->action = action;
+	policy->port = port;
+	policy->vid = vid;
+
+	err = mv88e6xxx_policy_apply(chip, port, policy);
+	if (err) {
+		idr_remove(&chip->policies, fs->location);
+		devm_kfree(chip->dev, policy);
+		return err;
+	}
+
+	return 0;
+}
+
+static int mv88e6xxx_get_rxnfc(struct dsa_switch *ds, int port,
+			       struct ethtool_rxnfc *rxnfc, u32 *rule_locs)
+{
+	struct ethtool_rx_flow_spec *fs = &rxnfc->fs;
+	struct mv88e6xxx_chip *chip = ds->priv;
+	struct mv88e6xxx_policy *policy;
+	int err;
+	int id;
+
+	mv88e6xxx_reg_lock(chip);
+
+	switch (rxnfc->cmd) {
+	case ETHTOOL_GRXCLSRLCNT:
+		rxnfc->data = 0;
+		rxnfc->data |= RX_CLS_LOC_SPECIAL;
+		rxnfc->rule_cnt = 0;
+		idr_for_each_entry(&chip->policies, policy, id)
+			if (policy->port == port)
+				rxnfc->rule_cnt++;
+		err = 0;
+		break;
+	case ETHTOOL_GRXCLSRULE:
+		err = -ENOENT;
+		policy = idr_find(&chip->policies, fs->location);
+		if (policy) {
+			memcpy(fs, &policy->fs, sizeof(*fs));
+			err = 0;
+		}
+		break;
+	case ETHTOOL_GRXCLSRLALL:
+		rxnfc->data = 0;
+		rxnfc->rule_cnt = 0;
+		idr_for_each_entry(&chip->policies, policy, id)
+			if (policy->port == port)
+				rule_locs[rxnfc->rule_cnt++] = id;
+		err = 0;
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	mv88e6xxx_reg_unlock(chip);
+
+	return err;
+}
+
+static int mv88e6xxx_set_rxnfc(struct dsa_switch *ds, int port,
+			       struct ethtool_rxnfc *rxnfc)
+{
+	struct ethtool_rx_flow_spec *fs = &rxnfc->fs;
+	struct mv88e6xxx_chip *chip = ds->priv;
+	struct mv88e6xxx_policy *policy;
+	int err;
+
+	mv88e6xxx_reg_lock(chip);
+
+	switch (rxnfc->cmd) {
+	case ETHTOOL_SRXCLSRLINS:
+		err = mv88e6xxx_policy_insert(chip, port, fs);
+		break;
+	case ETHTOOL_SRXCLSRLDEL:
+		err = -ENOENT;
+		policy = idr_remove(&chip->policies, fs->location);
+		if (policy) {
+			policy->action = MV88E6XXX_POLICY_ACTION_NORMAL;
+			err = mv88e6xxx_policy_apply(chip, port, policy);
+			devm_kfree(chip->dev, policy);
+		}
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	mv88e6xxx_reg_unlock(chip);
+
+	return err;
+}
+
 static int mv88e6xxx_port_add_broadcast(struct mv88e6xxx_chip *chip, int port,
 					u16 vid)
 {
@@ -4655,6 +4865,7 @@ static struct mv88e6xxx_chip *mv88e6xxx_alloc_chip(struct device *dev)
 
 	mutex_init(&chip->reg_lock);
 	INIT_LIST_HEAD(&chip->mdios);
+	idr_init(&chip->policies);
 
 	return chip;
 }
@@ -4739,6 +4950,8 @@ static const struct dsa_switch_ops mv88e6xxx_switch_ops = {
 	.set_eeprom		= mv88e6xxx_set_eeprom,
 	.get_regs_len		= mv88e6xxx_get_regs_len,
 	.get_regs		= mv88e6xxx_get_regs,
+	.get_rxnfc		= mv88e6xxx_get_rxnfc,
+	.set_rxnfc		= mv88e6xxx_set_rxnfc,
 	.set_ageing_time	= mv88e6xxx_set_ageing_time,
 	.port_bridge_join	= mv88e6xxx_port_bridge_join,
 	.port_bridge_leave	= mv88e6xxx_port_bridge_leave,
diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h
index 04a329a98158..e9b1a1ac9a8e 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.h
+++ b/drivers/net/dsa/mv88e6xxx/chip.h
@@ -8,6 +8,7 @@
 #ifndef _MV88E6XXX_CHIP_H
 #define _MV88E6XXX_CHIP_H
 
+#include <linux/idr.h>
 #include <linux/if_vlan.h>
 #include <linux/irq.h>
 #include <linux/gpio/consumer.h>
@@ -207,6 +208,15 @@ enum mv88e6xxx_policy_action {
 	MV88E6XXX_POLICY_ACTION_DISCARD,
 };
 
+struct mv88e6xxx_policy {
+	enum mv88e6xxx_policy_mapping mapping;
+	enum mv88e6xxx_policy_action action;
+	struct ethtool_rx_flow_spec fs;
+	u8 addr[ETH_ALEN];
+	int port;
+	u16 vid;
+};
+
 struct mv88e6xxx_port {
 	struct mv88e6xxx_chip *chip;
 	int port;
@@ -265,6 +275,9 @@ struct mv88e6xxx_chip {
 	/* List of mdio busses */
 	struct list_head mdios;
 
+	/* Policy Control List IDs and rules */
+	struct idr policies;
+
 	/* There can be two interrupt controllers, which are chained
 	 * off a GPIO as interrupt source
 	 */
-- 
2.23.0


^ permalink raw reply related

* Re: [PATCH net-next 3/3] net: dsa: mv88e6xxx: add RXNFC support
From: Andrew Lunn @ 2019-09-07 20:32 UTC (permalink / raw)
  To: Vivien Didelot; +Cc: netdev, davem, f.fainelli
In-Reply-To: <20190907200049.25273-4-vivien.didelot@gmail.com>

> +static int mv88e6xxx_policy_insert(struct mv88e6xxx_chip *chip, int port,
> +				   struct ethtool_rx_flow_spec *fs)
> +{
> +	struct ethhdr *mac_entry = &fs->h_u.ether_spec;
> +	struct ethhdr *mac_mask = &fs->m_u.ether_spec;
> +	enum mv88e6xxx_policy_mapping mapping;
> +	enum mv88e6xxx_policy_action action;
> +	struct mv88e6xxx_policy *policy;
> +	u16 vid = 0;
> +	u8 *addr;
> +	int err;
> +	int id;
> +
> +	if (fs->location != RX_CLS_LOC_ANY)
> +		return -EINVAL;
> +
> +	if (fs->ring_cookie == RX_CLS_FLOW_DISC)
> +		action = MV88E6XXX_POLICY_ACTION_DISCARD;
> +	else
> +		return -EOPNOTSUPP;
> +
> +	switch (fs->flow_type & ~FLOW_EXT) {
> +	case ETHER_FLOW:
> +		if (!is_zero_ether_addr(mac_mask->h_dest) &&
> +		    is_zero_ether_addr(mac_mask->h_source)) {
> +			mapping = MV88E6XXX_POLICY_MAPPING_DA;
> +			addr = mac_entry->h_dest;
> +		} else if (is_zero_ether_addr(mac_mask->h_dest) &&
> +		    !is_zero_ether_addr(mac_mask->h_source)) {
> +			mapping = MV88E6XXX_POLICY_MAPPING_SA;
> +			addr = mac_entry->h_source;
> +		} else {
> +			/* Cannot support DA and SA mapping in the same rule */
> +			return -EOPNOTSUPP;
> +		}
> +		break;
> +	default:
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if ((fs->flow_type & FLOW_EXT) && fs->m_ext.vlan_tci) {
> +		if (fs->m_ext.vlan_tci != 0xffff)
> +			return -EOPNOTSUPP;
> +		vid = be16_to_cpu(fs->h_ext.vlan_tci) & VLAN_VID_MASK;
> +	}
> +
> +	idr_for_each_entry(&chip->policies, policy, id) {
> +		if (policy->port == port && policy->mapping == mapping &&
> +		    policy->action == action && policy->vid == vid &&
> +		    ether_addr_equal(policy->addr, addr))
> +			return -EEXIST;
> +	}
> +
> +	policy = devm_kzalloc(chip->dev, sizeof(*policy), GFP_KERNEL);
> +	if (!policy)
> +		return -ENOMEM;

Hi Vivien

I think this might be the first time we have done dynamic memory
allocation in the mv88e6xxx driver. It might even be a first for a DSA
driver?

I'm not saying it is wrong, but maybe we should discuss it. 

I assume you are doing this because the ATU entry itself is not
sufficient?

How much memory is involved here, worst case? I assume one struct
mv88e6xxx_policy per ATU entry? Which you think is too much to
allocate as part of chip? I guess most users will never use this
feature, so for most users it would be wasted memory. So i do see the
point for dynamically allocating it.

Thanks
	Andrew

^ permalink raw reply

* Re: [PATCH net-next 1/3] net: dsa: mv88e6xxx: complete ATU state definitions
From: Andrew Lunn @ 2019-09-07 20:33 UTC (permalink / raw)
  To: Vivien Didelot; +Cc: netdev, davem, f.fainelli
In-Reply-To: <20190907200049.25273-2-vivien.didelot@gmail.com>

On Sat, Sep 07, 2019 at 04:00:47PM -0400, Vivien Didelot wrote:
> Marvell has different values for the state of a MAC address,
> depending on its multicast bit. This patch completes the definitions
> for these states.
> 
> At the same time, use 0 which is intuitive enough and simplifies the
> code a bit, instead of the UC or MC unused value.
> 
> Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [PATCH net-next 2/3] net: dsa: mv88e6xxx: introduce .port_set_policy
From: Andrew Lunn @ 2019-09-07 20:33 UTC (permalink / raw)
  To: Vivien Didelot; +Cc: netdev, davem, f.fainelli
In-Reply-To: <20190907200049.25273-3-vivien.didelot@gmail.com>

On Sat, Sep 07, 2019 at 04:00:48PM -0400, Vivien Didelot wrote:
> Introduce a new .port_set_policy operation to configure a port's
> Policy Control List, based on mapping such as DA, SA, Etype and so on.
> 
> Models similar to 88E6352 and 88E6390 are supported at the moment.
> 
> Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [patch net-next 3/3] net: devlink: move reload fail indication to devlink core and expose to user
From: Jiri Pirko @ 2019-09-07 20:38 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev, davem, idosch, jakub.kicinski, tariqt, mlxsw
In-Reply-To: <6ff0726a-f910-8107-883e-83476f80b9de@gmail.com>

Sat, Sep 07, 2019 at 05:08:59PM CEST, dsahern@gmail.com wrote:
>On 9/6/19 7:44 PM, Jiri Pirko wrote:
>> From: Jiri Pirko <jiri@mellanox.com>
>> 
>> Currently the fact that devlink failed is stored in drivers. Move this
>> flag into devlink core. Also, expose it to the user.
>
>you mean 'reload failed', not 'devlink failed'?

Yeah, "reload failed".

>

^ permalink raw reply

* [patch net-next v2 0/3] net: devlink: move reload fail indication to devlink core and expose to user
From: Jiri Pirko @ 2019-09-07 20:53 UTC (permalink / raw)
  To: netdev; +Cc: davem, idosch, dsahern, jakub.kicinski, tariqt, mlxsw

From: Jiri Pirko <jiri@mellanox.com>

First two patches are dependencies of the last one. That moves devlink
reload failure indication to the devlink code, so the drivers do not
have to track it themselves. Currently it is only mlxsw, but I will send
a follow-up patchset that introduces this in netdevsim too.

Jiri Pirko (3):
  mlx4: Split restart_one into two functions
  net: devlink: split reload op into two
  net: devlink: move reload fail indication to devlink core and expose
    to user

 drivers/net/ethernet/mellanox/mlx4/catas.c |  2 +-
 drivers/net/ethernet/mellanox/mlx4/main.c  | 44 ++++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx4/mlx4.h  |  3 +-
 drivers/net/ethernet/mellanox/mlxsw/core.c | 30 +++++++++------
 drivers/net/netdevsim/dev.c                | 13 +++++--
 include/net/devlink.h                      |  8 +++-
 include/uapi/linux/devlink.h               |  2 +
 net/core/devlink.c                         | 35 +++++++++++++++--
 8 files changed, 106 insertions(+), 31 deletions(-)

-- 
2.21.0


^ permalink raw reply

* [patch net-next v2 1/3] mlx4: Split restart_one into two functions
From: Jiri Pirko @ 2019-09-07 20:53 UTC (permalink / raw)
  To: netdev; +Cc: davem, idosch, dsahern, jakub.kicinski, tariqt, mlxsw
In-Reply-To: <20190907205400.14589-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Split the function restart_one into two functions and separate teardown
and buildup.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/catas.c |  2 +-
 drivers/net/ethernet/mellanox/mlx4/main.c  | 25 ++++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx4/mlx4.h  |  3 +--
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
index 87e90b5d4d7d..5b11557f1ae4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/catas.c
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -210,7 +210,7 @@ static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist)
 	mutex_lock(&persist->interface_state_mutex);
 	if (persist->interface_state & MLX4_INTERFACE_STATE_UP &&
 	    !(persist->interface_state & MLX4_INTERFACE_STATE_DELETION)) {
-		err = mlx4_restart_one(persist->pdev, false, NULL);
+		err = mlx4_restart_one(persist->pdev);
 		mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n",
 			  err);
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 07c204bd3fc4..a39c647c12dc 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -3931,6 +3931,10 @@ static void mlx4_devlink_param_load_driverinit_values(struct devlink *devlink)
 	}
 }
 
+static void mlx4_restart_one_down(struct pci_dev *pdev);
+static int mlx4_restart_one_up(struct pci_dev *pdev, bool reload,
+			       struct devlink *devlink);
+
 static int mlx4_devlink_reload(struct devlink *devlink,
 			       struct netlink_ext_ack *extack)
 {
@@ -3941,9 +3945,11 @@ static int mlx4_devlink_reload(struct devlink *devlink,
 
 	if (persist->num_vfs)
 		mlx4_warn(persist->dev, "Reload performed on PF, will cause reset on operating Virtual Functions\n");
-	err = mlx4_restart_one(persist->pdev, true, devlink);
+	mlx4_restart_one_down(persist->pdev);
+	err = mlx4_restart_one_up(persist->pdev, true, devlink);
 	if (err)
-		mlx4_err(persist->dev, "mlx4_restart_one failed, ret=%d\n", err);
+		mlx4_err(persist->dev, "mlx4_restart_one_up failed, ret=%d\n",
+			 err);
 
 	return err;
 }
@@ -4163,7 +4169,13 @@ static int restore_current_port_types(struct mlx4_dev *dev,
 	return err;
 }
 
-int mlx4_restart_one(struct pci_dev *pdev, bool reload, struct devlink *devlink)
+static void mlx4_restart_one_down(struct pci_dev *pdev)
+{
+	mlx4_unload_one(pdev);
+}
+
+static int mlx4_restart_one_up(struct pci_dev *pdev, bool reload,
+			       struct devlink *devlink)
 {
 	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
 	struct mlx4_dev	 *dev  = persist->dev;
@@ -4175,7 +4187,6 @@ int mlx4_restart_one(struct pci_dev *pdev, bool reload, struct devlink *devlink)
 	total_vfs = dev->persist->num_vfs;
 	memcpy(nvfs, dev->persist->nvfs, sizeof(dev->persist->nvfs));
 
-	mlx4_unload_one(pdev);
 	if (reload)
 		mlx4_devlink_param_load_driverinit_values(devlink);
 	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv, 1);
@@ -4194,6 +4205,12 @@ int mlx4_restart_one(struct pci_dev *pdev, bool reload, struct devlink *devlink)
 	return err;
 }
 
+int mlx4_restart_one(struct pci_dev *pdev)
+{
+	mlx4_restart_one_down(pdev);
+	return mlx4_restart_one_up(pdev, false, NULL);
+}
+
 #define MLX_SP(id) { PCI_VDEVICE(MELLANOX, id), MLX4_PCI_DEV_FORCE_SENSE_PORT }
 #define MLX_VF(id) { PCI_VDEVICE(MELLANOX, id), MLX4_PCI_DEV_IS_VF }
 #define MLX_GN(id) { PCI_VDEVICE(MELLANOX, id), 0 }
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 23f1b5b512c2..527b52e48276 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -1043,8 +1043,7 @@ int mlx4_catas_init(struct mlx4_dev *dev);
 void mlx4_catas_end(struct mlx4_dev *dev);
 int mlx4_crdump_init(struct mlx4_dev *dev);
 void mlx4_crdump_end(struct mlx4_dev *dev);
-int mlx4_restart_one(struct pci_dev *pdev, bool reload,
-		     struct devlink *devlink);
+int mlx4_restart_one(struct pci_dev *pdev);
 int mlx4_register_device(struct mlx4_dev *dev);
 void mlx4_unregister_device(struct mlx4_dev *dev);
 void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type,
-- 
2.21.0


^ permalink raw reply related

* [patch net-next v2 2/3] net: devlink: split reload op into two
From: Jiri Pirko @ 2019-09-07 20:53 UTC (permalink / raw)
  To: netdev; +Cc: davem, idosch, dsahern, jakub.kicinski, tariqt, mlxsw
In-Reply-To: <20190907205400.14589-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

In order to properly implement failure indication during reload,
split the reload op into two ops, one for down phase and one for
up phase.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/main.c  | 19 +++++++++++++++----
 drivers/net/ethernet/mellanox/mlxsw/core.c | 19 +++++++++++++++----
 drivers/net/netdevsim/dev.c                | 13 ++++++++++---
 include/net/devlink.h                      |  5 ++++-
 net/core/devlink.c                         | 16 ++++++++++++----
 5 files changed, 56 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index a39c647c12dc..ef3f3d06ff1e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -3935,17 +3935,27 @@ static void mlx4_restart_one_down(struct pci_dev *pdev);
 static int mlx4_restart_one_up(struct pci_dev *pdev, bool reload,
 			       struct devlink *devlink);
 
-static int mlx4_devlink_reload(struct devlink *devlink,
-			       struct netlink_ext_ack *extack)
+static int mlx4_devlink_reload_down(struct devlink *devlink,
+				    struct netlink_ext_ack *extack)
 {
 	struct mlx4_priv *priv = devlink_priv(devlink);
 	struct mlx4_dev *dev = &priv->dev;
 	struct mlx4_dev_persistent *persist = dev->persist;
-	int err;
 
 	if (persist->num_vfs)
 		mlx4_warn(persist->dev, "Reload performed on PF, will cause reset on operating Virtual Functions\n");
 	mlx4_restart_one_down(persist->pdev);
+	return 0;
+}
+
+static int mlx4_devlink_reload_up(struct devlink *devlink,
+				  struct netlink_ext_ack *extack)
+{
+	struct mlx4_priv *priv = devlink_priv(devlink);
+	struct mlx4_dev *dev = &priv->dev;
+	struct mlx4_dev_persistent *persist = dev->persist;
+	int err;
+
 	err = mlx4_restart_one_up(persist->pdev, true, devlink);
 	if (err)
 		mlx4_err(persist->dev, "mlx4_restart_one_up failed, ret=%d\n",
@@ -3956,7 +3966,8 @@ static int mlx4_devlink_reload(struct devlink *devlink,
 
 static const struct devlink_ops mlx4_devlink_ops = {
 	.port_type_set	= mlx4_devlink_port_type_set,
-	.reload		= mlx4_devlink_reload,
+	.reload_down	= mlx4_devlink_reload_down,
+	.reload_up	= mlx4_devlink_reload_up,
 };
 
 static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 963a2b4b61b1..c71a1d9ea17b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -984,16 +984,26 @@ mlxsw_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req,
 	return 0;
 }
 
-static int mlxsw_devlink_core_bus_device_reload(struct devlink *devlink,
-						struct netlink_ext_ack *extack)
+static int
+mlxsw_devlink_core_bus_device_reload_down(struct devlink *devlink,
+					  struct netlink_ext_ack *extack)
 {
 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
-	int err;
 
 	if (!(mlxsw_core->bus->features & MLXSW_BUS_F_RESET))
 		return -EOPNOTSUPP;
 
 	mlxsw_core_bus_device_unregister(mlxsw_core, true);
+	return 0;
+}
+
+static int
+mlxsw_devlink_core_bus_device_reload_up(struct devlink *devlink,
+					struct netlink_ext_ack *extack)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+	int err;
+
 	err = mlxsw_core_bus_device_register(mlxsw_core->bus_info,
 					     mlxsw_core->bus,
 					     mlxsw_core->bus_priv, true,
@@ -1066,7 +1076,8 @@ mlxsw_devlink_trap_group_init(struct devlink *devlink,
 }
 
 static const struct devlink_ops mlxsw_devlink_ops = {
-	.reload				= mlxsw_devlink_core_bus_device_reload,
+	.reload_down		= mlxsw_devlink_core_bus_device_reload_down,
+	.reload_up		= mlxsw_devlink_core_bus_device_reload_up,
 	.port_type_set			= mlxsw_devlink_port_type_set,
 	.port_split			= mlxsw_devlink_port_split,
 	.port_unsplit			= mlxsw_devlink_port_unsplit,
diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c
index 39cdb6c18ec0..7fba7b271a57 100644
--- a/drivers/net/netdevsim/dev.c
+++ b/drivers/net/netdevsim/dev.c
@@ -521,8 +521,14 @@ static void nsim_dev_traps_exit(struct devlink *devlink)
 	kfree(nsim_dev->trap_data);
 }
 
-static int nsim_dev_reload(struct devlink *devlink,
-			   struct netlink_ext_ack *extack)
+static int nsim_dev_reload_down(struct devlink *devlink,
+				struct netlink_ext_ack *extack)
+{
+	return 0;
+}
+
+static int nsim_dev_reload_up(struct devlink *devlink,
+			      struct netlink_ext_ack *extack)
 {
 	enum nsim_resource_id res_ids[] = {
 		NSIM_RESOURCE_IPV4_FIB, NSIM_RESOURCE_IPV4_FIB_RULES,
@@ -638,7 +644,8 @@ nsim_dev_devlink_trap_action_set(struct devlink *devlink,
 }
 
 static const struct devlink_ops nsim_dev_devlink_ops = {
-	.reload = nsim_dev_reload,
+	.reload_down = nsim_dev_reload_down,
+	.reload_up = nsim_dev_reload_up,
 	.flash_update = nsim_dev_flash_update,
 	.trap_init = nsim_dev_devlink_trap_init,
 	.trap_action_set = nsim_dev_devlink_trap_action_set,
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 460bc629d1a4..c17709c0d0ec 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -637,7 +637,10 @@ enum devlink_trap_group_generic_id {
 	}
 
 struct devlink_ops {
-	int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack);
+	int (*reload_down)(struct devlink *devlink,
+			   struct netlink_ext_ack *extack);
+	int (*reload_up)(struct devlink *devlink,
+			 struct netlink_ext_ack *extack);
 	int (*port_type_set)(struct devlink_port *devlink_port,
 			     enum devlink_port_type port_type);
 	int (*port_split)(struct devlink *devlink, unsigned int port_index,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 6e52d639dac6..1e3a2288b0b2 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2672,12 +2672,17 @@ devlink_resources_validate(struct devlink *devlink,
 	return err;
 }
 
+static bool devlink_reload_supported(struct devlink *devlink)
+{
+	return devlink->ops->reload_down && devlink->ops->reload_up;
+}
+
 static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
 {
 	struct devlink *devlink = info->user_ptr[0];
 	int err;
 
-	if (!devlink->ops->reload)
+	if (!devlink_reload_supported(devlink))
 		return -EOPNOTSUPP;
 
 	err = devlink_resources_validate(devlink, NULL, info);
@@ -2685,7 +2690,10 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
 		NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed");
 		return err;
 	}
-	return devlink->ops->reload(devlink, info->extack);
+	err = devlink->ops->reload_down(devlink, info->extack);
+	if (err)
+		return err;
+	return devlink->ops->reload_up(devlink, info->extack);
 }
 
 static int devlink_nl_flash_update_fill(struct sk_buff *msg,
@@ -7145,7 +7153,7 @@ __devlink_param_driverinit_value_set(struct devlink *devlink,
 int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id,
 				       union devlink_param_value *init_val)
 {
-	if (!devlink->ops->reload)
+	if (!devlink_reload_supported(devlink))
 		return -EOPNOTSUPP;
 
 	return __devlink_param_driverinit_value_get(&devlink->param_list,
@@ -7192,7 +7200,7 @@ int devlink_port_param_driverinit_value_get(struct devlink_port *devlink_port,
 {
 	struct devlink *devlink = devlink_port->devlink;
 
-	if (!devlink->ops->reload)
+	if (!devlink_reload_supported(devlink))
 		return -EOPNOTSUPP;
 
 	return __devlink_param_driverinit_value_get(&devlink_port->param_list,
-- 
2.21.0


^ permalink raw reply related

* [patch net-next v2 3/3] net: devlink: move reload fail indication to devlink core and expose to user
From: Jiri Pirko @ 2019-09-07 20:54 UTC (permalink / raw)
  To: netdev; +Cc: davem, idosch, dsahern, jakub.kicinski, tariqt, mlxsw
In-Reply-To: <20190907205400.14589-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Currently the fact that devlink reload failed is stored in drivers.
Move this flag into devlink core. Also, expose it to the user.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v1->v2:
- s/devlink failed/devlink reload failed/ in description
---
 drivers/net/ethernet/mellanox/mlxsw/core.c | 15 +++++----------
 include/net/devlink.h                      |  3 +++
 include/uapi/linux/devlink.h               |  2 ++
 net/core/devlink.c                         | 21 ++++++++++++++++++++-
 4 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index c71a1d9ea17b..3fa96076e8a5 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -80,7 +80,6 @@ struct mlxsw_core {
 	struct mlxsw_thermal *thermal;
 	struct mlxsw_core_port *ports;
 	unsigned int max_ports;
-	bool reload_fail;
 	bool fw_flash_in_progress;
 	unsigned long driver_priv[0];
 	/* driver_priv has to be always the last item */
@@ -1002,15 +1001,11 @@ mlxsw_devlink_core_bus_device_reload_up(struct devlink *devlink,
 					struct netlink_ext_ack *extack)
 {
 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
-	int err;
-
-	err = mlxsw_core_bus_device_register(mlxsw_core->bus_info,
-					     mlxsw_core->bus,
-					     mlxsw_core->bus_priv, true,
-					     devlink);
-	mlxsw_core->reload_fail = !!err;
 
-	return err;
+	return mlxsw_core_bus_device_register(mlxsw_core->bus_info,
+					      mlxsw_core->bus,
+					      mlxsw_core->bus_priv, true,
+					      devlink);
 }
 
 static int mlxsw_devlink_flash_update(struct devlink *devlink,
@@ -1254,7 +1249,7 @@ void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core,
 {
 	struct devlink *devlink = priv_to_devlink(mlxsw_core);
 
-	if (mlxsw_core->reload_fail) {
+	if (devlink_is_reload_failed(devlink)) {
 		if (!reload)
 			/* Only the parts that were not de-initialized in the
 			 * failed reload attempt need to be de-initialized.
diff --git a/include/net/devlink.h b/include/net/devlink.h
index c17709c0d0ec..9c881dc25273 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -38,6 +38,7 @@ struct devlink {
 	struct device *dev;
 	possible_net_t _net;
 	struct mutex lock;
+	bool reload_failed;
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
@@ -940,6 +941,8 @@ void
 devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
 				     enum devlink_health_reporter_state state);
 
+bool devlink_is_reload_failed(struct devlink *devlink);
+
 void devlink_flash_update_begin_notify(struct devlink *devlink);
 void devlink_flash_update_end_notify(struct devlink *devlink);
 void devlink_flash_update_status_notify(struct devlink *devlink,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 546e75dd74ac..7cb5e8c5ae0d 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -410,6 +410,8 @@ enum devlink_attr {
 	DEVLINK_ATTR_TRAP_METADATA,			/* nested */
 	DEVLINK_ATTR_TRAP_GROUP_NAME,			/* string */
 
+	DEVLINK_ATTR_RELOAD_FAILED,			/* u8 0 or 1 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 1e3a2288b0b2..e00a4a643d17 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -471,6 +471,8 @@ static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
 
 	if (devlink_nl_put_handle(msg, devlink))
 		goto nla_put_failure;
+	if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed))
+		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
 	return 0;
@@ -2677,6 +2679,21 @@ static bool devlink_reload_supported(struct devlink *devlink)
 	return devlink->ops->reload_down && devlink->ops->reload_up;
 }
 
+static void devlink_reload_failed_set(struct devlink *devlink,
+				      bool reload_failed)
+{
+	if (devlink->reload_failed == reload_failed)
+		return;
+	devlink->reload_failed = reload_failed;
+	devlink_notify(devlink, DEVLINK_CMD_NEW);
+}
+
+bool devlink_is_reload_failed(struct devlink *devlink)
+{
+	return devlink->reload_failed;
+}
+EXPORT_SYMBOL_GPL(devlink_is_reload_failed);
+
 static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
 {
 	struct devlink *devlink = info->user_ptr[0];
@@ -2693,7 +2710,9 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
 	err = devlink->ops->reload_down(devlink, info->extack);
 	if (err)
 		return err;
-	return devlink->ops->reload_up(devlink, info->extack);
+	err = devlink->ops->reload_up(devlink, info->extack);
+	devlink_reload_failed_set(devlink, !!err);
+	return err;
 }
 
 static int devlink_nl_flash_update_fill(struct sk_buff *msg,
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH net-next 3/3] net: dsa: mv88e6xxx: add RXNFC support
From: Vivien Didelot @ 2019-09-07 21:25 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: netdev, davem, f.fainelli
In-Reply-To: <20190907203256.GA18741@lunn.ch>

Hi Andrew,

On Sat, 7 Sep 2019 22:32:56 +0200, Andrew Lunn <andrew@lunn.ch> wrote:
> > +	policy = devm_kzalloc(chip->dev, sizeof(*policy), GFP_KERNEL);
> > +	if (!policy)
> > +		return -ENOMEM;
> 
> I think this might be the first time we have done dynamic memory
> allocation in the mv88e6xxx driver. It might even be a first for a DSA
> driver?
> 
> I'm not saying it is wrong, but maybe we should discuss it. 
> 
> I assume you are doing this because the ATU entry itself is not
> sufficient?
> 
> How much memory is involved here, worst case? I assume one struct
> mv88e6xxx_policy per ATU entry? Which you think is too much to
> allocate as part of chip? I guess most users will never use this
> feature, so for most users it would be wasted memory. So i do see the
> point for dynamically allocating it.

A layer 2 policy is not limited to the ATU. It can also be based on a VTU
entry, on the port's Etype, or frame's Etype. We can have 0, 1 or literally
thousands of policies programmed by the user. The ethtool API does not
store the entries and requires the driver to dump them on get operations,
hence the allocation for simplicity. But we may accomodate the DSA layer in
the future if there are more RXNFC users than just bcm_sf2 and mv88e6xxx.


Thanks,

	Vivien

^ permalink raw reply

* [RFC bpf-next 0/7] bpf: packet capture helpers, bpftool support
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
  To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
	hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
	acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
	shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
	danieltimlee, ctakshak, netdev, bpf, linux-kselftest
  Cc: Alan Maguire

Packet capture is useful from a general debugging standpoint, and is
useful in particular in debugging BPF programs that do packet processing.
For general debugging, being able to initiate arbitrary packet capture
from kprobes and tracepoints is highly valuable; e.g. what do the packets
that reach kfree_skb() - representing error codepaths - look like?
Arbitrary packet capture is distinct from the traditional concept of
pre-defined hooks, and gives much more flexibility in probing system
behaviour. For packet-processing BPF programs, packet capture can be useful
for doing things such as debugging checksum errors.

The intent of this RFC patchset is to initiate discussion on if and how to
work packet capture-specific capabilities into BPF.  It is possible -
and indeed projects like xdpcap [1] have demonstrated how - to carry out
packet capture in BPF today via perf events, but the aim here is to
simplify both the in-BPF capture and the userspace collection.

The suggested approach is to add a new bpf helper - bpf_pcap() - to
simplify packet capture within BPF programs, and to enhance bpftool
to add a "pcap" subcommand to aid in retrieving packets.  The helper
is for the most part a wrapper around perf event sending, using
data relevant for packet capture as metadata.

The end result is being able to capture packet data in the following
manner.  For example if we add an iptables drop rule, we can observe
TCP SYN segments being freed at kfree_skb:

$ iptables -A INPUT -p tcp --dport 6666 -j DROP
$ bpftool pcap trace kprobe:kfree_skb proto ip data_out /tmp/cap &
$ nc 127.0.0.1 6666
Ncat: Connection timed out.
$ fg
^C
$ tshark -r /tmp/cap
Running as user "root" and group "root". This could be dangerous.
...
  3          7    127.0.0.1 -> 127.0.0.1    TCP 60 54732 > ircu [SYN] Seq=0 Win=65495 Len=0 MSS=65495 SACK_PERM=1 TSval=696475539 TSecr=0 WS=128
...

Tracepoints are also supported, and by default data is sent to
stdout, so we can pipe to tcpdump:

$ bpftool pcap trace tracepoint:net_dev_xmit:arg1 proto eth | tcpdump -r -
reading from file -, link-type EN10MB (Ethernet)
00:16:49.150880 IP 10.11.12.13 > 10.11.12.14: ICMP echo reply, id 10519, seq 1, length 64
...

Patch 1 adds support for bpf_pcap() in skb and XDP programs.  In those cases,
the argument is the relevant context (struct __sk_buff or xdp metadata)
from which we capture.
Patch 2 extends the helper to allow it to work for tracing programs, and in
that case the data argument is a pointer to an skb, derived from raw
tracepoint or kprobe arguments.
Patch 3 syncs uapi and tools headers for the new helper, flags and associated
pcap header type.
Patch 4 adds a feature test for libpcap which will be used in the next patch.
Patch 5 adds a "pcap" subcommand to bpftool to collect packet data from
BPF-driven perf event maps in existing programs.  Also supplied are simple
tracepoint and kprobe programs which can be used to attach to a kprobe
or raw tracepoint to retrieve arguments and capture the associated skb.
Patch 6 adds documentation for the new pcap subcommand.
Patch 7 tests the pcap subcommand for tracing, skb and xdp programs.

Alan Maguire (7):
  bpf: add bpf_pcap() helper to simplify packet capture
  bpf: extend bpf_pcap support to tracing programs
  bpf: sync tools/include/uapi/linux/bpf.h for pcap support
  bpf: add libpcap feature test
  bpf: add pcap support to bpftool
  bpf: add documentation for bpftool pcap subcommand
  bpf: add tests for bpftool packet capture

 include/linux/bpf.h                                |  20 +
 include/uapi/linux/bpf.h                           |  92 +++-
 kernel/bpf/verifier.c                              |   4 +-
 kernel/trace/bpf_trace.c                           | 214 +++++++++
 net/core/filter.c                                  |  67 +++
 tools/bpf/bpftool/Documentation/bpftool-btf.rst    |   1 +
 tools/bpf/bpftool/Documentation/bpftool-cgroup.rst |   1 +
 .../bpf/bpftool/Documentation/bpftool-feature.rst  |   1 +
 tools/bpf/bpftool/Documentation/bpftool-map.rst    |   1 +
 tools/bpf/bpftool/Documentation/bpftool-net.rst    |   1 +
 tools/bpf/bpftool/Documentation/bpftool-pcap.rst   | 119 +++++
 tools/bpf/bpftool/Documentation/bpftool-perf.rst   |   1 +
 tools/bpf/bpftool/Documentation/bpftool-prog.rst   |   1 +
 tools/bpf/bpftool/Documentation/bpftool.rst        |   1 +
 tools/bpf/bpftool/Makefile                         |  39 +-
 tools/bpf/bpftool/main.c                           |   3 +-
 tools/bpf/bpftool/main.h                           |   1 +
 tools/bpf/bpftool/pcap.c                           | 496 +++++++++++++++++++++
 tools/bpf/bpftool/progs/bpftool_pcap_kprobe.c      |  80 ++++
 tools/bpf/bpftool/progs/bpftool_pcap_tracepoint.c  |  68 +++
 tools/build/Makefile.feature                       |   2 +
 tools/build/feature/Makefile                       |   4 +
 tools/build/feature/test-libpcap.c                 |  26 ++
 tools/include/uapi/linux/bpf.h                     |  92 +++-
 tools/testing/selftests/bpf/Makefile               |   3 +-
 tools/testing/selftests/bpf/bpf_helpers.h          |  11 +
 .../testing/selftests/bpf/progs/bpftool_pcap_tc.c  |  41 ++
 .../testing/selftests/bpf/progs/bpftool_pcap_xdp.c |  39 ++
 tools/testing/selftests/bpf/test_bpftool_pcap.sh   | 132 ++++++
 29 files changed, 1549 insertions(+), 12 deletions(-)
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-pcap.rst
 create mode 100644 tools/bpf/bpftool/pcap.c
 create mode 100644 tools/bpf/bpftool/progs/bpftool_pcap_kprobe.c
 create mode 100644 tools/bpf/bpftool/progs/bpftool_pcap_tracepoint.c
 create mode 100644 tools/build/feature/test-libpcap.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpftool_pcap_tc.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpftool_pcap_xdp.c
 create mode 100755 tools/testing/selftests/bpf/test_bpftool_pcap.sh

-- 
1.8.3.1


^ permalink raw reply

* [RFC bpf-next 2/7] bpf: extend bpf_pcap support to tracing programs
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
  To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
	hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
	acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
	shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
	danieltimlee, ctakshak, netdev, bpf, linux-kselftest
  Cc: Alan Maguire
In-Reply-To: <1567892444-16344-1-git-send-email-alan.maguire@oracle.com>

packet capture is especially valuable in tracing contexts, so
extend bpf_pcap helper to take a tracing-derived skb pointer
as an argument.

In the case of tracing programs, the starting protocol
(corresponding to libpcap DLT_* values; 1 for Ethernet, 12 for
IP, etc) needs to be specified and should reflect the protocol
type which is pointed to by the skb->data pointer; i.e. the
start of the packet.  This can derived in a limited set of cases,
but should be specified where possible.  For skb and xdp programs
this protocol will nearly always be 1 (BPF_PCAP_TYPE_ETH).

Example usage for a tracing program, where we use a
struct bpf_pcap_hdr array map to pass in preferences for
protocol and max len:

struct bpf_map_def SEC("maps") pcap_conf_map = {
	.type = BPF_MAP_TYPE_ARRAY,
	.key_size = sizeof(int),
	.value_size = sizeof(struct bpf_pcap_hdr),
	.max_entries = 1,
};

struct bpf_map_def SEC("maps") pcap_map = {
	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
	.key_size = sizeof(int),
	.value_size = sizeof(int),
	.max_entries = 1024,
};

SEC("kprobe/kfree_skb")
int probe_kfree_skb(struct pt_regs *ctx)
{
	struct bpf_pcap_hdr *conf;
	int key = 0;

	conf = bpf_map_lookup_elem(&pcap_conf_map, &key);
	if (!conf)
		return 0;

	bpf_pcap((void *)PT_REGS_PARM1(ctx), conf->cap_len, &pcap_map,
		 conf->protocol, BPF_F_CURRENT_CPU);
	return 0;
}

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
 include/uapi/linux/bpf.h |  21 ++++-
 kernel/trace/bpf_trace.c | 214 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 233 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a27e58e..13f86d3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2758,7 +2758,9 @@ struct bpf_stack_build_id {
  *              held by *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This
  *		perf event has the same attributes as perf events generated
  *		by bpf_perf_event_output.  For skb and xdp programs, *data*
- *		is the relevant context.
+ *		is the relevant context, while for tracing programs,
+ *		*data* must be a pointer to a **struct sk_buff** derived
+ *		from kprobe or tracepoint arguments.
  *
  *		Metadata for this event is a **struct bpf_pcap_hdr**; this
  *		contains the capture length, actual packet length and
@@ -2771,6 +2773,14 @@ struct bpf_stack_build_id {
  *		to 48 bits; the id can be used to correlate captured packets
  *		with other trace data, since the passed-in flags value is stored
  *		stored in the **struct bpf_pcap_hdr** in the **flags** field.
+ *		Specifying **BPF_F_PCAP_ID_IIFINDEX** and a non-zero value in
+ *		the id portion of the flags limits capture events to skbs
+ *		with the specified incoming ifindex, allowing limiting of
+ *		tracing to the the associated interface.  Specifying
+ *		**BPF_F_PCAP_STRICT_TYPE** will cause *bpf_pcap* to return
+ *		-EPROTO and skip capture if a specific protocol is specified
+ *		and it does not match the current skb.  These additional flags
+ *		are only valid (and useful) for tracing programs.
  *
  *		The *protocol* value specifies the protocol type of the start
  *		of the packet so that packet capture can carry out
@@ -2780,7 +2790,12 @@ struct bpf_stack_build_id {
  *	Return
  *		0 on success, or a negative error in case of failure.
  *		-ENOENT will be returned if the associated perf event
- *		map entry is empty, or the skb is zero-length.
+ *		map entry is empty, the skb is zero-length,  or the incoming
+ *		ifindex was specified and we failed to match.
+ *		-EPROTO will be returned if **BPF_PCAP_TYPE_UNSET** is specified
+ *		and no protocol can be determined, or if we specify a protocol
+ *		along with **BPF_F_PCAP_STRICT_TYPE** and the skb protocol does
+ *		not match.
  *		-EINVAL will be returned if the flags value is invalid.
  *
  */
@@ -2977,6 +2992,8 @@ enum bpf_func_id {
 
 /* BPF_FUNC_pcap flags */
 #define	BPF_F_PCAP_ID_MASK		0xffffffffffff
+#define BPF_F_PCAP_ID_IIFINDEX		(1ULL << 48)
+#define BPF_F_PCAP_STRICT_TYPE         (1ULL << 56)
 
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ca1255d..311883b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,6 +13,8 @@
 #include <linux/kprobes.h>
 #include <linux/syscalls.h>
 #include <linux/error-injection.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
 
 #include <asm/tlb.h>
 
@@ -530,6 +532,216 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 	return __bpf_perf_event_output(regs, map, flags, sd);
 }
 
+/* Essentially just skb_copy_bits() using probe_kernel_read() where needed. */
+static unsigned long bpf_trace_skb_copy(void *tobuf, const void *from,
+					unsigned long offset,
+					unsigned long len)
+{
+	const struct sk_buff *frag_iterp, *skb = from;
+	struct skb_shared_info *shinfop, shinfo;
+	struct sk_buff frag_iter;
+	unsigned long copy, start;
+	void *to = tobuf;
+	int i, ret;
+
+	start = skb_headlen(skb);
+
+	copy = start - offset;
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		ret = probe_kernel_read(to, skb->data, copy);
+		if (unlikely(ret < 0))
+			goto out;
+		len -= copy;
+		if (len == 0)
+			return 0;
+		offset += copy;
+		to += copy;
+	}
+
+	if (skb->data_len == 0)
+		goto out;
+
+	shinfop = skb_shinfo(skb);
+
+	ret = probe_kernel_read(&shinfo, shinfop, sizeof(shinfo));
+	if (unlikely(ret < 0))
+		goto out;
+
+	if (shinfo.nr_frags > MAX_SKB_FRAGS) {
+		ret = -EINVAL;
+		goto out;
+	}
+	for (i = 0; i < shinfo.nr_frags; i++) {
+		skb_frag_t *f = &shinfo.frags[i];
+		int end;
+
+		if (start > offset + len) {
+			ret = -E2BIG;
+			goto out;
+		}
+
+		end = start + skb_frag_size(f);
+		copy = end - offset;
+		if (copy > 0) {
+			u32 poff, p_len, copied;
+			struct page *p;
+			u8 *vaddr;
+
+			if (copy > len)
+				copy = len;
+
+			skb_frag_foreach_page(f,
+					      skb_frag_off(f) + offset - start,
+					      copy, p, poff, p_len, copied) {
+
+				vaddr = kmap_atomic(p);
+				ret = probe_kernel_read(to + copied,
+							vaddr + poff, p_len);
+				kunmap_atomic(vaddr);
+
+				if (unlikely(ret < 0))
+					goto out;
+			}
+			len -= copy;
+			if (len == 0)
+				return 0;
+			offset += copy;
+			to += copy;
+		}
+		start = end;
+	}
+
+	for (frag_iterp = shinfo.frag_list; frag_iterp;
+	     frag_iterp = frag_iter.next) {
+		int end;
+
+		if (start > offset + len) {
+			ret = -E2BIG;
+			goto out;
+		}
+		ret = probe_kernel_read(&frag_iter, frag_iterp,
+					sizeof(frag_iter));
+		if (ret)
+			goto out;
+
+		end = start + frag_iter.len;
+		copy = end - offset;
+		if (copy > 0) {
+			if (copy > len)
+				copy = len;
+			ret = bpf_trace_skb_copy(to, &frag_iter,
+						offset - start,
+						copy);
+			if (ret)
+				goto out;
+
+			len -= copy;
+			if (len == 0)
+				return 0;
+			offset += copy;
+			to += copy;
+		}
+		start = end;
+	}
+out:
+	if (ret)
+		memset(tobuf, 0, len);
+
+	return ret;
+}
+
+/* Derive protocol for some of the easier cases.  For tracing, a probe point
+ * may be dealing with packets in various states. Common cases are IP
+ * packets prior to adding MAC header (_PCAP_TYPE_IP) and a full packet
+ * (_PCAP_TYPE_ETH).  For other cases the caller must specify the
+ * protocol they expect.  Other heuristics for packet identification
+ * should be added here as needed, since determining the packet type
+ * ensures we do not capture packets that fail to match the desired
+ * pcap type in BPF_F_PCAP_STRICT_TYPE mode.
+ */
+static inline int bpf_skb_protocol_get(struct sk_buff *skb)
+{
+	switch (htons(skb->protocol)) {
+	case ETH_P_IP:
+	case ETH_P_IPV6:
+		if (skb_network_header(skb) == skb->data)
+			return BPF_PCAP_TYPE_IP;
+		else
+			return BPF_PCAP_TYPE_ETH;
+	default:
+		return BPF_PCAP_TYPE_UNSET;
+	}
+}
+
+BPF_CALL_5(bpf_trace_pcap, void *, data, u32, size, struct bpf_map *, map,
+	   int, protocol_wanted, u64, flags)
+{
+	struct bpf_pcap_hdr pcap;
+	struct sk_buff skb;
+	int protocol;
+	int ret;
+
+	if (unlikely(flags & ~(BPF_F_PCAP_ID_IIFINDEX | BPF_F_PCAP_ID_MASK |
+			       BPF_F_PCAP_STRICT_TYPE)))
+		return -EINVAL;
+
+	ret = probe_kernel_read(&skb, data, sizeof(skb));
+	if (unlikely(ret < 0))
+		return ret;
+
+	/* Sanity check skb len in case we get bogus data. */
+	if (unlikely(!skb.len))
+		return -ENOENT;
+	if (unlikely(skb.len > GSO_MAX_SIZE || skb.data_len > skb.len))
+		return -E2BIG;
+
+	protocol = bpf_skb_protocol_get(&skb);
+
+	if (protocol_wanted == BPF_PCAP_TYPE_UNSET) {
+		/* If we cannot determine protocol type, bail. */
+		if (protocol == BPF_PCAP_TYPE_UNSET)
+			return -EPROTO;
+	} else {
+		/* if we determine protocol type, and it's not what we asked
+		 * for _and_ we are in strict mode, bail.  Otherwise we assume
+		 * the packet is the requested protocol type and drive on.
+		 */
+		if (flags & BPF_F_PCAP_STRICT_TYPE &&
+		    protocol != BPF_PCAP_TYPE_UNSET &&
+		    protocol != protocol_wanted)
+			return -EPROTO;
+		protocol = protocol_wanted;
+	}
+
+	/* If we specified a matching incoming ifindex, bail if not a match. */
+	if (flags & BPF_F_PCAP_ID_IIFINDEX) {
+		int iif = flags & BPF_F_PCAP_ID_MASK;
+
+		if (iif && skb.skb_iif != iif)
+			return -ENOENT;
+	}
+
+	ret = bpf_pcap_prepare(protocol, size, skb.len, flags, &pcap);
+	if (ret)
+		return ret;
+
+	return bpf_event_output(map, BPF_F_CURRENT_CPU, &pcap, sizeof(pcap),
+				&skb, pcap.cap_len, bpf_trace_skb_copy);
+}
+
+static const struct bpf_func_proto bpf_trace_pcap_proto = {
+	.func		= bpf_trace_pcap,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_CONST_MAP_PTR,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_0(bpf_get_current_task)
 {
 	return (long) current;
@@ -709,6 +921,8 @@ static void do_bpf_send_signal(struct irq_work *entry)
 #endif
 	case BPF_FUNC_send_signal:
 		return &bpf_send_signal_proto;
+	case BPF_FUNC_pcap:
+		return &bpf_trace_pcap_proto;
 	default:
 		return NULL;
 	}
-- 
1.8.3.1


^ permalink raw reply related

* [RFC bpf-next 4/7] bpf: add libpcap feature test
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
  To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
	hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
	acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
	shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
	danieltimlee, ctakshak, netdev, bpf, linux-kselftest
  Cc: Alan Maguire
In-Reply-To: <1567892444-16344-1-git-send-email-alan.maguire@oracle.com>

this test will be used when deciding whether to add the pcap
support features in the following patch

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
 tools/build/Makefile.feature       |  2 ++
 tools/build/feature/Makefile       |  4 ++++
 tools/build/feature/test-libpcap.c | 26 ++++++++++++++++++++++++++
 3 files changed, 32 insertions(+)
 create mode 100644 tools/build/feature/test-libpcap.c

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 86b793d..35e65418 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -85,6 +85,7 @@ FEATURE_TESTS_EXTRA :=                  \
          libbfd-liberty                 \
          libbfd-liberty-z               \
          libopencsd                     \
+         libpcap                        \
          libunwind-x86                  \
          libunwind-x86_64               \
          libunwind-arm                  \
@@ -113,6 +114,7 @@ FEATURE_DISPLAY ?=              \
          libelf                 \
          libnuma                \
          numa_num_possible_cpus \
+         libpcap                \
          libperl                \
          libpython              \
          libcrypto              \
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 0658b8c..c7585a1 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -27,6 +27,7 @@ FILES=                                          \
          test-libelf-mmap.bin                   \
          test-libnuma.bin                       \
          test-numa_num_possible_cpus.bin        \
+         test-libpcap.bin                       \
          test-libperl.bin                       \
          test-libpython.bin                     \
          test-libpython-version.bin             \
@@ -209,6 +210,9 @@ FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS)
 $(OUTPUT)test-libperl.bin:
 	$(BUILD) $(FLAGS_PERL_EMBED)
 
+$(OUTPUT)test-libpcap.bin:
+	$(BUILD) -lpcap
+
 $(OUTPUT)test-libpython.bin:
 	$(BUILD) $(FLAGS_PYTHON_EMBED)
 
diff --git a/tools/build/feature/test-libpcap.c b/tools/build/feature/test-libpcap.c
new file mode 100644
index 0000000..7f60eb9
--- /dev/null
+++ b/tools/build/feature/test-libpcap.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <pcap.h>
+
+#define PKTLEN 100
+
+int main(void)
+{
+	char dummy_data[PKTLEN] = { 0 };
+	pcap_dumper_t *pcap_dumper;
+	struct pcap_pkthdr hdr;
+	int proto = 1;
+	pcap_t *pcap;
+
+	pcap = pcap_open_dead(proto, PKTLEN);
+	pcap_dumper = pcap_dump_open(pcap, "-");
+	hdr.caplen = PKTLEN;
+	hdr.len = PKTLEN;
+	hdr.ts.tv_sec = 0;
+	hdr.ts.tv_usec = 0;
+	pcap_dump((u_char *)pcap_dumper, &hdr, (const u_char *)dummy_data);
+	pcap_dump_flush(pcap_dumper);
+	pcap_dump_close(pcap_dumper);
+	pcap_close(pcap);
+
+	return 0;
+}
-- 
1.8.3.1


^ permalink raw reply related

* [RFC bpf-next 3/7] bpf: sync tools/include/uapi/linux/bpf.h for pcap support
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
  To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
	hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
	acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
	shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
	danieltimlee, ctakshak, netdev, bpf, linux-kselftest
  Cc: Alan Maguire
In-Reply-To: <1567892444-16344-1-git-send-email-alan.maguire@oracle.com>

sync bpf.h updates for bpf_pcap helper and associated definitions

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
 tools/include/uapi/linux/bpf.h | 92 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 91 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 77c6be9..13f86d3 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2750,6 +2750,54 @@ struct bpf_stack_build_id {
  *		**-EOPNOTSUPP** kernel configuration does not enable SYN cookies
  *
  *		**-EPROTONOSUPPORT** IP packet version is not 4 or 6
+ *
+ * int bpf_pcap(void *data, u32 size, struct bpf_map *map, int protocol,
+ *		u64 flags)
+ *	Description
+ *		Write packet data from *data* into a special BPF perf event
+ *              held by *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This
+ *		perf event has the same attributes as perf events generated
+ *		by bpf_perf_event_output.  For skb and xdp programs, *data*
+ *		is the relevant context, while for tracing programs,
+ *		*data* must be a pointer to a **struct sk_buff** derived
+ *		from kprobe or tracepoint arguments.
+ *
+ *		Metadata for this event is a **struct bpf_pcap_hdr**; this
+ *		contains the capture length, actual packet length and
+ *		the starting protocol.
+ *
+ *		The max number of bytes of context to store is specified via
+ *		*size*.
+ *
+ *		The flags value can be used to specify an id value of up
+ *		to 48 bits; the id can be used to correlate captured packets
+ *		with other trace data, since the passed-in flags value is stored
+ *		stored in the **struct bpf_pcap_hdr** in the **flags** field.
+ *		Specifying **BPF_F_PCAP_ID_IIFINDEX** and a non-zero value in
+ *		the id portion of the flags limits capture events to skbs
+ *		with the specified incoming ifindex, allowing limiting of
+ *		tracing to the the associated interface.  Specifying
+ *		**BPF_F_PCAP_STRICT_TYPE** will cause *bpf_pcap* to return
+ *		-EPROTO and skip capture if a specific protocol is specified
+ *		and it does not match the current skb.  These additional flags
+ *		are only valid (and useful) for tracing programs.
+ *
+ *		The *protocol* value specifies the protocol type of the start
+ *		of the packet so that packet capture can carry out
+ *		interpretation.  See **pcap-linktype** (7) for details on
+ *		the supported values.
+ *
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *		-ENOENT will be returned if the associated perf event
+ *		map entry is empty, the skb is zero-length,  or the incoming
+ *		ifindex was specified and we failed to match.
+ *		-EPROTO will be returned if **BPF_PCAP_TYPE_UNSET** is specified
+ *		and no protocol can be determined, or if we specify a protocol
+ *		along with **BPF_F_PCAP_STRICT_TYPE** and the skb protocol does
+ *		not match.
+ *		-EINVAL will be returned if the flags value is invalid.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2862,7 +2910,8 @@ struct bpf_stack_build_id {
 	FN(sk_storage_get),		\
 	FN(sk_storage_delete),		\
 	FN(send_signal),		\
-	FN(tcp_gen_syncookie),
+	FN(tcp_gen_syncookie),		\
+	FN(pcap),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2941,6 +2990,11 @@ enum bpf_func_id {
 /* BPF_FUNC_sk_storage_get flags */
 #define BPF_SK_STORAGE_GET_F_CREATE	(1ULL << 0)
 
+/* BPF_FUNC_pcap flags */
+#define	BPF_F_PCAP_ID_MASK		0xffffffffffff
+#define BPF_F_PCAP_ID_IIFINDEX		(1ULL << 48)
+#define BPF_F_PCAP_STRICT_TYPE         (1ULL << 56)
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
@@ -3613,4 +3667,40 @@ struct bpf_sockopt {
 	__s32	retval;
 };
 
+/* bpf_pcap_hdr contains information related to a particular packet capture
+ * flow.  It specifies
+ *
+ * - a magic number BPF_PCAP_MAGIC which identifies the perf event as
+ *   a pcap-related event.
+ * - a starting protocol is the protocol associated with the header
+ * - a flags value, copied from the flags value passed into bpf_pcap().
+ *   IDs can be used to correlate packet capture data and other tracing data.
+ *
+ * bpf_pcap_hdr also contains the information relating to the to-be-captured
+ * packet, and closely corresponds to the struct pcap_pkthdr used by
+ * pcap_dump (3PCAP).  The bpf_pcap helper sets ktime_ns (nanoseconds since
+ * boot) to the ktime_ns value; to get sensible pcap times this value should
+ * be converted to a struct timeval time since epoch in the struct pcap_pkthdr.
+ *
+ * When bpf_pcap() is used, a "struct bpf_pcap_hdr" is stored as we
+ * need both information about the particular packet and the protocol
+ * we are capturing.
+ */
+
+#define BPF_PCAP_MAGIC		0xb7fca7
+
+struct bpf_pcap_hdr {
+	__u32			magic;
+	int			protocol;
+	__u64			flags;
+	__u64			ktime_ns;
+	__u32			tot_len;
+	__u32			cap_len;
+	__u8			data[0];
+};
+
+#define BPF_PCAP_TYPE_UNSET	-1
+#define BPF_PCAP_TYPE_ETH	1
+#define	BPF_PCAP_TYPE_IP	12
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
1.8.3.1


^ permalink raw reply related

* [RFC bpf-next 1/7] bpf: add bpf_pcap() helper to simplify packet capture
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
  To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
	hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
	acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
	shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
	danieltimlee, ctakshak, netdev, bpf, linux-kselftest
  Cc: Alan Maguire
In-Reply-To: <1567892444-16344-1-git-send-email-alan.maguire@oracle.com>

bpf_pcap() simplifies packet capture for skb and XDP
BPF programs by creating a BPF perf event containing information
relevant for packet capture (protocol, actual/captured packet
size, time of capture, etc) along with the packet payload itself.
All of this is stored in a "struct bpf_pcap_hdr".

This header information can then be retrieved from the perf
event map and used by packet capture frameworks such as libpcap
to carry out packet capture.

skb and XDP programs currently deal in Ethernet-based traffic
exclusively, so should specify BPF_PCAP_TYPE_ETH or
BPF_PCAP_TYPE_UNSET.  The protocol parameter will be used
in a later commit.

Note that libpcap assumes times are relative to the epoch while
we record nanoseconds since boot; as a result any times need
to be normalized with respect to the boot time for libpcap
storage; sysinfo(2) can be used to retrieve boot time to normalize
values appropriately.

Example usage for a tc-bpf program:

struct bpf_map_def SEC("maps") pcap_map = {
	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
	.key_size = sizeof(int),
	.value_size = sizeof(int),
	.max_entries = 1024,
};

SEC("cap")
int cap(struct __sk_buff *skb)
{
	bpf_pcap(skb, 1514, &pcap_map, BPF_PCAP_TYPE_ETH, 0);

	return TC_ACT_OK;
}

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
 include/linux/bpf.h      | 20 +++++++++++++
 include/uapi/linux/bpf.h | 75 +++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/verifier.c    |  4 ++-
 net/core/filter.c        | 67 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 164 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5b9d223..033c9cf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1145,4 +1145,24 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
 }
 #endif /* CONFIG_INET */
 
+
+static inline int bpf_pcap_prepare(int protocol, u32 cap_len, u32 tot_len,
+				   u64 flags, struct bpf_pcap_hdr *pcap)
+{
+	if (protocol < 0 || pcap == NULL)
+		return -EINVAL;
+
+	pcap->magic = BPF_PCAP_MAGIC;
+	pcap->protocol = protocol;
+	pcap->flags = flags;
+
+	if (cap_len == 0 || tot_len < cap_len)
+		cap_len = tot_len;
+	pcap->cap_len = cap_len;
+	pcap->tot_len = tot_len;
+	pcap->ktime_ns = ktime_get_mono_fast_ns();
+
+	return 0;
+}
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 77c6be9..a27e58e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2750,6 +2750,39 @@ struct bpf_stack_build_id {
  *		**-EOPNOTSUPP** kernel configuration does not enable SYN cookies
  *
  *		**-EPROTONOSUPPORT** IP packet version is not 4 or 6
+ *
+ * int bpf_pcap(void *data, u32 size, struct bpf_map *map, int protocol,
+ *		u64 flags)
+ *	Description
+ *		Write packet data from *data* into a special BPF perf event
+ *              held by *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This
+ *		perf event has the same attributes as perf events generated
+ *		by bpf_perf_event_output.  For skb and xdp programs, *data*
+ *		is the relevant context.
+ *
+ *		Metadata for this event is a **struct bpf_pcap_hdr**; this
+ *		contains the capture length, actual packet length and
+ *		the starting protocol.
+ *
+ *		The max number of bytes of context to store is specified via
+ *		*size*.
+ *
+ *		The flags value can be used to specify an id value of up
+ *		to 48 bits; the id can be used to correlate captured packets
+ *		with other trace data, since the passed-in flags value is stored
+ *		stored in the **struct bpf_pcap_hdr** in the **flags** field.
+ *
+ *		The *protocol* value specifies the protocol type of the start
+ *		of the packet so that packet capture can carry out
+ *		interpretation.  See **pcap-linktype** (7) for details on
+ *		the supported values.
+ *
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *		-ENOENT will be returned if the associated perf event
+ *		map entry is empty, or the skb is zero-length.
+ *		-EINVAL will be returned if the flags value is invalid.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2862,7 +2895,8 @@ struct bpf_stack_build_id {
 	FN(sk_storage_get),		\
 	FN(sk_storage_delete),		\
 	FN(send_signal),		\
-	FN(tcp_gen_syncookie),
+	FN(tcp_gen_syncookie),		\
+	FN(pcap),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2941,6 +2975,9 @@ enum bpf_func_id {
 /* BPF_FUNC_sk_storage_get flags */
 #define BPF_SK_STORAGE_GET_F_CREATE	(1ULL << 0)
 
+/* BPF_FUNC_pcap flags */
+#define	BPF_F_PCAP_ID_MASK		0xffffffffffff
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
@@ -3613,4 +3650,40 @@ struct bpf_sockopt {
 	__s32	retval;
 };
 
+/* bpf_pcap_hdr contains information related to a particular packet capture
+ * flow.  It specifies
+ *
+ * - a magic number BPF_PCAP_MAGIC which identifies the perf event as
+ *   a pcap-related event.
+ * - a starting protocol is the protocol associated with the header
+ * - a flags value, copied from the flags value passed into bpf_pcap().
+ *   IDs can be used to correlate packet capture data and other tracing data.
+ *
+ * bpf_pcap_hdr also contains the information relating to the to-be-captured
+ * packet, and closely corresponds to the struct pcap_pkthdr used by
+ * pcap_dump (3PCAP).  The bpf_pcap helper sets ktime_ns (nanoseconds since
+ * boot) to the ktime_ns value; to get sensible pcap times this value should
+ * be converted to a struct timeval time since epoch in the struct pcap_pkthdr.
+ *
+ * When bpf_pcap() is used, a "struct bpf_pcap_hdr" is stored as we
+ * need both information about the particular packet and the protocol
+ * we are capturing.
+ */
+
+#define BPF_PCAP_MAGIC		0xb7fca7
+
+struct bpf_pcap_hdr {
+	__u32			magic;
+	int			protocol;
+	__u64			flags;
+	__u64			ktime_ns;
+	__u32			tot_len;
+	__u32			cap_len;
+	__u8			data[0];
+};
+
+#define BPF_PCAP_TYPE_UNSET	-1
+#define BPF_PCAP_TYPE_ETH	1
+#define	BPF_PCAP_TYPE_IP	12
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3fb5075..a33ed24 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3440,7 +3440,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 	case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
 		if (func_id != BPF_FUNC_perf_event_read &&
 		    func_id != BPF_FUNC_perf_event_output &&
-		    func_id != BPF_FUNC_perf_event_read_value)
+		    func_id != BPF_FUNC_perf_event_read_value &&
+		    func_id != BPF_FUNC_pcap)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_STACK_TRACE:
@@ -3527,6 +3528,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 	case BPF_FUNC_perf_event_read:
 	case BPF_FUNC_perf_event_output:
 	case BPF_FUNC_perf_event_read_value:
+	case BPF_FUNC_pcap:
 		if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
 			goto error;
 		break;
diff --git a/net/core/filter.c b/net/core/filter.c
index ed65636..e0e23ee 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4158,6 +4158,35 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
+BPF_CALL_5(bpf_xdp_pcap, struct xdp_buff *, xdp, u32, size,
+	   struct bpf_map *, map, int, protocol, u64, flags)
+{
+	unsigned long len = (unsigned long)(xdp->data_end - xdp->data);
+	struct bpf_pcap_hdr pcap;
+	int ret;
+
+	if (unlikely(flags & ~BPF_F_PCAP_ID_MASK))
+		return -EINVAL;
+
+	ret = bpf_pcap_prepare(protocol, size, len, flags, &pcap);
+	if (ret)
+		return ret;
+
+	return bpf_event_output(map, BPF_F_CURRENT_CPU, &pcap, sizeof(pcap),
+				xdp->data, pcap.cap_len, bpf_xdp_copy);
+}
+
+static const struct bpf_func_proto bpf_xdp_pcap_proto = {
+	.func		= bpf_xdp_pcap,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_CONST_MAP_PTR,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
 {
 	return skb->sk ? sock_gen_cookie(skb->sk) : 0;
@@ -5926,6 +5955,34 @@ u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
 
 #endif /* CONFIG_INET */
 
+BPF_CALL_5(bpf_skb_pcap, struct sk_buff *, skb, u32, size,
+	   struct bpf_map *, map, int, protocol, u64, flags)
+{
+	struct bpf_pcap_hdr pcap;
+	int ret;
+
+	if (unlikely(flags & ~BPF_F_PCAP_ID_MASK))
+		return -EINVAL;
+
+	ret = bpf_pcap_prepare(protocol, size, skb->len, flags, &pcap);
+	if (ret)
+		return ret;
+
+	return bpf_event_output(map, BPF_F_CURRENT_CPU, &pcap, sizeof(pcap),
+				skb, pcap.cap_len, bpf_skb_copy);
+}
+
+static const struct bpf_func_proto bpf_skb_pcap_proto = {
+	.func		= bpf_skb_pcap,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_CONST_MAP_PTR,
+	.arg4_type      = ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
 bool bpf_helper_changes_pkt_data(void *func)
 {
 	if (func == bpf_skb_vlan_push ||
@@ -6075,6 +6132,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 		return &bpf_get_socket_uid_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_skb_event_output_proto;
+	case BPF_FUNC_pcap:
+		return &bpf_skb_pcap_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -6216,6 +6275,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 	case BPF_FUNC_tcp_gen_syncookie:
 		return &bpf_tcp_gen_syncookie_proto;
 #endif
+	case BPF_FUNC_pcap:
+		return &bpf_skb_pcap_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -6256,6 +6317,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 		return &bpf_tcp_check_syncookie_proto;
 	case BPF_FUNC_tcp_gen_syncookie:
 		return &bpf_tcp_gen_syncookie_proto;
+	case BPF_FUNC_pcap:
+		return &bpf_xdp_pcap_proto;
 #endif
 	default:
 		return bpf_base_func_proto(func_id);
@@ -6361,6 +6424,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 	case BPF_FUNC_skc_lookup_tcp:
 		return &bpf_skc_lookup_tcp_proto;
 #endif
+	case BPF_FUNC_pcap:
+		return &bpf_skb_pcap_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -6399,6 +6464,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_skb_under_cgroup:
 		return &bpf_skb_under_cgroup_proto;
+	case BPF_FUNC_pcap:
+		return &bpf_skb_pcap_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
1.8.3.1


^ permalink raw reply related

* [RFC bpf-next 6/7] bpf: add documentation for bpftool pcap subcommand
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
  To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
	hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
	acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
	shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
	danieltimlee, ctakshak, netdev, bpf, linux-kselftest
  Cc: Alan Maguire
In-Reply-To: <1567892444-16344-1-git-send-email-alan.maguire@oracle.com>

Document supported "bpf pcap" subcommands.

"prog" is used to capture packets from already-loaded programs.
"trace" loads/atttaches tracing programs to capture packets.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
 tools/bpf/bpftool/Documentation/bpftool-btf.rst    |   1 +
 tools/bpf/bpftool/Documentation/bpftool-cgroup.rst |   1 +
 .../bpf/bpftool/Documentation/bpftool-feature.rst  |   1 +
 tools/bpf/bpftool/Documentation/bpftool-map.rst    |   1 +
 tools/bpf/bpftool/Documentation/bpftool-net.rst    |   1 +
 tools/bpf/bpftool/Documentation/bpftool-pcap.rst   | 119 +++++++++++++++++++++
 tools/bpf/bpftool/Documentation/bpftool-perf.rst   |   1 +
 tools/bpf/bpftool/Documentation/bpftool-prog.rst   |   1 +
 tools/bpf/bpftool/Documentation/bpftool.rst        |   1 +
 9 files changed, 127 insertions(+)
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-pcap.rst

diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
index 39615f8..54045f0 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
@@ -235,4 +235,5 @@ SEE ALSO
 	**bpftool-cgroup**\ (8),
 	**bpftool-feature**\ (8),
 	**bpftool-net**\ (8),
+	**bpftool-pcap**\ (8),
 	**bpftool-perf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index 06a28b0..1df98e1 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -164,5 +164,6 @@ SEE ALSO
 	**bpftool-map**\ (8),
 	**bpftool-feature**\ (8),
 	**bpftool-net**\ (8),
+	**bpftool-pcap**\ (8),
 	**bpftool-perf**\ (8),
 	**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
index 4d08f35..0f36ad8 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
@@ -86,5 +86,6 @@ SEE ALSO
 	**bpftool-map**\ (8),
 	**bpftool-cgroup**\ (8),
 	**bpftool-net**\ (8),
+	**bpftool-pcap**\ (8),
 	**bpftool-perf**\ (8),
 	**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 1c0f714..8408022 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -271,5 +271,6 @@ SEE ALSO
 	**bpftool-cgroup**\ (8),
 	**bpftool-feature**\ (8),
 	**bpftool-net**\ (8),
+	**bpftool-pcap**\ (8),
 	**bpftool-perf**\ (8),
 	**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst
index 8651b00..6bd24bb 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-net.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
@@ -198,5 +198,6 @@ SEE ALSO
 	**bpftool-map**\ (8),
 	**bpftool-cgroup**\ (8),
 	**bpftool-feature**\ (8),
+	**bpftool-pcap**\ (8),
 	**bpftool-perf**\ (8),
 	**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-pcap.rst b/tools/bpf/bpftool/Documentation/bpftool-pcap.rst
new file mode 100644
index 0000000..53ed226d
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-pcap.rst
@@ -0,0 +1,119 @@
+================
+bpftool-pcap
+================
+-------------------------------------------------------------------------------
+tool for inspection and simple manipulation of eBPF progs
+-------------------------------------------------------------------------------
+
+:Manual section: 8
+
+SYNOPSIS
+========
+
+	**bpftool** [*OPTIONS*] **pcap** *COMMAND*
+
+	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } }
+
+	*COMMANDS* :=
+	{ **prog** | **trace** | **help** }
+
+PCAP COMMANDS
+=============
+
+|	**bpftool** **pcap** **prog **  *PROG* [{**data_out** *FILE* | **proto** *PROTOCOL* | **len** *MAXLEN* | **pages** *NUMPAGES*}]
+|	**bpftool** **pcap** **trace** [*OBJ*] *TRACE* [{**data_out** *FILE* | **proto** *PROTOCOL* | **len** *MAXLEN* | **dev** *DEVNAME* | **pages** *NUMPAGES*}]
+|	**bpftool** **pcap help**
+|
+|	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
+|	*PROTOCOL* := {
+|		**eth** | **ip** | **ieee_80211** | ... }
+|       *TRACE* := {
+|		**kprobe**|**tracepoint**:*probename*[:arg{1-4}] }
+
+
+DESCRIPTION
+===========
+	**bpftool pcap prog [*PROG*] *PROG* [{**data_out** *FILE* | **proto** *PROTOCOL* | **len** *MAXLEN* | **pages** *NUMPAGES*}]
+
+		  Capture packet data from perf event map associated with
+		  program specified.  By default capture data is displayed on
+		  stdout, but if a capture file is preferred the data_out FILE
+		  option can be used.  The link type (termed DLT_TYPE in
+		  libpcap) is assumed to be Ethernet if not explicitly
+		  specified via the **proto** option.
+
+		  Maximum capture length can be adjusted via the **len**
+		  option.
+
+		  To work with bpftool pcap, the associated BPF program must
+		  at least define a perf event map, but if config options
+		  (protocol, max len) are to be supported it should also
+		  provide an array map with a single value of at least
+		  *struct bpf_pcap_conf* size.
+
+	**bpftool** **pcap** **trace** [*OBJ*] *TRACE* [{**data_out** *FILE* | **proto** *PROTOCOL* | **len** *MAXLEN* | **dev** *DEV* | **pages** *NUMPAGES*}]
+
+		  Attach the specified program in *OBJ* or load a
+		  pre-existing BPF kprobe/tracepoint program capable
+		  of capturing packets.
+
+		  Trace specification is of the form
+
+			trace_type:probe[:arg]
+
+		  For example tracepoint:iwlwifi_dev_tx_tb:arg2 will
+		  capture packet data from the second argument to the
+		  iwlwifi_dev_tx_tb tracepoint.  *DEV* can be used to
+		  limit capture to a specific incoming interface.
+
+	**bpftool prog help**
+		  Print short help message.
+
+OPTIONS
+=======
+	-h, --help
+		  Print short generic help message (similar to **bpftool help**).
+
+	-V, --version
+		  Print version number (similar to **bpftool version**).
+
+	-j, --json
+		  Generate JSON output. For commands that cannot produce JSON, this
+		  option has no effect.
+
+	-p, --pretty
+		  Generate human-readable JSON output. Implies **-j**.
+
+	-f, --bpffs
+		  When showing BPF programs, show file names of pinned
+		  programs.
+
+	-m, --mapcompat
+		  Allow loading maps with unknown map definitions.
+
+	-n, --nomount
+		  Do not automatically attempt to mount any virtual file system
+		  (such as tracefs or BPF virtual file system) when necessary.
+
+	-d, --debug
+		  Print all logs available, even debug-level information. This
+		  includes logs from libbpf as well as from the verifier, when
+		  attempting to load programs.
+
+EXAMPLES
+========
+**# bpftool pcap trace tracepoint:net_dev_xmit:arg1 proto eth | tcpdump -r -**
+reading from file -, link-type EN10MB (Ethernet)
+00:16:49.150880 IP 10.11.12.13 > 10.11.12.14: ICMP echo reply, id 10519, seq 1, length 64
+
+SEE ALSO
+========
+	**bpf**\ (2),
+	**bpf-helpers**\ (7),
+	**bpftool**\ (8),
+	**bpftool-map**\ (8),
+	**bpftool-cgroup**\ (8),
+	**bpftool-feature**\ (8),
+	**bpftool-net**\ (8),
+	**bpftool-perf**\ (8),
+	**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
index e252bd0..d618bbd 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
@@ -90,4 +90,5 @@ SEE ALSO
 	**bpftool-cgroup**\ (8),
 	**bpftool-feature**\ (8),
 	**bpftool-net**\ (8),
+	**bpftool-pcap**\ (8),
 	**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 7a374b3..b4dd779 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -311,5 +311,6 @@ SEE ALSO
 	**bpftool-cgroup**\ (8),
 	**bpftool-feature**\ (8),
 	**bpftool-net**\ (8),
+	**bpftool-pcap**\ (8),
 	**bpftool-perf**\ (8),
 	**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 6a9c52e..4126246 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -80,5 +80,6 @@ SEE ALSO
 	**bpftool-cgroup**\ (8),
 	**bpftool-feature**\ (8),
 	**bpftool-net**\ (8),
+	**bpftool-pcap**\ (8),
 	**bpftool-perf**\ (8),
 	**bpftool-btf**\ (8)
-- 
1.8.3.1


^ permalink raw reply related

* [RFC bpf-next 5/7] bpf: add pcap support to bpftool
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
  To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
	hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
	acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
	shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
	danieltimlee, ctakshak, netdev, bpf, linux-kselftest
  Cc: Alan Maguire
In-Reply-To: <1567892444-16344-1-git-send-email-alan.maguire@oracle.com>

bpftool is enhanced to be able to both capture from existing skb/xdp
programs ("bpftool pcap prog") and to load tracing programs - including
built-in simple kprobe/raw_tracepoint programs.  The end result is
to have a way of dynamically tapping BPF programs, kernel functions
and tracepoints to capture packet data.

"bpftool pcap" support depends on libpcap library and headers presence,
hence the new feature test is used to check for these.

If present, "bpftool pcap" can be used to capture pcap perf event
data from the perf event map associated with a program.  For example,

$ bpftool pcap prog id 32 data_out /tmp/cap

...will capture perf event data from the BPF program with id 32,
storing it in the capture file /tmp/pcap.

bpftool looks for a perf event map associated with that program and
then captures packets from it in a loop until Ctrl^C is pressed.
By default stdout is used, so the following also works:

$ bpftool pcap prog id 32 | tcpdump -r -

Configuration can also be passed to pcap programs, provided they
define a single-element BPF_MAP_TYPE_ARRAY with value size
greater than "sizeof struct bpf_pcap_hdr".  Options include

 data_out FILE  packet capture file to use (stdout is default)
 proto PROTO    DLT_* type as per libpcap; by specifying a type
                BPF programs can query the map in-kernel and
                capture packets of that type.  A string
                or numeric value can be used.  It is set in the
                bpf_pcap_hdr associated with the configuration
                map as the "protocol" value.
 len MAXLEN     maximum capture length in bytes.  It is set in
                the bpf_pcap_hdr associated with the configuration
                map as the "cap_len" value.
 dev DEVICE     incoming interface.  Tracing will be restricted
                to skbs which have this incoming interface set.
                The flags associated with the bpf_pcap_hdr
                in the configuration map are adjusted to record
                the associated ifindex to limit tracing.

In addition to capturing from existing programs, it is possible
to load provided programs which trace kprobe entry and raw_tracepoints,
making the first four arguments of each available for tracing.
For example

$ bpftool pcap trace kprobe/ip_rcv proto ip | tcpdump -r -

...will load a provided kprobe program, set the configuration options
in its associated map and capture packets which the bpf_pcap()
helper identifies as of type IPv[4,6].  Similarly for tracepoints,

$ bpftool pcap trace tracepoint:net_dev_xmit:arg1 proto eth | tcpdump -r -

In this case we explicitly specify an argument (:arg1), but the
default assumption is the first argument is to be captured.

To achieve the built-in tracing capabilities, two BPF objects need
to be delivered with bpftool - bpftool_pcap_kprobe.o and
bpftool_pcap_tracepoint.o.  These are accordingly added to the
install target for bpftool.  Each contains a separate program for
extracting arg1, arg2, arg3 and arg4.  This may seem wasteful -
why not just have the arg number as a map parameter?  In practice
tracepoints fail to attach with that approach.

A question arises here.  First, if we deliver a kprobe program, won't
it only work for the specific kernel?  Just by dumb luck on my part
the program appears to dodge the kernel version check in libbpf by not
passing an explicit program type at load time.  That said, the
program does not reference any data structures outside of the context
provided (struct pt_regs *), so maybe there's something else going
on too?

Note that a user-defined tracing program can also be passed in,
and that program will be attached to the target probe in a similar
manner.  We first look for programs with "arg[1-4]" in the name
if an argnum is specified, otherwise we fall back to using the
first program.

$ bpftool pcap trace mytraceprog.o tracepoint:net_dev_xmit:arg1 \
	data_out /tmp/cap

bpftool looks for a BPF_MAP_TYPE_ARRAY containing one value of
size >= "struct bpf_pcap_hdr", and assumes that configuration
provided to the program should be set in that map.  This allows
the user to provide a maximum packet length, starting protocol
etc to tracing programs.

The idea behind providing packet capture/tracing functionality in
bpftool is to similify developer access to dynamic packet capture.
An alternative approach would be to provide libbpf interfaces, but
this would require linking libbpf with libpcap.

A possible approach would be to take the code from bpftool that
interacts with programs (to retrieve pcap-related maps and
set config) and move it to libbpf, but it may make sense to
start with the functionality in bpftool and see if other
consumers need/want it.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
 tools/bpf/bpftool/Makefile                        |  39 +-
 tools/bpf/bpftool/main.c                          |   3 +-
 tools/bpf/bpftool/main.h                          |   1 +
 tools/bpf/bpftool/pcap.c                          | 496 ++++++++++++++++++++++
 tools/bpf/bpftool/progs/bpftool_pcap_kprobe.c     |  80 ++++
 tools/bpf/bpftool/progs/bpftool_pcap_tracepoint.c |  68 +++
 tools/testing/selftests/bpf/bpf_helpers.h         |  11 +
 7 files changed, 690 insertions(+), 8 deletions(-)
 create mode 100644 tools/bpf/bpftool/pcap.c
 create mode 100644 tools/bpf/bpftool/progs/bpftool_pcap_kprobe.c
 create mode 100644 tools/bpf/bpftool/progs/bpftool_pcap_tracepoint.c

diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 39bc6f0..16c4104 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 include ../../scripts/Makefile.include
+include ../../scripts/Makefile.arch
 include ../../scripts/utilities.mak
 
 ifeq ($(srctree),)
@@ -61,8 +62,8 @@ INSTALL ?= install
 RM ?= rm -f
 
 FEATURE_USER = .bpftool
-FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib
-FEATURE_DISPLAY = libbfd disassembler-four-args zlib
+FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib libpcap
+FEATURE_DISPLAY = libbfd disassembler-four-args zlib libpcap
 
 check_feat := 1
 NON_CHECK_FEAT_TARGETS := clean uninstall doc doc-clean doc-install doc-uninstall
@@ -90,7 +91,14 @@ endif
 
 include $(wildcard $(OUTPUT)*.d)
 
-all: $(OUTPUT)bpftool
+ifeq ($(feature-libpcap),1)
+  LIBS += -lpcap
+  CFLAGS += -DHAVE_LIBPCAP_SUPPORT
+  BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
+  BPF_SRCS = $(wildcard progs/*.c)
+endif
+
+all: $(OUTPUT)bpftool $(OUTPUT)$(BPF_OBJ_FILES)
 
 BFD_SRCS = jit_disasm.c
 
@@ -109,6 +117,18 @@ CFLAGS += -DHAVE_LIBBFD_SUPPORT
 SRCS += $(BFD_SRCS)
 endif
 
+CLANG           ?= clang
+LLC             ?= llc
+
+CLANG_SYS_INCLUDES := $(shell $(CLANG) -v -E - </dev/null 2>&1 \
+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
+
+CLANG_FLAGS = -I. -I$(srctree)/tools/include/uapi \
+	      -I$(srctree)/tools/testing/selftests/bpf \
+	      $(CLANG_SYS_INCLUDES) \
+	      -Wno-compare-distinct-pointer-types \
+	      -D__TARGET_ARCH_$(SRCARCH)
+
 OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o
 
 $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
@@ -122,24 +142,29 @@ $(OUTPUT)bpftool: $(OBJS) $(LIBBPF)
 $(OUTPUT)%.o: %.c
 	$(QUIET_CC)$(COMPILE.c) -MMD -o $@ $<
 
+$(OUTPUT)$(BPF_OBJ_FILES): $(BPF_SRCS)
+	($(CLANG) $(CLANG_FLAGS) -O2 -target bpf -emit-llvm \
+		-c $(patsubst %.o,progs/%.c,$@) -o - || echo "clang failed") | \
+	$(LLC) -march=bpf -mcpu=$(CPU) $(LLC_FLAGS) -filetype=obj -o $@
+
 clean: $(LIBBPF)-clean
 	$(call QUIET_CLEAN, bpftool)
-	$(Q)$(RM) -- $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d
+	$(Q)$(RM) -- $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d $(OUTPUT)$(BPF_OBJ_FILES)
 	$(Q)$(RM) -r -- $(OUTPUT)libbpf/
 	$(call QUIET_CLEAN, core-gen)
 	$(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpftool
 	$(Q)$(RM) -r -- $(OUTPUT)feature/
 
-install: $(OUTPUT)bpftool
+install: $(OUTPUT)bpftool $(OUTPUT)$(BPF_OBJ_FILES)
 	$(call QUIET_INSTALL, bpftool)
 	$(Q)$(INSTALL) -m 0755 -d $(DESTDIR)$(prefix)/sbin
-	$(Q)$(INSTALL) $(OUTPUT)bpftool $(DESTDIR)$(prefix)/sbin/bpftool
+	$(Q)$(INSTALL) $(OUTPUT)bpftool $(OUTPUT)$(BPF_OBJ_FILES) $(DESTDIR)$(prefix)/sbin/
 	$(Q)$(INSTALL) -m 0755 -d $(DESTDIR)$(bash_compdir)
 	$(Q)$(INSTALL) -m 0644 bash-completion/bpftool $(DESTDIR)$(bash_compdir)
 
 uninstall:
 	$(call QUIET_UNINST, bpftool)
-	$(Q)$(RM) -- $(DESTDIR)$(prefix)/sbin/bpftool
+	$(Q)$(RM) -- $(DESTDIR)$(prefix)/sbin/bpftool*
 	$(Q)$(RM) -- $(DESTDIR)$(bash_compdir)/bpftool
 
 doc:
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 93d0086..e7c7969 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -58,7 +58,7 @@ static int do_help(int argc, char **argv)
 		"       %s batch file FILE\n"
 		"       %s version\n"
 		"\n"
-		"       OBJECT := { prog | map | cgroup | perf | net | feature | btf }\n"
+		"       OBJECT := { prog | map | cgroup | perf | net | feature | btf | pcap }\n"
 		"       " HELP_SPEC_OPTIONS "\n"
 		"",
 		bin_name, bin_name, bin_name);
@@ -227,6 +227,7 @@ static int make_args(char *line, char *n_argv[], int maxargs, int cmd_nb)
 	{ "net",	do_net },
 	{ "feature",	do_feature },
 	{ "btf",	do_btf },
+	{ "pcap",	do_pcap },
 	{ "version",	do_version },
 	{ 0 }
 };
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index af9ad56..079409c 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -155,6 +155,7 @@ int cmd_select(const struct cmd *cmds, int argc, char **argv,
 int do_tracelog(int argc, char **arg);
 int do_feature(int argc, char **argv);
 int do_btf(int argc, char **argv);
+int do_pcap(int argc, char **argv);
 
 int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what);
 int prog_parse_fd(int *argc, char ***argv);
diff --git a/tools/bpf/bpftool/pcap.c b/tools/bpf/bpftool/pcap.c
new file mode 100644
index 0000000..ab18d1f
--- /dev/null
+++ b/tools/bpf/bpftool/pcap.c
@@ -0,0 +1,496 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
+
+#include <assert.h>
+#include <bpf.h>
+#include <errno.h>
+#include <libbpf.h>
+#include <libgen.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/limits.h>
+#include <linux/perf_event.h>
+#include <linux/sysinfo.h>
+#include <net/if.h>
+#include <signal.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+#include <sys/sysinfo.h>
+#include <time.h>
+
+#include "json_writer.h"
+#include "main.h"
+
+#ifdef HAVE_LIBPCAP_SUPPORT
+
+/* To avoid conflicting definitions of bpf_insn */
+#define PCAP_DONT_INCLUDE_PCAP_BPF_H
+#include <pcap.h>
+
+#include <perf-sys.h>
+
+#define	PCAP_MAX_MAPS	8
+#define	PCAP_PROTOCOL_DEFAULT	BPF_PCAP_TYPE_ETH
+#define PCAP_NUM_PAGES_DEFAULT	16
+#define PCAP_NUM_PAGES_MIN	8
+#define PCAP_MAX_LEN		65536
+#define	PCAP_FILE_STDOUT	"-"
+#define	PCAP_FILE_DEFAULT	PCAP_FILE_STDOUT
+#define NANOSEC			1000000000
+
+
+pcap_dumper_t *pcap_dumper;
+static bool flush = true;
+volatile bool stop;
+
+static __u64 boottime;		/* seconds since epoch at boot time. */
+
+static unsigned int proto_from_str(const char *proto_str)
+{
+	int proto;
+
+	/* Names for DLT_ ethernet (en10mb) and IP (raw) aren't obvious. */
+	if (strcmp(proto_str, "eth") == 0)
+		proto = BPF_PCAP_TYPE_ETH;
+	else if (strcmp(proto_str, "ip") == 0)
+		proto = BPF_PCAP_TYPE_IP;
+	else {
+		proto = pcap_datalink_name_to_val(proto_str);
+		if (proto == PCAP_ERROR) {
+			proto = strtol(proto_str, NULL, 10);
+			if (errno == ERANGE)
+				proto = -1;
+		}
+	}
+	return proto;
+}
+
+static int verify_map(int map_fd, enum bpf_map_type map_type,
+		      __u32 num_entries, __u32 min_value_size)
+{
+	__u32 info_len = sizeof(struct bpf_map_info);
+	struct bpf_map_info info;
+
+	if (!bpf_obj_get_info_by_fd(map_fd, &info, &info_len) &&
+	    info.type == map_type &&
+	    (!num_entries || info.max_entries == num_entries) &&
+	    (!min_value_size || info.value_size >= min_value_size))
+		return 0;
+	return -1;
+}
+
+static void int_exit(int signo)
+{
+	stop = true;
+}
+
+static void handle_pcap_event(void *ctx, int cpu, void *data, __u32 size)
+{
+	struct bpf_pcap_hdr *conf = ctx;
+	struct bpf_pcap_hdr *hdr = data;
+	struct pcap_pkthdr caphdr;
+
+	if (hdr->magic != BPF_PCAP_MAGIC)
+		return;
+
+	/* If we are looking for a specific protocol, and this isn't a
+	 * match, ignore.
+	 */
+	if (conf->protocol != BPF_PCAP_TYPE_UNSET &&
+	    conf->protocol != hdr->protocol)
+		return;
+
+	caphdr.len = hdr->tot_len;
+	caphdr.caplen = hdr->cap_len;
+	caphdr.ts.tv_sec = boottime + (hdr->ktime_ns/NANOSEC);
+	caphdr.ts.tv_usec = (hdr->ktime_ns % NANOSEC) / 1000;
+
+	pcap_dump((u_char *)pcap_dumper, &caphdr, hdr->data);
+	if (flush)
+		pcap_dump_flush(pcap_dumper);
+}
+
+static int handle_pcap(int data_map_fd, int conf_map_fd,
+		       struct bpf_pcap_hdr *conf, const char *pcap_filename,
+		       int pages, struct bpf_link *trace_link)
+{
+	struct perf_buffer_opts pb_opts = {};
+	struct perf_buffer *pb;
+	struct sysinfo info;
+	pcap_t *pcap;
+	int err;
+
+	if (signal(SIGHUP, int_exit) ||
+	    signal(SIGTERM, int_exit)) {
+		perror("signal");
+		return 1;
+	}
+	(void) signal(SIGINT, int_exit);
+
+	/* pcap expects time since epoch and bpf_pcap() records nanoseconds
+	 * since boot; get time of boot to add to pcap time to give a (rough)
+	 * time since epoch for capture event.
+	 */
+	if (sysinfo(&info) == 0)
+		boottime = time(NULL) - info.uptime;
+
+	pcap = pcap_open_dead(conf->protocol, conf->cap_len ?
+					      conf->cap_len : PCAP_MAX_LEN);
+	if (!pcap) {
+		perror("pcap_open");
+		return -1;
+	}
+	pcap_dumper = pcap_dump_open(pcap, pcap_filename);
+	if (!pcap_dumper) {
+		perror("pcap_dumper");
+		return -1;
+	}
+
+	pb_opts.sample_cb = handle_pcap_event;
+	pb_opts.ctx = conf;
+	pb = perf_buffer__new(data_map_fd, pages, &pb_opts);
+	if (libbpf_get_error(pb)) {
+		perror("perf_buffer setup failed");
+		return -1;
+	}
+
+	while (!stop) {
+		err = perf_buffer__poll(pb, 1000);
+		if (err < 0 && err != -EINTR) {
+			p_err("perf buffer polling failed: %s (%d)",
+			      strerror(err), err);
+			break;
+		}
+	}
+
+	/* detach program if we attached one. */
+	if (trace_link)
+		bpf_link__destroy(trace_link);
+	perf_buffer__free(pb);
+	close(data_map_fd);
+	if (conf_map_fd >= 0)
+		close(conf_map_fd);
+	if (pcap_dumper) {
+		pcap_dump_flush(pcap_dumper);
+		pcap_dump_close(pcap_dumper);
+	}
+	if (pcap)
+		pcap_close(pcap);
+
+	return 0;
+}
+
+static int handle_opts(int argc, char **argv,
+		       struct bpf_pcap_hdr *conf,
+		       const char **pcap_filename, int *pages)
+{
+	int conf_used = 0;
+
+	while (argc) {
+		if (!REQ_ARGS(2))
+			return -1;
+
+		if (is_prefix(*argv, "data_out")) {
+			NEXT_ARG();
+			*pcap_filename = *argv;
+			/* no need to flush to capture file if not stdout */
+			if (strcmp(*pcap_filename, PCAP_FILE_STDOUT) != 0)
+				flush = false;
+			NEXT_ARG();
+		} else if (is_prefix(*argv, "proto")) {
+			NEXT_ARG();
+			conf->protocol = proto_from_str(*argv);
+			if (conf->protocol == -1) {
+				p_err("unrecognized protocol %s", *argv);
+				return -1;
+			}
+			conf_used = 1;
+			NEXT_ARG();
+		} else if (is_prefix(*argv, "len")) {
+			NEXT_ARG();
+			conf->cap_len = atoi(*argv);
+			conf_used = 1;
+			NEXT_ARG();
+		} else if (is_prefix(*argv, "dev")) {
+			unsigned long iifindex;
+
+			NEXT_ARG();
+			iifindex = if_nametoindex(*argv);
+			if (!iifindex) {
+				p_err("no such device %s", *argv);
+				return -1;
+			}
+			conf->flags |= (BPF_F_PCAP_ID_IIFINDEX |
+					(iifindex & BPF_F_PCAP_ID_MASK));
+			conf_used = 1;
+			NEXT_ARG();
+		} else if (is_prefix(*argv, "pages")) {
+			NEXT_ARG();
+			*pages = atoi(*argv);
+			if (*pages < PCAP_NUM_PAGES_MIN) {
+				p_err("at least %d pages are required",
+				      PCAP_NUM_PAGES_MIN);
+				return -1;
+			}
+			NEXT_ARG();
+		} else {
+			p_err("unknown arg %s", *argv);
+			return -1;
+		}
+	}
+	return conf_used;
+}
+
+static int handle_conf_map(int conf_map_fd, struct bpf_pcap_hdr *conf)
+{
+	int key = 0;
+
+	if (bpf_map_update_elem(conf_map_fd, &key, conf, BPF_ANY)) {
+		p_err("could not populate config in map");
+		return -1;
+	}
+	return 0;
+}
+
+/* For the prog specified, the conf map is optional but the data map must
+ * be present to facilitate capture.
+ */
+static int prog_info(int prog_fd, enum bpf_prog_type *type,
+		     int *data_map_fd, int *conf_map_fd)
+{
+	__u32 info_len = sizeof(struct bpf_prog_info);
+	struct bpf_prog_info prog_info;
+	__u32 map_ids[PCAP_MAX_MAPS];
+	int map_fd;
+	__u32 i;
+
+	*data_map_fd = -1;
+	*conf_map_fd = -1;
+
+	/* Find data and (optionally) conf map associated with program. */
+	memset(&prog_info, 0, sizeof(prog_info));
+	prog_info.nr_map_ids = PCAP_MAX_MAPS;
+	prog_info.map_ids =  ptr_to_u64(map_ids);
+	if (bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len) < 0) {
+		p_err("could not retrieve info for program");
+		return -1;
+	}
+	*type = prog_info.type;
+
+	for (i = 0; i < prog_info.nr_map_ids; i++) {
+		map_fd = bpf_map_get_fd_by_id(map_ids[i]);
+
+		if (!verify_map(map_fd,
+				BPF_MAP_TYPE_PERF_EVENT_ARRAY, 0, 0)) {
+			*data_map_fd = map_fd;
+			continue;
+		}
+		if (!verify_map(map_fd,
+				BPF_MAP_TYPE_ARRAY, 1,
+				sizeof(struct bpf_pcap_hdr)))
+			*conf_map_fd = map_fd;
+	}
+
+	/* For the prog specified, the conf map is optional but the data map
+	 * must be present to facilitate capture.
+	 */
+	if (*data_map_fd == -1) {
+		p_err("no perf event map associated with program");
+		return -1;
+	}
+	return 0;
+}
+
+static struct bpf_link *trace_attach(int prog_fd, enum bpf_prog_type prog_type,
+			struct bpf_program *prog, const char *trace)
+{
+	switch (prog_type) {
+	case BPF_PROG_TYPE_KPROBE:
+		return bpf_program__attach_kprobe(prog, false, trace);
+
+	case BPF_PROG_TYPE_RAW_TRACEPOINT:
+		return bpf_program__attach_raw_tracepoint(prog, trace);
+	default:
+		p_err("unexpected type; kprobes, raw tracepoints supported");
+		return NULL;
+	}
+}
+
+static int do_pcap_common(int argc, char **argv, int prog_fd,
+			  struct bpf_program *prog, char *trace)
+{
+	struct bpf_pcap_hdr conf = { .protocol = PCAP_PROTOCOL_DEFAULT,
+				     .cap_len = 0,
+				     .flags = 0 };
+	const char *pcap_filename = PCAP_FILE_DEFAULT;
+	int data_map_fd = -1, conf_map_fd = -1;
+	int pages = PCAP_NUM_PAGES_DEFAULT;
+	struct bpf_link *trace_link = NULL;
+	enum bpf_prog_type prog_type;
+	int conf_used;
+
+	if (prog_info(prog_fd, &prog_type, &data_map_fd, &conf_map_fd) < 0)
+		return -1;
+
+	conf_used = handle_opts(argc, argv, &conf, &pcap_filename, &pages);
+	switch (conf_used) {
+	case 0:
+		break;
+	case 1:
+		if (conf_map_fd < 0) {
+			p_err("no single-element BPF array map to store configuration found");
+			return -1;
+		}
+		break;
+	default:
+		return -1;
+	}
+
+	set_max_rlimit();
+
+	if (conf_map_fd >= 0 && handle_conf_map(conf_map_fd, &conf) < 0)
+		return -1;
+
+	if (trace && !prog) {
+		p_err("to specify trace option, '%s pcap load' must be used",
+		      bin_name);
+		return -1;
+	}
+	if (trace) {
+		trace_link = trace_attach(prog_fd, prog_type, prog, trace);
+		if (IS_ERR(trace_link))
+			return -1;
+	}
+
+	return handle_pcap(data_map_fd, conf_map_fd, &conf, pcap_filename,
+			   pages, trace_link);
+}
+
+static int do_pcap_trace(int argc, char **argv)
+{
+	char trace_prog[PATH_MAX], trace[PATH_MAX], trace_type[PATH_MAX];
+	struct bpf_program *prog;
+	struct bpf_object *obj;
+	int trace_argnum = 1;
+	char argstr[8];
+	int prog_fd;
+
+	if (!REQ_ARGS(1))
+		return -1;
+
+	trace_prog[0] = '\0';
+
+	/* Optional trace program; if not specified we load a builtin program
+	 * based on probe prefix (kprobe|tracepoint).
+	 */
+	if (strcmp(*argv + strlen(*argv) - 2, ".o") == 0) {
+		strncpy(trace_prog, *argv, sizeof(trace_prog));
+		if (!REQ_ARGS(2))
+			return -1;
+		NEXT_ARG();
+	}
+
+	if (sscanf(*argv, "%[^:]:%[^:]:arg%d", trace_type, trace, &trace_argnum)
+	    != 3 &&
+	    sscanf(*argv, "%[^:]:%[^:]", trace_type, trace) != 2) {
+		p_err("expected '[kprobe|tracepoint]:PROBENAME[:arg[1-4]]'");
+		return -1;
+	}
+	if (strcmp(trace_type, "kprobe") != 0 &&
+	    strcmp(trace_type, "tracepoint") != 0) {
+		p_err("invalid trace type %s, expected '[kprobe|tracepoint]:PROBENAME[:arg[1-4]]'",
+		      trace_type);
+		return -1;
+	}
+	if (trace_argnum < 1 || trace_argnum > 4) {
+		p_err("'arg%d' invalid, expected '[kprobe|tracepoint]:PROBENAME[:arg[1-4]]'",
+		      trace_argnum);
+		return -1;
+	}
+	NEXT_ARG();
+
+	if (strlen(trace_prog) == 0) {
+		char bin_path[PATH_MAX];
+
+		/* derive path of currently-executing command; BPF programs will
+		 * be in the same directory, with suffix based on trace type.
+		 */
+		if (readlink("/proc/self/exe", bin_path, sizeof(bin_path)) <= 0)
+			return -1;
+		snprintf(trace_prog, sizeof(trace_prog), "%s_pcap_%s.o",
+			 bin_path, trace_type);
+	}
+
+	if (bpf_prog_load(trace_prog, BPF_PROG_TYPE_UNSPEC, &obj, &prog_fd) < 0)
+		return -1;
+
+	snprintf(argstr, sizeof(argstr), "arg%d", trace_argnum);
+
+	bpf_object__for_each_program(prog, obj) {
+		if (strstr(bpf_program__title(prog, false), argstr))
+			break;
+	}
+	/* No argnum-specific program, fall back to first program. */
+	if (!prog)
+		prog = bpf_program__next(NULL, obj);
+	if (!prog) {
+		p_err("could not get program");
+		return -1;
+	}
+
+	return do_pcap_common(argc, argv, prog_fd, prog, trace);
+}
+
+static int do_pcap_prog(int argc, char **argv)
+{
+	int prog_fd;
+
+	prog_fd = prog_parse_fd(&argc, &argv);
+	if (prog_fd == -1)
+		return -1;
+
+	return do_pcap_common(argc, argv, prog_fd, NULL, NULL);
+}
+
+static int do_help(int argc, char **argv)
+{
+	if (json_output) {
+		jsonw_null(json_wtr);
+		return 0;
+	}
+	fprintf(stderr,
+		"        %s %s prog {id ID | pinned PATH }\n"
+		"              [data_out FILE] [proto PROTOCOL] [len MAXLEN]\n"
+		"              [pages NUMPAGES]\n"
+		"        %s %s trace [OBJ] {kprobe|tracepoint}:probename[:arg[1-4]]]\n"
+		"              [data_out FILE] [proto PROTOCOL] [len MAXLEN]\n"
+		"              [dev DEVICE] [pages NUMPAGES]\n"
+		"        %s %s help\n",
+		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]);
+
+	return 0;
+}
+
+static const struct cmd cmds[] = {
+	{ "prog",		do_pcap_prog },
+	{ "trace",		do_pcap_trace },
+	{ "help",		do_help },
+	{ 0 }
+};
+
+#endif /* HAVE_LIBPCAP_SUPPORT */
+
+int do_pcap(int argc, char **argv)
+{
+#ifdef HAVE_LIBPCAP_SUPPORT
+	return cmd_select(cmds, argc, argv, do_help);
+#else
+	p_err("pcap support was not compiled into bpftool as libpcap\n"
+	      "and associated headers were not available at build time.\n");
+	return -1;
+#endif /* HAVE_LIBPCAP_SUPPORT */
+}
diff --git a/tools/bpf/bpftool/progs/bpftool_pcap_kprobe.c b/tools/bpf/bpftool/progs/bpftool_pcap_kprobe.c
new file mode 100644
index 0000000..00a945d
--- /dev/null
+++ b/tools/bpf/bpftool/progs/bpftool_pcap_kprobe.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
+
+#include <stddef.h>
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+
+#include <bpf_helpers.h>
+
+#define KBUILD_MODNAME "foo"
+
+struct bpf_map_def SEC("maps") pcap_data_map = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1024,
+};
+
+struct bpf_map_def SEC("maps") pcap_conf_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct bpf_pcap_hdr),
+	.max_entries = 1,
+};
+
+static __always_inline int kprobe_pcap(struct pt_regs *ctx, int argnum)
+{
+	struct bpf_pcap_hdr *conf;
+	int key = 0;
+
+	conf = bpf_map_lookup_elem(&pcap_conf_map, &key);
+	if (!conf)
+		return 0;
+
+	switch (argnum) {
+	case 1:
+		bpf_pcap((void *)PT_REGS_PARM1(ctx), conf->cap_len,
+			 &pcap_data_map, conf->protocol, conf->flags);
+		return 0;
+	case 2:
+		bpf_pcap((void *)PT_REGS_PARM2(ctx), conf->cap_len,
+			 &pcap_data_map, conf->protocol, conf->flags);
+		return 0;
+	case 3:
+		bpf_pcap((void *)PT_REGS_PARM3(ctx), conf->cap_len,
+			 &pcap_data_map, conf->protocol, conf->flags);
+		return 0;
+	case 4:
+		bpf_pcap((void *)PT_REGS_PARM4(ctx), conf->cap_len,
+			 &pcap_data_map, conf->protocol, conf->flags);
+		return 0;
+	}
+	return 0;
+}
+
+SEC("kprobe/pcap_arg1")
+int pcap_arg1(struct pt_regs *ctx)
+{
+	return kprobe_pcap(ctx, 1);
+}
+
+SEC("kprobe/pcap_arg2")
+int pcap_arg2(struct pt_regs *ctx)
+{
+	return kprobe_pcap(ctx, 2);
+}
+
+SEC("kprobe/pcap_arg3")
+int pcap_arg3(struct pt_regs *ctx)
+{
+	return kprobe_pcap(ctx, 3);
+}
+
+SEC("kprobe/pcap_arg4")
+int pcap_arg4(struct pt_regs *ctx)
+{
+	return kprobe_pcap(ctx, 4);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/bpf/bpftool/progs/bpftool_pcap_tracepoint.c b/tools/bpf/bpftool/progs/bpftool_pcap_tracepoint.c
new file mode 100644
index 0000000..639806a
--- /dev/null
+++ b/tools/bpf/bpftool/progs/bpftool_pcap_tracepoint.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
+
+#include <stddef.h>
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+
+#include <bpf_helpers.h>
+
+#define KBUILD_MODNAME "foo"
+
+struct bpf_map_def SEC("maps") pcap_data_map = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1024,
+};
+
+struct bpf_map_def SEC("maps") pcap_conf_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct bpf_pcap_hdr),
+	.max_entries = 1,
+};
+
+/* To attach to raw tracepoints, we need one program for each arg choice 1-4.
+ * Otherwise attach fails.
+ */
+static __always_inline int trace_pcap(struct bpf_raw_tracepoint_args *ctx,
+				      int argnum)
+{
+	struct bpf_pcap_hdr *conf;
+	int ret, key = 0;
+
+	conf = bpf_map_lookup_elem(&pcap_conf_map, &key);
+	if (!conf)
+		return 0;
+
+	bpf_pcap((void *)ctx->args[argnum], conf->cap_len,
+		 &pcap_data_map, conf->protocol, conf->flags);
+	return 0;
+}
+
+SEC("raw_tracepoint/pcap_arg1")
+int trace_arg1(struct bpf_raw_tracepoint_args *ctx)
+{
+	return trace_pcap(ctx, 0);
+}
+
+SEC("raw_tracepoint/pcap_arg2")
+int trace_arg2(struct bpf_raw_tracepoint_args *ctx)
+{
+	return trace_pcap(ctx, 1);
+}
+
+SEC("raw_tracepoint/pcap_arg3")
+int trace_arg3(struct bpf_raw_tracepoint_args *ctx)
+{
+	return trace_pcap(ctx, 2);
+}
+
+SEC("raw_tracepoint/pcap_arg4")
+int trace_arg4(struct bpf_raw_tracepoint_args *ctx)
+{
+	return trace_pcap(ctx, 3);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 6c4930b..2a61126 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -231,6 +231,9 @@ static int (*bpf_sk_storage_delete)(void *map, struct bpf_sock *sk) =
 static long long (*bpf_tcp_gen_syncookie)(struct bpf_sock *sk, void *ip,
 					  int ip_len, void *tcp, int tcp_len) =
 	(void *) BPF_FUNC_tcp_gen_syncookie;
+static int (*bpf_pcap)(void *data, __u32 size, void *map, int protocol,
+		       __u64 flags) =
+	(void *) BPF_FUNC_pcap;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
@@ -520,8 +523,16 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode,
  * actual field offset, based on target kernel BTF type that matches original
  * (local) BTF, used to record relocation.
  */
+#ifdef __builtin_preserve_access_index
 #define BPF_CORE_READ(dst, src)						\
 	bpf_probe_read((dst), sizeof(*(src)),				\
 		       __builtin_preserve_access_index(src))
 
+#else
+
+#define	BPF_CORE_READ(dst, src)						\
+	bpf_probe_read((dst), sizeof(*(src)), src)
+
+#endif /* __builtin_preserve_access_index */
+
 #endif
-- 
1.8.3.1


^ permalink raw reply related

* [RFC bpf-next 7/7] bpf: add tests for bpftool packet capture
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
  To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
	hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
	acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
	shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
	danieltimlee, ctakshak, netdev, bpf, linux-kselftest
  Cc: Alan Maguire
In-Reply-To: <1567892444-16344-1-git-send-email-alan.maguire@oracle.com>

add tests which verify packet capture works for tracing of
kprobes and raw tracepoints, and for capturing packets from
existing skb/xdp programs.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
 tools/testing/selftests/bpf/Makefile               |   3 +-
 .../testing/selftests/bpf/progs/bpftool_pcap_tc.c  |  41 +++++++
 .../testing/selftests/bpf/progs/bpftool_pcap_xdp.c |  39 ++++++
 tools/testing/selftests/bpf/test_bpftool_pcap.sh   | 132 +++++++++++++++++++++
 4 files changed, 214 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/progs/bpftool_pcap_tc.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpftool_pcap_xdp.c
 create mode 100755 tools/testing/selftests/bpf/test_bpftool_pcap.sh

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 7f3196a..1e8b68d 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -66,7 +66,8 @@ TEST_PROGS := test_kmod.sh \
 	test_tc_tunnel.sh \
 	test_tc_edt.sh \
 	test_xdping.sh \
-	test_bpftool_build.sh
+	test_bpftool_build.sh \
+	test_bpftool_pcap.sh
 
 TEST_PROGS_EXTENDED := with_addr.sh \
 	with_tunnels.sh \
diff --git a/tools/testing/selftests/bpf/progs/bpftool_pcap_tc.c b/tools/testing/selftests/bpf/progs/bpftool_pcap_tc.c
new file mode 100644
index 0000000..b51f8fc
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpftool_pcap_tc.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
+
+#include <stddef.h>
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include <linux/pkt_cls.h>
+
+#include <bpf_helpers.h>
+
+#define KBUILD_MODNAME "foo"
+
+struct bpf_map_def SEC("maps") pcap_data_map = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1024,
+};
+
+struct bpf_map_def SEC("maps") pcap_conf_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct bpf_pcap_hdr),
+	.max_entries = 1,
+};
+
+SEC("tc_pcap")
+int tc_pcap(struct __sk_buff *skb)
+{
+	struct bpf_pcap_hdr *conf;
+	int key = 0;
+
+	conf = bpf_map_lookup_elem(&pcap_conf_map, &key);
+	if (!conf)
+		return 0;
+
+	bpf_pcap(skb, conf->cap_len, &pcap_data_map, conf->protocol,
+		 conf->flags);
+
+	return TC_ACT_OK;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpftool_pcap_xdp.c b/tools/testing/selftests/bpf/progs/bpftool_pcap_xdp.c
new file mode 100644
index 0000000..a7d6866
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpftool_pcap_xdp.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
+
+#include <stddef.h>
+#include <linux/bpf.h>
+
+#include <bpf_helpers.h>
+
+#define KBUILD_MODNAME "foo"
+
+struct bpf_map_def SEC("maps") pcap_data_map = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1024,
+};
+
+struct bpf_map_def SEC("maps") pcap_conf_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct bpf_pcap_hdr),
+	.max_entries = 1,
+};
+
+SEC("xdp_pcap")
+int xdp_pcap(struct xdp_md *ctx)
+{
+	struct bpf_pcap_hdr *conf;
+	int key = 0;
+
+	conf = bpf_map_lookup_elem(&pcap_conf_map, &key);
+	if (!conf)
+		return 0;
+
+	bpf_pcap(ctx, conf->cap_len, &pcap_data_map, conf->protocol,
+		 conf->flags);
+
+	return XDP_PASS;
+}
diff --git a/tools/testing/selftests/bpf/test_bpftool_pcap.sh b/tools/testing/selftests/bpf/test_bpftool_pcap.sh
new file mode 100755
index 0000000..92b5438
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_bpftool_pcap.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+
+readonly src="../../../../"
+readonly bpftool="${src}/tools/bpf/bpftool/bpftool"
+readonly capfile="/tmp/cap.$$"
+readonly ns="ns-$$"
+readonly badport="5555"
+readonly addr1="192.168.1.1"
+readonly addr2="192.168.1.2"
+readonly pinpath="/sys/fs/bpf/"
+readonly veth1="${ns}-veth1"
+readonly veth2="${ns}-veth2"
+# 24 bytes for the pcap header
+readonly cap_minsize=24
+readonly caplens="0 8192"
+readonly addrs="127.0.0.1 ::1"
+readonly devs="none lo"
+
+cleanup() {
+  iptables -D INPUT -p tcp --dport $badport -j DROP
+  ip6tables -D INPUT -p tcp --dport $badport -j DROP
+  ip netns del $ns 2>/dev/null
+  rm -f $capfile
+}
+
+verify_capture() {
+  capsize=$(stat -c '%s' $capfile)
+  if [[ $capsize -le $cap_minsize ]]; then
+    exit 1
+  fi
+  if [[ $no_tcpdump == 0 ]]; then
+    count=$(tcpdump -lnr $capfile $1 2>/dev/null)
+    if [[ -z "$count" ]]; then
+      exit 1
+    fi
+  fi
+}
+
+which tcpdump 2>&1 > /dev/null
+no_tcpdump=$?
+
+pcap_supported=$(bpftool pcap help >/dev/null 2>&1)
+if [[ $? -ne 0 ]]; then
+	echo "no pcap support in bpftool, cannot test feature."
+	exit 0
+fi
+
+set -e
+
+trap cleanup EXIT
+
+iptables -A INPUT -p tcp --dport $badport -j DROP
+ip6tables -A INPUT -p tcp --dport $badport -j DROP
+
+# Test "bpftool pcap trace" - kprobe, tracepoint tracing
+for probe in kprobe tracepoint; do
+  for dev in $devs; do
+    devarg=
+    if [[ $dev != "none" ]]; then
+      devarg="dev $dev"
+    fi
+    args="$probe:kfree_skb proto ip data_out $capfile $devarg"
+    echo "Test trace $args"
+    for caplen in $caplens ; do
+      for progname in none $probe ; do
+        progpath=
+        if [[ $progname != "none" ]]; then
+          progpath=${bpftool}_pcap_${probe}.o
+        fi
+        allargs="$progpath $args len $caplen"
+        for addr in $addrs ; do
+          $bpftool pcap trace $allargs &
+          bpftool_pid=$!
+          set +e
+          timeout 2 nc $addr $badport 2>/dev/null
+          kill -TERM $bpftool_pid
+          set -e
+          sleep 1
+          verify_capture "host $addr and port $badport"
+          rm -f $capfile
+        done
+      done
+    done
+    echo "Test trace $args: PASS"
+  done
+done
+
+# Test "bpftool pcap prog" - skb, xdp program tracing
+ip netns add $ns
+ip link add dev $veth2 netns $ns type veth peer name $veth1
+ip link set $veth1 up
+ip addr add ${addr1}/24 dev $veth1
+ip -netns $ns link set $veth2 up
+ip netns exec $ns ip addr add ${addr2}/24 dev $veth2
+
+for prog in tc xdp ; do
+  if [[ $prog == tc ]]; then
+    ip netns exec $ns tc qdisc add dev $veth2 clsact
+    ip netns exec $ns tc filter add dev $veth2 ingress bpf da \
+      obj bpftool_pcap_${prog}.o sec ${prog}_pcap
+    id=$(ip netns exec $ns tc filter show dev $veth2 ingress | \
+         awk '/direct-action/ { for(i=1;i<=NF;i++)if($i=="id")print $(i+1)}')
+  else
+    ip netns exec $ns ip link set dev $veth2 xdp obj bpftool_pcap_${prog}.o \
+      sec ${prog}_pcap
+    id=$(ip netns exec $ns ip link show $veth2 | awk '/prog\/xdp/ { print $3 }')
+    sleep 5
+  fi
+  args="id $id data_out $capfile"
+  echo "Test prog $args"
+  for caplen in $caplens ; do
+    allargs="$args len $caplen"
+    $bpftool pcap prog $allargs &
+    bpftool_pid=$!
+    set +e
+    ping -q -c 5 $addr2 1>/dev/null
+    kill -TERM $bpftool_pid
+    set -e
+    sleep 1
+    verify_capture "host $addr1"
+    rm -f $capfile
+  done
+  if [[ $prog == tc ]]; then
+    ip netns exec $ns tc qdisc del dev $veth2 clsact
+    sleep 1
+  else
+    ip netns exec $ns ip link set dev $veth2 xdp off
+  fi
+  echo "Test trace $args: PASS"
+done
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH net-next 2/3] net: dsa: mv88e6xxx: introduce .port_set_policy
From: Marek Behun @ 2019-09-07 23:54 UTC (permalink / raw)
  To: Vivien Didelot; +Cc: netdev, davem, f.fainelli, andrew
In-Reply-To: <20190907200049.25273-3-vivien.didelot@gmail.com>

On Sat,  7 Sep 2019 16:00:48 -0400
Vivien Didelot <vivien.didelot@gmail.com> wrote:

> @@ -3132,6 +3132,7 @@ static const struct mv88e6xxx_ops mv88e6172_ops = {
>  	.port_set_rgmii_delay = mv88e6352_port_set_rgmii_delay,
>  	.port_set_speed = mv88e6352_port_set_speed,
>  	.port_tag_remap = mv88e6095_port_tag_remap,
> +	.port_set_policy = mv88e6352_port_set_policy,
>  	.port_set_frame_mode = mv88e6351_port_set_frame_mode,
>  	.port_set_egress_floods = mv88e6352_port_set_egress_floods,
>  	.port_set_ether_type = mv88e6351_port_set_ether_type,

Topaz also supports this, 6141 and 6341.

^ permalink raw reply

* Re: general protection fault in dev_map_hash_update_elem
From: syzbot @ 2019-09-08  1:59 UTC (permalink / raw)
  To: alexei.starovoitov, ast, bpf, daniel, davem, hawk, jakub.kicinski,
	jbrouer, john.fastabend, kafai, linux-kernel, netdev,
	songliubraving, syzkaller-bugs, toke, yhs
In-Reply-To: <0000000000005091a70591d3e1d9@google.com>

syzbot has found a reproducer for the following crash on:

HEAD commit:    a2c11b03 kcm: use BPF_PROG_RUN
git tree:       bpf-next
console output: https://syzkaller.appspot.com/x/log.txt?x=13d46ec1600000
kernel config:  https://syzkaller.appspot.com/x/.config?x=cf0c85d15c20ade3
dashboard link: https://syzkaller.appspot.com/bug?extid=4e7a85b1432052e8d6f8
compiler:       gcc (GCC) 9.0.0 20181231 (experimental)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=1220b2d1600000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=1360b26e600000

Bisection is inconclusive: the first bad commit could be any of:

116e7dbe Merge branch 'gen-syn-cookie'
91bc3578 selftests/bpf: add test for bpf_tcp_gen_syncookie
637f71c0 selftests/bpf: bpf_tcp_gen_syncookie->bpf_helpers
bf8ff0f8 selftests/bpf: fix clearing buffered output between tests/subtests
3745ee18 bpf: sync bpf.h to tools/
a98bf573 tools: bpftool: add support for reporting the effective cgroup  
progs
70d66244 bpf: add bpf_tcp_gen_syncookie helper
9babe825 bpf: always allocate at least 16 bytes for setsockopt hook
9349d600 tcp: add skb-less helpers to retrieve SYN cookie
fd5ef31f selftests/bpf: extend sockopt_sk selftest with TCP_CONGESTION use  
case
02bc2b64 Merge branch 'setsockopt-extra-mem'
96511278 tcp: tcp_syn_flood_action read port from socket
a78d0dbe selftests/bpf: add loop test 4
d3406913 Merge branch 'devmap_hash'
1375dc4a tools: Add definitions for devmap_hash map type
8c303960 selftests/bpf: add loop test 5
946152b3 selftests/bpf: test_progs: switch to open_memstream
e4234619 tools/libbpf_probes: Add new devmap_hash type
10fbe211 tools/include/uapi: Add devmap_hash BPF map type
66bd2ec1 selftests/bpf: test_progs: test__printf -> printf
16e910d4 selftests/bpf: test_progs: drop extra trailing tab
6f9d451a xdp: Add devmap_hash map type for looking up devices by hashed  
index
682cdbdc Merge branch 'test_progs-stdio'
fca16e51 xdp: Refactor devmap allocation code for reuse
6dbff13c include/bpf.h: Remove map_insert_ctx() stubs
ef20a9b2 libbpf: add helpers for working with BTF types
475e31f8 Merge branch 'revamp-test_progs'
b03bc685 libbpf: convert libbpf code to use new btf helpers
4cedc0da libbpf: add .BTF.ext offset relocation section loading
b207edfe selftests/bpf: convert send_signal.c to use subtests
51436ed7 selftests/bpf: convert bpf_verif_scale.c to sub-tests API
ddc7c304 libbpf: implement BPF CO-RE offset relocation algorithm
2dc26d5a selftests/bpf: add BPF_CORE_READ relocatable read macro
3a516a0a selftests/bpf: add sub-tests support for test_progs
0ff97e56 selftests/bpf: abstract away test log output
df36e621 selftests/bpf: add CO-RE relocs testing setup
002d3afc selftests/bpf: add CO-RE relocs struct flavors tests
329e38f7 selftest/bpf: centralize libbpf logging management for test_progs
e87fd8ba libbpf: return previous print callback from libbpf_set_print
ec6438a9 selftests/bpf: add CO-RE relocs nesting tests
20a9ad2e selftests/bpf: add CO-RE relocs array tests
8160bae2 selftests/bpf: add test selectors by number and name to test_progs
766f2a59 selftests/bpf: revamp test_progs to allow more control
d9db3550 selftests/bpf: add CO-RE relocs enum/ptr/func_proto tests
61098e89 selftests/bpf: prevent headers to be compiled as C code
9654e2ae selftests/bpf: add CO-RE relocs modifiers/typedef tests
943e398d Merge branch 'flow_dissector-input-flags'
d698f9db selftests/bpf: add CO-RE relocs ptr-as-array tests
c1f5e7dd selftests/bpf: add CO-RE relocs ints tests
e853ae77 selftests/bpf: support BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP
29e1c668 selftests/bpf: add CO-RE relocs misc tests
71c99e32 bpf/flow_dissector: support ipv6 flow_label and  
BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL
726e333f Merge branch 'compile-once-run-everywhere'
ae173a91 selftests/bpf: support BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG
57debff2 tools/bpf: sync bpf_flow_keys flags
b7076592 tools/bpf: fix core_reloc.c compilation error
b2ca4e1c bpf/flow_dissector: support flags in BPF_PROG_TEST_RUN
d9973cec xdp: xdp_umem: fix umem pages mapping for 32bits systems
1ac6b126 bpf/flow_dissector: document flags
3783d437 samples/bpf: xdp_fwd rename devmap name to be xdp_tx_ports
086f9568 bpf/flow_dissector: pass input flags to BPF flow dissector program
a32a32cb samples/bpf: make xdp_fwd more practically usable via devmap lookup
03cd1d1a selftests/bpf: Add selftests for bpf_perf_event_output
abcce733 samples/bpf: xdp_fwd explain bpf_fib_lookup return codes
7c4b90d7 bpf: Allow bpf_skb_event_output for a few prog types
9f30cd56 Merge branch 'bpf-xdp-fwd-sample-improvements'
5e31d507 Merge branch 'convert-tests-to-libbpf'
a664a834 tools: bpftool: fix reading from /proc/config.gz
341dfcf8 btf: expose BTF info through sysfs
47da6e4d selftests/bpf: remove perf buffer helpers
c17bec54 samples/bpf: switch trace_output sample to perf_buffer API
d66fa3c7 tools: bpftool: add feature check for zlib
9840a4ff selftests/bpf: fix race in flow dissector tests
f58a4d51 samples/bpf: convert xdp_sample_pkts_user to perf_buffer API
7fd78568 btf: rename /sys/kernel/btf/kernel into /sys/kernel/btf/vmlinux
898ca681 selftests/bpf: switch test_tcpnotify to perf_buffer API
58b80815 selftests/bpf: convert test_get_stack_raw_tp to perf_buffer API
a1916a15 libbpf: attempt to load kernel BTF from sysfs first
72ef80b5 Merge branch 'bpf-libbpf-read-sysfs-btf'
f2a3e4e9 libbpf: provide more helpful message on uninitialized global var
708852dc Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=1130846e600000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+4e7a85b1432052e8d6f8@syzkaller.appspotmail.com

kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 10210 Comm: syz-executor910 Not tainted 5.3.0-rc7+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
RIP: 0010:__write_once_size include/linux/compiler.h:226 [inline]
RIP: 0010:__hlist_del include/linux/list.h:762 [inline]
RIP: 0010:hlist_del_rcu include/linux/rculist.h:455 [inline]
RIP: 0010:__dev_map_hash_update_elem kernel/bpf/devmap.c:668 [inline]
RIP: 0010:dev_map_hash_update_elem+0x3c8/0x6e0 kernel/bpf/devmap.c:691
Code: 48 89 f1 48 89 75 c8 48 c1 e9 03 80 3c 11 00 0f 85 d3 02 00 00 48 b9  
00 00 00 00 00 fc ff df 48 8b 53 10 48 89 d6 48 c1 ee 03 <80> 3c 0e 00 0f  
85 97 02 00 00 48 85 c0 48 89 02 74 38 48 89 55 b8
RSP: 0018:ffff88808c757c30 EFLAGS: 00010046
RAX: 0000000000000000 RBX: ffff8880a216a980 RCX: dffffc0000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8880a216a988
RBP: ffff88808c757c78 R08: 0000000000000004 R09: ffffed10118eaf73
R10: ffffed10118eaf72 R11: 0000000000000003 R12: ffff88808c7fb2c0
R13: ffff88808aa98800 R14: 0000000000000000 R15: ffff88808c7fb3e8
FS:  00007fd4c5528700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fff47596210 CR3: 000000008b442000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
  map_update_elem+0xc82/0x10b0 kernel/bpf/syscall.c:966
  __do_sys_bpf+0x8b5/0x3350 kernel/bpf/syscall.c:2854
  __se_sys_bpf kernel/bpf/syscall.c:2825 [inline]
  __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:2825
  do_syscall_64+0xfd/0x6a0 arch/x86/entry/common.c:296
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x446a29
Code: e8 0c e8 ff ff 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7  
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff  
ff 0f 83 5b 07 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007fd4c5527db8 EFLAGS: 00000246 ORIG_RAX: 0000000000000141
RAX: ffffffffffffffda RBX: 00000000006dbc28 RCX: 0000000000446a29
RDX: 0000000000000020 RSI: 0000000020000180 RDI: 0000000000000002
RBP: 00000000006dbc20 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006dbc2c
R13: 00007fff4759618f R14: 00007fd4c55289c0 R15: 0000000000000000
Modules linked in:
---[ end trace 9a6d00abce3fe1c8 ]---
RIP: 0010:__write_once_size include/linux/compiler.h:226 [inline]
RIP: 0010:__hlist_del include/linux/list.h:762 [inline]
RIP: 0010:hlist_del_rcu include/linux/rculist.h:455 [inline]
RIP: 0010:__dev_map_hash_update_elem kernel/bpf/devmap.c:668 [inline]
RIP: 0010:dev_map_hash_update_elem+0x3c8/0x6e0 kernel/bpf/devmap.c:691
Code: 48 89 f1 48 89 75 c8 48 c1 e9 03 80 3c 11 00 0f 85 d3 02 00 00 48 b9  
00 00 00 00 00 fc ff df 48 8b 53 10 48 89 d6 48 c1 ee 03 <80> 3c 0e 00 0f  
85 97 02 00 00 48 85 c0 48 89 02 74 38 48 89 55 b8
RSP: 0018:ffff88808c757c30 EFLAGS: 00010046
RAX: 0000000000000000 RBX: ffff8880a216a980 RCX: dffffc0000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8880a216a988
RBP: ffff88808c757c78 R08: 0000000000000004 R09: ffffed10118eaf73
R10: ffffed10118eaf72 R11: 0000000000000003 R12: ffff88808c7fb2c0
R13: ffff88808aa98800 R14: 0000000000000000 R15: ffff88808c7fb3e8
FS:  00007fd4c5528700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fff47596210 CR3: 000000008b442000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400


^ permalink raw reply

* Re: [PATCH net-next 3/3] net: dsa: mv88e6xxx: add RXNFC support
From: Florian Fainelli @ 2019-09-08  2:48 UTC (permalink / raw)
  To: Vivien Didelot, netdev; +Cc: davem, andrew
In-Reply-To: <20190907200049.25273-4-vivien.didelot@gmail.com>



On 9/7/2019 1:00 PM, Vivien Didelot wrote:
> Implement the .get_rxnfc and .set_rxnfc DSA operations to configure
> a port's Layer 2 Policy Control List (PCL) via ethtool.
> 
> Currently only dropping frames based on MAC Destination or Source
> Address (including the option VLAN parameter) is supported.
> 
> Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>

For the ethtool interface part:

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
-- 
Florian

^ permalink raw reply

* general protection fault in cbs_destroy
From: syzbot @ 2019-09-08  6:08 UTC (permalink / raw)
  To: davem, jhs, jiri, linux-kernel, netdev, syzkaller-bugs,
	xiyou.wangcong

Hello,

syzbot found the following crash on:

HEAD commit:    3b47fd5c Merge tag 'nfs-for-5.3-4' of git://git.linux-nfs...
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=14854e71600000
kernel config:  https://syzkaller.appspot.com/x/.config?x=144488c6c6c6d2b6
dashboard link: https://syzkaller.appspot.com/bug?extid=3a8d6a998cbb73bcf337
compiler:       clang version 9.0.0 (/home/glider/llvm/clang  
80fee25776c2fb61e74c1ecb1a523375c2500b69)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=17998f9e600000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=10421efa600000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+3a8d6a998cbb73bcf337@syzkaller.appspotmail.com

8021q: adding VLAN 0 to HW filter on device batadv0
netlink: 24 bytes leftover after parsing attributes in process  
`syz-executor457'.
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 0 PID: 9249 Comm: syz-executor457 Not tainted 5.3.0-rc7+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
RIP: 0010:__list_del_entry_valid+0x6b/0x100 lib/list_debug.c:51
Code: 4c 89 f7 e8 97 d0 58 fe 48 ba 00 01 00 00 00 00 ad de 49 8b 1e 48 39  
d3 74 54 48 83 c2 22 49 39 d7 74 5e 4c 89 f8 48 c1 e8 03 <42> 80 3c 20 00  
74 08 4c 89 ff e8 66 d0 58 fe 49 8b 17 4c 39 f2 75
RSP: 0018:ffff88809898f568 EFLAGS: 00010246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001
RDX: dead000000000122 RSI: 0000000000000004 RDI: ffff88809fb5a7e8
RBP: ffff88809898f588 R08: dffffc0000000000 R09: ffffed1013131ea8
R10: ffffed1013131ea8 R11: 0000000000000000 R12: dffffc0000000000
R13: ffff88809fb5a480 R14: ffff88809fb5a7e0 R15: 0000000000000000
FS:  00005555568cb880(0000) GS:ffff8880aea00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020000610 CR3: 00000000a3968000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
  __list_del_entry include/linux/list.h:131 [inline]
  list_del include/linux/list.h:139 [inline]
  cbs_destroy+0x85/0x3e0 net/sched/sch_cbs.c:435
  qdisc_create+0xff8/0x13e0 net/sched/sch_api.c:1302
  tc_modify_qdisc+0x989/0x1ea0 net/sched/sch_api.c:1652
  rtnetlink_rcv_msg+0x889/0xd40 net/core/rtnetlink.c:5223
  netlink_rcv_skb+0x19e/0x3d0 net/netlink/af_netlink.c:2477
  rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:5241
  netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline]
  netlink_unicast+0x787/0x900 net/netlink/af_netlink.c:1328
  netlink_sendmsg+0x993/0xc50 net/netlink/af_netlink.c:1917
  sock_sendmsg_nosec net/socket.c:637 [inline]
  sock_sendmsg net/socket.c:657 [inline]
  ___sys_sendmsg+0x60d/0x910 net/socket.c:2311
  __sys_sendmsg net/socket.c:2356 [inline]
  __do_sys_sendmsg net/socket.c:2365 [inline]
  __se_sys_sendmsg net/socket.c:2363 [inline]
  __x64_sys_sendmsg+0x17c/0x200 net/socket.c:2363
  do_syscall_64+0xfe/0x140 arch/x86/entry/common.c:296
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x441b59
Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7  
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff  
ff 0f 83 7b 10 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007ffe29572cf8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000441b59
RDX: 0000000000000000 RSI: 0000000020000240 RDI: 0000000000000003
RBP: 00007ffe29572d10 R08: 0000000001bbbbbb R09: 0000000001bbbbbb
R10: 0000000001bbbbbb R11: 0000000000000246 R12: 0000000000000000
R13: 00000000004030f0 R14: 0000000000000000 R15: 0000000000000000
Modules linked in:
---[ end trace 226030e488aca074 ]---
RIP: 0010:__list_del_entry_valid+0x6b/0x100 lib/list_debug.c:51
Code: 4c 89 f7 e8 97 d0 58 fe 48 ba 00 01 00 00 00 00 ad de 49 8b 1e 48 39  
d3 74 54 48 83 c2 22 49 39 d7 74 5e 4c 89 f8 48 c1 e8 03 <42> 80 3c 20 00  
74 08 4c 89 ff e8 66 d0 58 fe 49 8b 17 4c 39 f2 75
RSP: 0018:ffff88809898f568 EFLAGS: 00010246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001
RDX: dead000000000122 RSI: 0000000000000004 RDI: ffff88809fb5a7e8
RBP: ffff88809898f588 R08: dffffc0000000000 R09: ffffed1013131ea8
R10: ffffed1013131ea8 R11: 0000000000000000 R12: dffffc0000000000
R13: ffff88809fb5a480 R14: ffff88809fb5a7e0 R15: 0000000000000000
FS:  00005555568cb880(0000) GS:ffff8880aea00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020000610 CR3: 00000000a3968000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox