Netdev List
 help / color / mirror / Atom feed
* [PATCH] iproute2: add VF_PORT support
From: Roopa Prabhu @ 2010-11-10  0:47 UTC (permalink / raw)
  To: netdev; +Cc: chrisw, scofeldm, shemminger, arnd

From: Roopa Prabhu <roprabhu@cisco.com>

Resubmitting Scott Feldmans original patch with below changes

- Fix port profile strlen which was off by 1
- Added function to convert IFLA_PORT_RESPONSE codes to string

Add support for IFLA_VF_PORTS.  VF port netlink msg layout is

        [IFLA_NUM_VF]
        [IFLA_VF_PORTS]
                [IFLA_VF_PORT]
                        [IFLA_PORT_*], ...
                [IFLA_VF_PORT]
                        [IFLA_PORT_*], ...
                ...
        [IFLA_PORT_SELF]
                [IFLA_PORT_*], ...

The iproute2 cmd line for link set is now:

Usage: ip link add link DEV [ name ] NAME
                   [ txqueuelen PACKETS ]
                   [ address LLADDR ]
                   [ broadcast LLADDR ]
                   [ mtu MTU ]
                   type TYPE [ ARGS ]
       ip link delete DEV type TYPE [ ARGS ]

       ip link set DEVICE [ { up | down } ]
                          [ arp { on | off } ]
                          [ dynamic { on | off } ]
                          [ multicast { on | off } ]
                          [ allmulticast { on | off } ]
                          [ promisc { on | off } ]
                          [ trailers { on | off } ]
                          [ txqueuelen PACKETS ]
                          [ name NEWNAME ]
                          [ address LLADDR ]
                          [ broadcast LLADDR ]
                          [ mtu MTU ]
                          [ netns PID ]
                          [ alias NAME ]
                          [ port MODE { PROFILE | VSI } ]
                          [ vf NUM [ mac LLADDR ]
                                   [ vlan VLANID [ qos VLAN-QOS ] ]
                                   [ rate TXRATE ]
                                   [ port MODE { PROFILE | VSI } ] ]
       ip link show [ DEVICE ]

TYPE := { vlan | veth | vcan | dummy | ifb | macvlan | can }
MODE := { assoc | preassoc | preassocrr | disassoc }
PROFILE := profile PROFILE
           [ instance UUID ]
           [ host UUID ]
VSI := vsi mgr MGRID type VTID ver VER
       [ instance UUID ]

Signed-off-by: Scott Feldman <scofeldm@cisco.com>
Signed-off-by: Roopa Prabhu <roprabhu@cisco.com>
---
 ip/ipaddress.c |  122 ++++++++++++++++++++++++++++++
 ip/iplink.c    |  227 +++++++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 311 insertions(+), 38 deletions(-)


diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 19b3d6e..8b8f8c7 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -187,6 +187,114 @@ static void print_linktype(FILE *fp, struct rtattr *tb)
 	}
 }
 
+static const char *vf_port_response_n2a(__u16 response)
+{
+	switch (response) {
+	case PORT_VDP_RESPONSE_SUCCESS:
+		return "SUCCESS";
+	case PORT_VDP_RESPONSE_INVALID_FORMAT:
+		return "INVALID FORMAT";
+	case PORT_VDP_RESPONSE_INSUFFICIENT_RESOURCES:
+		return "INSUFFICIENT RESOURCES";
+	case PORT_VDP_RESPONSE_UNUSED_VTID:
+		return "UNUSED VTID";
+	case PORT_VDP_RESPONSE_VTID_VIOLATION:
+		return "VTID VIOLATION";
+	case PORT_VDP_RESPONSE_VTID_VERSION_VIOALTION:
+		return "VTID VERSION VIOLATION";
+	case PORT_VDP_RESPONSE_OUT_OF_SYNC:
+		return "OUT-OF-SYNC";
+	case PORT_PROFILE_RESPONSE_SUCCESS:
+		return "SUCCESS";
+	case PORT_PROFILE_RESPONSE_INPROGRESS:
+		return "IN-PROGRESS";
+	case PORT_PROFILE_RESPONSE_INVALID:
+		return "INVALID";
+	case PORT_PROFILE_RESPONSE_BADSTATE:
+		return "BAD STATE";
+	case PORT_PROFILE_RESPONSE_INSUFFICIENT_RESOURCES:
+		return "INSUFFICIENT RESOURCES";
+	case PORT_PROFILE_RESPONSE_ERROR:
+		return "ERROR";
+	default:
+		return "UNKNOWN RESPONSE";
+	}
+}
+
+static void print_port(FILE *fp, struct rtattr *port[])
+{
+	struct ifla_port_vsi *vsi;
+#define uuid_fmt "%02X%02X%02X%02X-%02X%02X-%02X%02X-" \
+	"%02X%02X-%02X%02X%02X%02X%02X%02X"
+	unsigned char *uuid;
+	__u8 request;
+	__u16 response;
+
+	if (port[IFLA_PORT_VF])
+		fprintf(fp, "\n    vf %d port",
+			*(__u32 *)RTA_DATA(port[IFLA_PORT_VF]));
+	else
+		fprintf(fp, "\n    port");
+
+	if (port[IFLA_PORT_REQUEST]) {
+		request = *(__u8 *)RTA_DATA(port[IFLA_PORT_REQUEST]);
+		fprintf(fp, " %s",
+			request == PORT_REQUEST_PREASSOCIATE ? "preassoc" :
+			request == PORT_REQUEST_PREASSOCIATE_RR ? "preassocrr" :
+			request == PORT_REQUEST_ASSOCIATE ? "assoc" :
+			request == PORT_REQUEST_DISASSOCIATE ? "disassoc" :
+			"unknown request");
+	}
+
+	if (port[IFLA_PORT_PROFILE])
+		fprintf(fp, " profile \"%s\"",
+			(char *)RTA_DATA(port[IFLA_PORT_PROFILE]));
+
+	if (port[IFLA_PORT_VSI_TYPE]) {
+		vsi = RTA_DATA(port[IFLA_PORT_VSI_TYPE]);
+		fprintf(fp, " vsi mgr %d type 0x%02x%02x%02x ver %d",
+			vsi->vsi_mgr_id, vsi->vsi_type_id[0],
+			vsi->vsi_type_id[1], vsi->vsi_type_id[2],
+			vsi->vsi_type_version);
+	}
+
+	if (port[IFLA_PORT_RESPONSE]) {
+		response = *(__u16 *)RTA_DATA(port[IFLA_PORT_RESPONSE]);
+		fprintf(fp, " status: %s", vf_port_response_n2a(response));
+	}
+
+	if (port[IFLA_PORT_INSTANCE_UUID]) {
+		uuid = RTA_DATA(port[IFLA_PORT_INSTANCE_UUID]);
+		fprintf(fp, "\n        instance "uuid_fmt,
+			uuid[0],  uuid[1],  uuid[2],  uuid[3],
+			uuid[4],  uuid[5],  uuid[6],  uuid[7],
+			uuid[8],  uuid[9],  uuid[10], uuid[11],
+			uuid[12], uuid[13], uuid[14], uuid[15]);
+	}
+
+	if (port[IFLA_PORT_HOST_UUID]) {
+		uuid = RTA_DATA(port[IFLA_PORT_HOST_UUID]);
+		fprintf(fp, "\n            host "uuid_fmt,
+			uuid[0],  uuid[1],  uuid[2],  uuid[3],
+			uuid[4],  uuid[5],  uuid[6],  uuid[7],
+			uuid[8],  uuid[9],  uuid[10], uuid[11],
+			uuid[12], uuid[13], uuid[14], uuid[15]);
+	}
+}
+
+static void print_vfport(FILE *fp, struct rtattr *vfport)
+{
+	struct rtattr *port[IFLA_PORT_MAX+1];
+
+	if (vfport->rta_type != IFLA_VF_PORT) {
+		fprintf(stderr, "BUG: rta type is %d\n", vfport->rta_type);
+		return;
+	}
+
+	parse_rtattr_nested(port, IFLA_PORT_MAX, vfport);
+	print_port(fp, port);
+}
+
 static void print_vfinfo(FILE *fp, struct rtattr *vfinfo)
 {
 	struct ifla_vf_mac *vf_mac;
@@ -421,6 +529,20 @@ int print_linkinfo(const struct sockaddr_nl *who,
 			print_vfinfo(fp, i);
 	}
 
+	if (do_link && tb[IFLA_PORT_SELF]) {
+		struct rtattr *port[IFLA_PORT_MAX+1];
+		parse_rtattr_nested(port, IFLA_PORT_MAX, tb[IFLA_PORT_SELF]);
+		print_port(fp, port);
+	}
+
+	if (do_link && tb[IFLA_VF_PORTS] && tb[IFLA_NUM_VF]) {
+		struct rtattr *i, *vfports = tb[IFLA_VF_PORTS];
+		int rem = RTA_PAYLOAD(vfports);
+		for (i = RTA_DATA(vfports); RTA_OK(i, rem);
+			i = RTA_NEXT(i, rem))
+			print_vfport(fp, i);
+	}
+
 	fprintf(fp, "\n");
 	fflush(fp);
 	return 0;
diff --git a/ip/iplink.c b/ip/iplink.c
index cb2c4f5..961a3ef 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -68,14 +68,22 @@ void iplink_usage(void)
 	fprintf(stderr, "	                  [ mtu MTU ]\n");
 	fprintf(stderr, "	                  [ netns PID ]\n");
 	fprintf(stderr, "			  [ alias NAME ]\n");
+	fprintf(stderr, "			  [ port MODE { PROFILE | VSI } ]\n");
 	fprintf(stderr, "	                  [ vf NUM [ mac LLADDR ]\n");
 	fprintf(stderr, "				   [ vlan VLANID [ qos VLAN-QOS ] ]\n");
-	fprintf(stderr, "				   [ rate TXRATE ] ] \n");
+	fprintf(stderr, "				   [ rate TXRATE ]\n");
+	fprintf(stderr, "				   [ port MODE { PROFILE | VSI } ] ]\n");
 	fprintf(stderr, "       ip link show [ DEVICE ]\n");
 
 	if (iplink_have_newlink()) {
 		fprintf(stderr, "\n");
 		fprintf(stderr, "TYPE := { vlan | veth | vcan | dummy | ifb | macvlan | can }\n");
+		fprintf(stderr, "MODE := { assoc | preassoc | preassocrr | disassoc }\n");
+		fprintf(stderr, "PROFILE := profile PROFILE\n");
+		fprintf(stderr, "           [ instance UUID ]\n");
+		fprintf(stderr, "           [ host UUID ]\n");
+		fprintf(stderr, "VSI := vsi mgr MGRID type VTID ver VER\n");
+		fprintf(stderr, "       [ instance UUID ]\n");
 	}
 	exit(-1);
 }
@@ -176,55 +184,170 @@ struct iplink_req {
 	char			buf[1024];
 };
 
-int iplink_parse_vf(int vf, int *argcp, char ***argvp,
-			   struct iplink_req *req)
+void iplink_parse_port(int vf, int *argcp, char ***argvp,
+		       struct iplink_req *req)
+{
+	int argc = *argcp;
+	char **argv = *argvp;
+	struct rtattr *nest, *nest_inner = NULL;
+	struct ifla_port_vsi port_vsi;
+	char *port_profile = NULL;
+	char *instance_uuid = NULL;
+	char *host_uuid = NULL;
+	unsigned char uuid[16];
+	char *uuid_fmt = "%02X%02X%02X%02X-%02X%02X-%02X%02X-"
+		"%02X%02X-%02X%02X%02X%02X%02X%02X";
+	int parsed;
+	int manager_id = -1;
+	int type_id = -1;
+	int type_id_version = -1;
+	int request = -1;
+	int vsi = 0;
+
+	if (NEXT_ARG_OK()) {
+		NEXT_ARG();
+		if (matches(*argv, "assoc") == 0)
+			request = PORT_REQUEST_ASSOCIATE;
+		else if (matches(*argv, "preassoc") == 0)
+			request = PORT_REQUEST_PREASSOCIATE;
+		else if (matches(*argv, "preassocrr") == 0)
+			request = PORT_REQUEST_PREASSOCIATE_RR;
+		else if (matches(*argv, "disassoc") == 0)
+			request = PORT_REQUEST_DISASSOCIATE;
+	}
+
+	while (NEXT_ARG_OK()) {
+		NEXT_ARG();
+		if (matches(*argv, "vsi") == 0) {
+			vsi = 1;
+		} else if (matches(*argv, "mgr") == 0) {
+			NEXT_ARG();
+			if (get_integer(&manager_id, *argv, 0))
+				invarg("Invalid \"mgr\" value\n", *argv);
+		} else if (matches(*argv, "type") == 0) {
+			NEXT_ARG();
+			if (get_integer(&type_id, *argv, 0))
+				invarg("Invalid \"type\" value\n", *argv);
+		} else if (matches(*argv, "ver") == 0) {
+			NEXT_ARG();
+			if (get_integer(&type_id_version, *argv, 0))
+				invarg("Invalid \"ver\" value\n", *argv);
+		} else if (matches(*argv, "profile") == 0) {
+			NEXT_ARG();
+			port_profile = *argv;
+		} else if (matches(*argv, "instance") == 0) {
+			NEXT_ARG();
+			instance_uuid = *argv;
+		} else if (matches(*argv, "host") == 0) {
+			NEXT_ARG();
+			host_uuid = *argv;
+		} else {
+			/* rewind arg */
+			PREV_ARG();
+			break;
+		}
+	}
+
+	if (argc == *argcp)
+		incomplete_command();
+
+	if (vf == PORT_SELF_VF) {
+		nest = addattr_nest(&req->n, sizeof(*req), IFLA_PORT_SELF);
+	} else {
+		nest = addattr_nest(&req->n, sizeof(*req), IFLA_VF_PORTS);
+		nest_inner = addattr_nest(&req->n, sizeof(*req), IFLA_VF_PORT);
+		addattr_l(&req->n, sizeof(*req), IFLA_PORT_VF,
+			(uint32_t *)&vf, sizeof(uint32_t));
+	}
+
+	if (port_profile)
+		addattr_l(&req->n, sizeof(*req), IFLA_PORT_PROFILE,
+			port_profile, strlen(port_profile) + 1);
+
+	if (instance_uuid) {
+		parsed = sscanf(instance_uuid, uuid_fmt,
+			&uuid[0],  &uuid[1],  &uuid[2],  &uuid[3],
+			&uuid[4],  &uuid[5],  &uuid[6],  &uuid[7],
+			&uuid[8],  &uuid[9],  &uuid[10], &uuid[11],
+			&uuid[12], &uuid[13], &uuid[14], &uuid[15]);
+		if (parsed != sizeof(uuid))
+			invarg("Invalid \"uuid\" value\n", instance_uuid);
+		addattr_l(&req->n, sizeof(*req), IFLA_PORT_INSTANCE_UUID,
+			uuid, sizeof(uuid));
+
+	}
+
+	if (host_uuid) {
+		parsed = sscanf(host_uuid, uuid_fmt,
+			&uuid[0],  &uuid[1],  &uuid[2],  &uuid[3],
+			&uuid[4],  &uuid[5],  &uuid[6],  &uuid[7],
+			&uuid[8],  &uuid[9],  &uuid[10], &uuid[11],
+			&uuid[12], &uuid[13], &uuid[14], &uuid[15]);
+		if (parsed != sizeof(uuid))
+			invarg("Invalid \"uuid\" value\n", host_uuid);
+		addattr_l(&req->n, sizeof(*req), IFLA_PORT_HOST_UUID,
+			uuid, sizeof(uuid));
+
+	}
+
+	if (vsi) {
+		port_vsi.vsi_mgr_id = manager_id;
+		memcpy(&port_vsi.vsi_type_id, &type_id,
+			sizeof(port_vsi.vsi_type_id));
+		port_vsi.vsi_type_version = type_id_version;
+		addattr_l(&req->n, sizeof(*req), IFLA_PORT_VSI_TYPE,
+			&port_vsi, sizeof(port_vsi));
+	}
+
+	addattr_l(&req->n, sizeof(*req), IFLA_PORT_REQUEST,
+		&request, 1);
+
+	if (nest_inner)
+		addattr_nest_end(&req->n, nest_inner);
+	addattr_nest_end(&req->n, nest);
+
+	*argcp = argc;
+	*argvp = argv;
+}
+
+void iplink_parse_vf(int vf, int *argcp, char ***argvp,
+		     struct iplink_req *req)
 {
 	int len, argc = *argcp;
 	char **argv = *argvp;
+	struct rtattr *vflist;
 	struct rtattr *vfinfo;
-
-	vfinfo = addattr_nest(&req->n, sizeof(*req), IFLA_VF_INFO);
+	char *mac = NULL;
+	char *vlan = NULL;
+	char *qos = NULL;
+	char *rate = NULL;
+	struct ifla_vf_mac ivm = { .vf = vf, };
+	struct ifla_vf_vlan ivv = { .vf = vf, .qos = 0, };
+	struct ifla_vf_tx_rate ivt = { .vf = vf, };
 
 	while (NEXT_ARG_OK()) {
 		NEXT_ARG();
-		if (matches(*argv, "mac") == 0) {
-			struct ifla_vf_mac ivm;
+		if (matches(*argv, "port") == 0) {
+			iplink_parse_port(vf, &argc, &argv, req);
+		} else if (matches(*argv, "mac") == 0) {
 			NEXT_ARG();
-			ivm.vf = vf;
-			len = ll_addr_a2n((char *)ivm.mac, 32, *argv);
-			if (len < 0)
-				return -1;
-			addattr_l(&req->n, sizeof(*req), IFLA_VF_MAC, &ivm, sizeof(ivm));
+			mac = *argv;
 		} else if (matches(*argv, "vlan") == 0) {
-			struct ifla_vf_vlan ivv;
 			NEXT_ARG();
-			if (get_unsigned(&ivv.vlan, *argv, 0)) {
-				invarg("Invalid \"vlan\" value\n", *argv);
-			}
-			ivv.vf = vf;
-			ivv.qos = 0;
+			vlan = *argv;
 			if (NEXT_ARG_OK()) {
 				NEXT_ARG();
 				if (matches(*argv, "qos") == 0) {
 					NEXT_ARG();
-					if (get_unsigned(&ivv.qos, *argv, 0)) {
-						invarg("Invalid \"qos\" value\n", *argv);
-					}
+					qos = *argv;
 				} else {
 					/* rewind arg */
 					PREV_ARG();
 				}
 			}
-			addattr_l(&req->n, sizeof(*req), IFLA_VF_VLAN, &ivv, sizeof(ivv));
 		} else if (matches(*argv, "rate") == 0) {
-			struct ifla_vf_tx_rate ivt;
 			NEXT_ARG();
-			if (get_unsigned(&ivt.rate, *argv, 0)) {
-				invarg("Invalid \"rate\" value\n", *argv);
-			}
-			ivt.vf = vf;
-			addattr_l(&req->n, sizeof(*req), IFLA_VF_TX_RATE, &ivt, sizeof(ivt));
-		
+			rate = *argv;
 		} else {
 			/* rewind arg */
 			PREV_ARG();
@@ -235,11 +358,43 @@ int iplink_parse_vf(int vf, int *argcp, char ***argvp,
 	if (argc == *argcp)
 		incomplete_command();
 
-	addattr_nest_end(&req->n, vfinfo);
+	if (mac || vlan || rate) {
+
+		vflist = addattr_nest(&req->n, sizeof(*req), IFLA_VFINFO_LIST);
+		vfinfo = addattr_nest(&req->n, sizeof(*req), IFLA_VF_INFO);
+
+		if (mac) {
+			len = ll_addr_a2n((char *)ivm.mac, 32, mac);
+			if (len < 0)
+				invarg("Invalid \"mac\" value\n", mac);
+			addattr_l(&req->n, sizeof(*req), IFLA_VF_MAC,
+				&ivm, sizeof(ivm));
+		}
+
+		if (vlan) {
+			if (get_unsigned(&ivv.vlan, vlan, 0))
+				invarg("Invalid \"vlan\" value\n", vlan);
+			if (qos) {
+				if (get_unsigned(&ivv.qos, qos, 0))
+					invarg("Invalid \"qos\" value\n", qos);
+			}
+			addattr_l(&req->n, sizeof(*req), IFLA_VF_VLAN,
+				&ivv, sizeof(ivv));
+		}
+
+		if (rate) {
+			if (get_unsigned(&ivt.rate, rate, 0))
+				invarg("Invalid \"rate\" value\n", rate);
+			addattr_l(&req->n, sizeof(*req), IFLA_VF_TX_RATE,
+				&ivt, sizeof(ivt));
+		}
+
+		addattr_nest_end(&req->n, vfinfo);
+		addattr_nest_end(&req->n, vflist);
+	}
 
 	*argcp = argc;
 	*argvp = argv;
-	return 0;
 }
 
 
@@ -349,18 +504,14 @@ int iplink_parse(int argc, char **argv, struct iplink_req *req,
 				req->i.ifi_flags |= IFF_NOARP;
 			} else
 				return on_off("noarp");
+		} else if (strcmp(*argv, "port") == 0) {
+			iplink_parse_port(vf, &argc, &argv, req);
 		} else if (strcmp(*argv, "vf") == 0) {
-			struct rtattr *vflist;
 			NEXT_ARG();
 			if (get_integer(&vf,  *argv, 0)) {
 				invarg("Invalid \"vf\" value\n", *argv);
 			}
-			vflist = addattr_nest(&req->n, sizeof(*req),
-					      IFLA_VFINFO_LIST);
-			len = iplink_parse_vf(vf, &argc, &argv, req);
-			if (len < 0)
-				return -1;
-			addattr_nest_end(&req->n, vflist);
+			iplink_parse_vf(vf, &argc, &argv, req);
 #ifdef IFF_DYNAMIC
 		} else if (matches(*argv, "dynamic") == 0) {
 			NEXT_ARG();


^ permalink raw reply related

* [PATCH net-2.6 1/3] vlan: Add function to retrieve EtherType from vlan packets.
From: Jesse Gross @ 2010-11-10  1:09 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Hao Zheng

From: Hao Zheng <hzheng@nicira.com>

Depending on how a packet is vlan tagged (i.e. hardware accelerated or
not), the encapsulated protocol is stored in different locations.  This
provides a consistent method of accessing that protocol, which is needed
by drivers, security checks, etc.

Signed-off-by: Hao Zheng <hzheng@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 include/linux/if_vlan.h |   20 ++++++++++++++++++++
 1 files changed, 20 insertions(+), 0 deletions(-)

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index c2f3a72..ee06c52 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -339,6 +339,26 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
 	}
 }
 
+/**
+ * vlan_get_protocol - get protocol EtherType.
+ * @skb: skbuff to query
+ *
+ * Returns the EtherType of the packet, regardless of whether it is
+ * vlan encapsulated (normal or hardware accelerated) or not.
+ */
+static inline __be16 vlan_get_protocol(struct sk_buff *skb)
+{
+	__be16 protocol = 0;
+
+	if (vlan_tx_tag_present(skb) ||
+	     skb->protocol != cpu_to_be16(ETH_P_8021Q))
+		protocol = skb->protocol;
+	else if (likely(pskb_may_pull(skb, VLAN_ETH_HLEN)))
+		protocol = ((const struct vlan_ethhdr *)skb->data)->
+			   h_vlan_encapsulated_proto;
+
+	return protocol;
+}
 #endif /* __KERNEL__ */
 
 /* VLAN IOCTLs are found in sockios.h */
-- 
1.7.1


^ permalink raw reply related

* [PATCH net-2.6 2/3] bnx2x: Look inside vlan when determining checksum proto.
From: Jesse Gross @ 2010-11-10  1:09 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Hao Zheng, Eilon Greenstein
In-Reply-To: <1289351344-14340-1-git-send-email-jesse@nicira.com>

From: Hao Zheng <hzheng@nicira.com>

Currently the skb->protocol field is used to setup checksum
offloading on transmit for the correct protocol.  However, if
vlan offloading is disabled or otherwise not used, the protocol
field will be ETH_P_8021Q, not the actual protocol.  This will
cause the checksum to be not computed correctly, even though the
hardware is capable of looking inside vlan tags.  Instead,
look inside the header if necessary to determine the correct
protocol type.

To some extent this fixes a regression from 2.6.36 because it
was previously not possible to disable vlan offloading and this
error case was not exposed.

Signed-off-by: Hao Zheng <hzheng@nicira.com>
CC: Eilon Greenstein <eilong@broadcom.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 drivers/net/bnx2x/bnx2x_cmn.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/bnx2x/bnx2x_cmn.c b/drivers/net/bnx2x/bnx2x_cmn.c
index 459614d..94d5f59 100644
--- a/drivers/net/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/bnx2x/bnx2x_cmn.c
@@ -1680,7 +1680,7 @@ static inline u32 bnx2x_xmit_type(struct bnx2x *bp, struct sk_buff *skb)
 		rc = XMIT_PLAIN;
 
 	else {
-		if (skb->protocol == htons(ETH_P_IPV6)) {
+		if (vlan_get_protocol(skb) == htons(ETH_P_IPV6)) {
 			rc = XMIT_CSUM_V6;
 			if (ipv6_hdr(skb)->nexthdr == IPPROTO_TCP)
 				rc |= XMIT_CSUM_TCP;
-- 
1.7.1


^ permalink raw reply related

* [PATCH net-2.6 3/3] ixgbe: Look inside vlan when determining offload protocol.
From: Jesse Gross @ 2010-11-10  1:09 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Hao Zheng, Jeff Kirsher, Alex Duyck, Jesse Brandeburg
In-Reply-To: <1289351344-14340-1-git-send-email-jesse@nicira.com>

From: Hao Zheng <hzheng@nicira.com>

Currently the skb->protocol field is used to setup various
offloading parameters on transmit for the correct protocol.
However, if vlan offloading is disabled or otherwise not used,
the protocol field will be ETH_P_8021Q, not the actual protocol.
This will cause the offloading to be not performed correctly,
even though the hardware is capable of looking inside vlan tags.
Instead, look inside the header if necessary to determine the
correct protocol type.

To some extent this fixes a regression from 2.6.36 because it
was previously not possible to disable vlan offloading and this
error case was not exposed.

Signed-off-by: Hao Zheng <hzheng@nicira.com>
CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
CC: Alex Duyck <alexander.h.duyck@intel.com>
CC: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 drivers/net/ixgbe/ixgbe_main.c |   60 +++++++++++++++++++++------------------
 1 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 2bd3eb4..fbad4d8 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -764,8 +764,9 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 #ifdef IXGBE_FCOE
 				/* adjust for FCoE Sequence Offload */
 				if ((adapter->flags & IXGBE_FLAG_FCOE_ENABLED)
-				    && (skb->protocol == htons(ETH_P_FCOE)) &&
-				    skb_is_gso(skb)) {
+				    && skb_is_gso(skb)
+				    && vlan_get_protocol(skb) ==
+				    htons(ETH_P_FCOE)) {
 					hlen = skb_transport_offset(skb) +
 						sizeof(struct fc_frame_header) +
 						sizeof(struct fcoe_crc_eof);
@@ -5823,7 +5824,7 @@ static void ixgbe_watchdog_task(struct work_struct *work)
 
 static int ixgbe_tso(struct ixgbe_adapter *adapter,
 		     struct ixgbe_ring *tx_ring, struct sk_buff *skb,
-		     u32 tx_flags, u8 *hdr_len)
+		     u32 tx_flags, u8 *hdr_len, __be16 protocol)
 {
 	struct ixgbe_adv_tx_context_desc *context_desc;
 	unsigned int i;
@@ -5841,7 +5842,7 @@ static int ixgbe_tso(struct ixgbe_adapter *adapter,
 		l4len = tcp_hdrlen(skb);
 		*hdr_len += l4len;
 
-		if (skb->protocol == htons(ETH_P_IP)) {
+		if (protocol == htons(ETH_P_IP)) {
 			struct iphdr *iph = ip_hdr(skb);
 			iph->tot_len = 0;
 			iph->check = 0;
@@ -5880,7 +5881,7 @@ static int ixgbe_tso(struct ixgbe_adapter *adapter,
 		type_tucmd_mlhl = (IXGBE_TXD_CMD_DEXT |
 				   IXGBE_ADVTXD_DTYP_CTXT);
 
-		if (skb->protocol == htons(ETH_P_IP))
+		if (protocol == htons(ETH_P_IP))
 			type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
 		type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
 		context_desc->type_tucmd_mlhl = cpu_to_le32(type_tucmd_mlhl);
@@ -5906,16 +5907,10 @@ static int ixgbe_tso(struct ixgbe_adapter *adapter,
 	return false;
 }
 
-static u32 ixgbe_psum(struct ixgbe_adapter *adapter, struct sk_buff *skb)
+static u32 ixgbe_psum(struct ixgbe_adapter *adapter, struct sk_buff *skb,
+		      __be16 protocol)
 {
 	u32 rtn = 0;
-	__be16 protocol;
-
-	if (skb->protocol == cpu_to_be16(ETH_P_8021Q))
-		protocol = ((const struct vlan_ethhdr *)skb->data)->
-					h_vlan_encapsulated_proto;
-	else
-		protocol = skb->protocol;
 
 	switch (protocol) {
 	case cpu_to_be16(ETH_P_IP):
@@ -5943,7 +5938,7 @@ static u32 ixgbe_psum(struct ixgbe_adapter *adapter, struct sk_buff *skb)
 	default:
 		if (unlikely(net_ratelimit()))
 			e_warn(probe, "partial checksum but proto=%x!\n",
-			       skb->protocol);
+			       protocol);
 		break;
 	}
 
@@ -5952,7 +5947,8 @@ static u32 ixgbe_psum(struct ixgbe_adapter *adapter, struct sk_buff *skb)
 
 static bool ixgbe_tx_csum(struct ixgbe_adapter *adapter,
 			  struct ixgbe_ring *tx_ring,
-			  struct sk_buff *skb, u32 tx_flags)
+			  struct sk_buff *skb, u32 tx_flags,
+			  __be16 protocol)
 {
 	struct ixgbe_adv_tx_context_desc *context_desc;
 	unsigned int i;
@@ -5981,7 +5977,7 @@ static bool ixgbe_tx_csum(struct ixgbe_adapter *adapter,
 				    IXGBE_ADVTXD_DTYP_CTXT);
 
 		if (skb->ip_summed == CHECKSUM_PARTIAL)
-			type_tucmd_mlhl |= ixgbe_psum(adapter, skb);
+			type_tucmd_mlhl |= ixgbe_psum(adapter, skb, protocol);
 
 		context_desc->type_tucmd_mlhl = cpu_to_le32(type_tucmd_mlhl);
 		/* use index zero for tx checksum offload */
@@ -6179,7 +6175,7 @@ static void ixgbe_tx_queue(struct ixgbe_adapter *adapter,
 }
 
 static void ixgbe_atr(struct ixgbe_adapter *adapter, struct sk_buff *skb,
-		      int queue, u32 tx_flags)
+		      int queue, u32 tx_flags, __be16 protocol)
 {
 	struct ixgbe_atr_input atr_input;
 	struct tcphdr *th;
@@ -6190,7 +6186,7 @@ static void ixgbe_atr(struct ixgbe_adapter *adapter, struct sk_buff *skb,
 	u8 l4type = 0;
 
 	/* Right now, we support IPv4 only */
-	if (skb->protocol != htons(ETH_P_IP))
+	if (protocol != htons(ETH_P_IP))
 		return;
 	/* check if we're UDP or TCP */
 	if (iph->protocol == IPPROTO_TCP) {
@@ -6257,10 +6253,13 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(dev);
 	int txq = smp_processor_id();
-
 #ifdef IXGBE_FCOE
-	if ((skb->protocol == htons(ETH_P_FCOE)) ||
-	    (skb->protocol == htons(ETH_P_FIP))) {
+	__be16 protocol;
+
+	protocol = vlan_get_protocol(skb);
+
+	if ((protocol == htons(ETH_P_FCOE)) ||
+	    (protocol == htons(ETH_P_FIP))) {
 		if (adapter->flags & IXGBE_FLAG_FCOE_ENABLED) {
 			txq &= (adapter->ring_feature[RING_F_FCOE].indices - 1);
 			txq += adapter->ring_feature[RING_F_FCOE].mask;
@@ -6303,6 +6302,9 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb, struct net_device *netdev
 	int tso;
 	int count = 0;
 	unsigned int f;
+	__be16 protocol;
+
+	protocol = vlan_get_protocol(skb);
 
 	if (vlan_tx_tag_present(skb)) {
 		tx_flags |= vlan_tx_tag_get(skb);
@@ -6323,8 +6325,8 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb, struct net_device *netdev
 	/* for FCoE with DCB, we force the priority to what
 	 * was specified by the switch */
 	if (adapter->flags & IXGBE_FLAG_FCOE_ENABLED &&
-	    (skb->protocol == htons(ETH_P_FCOE) ||
-	     skb->protocol == htons(ETH_P_FIP))) {
+	    (protocol == htons(ETH_P_FCOE) ||
+	     protocol == htons(ETH_P_FIP))) {
 #ifdef CONFIG_IXGBE_DCB
 		if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) {
 			tx_flags &= ~(IXGBE_TX_FLAGS_VLAN_PRIO_MASK
@@ -6334,7 +6336,7 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb, struct net_device *netdev
 		}
 #endif
 		/* flag for FCoE offloads */
-		if (skb->protocol == htons(ETH_P_FCOE))
+		if (protocol == htons(ETH_P_FCOE))
 			tx_flags |= IXGBE_TX_FLAGS_FCOE;
 	}
 #endif
@@ -6368,9 +6370,10 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb, struct net_device *netdev
 			tx_flags |= IXGBE_TX_FLAGS_FSO;
 #endif /* IXGBE_FCOE */
 	} else {
-		if (skb->protocol == htons(ETH_P_IP))
+		if (protocol == htons(ETH_P_IP))
 			tx_flags |= IXGBE_TX_FLAGS_IPV4;
-		tso = ixgbe_tso(adapter, tx_ring, skb, tx_flags, &hdr_len);
+		tso = ixgbe_tso(adapter, tx_ring, skb, tx_flags, &hdr_len,
+				protocol);
 		if (tso < 0) {
 			dev_kfree_skb_any(skb);
 			return NETDEV_TX_OK;
@@ -6378,7 +6381,8 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb, struct net_device *netdev
 
 		if (tso)
 			tx_flags |= IXGBE_TX_FLAGS_TSO;
-		else if (ixgbe_tx_csum(adapter, tx_ring, skb, tx_flags) &&
+		else if (ixgbe_tx_csum(adapter, tx_ring, skb, tx_flags,
+				       protocol) &&
 			 (skb->ip_summed == CHECKSUM_PARTIAL))
 			tx_flags |= IXGBE_TX_FLAGS_CSUM;
 	}
@@ -6392,7 +6396,7 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb, struct net_device *netdev
 			     test_bit(__IXGBE_FDIR_INIT_DONE,
 				      &tx_ring->reinit_state)) {
 				ixgbe_atr(adapter, skb, tx_ring->queue_index,
-					  tx_flags);
+					  tx_flags, protocol);
 				tx_ring->atr_count = 0;
 			}
 		}
-- 
1.7.1


^ permalink raw reply related

* sk->sk_socket seems to disappear before connection termination
From: Jan Engelhardt @ 2010-11-10  1:09 UTC (permalink / raw)
  To: Netfilter Developer Mailing List; +Cc: netdev, Rafał Maj

Hi,


Rafał reported this to us on IRC, paraphrasing what has been observed:

Using a simple rule like `iptables -A OUTPUT -p tcp --dport 80 -j LOG 
--log-uid`, one can observe on creating a connection and terminating
it that the trailing packets have skb->sk->sk_socket == NULL.
Is this intended? Is the socket not retained until after TCP has
sent out the closing exchange?

As I can reproduce:

$ telnet 134.76.13.21 80
Trying 134.76.13.21...
Connected to 134.76.13.21.
Escape character is '^]'.
^]
telnet> ^D
Connection closed.

[491419.500978] IN= OUT=tun0 SRC=134.76.2.163 DST=134.76.13.21 LEN=60 TOS=0x10 PREC=0x00 TTL=64 ID=35420 DF PROTO=TCP SPT=58613 DPT=80 WINDOW=5488 RES=0x00 SYN URGP=0 UID=25121 GID=100 
[491419.511533] IN= OUT=tun0 SRC=134.76.2.163 DST=134.76.13.21 LEN=52 TOS=0x10 PREC=0x00 TTL=64 ID=35421 DF PROTO=TCP SPT=58613 DPT=80 WINDOW=86 RES=0x00 ACK URGP=0 UID=25121 GID=100 
[491420.052182] IN= OUT=tun0 SRC=134.76.2.163 DST=134.76.13.21 LEN=52 TOS=0x10 PREC=0x00 TTL=64 ID=35422 DF PROTO=TCP SPT=58613 DPT=80 WINDOW=86 RES=0x00 ACK FIN URGP=0 UID=25121 GID=100 
[491420.063619] IN= OUT=tun0 SRC=134.76.2.163 DST=134.76.13.21 LEN=52 TOS=0x10 PREC=0x00 TTL=64 ID=35423 DF PROTO=TCP SPT=58613 DPT=80 WINDOW=86 RES=0x00 ACK URGP=0 
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] Prevent reading uninitialized memory with socket filters
From: David Miller @ 2010-11-10  5:28 UTC (permalink / raw)
  To: drosenberg; +Cc: netdev, stable, security
In-Reply-To: <1289341724.7380.13.camel@dan>

From: Dan Rosenberg <drosenberg@vsecurity.com>
Date: Tue, 09 Nov 2010 17:28:44 -0500

> The "mem" array used as scratch space for socket filters is not
> initialized, allowing unprivileged users to leak kernel stack bytes.
> 
> Signed-off-by: Dan Rosenberg <drosenberg@vsecurity.com>

Prove it.

^ permalink raw reply

* Re: sk->sk_socket seems to disappear before connection termination
From: Eric Dumazet @ 2010-11-10  5:47 UTC (permalink / raw)
  To: Jan Engelhardt; +Cc: Netfilter Developer Mailing List, netdev, Rafał Maj
In-Reply-To: <alpine.LNX.2.01.1011100205220.17978@obet.zrqbmnf.qr>

Le mercredi 10 novembre 2010 à 02:09 +0100, Jan Engelhardt a écrit :
> Hi,
> 
> 
> Rafał reported this to us on IRC, paraphrasing what has been observed:
> 
> Using a simple rule like `iptables -A OUTPUT -p tcp --dport 80 -j LOG 
> --log-uid`, one can observe on creating a connection and terminating
> it that the trailing packets have skb->sk->sk_socket == NULL.
> Is this intended? Is the socket not retained until after TCP has
> sent out the closing exchange?
> 
> As I can reproduce:
> 
> $ telnet 134.76.13.21 80
> Trying 134.76.13.21...
> Connected to 134.76.13.21.
> Escape character is '^]'.
> ^]
> telnet> ^D
> Connection closed.
> 
> [491419.500978] IN= OUT=tun0 SRC=134.76.2.163 DST=134.76.13.21 LEN=60 TOS=0x10 PREC=0x00 TTL=64 ID=35420 DF PROTO=TCP SPT=58613 DPT=80 WINDOW=5488 RES=0x00 SYN URGP=0 UID=25121 GID=100 
> [491419.511533] IN= OUT=tun0 SRC=134.76.2.163 DST=134.76.13.21 LEN=52 TOS=0x10 PREC=0x00 TTL=64 ID=35421 DF PROTO=TCP SPT=58613 DPT=80 WINDOW=86 RES=0x00 ACK URGP=0 UID=25121 GID=100 
> [491420.052182] IN= OUT=tun0 SRC=134.76.2.163 DST=134.76.13.21 LEN=52 TOS=0x10 PREC=0x00 TTL=64 ID=35422 DF PROTO=TCP SPT=58613 DPT=80 WINDOW=86 RES=0x00 ACK FIN URGP=0 UID=25121 GID=100 
> [491420.063619] IN= OUT=tun0 SRC=134.76.2.163 DST=134.76.13.21 LEN=52 TOS=0x10 PREC=0x00 TTL=64 ID=35423 DF PROTO=TCP SPT=58613 DPT=80 WINDOW=86 RES=0x00 ACK URGP=0 

Hmmm... skb->sk->sk_socket is really NULL ?

Are you sure its not skb->sk->sk_socket->file which is NULL ?

In this case, you might need to use sock_i_uid() / sock_i_ino() as a
fallback ? (expensive because they take a rwlock)




--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] Prevent reading uninitialized memory with socket filters
From: Eric Dumazet @ 2010-11-10  5:53 UTC (permalink / raw)
  To: David Miller; +Cc: drosenberg, netdev, stable, security
In-Reply-To: <20101109.212838.193698340.davem@davemloft.net>

Le mardi 09 novembre 2010 à 21:28 -0800, David Miller a écrit :
> From: Dan Rosenberg <drosenberg@vsecurity.com>
> Date: Tue, 09 Nov 2010 17:28:44 -0500
> 
> > The "mem" array used as scratch space for socket filters is not
> > initialized, allowing unprivileged users to leak kernel stack bytes.
> > 
> > Signed-off-by: Dan Rosenberg <drosenberg@vsecurity.com>
> 
> Prove it.

And once done, add the checks in sk_chk_filter() ?

Allow a load of mem[X] only if a prior store of mem[X] is proven.




^ permalink raw reply

* Re: [PATCH net-2.6 1/3] vlan: Add function to retrieve EtherType from vlan packets.
From: Stephen Hemminger @ 2010-11-10  5:54 UTC (permalink / raw)
  To: Jesse Gross; +Cc: David Miller, netdev, Hao Zheng
In-Reply-To: <1289351344-14340-1-git-send-email-jesse@nicira.com>

On Tue,  9 Nov 2010 17:09:02 -0800
Jesse Gross <jesse@nicira.com> wrote:

> From: Hao Zheng <hzheng@nicira.com>
> 
> Depending on how a packet is vlan tagged (i.e. hardware accelerated or
> not), the encapsulated protocol is stored in different locations.  This
> provides a consistent method of accessing that protocol, which is needed
> by drivers, security checks, etc.
> 
> Signed-off-by: Hao Zheng <hzheng@nicira.com>
> Signed-off-by: Jesse Gross <jesse@nicira.com>
> ---
>  include/linux/if_vlan.h |   20 ++++++++++++++++++++
>  1 files changed, 20 insertions(+), 0 deletions(-)
> 
> diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
> index c2f3a72..ee06c52 100644
> --- a/include/linux/if_vlan.h
> +++ b/include/linux/if_vlan.h
> @@ -339,6 +339,26 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
>  	}
>  }
>  
> +/**
> + * vlan_get_protocol - get protocol EtherType.
> + * @skb: skbuff to query
> + *
> + * Returns the EtherType of the packet, regardless of whether it is
> + * vlan encapsulated (normal or hardware accelerated) or not.
> + */
> +static inline __be16 vlan_get_protocol(struct sk_buff *skb)
> +{
> +	__be16 protocol = 0;
> +
> +	if (vlan_tx_tag_present(skb) ||
> +	     skb->protocol != cpu_to_be16(ETH_P_8021Q))
> +		protocol = skb->protocol;
> +	else if (likely(pskb_may_pull(skb, VLAN_ETH_HLEN)))
> +		protocol = ((const struct vlan_ethhdr *)skb->data)->
> +			   h_vlan_encapsulated_proto;
> +
> +	return protocol;
> +}

This this calls pskb_may_pull, which modifies the skb data
offsets and therefore could invalidate any callers pointers
to ip header or other fields.
Therefore you will need to audit all callers of this function!

Also, your code doesn't handle the case of too small a frame (VLAN header only).



-- 

^ permalink raw reply

* Re: [PATCH] net/dst: dst_dev_event() called after other notifiers
From: Eric Dumazet @ 2010-11-10  5:57 UTC (permalink / raw)
  To: David Miller; +Cc: greearb, netdev
In-Reply-To: <20101109.114853.193732360.davem@davemloft.net>

Le mardi 09 novembre 2010 à 11:48 -0800, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 09 Nov 2010 20:37:55 +0100
> 
> > [PATCH] net/dst: dst_dev_event() called after other notifiers
> 
> Nice, applied.
> 
> However, I had to apply this by hand:
> 
> >  static struct notifier_block dst_dev_notifier = {
> >  	.notifier_call  = dst_dev_event,
> > +	.priority = -10, /* must be called after other network notifiers */
> >  };
> 
> The character after ".notifier_call" in my tree is a TAB character but
> in your patch it is a sequence of spaces.  This isn't looking like the
> usual email corruption, because the leading TAB characters on these
> lines are properly there.
> 
> Please figure out why this happened so that it doesn't repeat in
> future patches :-)
> 

I am very sorry David, I had to run yesterday night and did a stupid
hand editing right before doing so. It was a human error, not a tool
error. Next time, I'll delay the patch to next day :)

Thanks !



^ permalink raw reply

* Re: warnings in 2.6.37-rc1+
From: Eric Dumazet @ 2010-11-10  5:59 UTC (permalink / raw)
  To: Norbert Preining; +Cc: linux-kernel, netdev
In-Reply-To: <20101110054948.GA16612@gamma.logic.tuwien.ac.at>

Le mercredi 10 novembre 2010 à 14:49 +0900, Norbert Preining a écrit :
> Hi all
> 
> (please keep in Cc, thanks)
> 
> [ 1592.320059] ------------[ cut here ]------------
> [ 1592.320077] WARNING: at net/ipv4/devinet.c:137 in_dev_finish_destroy+0x3d/0x6e()
> [ 1592.320083] Hardware name: VGN-Z11VN_B
> [ 1592.320088] Modules linked in: vboxnetadp vboxnetflt sco bnep rfcomm l2cap crc16 binfmt_misc dm_crypt dm_mod isofs btrfs zlib_deflate crc32c libcrc32c vfat fat hso fuse vboxdrv loop uinput snd_hda_codec_realtek snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi arc4 snd_rawmidi snd_seq_midi_event snd_seq iwlagn iwlcore mac80211 snd_timer btusb firewire_ohci firewire_core bluetooth snd_seq_device tpm_infineon sony_laptop snd soundcore crc_itu_t cfg80211 rfkill joydev snd_page_alloc
> [ 1592.320207] Pid: 0, comm: kworker/0:0 Tainted: P        W   2.6.37-rc1+ #3
> [ 1592.320213] Call Trace:
> [ 1592.320218]  <IRQ>  [<ffffffff81035075>] warn_slowpath_common+0x80/0x98
> [ 1592.320236]  [<ffffffff810350a2>] warn_slowpath_null+0x15/0x17
> [ 1592.320245]  [<ffffffff81352ede>] in_dev_finish_destroy+0x3d/0x6e
> [ 1592.320257]  [<ffffffff8132b5f7>] ipv4_dst_destroy+0x53/0x58
> [ 1592.320266]  [<ffffffff81314c26>] dst_destroy+0x78/0xd6
> [ 1592.320275]  [<ffffffff8132b340>] dst_free+0x1a/0x29
> [ 1592.320283]  [<ffffffff8132b358>] dst_rcu_free+0x9/0xb
> [ 1592.320292]  [<ffffffff8107b5fd>] __rcu_process_callbacks+0x173/0x265
> [ 1592.320301]  [<ffffffff8107b72e>] rcu_process_callbacks+0x3f/0x60
> [ 1592.320310]  [<ffffffff81039e1c>] __do_softirq+0x8f/0x140
> [ 1592.320322]  [<ffffffff810563fa>] ? tick_program_event+0x21/0x23
> [ 1592.320331]  [<ffffffff8100304c>] call_softirq+0x1c/0x28
> [ 1592.320339]  [<ffffffff81004c23>] do_softirq+0x33/0x68
> [ 1592.320347]  [<ffffffff8103a036>] irq_exit+0x36/0x8b
> [ 1592.320358]  [<ffffffff81019699>] smp_apic_timer_interrupt+0x88/0x96
> [ 1592.320366]  [<ffffffff81002b13>] apic_timer_interrupt+0x13/0x20
> [ 1592.320371]  <EOI>  [<ffffffff811ae0ef>] ? acpi_idle_enter_simple+0xc8/0xfa
> [ 1592.320389]  [<ffffffff811ae0ea>] ? acpi_idle_enter_simple+0xc3/0xfa
> [ 1592.320401]  [<ffffffff812e053c>] cpuidle_idle_call+0x9e/0xd6
> [ 1592.320408]  [<ffffffff81001484>] cpu_idle+0x56/0x9c
> [ 1592.320418]  [<ffffffff813816c8>] start_secondary+0x199/0x19d
> [ 1592.320426] ---[ end trace 5f7d0c35de1972f1 ]---
> 
> followed by a strange message:
> [ 1592.320431] Freeing alive in_device ffff88013e49b200
> 
> happened several times, starting after a wake up from suspend to ram.
> 
> Best wishes

Should be solved by a patch David will send to Linus in next round.

http://git.kernel.org/?p=linux/kernel/git/davem/net-2.6.git;a=commitdiff;h=18943d292facbc70e6a36fc62399ae833f64671b

Thanks





^ permalink raw reply

* Re: [Patch] Limit sysctl_tcp_mem and sysctl_udp_mem initializers to prevent integer overflows.
From: Robin Holt @ 2010-11-10  6:15 UTC (permalink / raw)
  To: eric.dumazet
  Cc: David Miller, holt, akpm, w, linux-kernel, netdev, kuznet, pekkas,
	jmorris, yoshfuji, kaber
In-Reply-To: <20101005.145055.63017205.davem@davemloft.net>

On Tue, Oct 05, 2010 at 02:50:55PM -0700, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Sat, 02 Oct 2010 15:22:16 +0200
> 
> > [PATCH] net: avoid limits overflow
> > 
> > Robin Holt tried to boot a 16TB machine and found some limits were
> > reached : sysctl_tcp_mem[2], sysctl_udp_mem[2]
> > 
> > We can switch infrastructure to use long "instead" of "int", now
> > atomic_long_t primitives are available for free.
> > 
> > Reported-by: Robin Holt <holt@sgi.com>
> > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> 
> Eric please resubmit this when the sysctl fix is resolved.

It looks like the sysctl fix is upstream.  Has this been resubmitted
and I missed it?

Robin

^ permalink raw reply

* Re: [PATCH] Fix CAN info leak/minor heap overflow
From: Oliver Hartkopp @ 2010-11-10  6:52 UTC (permalink / raw)
  To: David Miller; +Cc: urs, netdev, drosenberg, security, torvalds
In-Reply-To: <20101109.090523.189685701.davem@davemloft.net>

On 09.11.2010 18:05, David Miller wrote:
> From: Oliver Hartkopp <socketcan@hartkopp.net>
> Date: Tue, 09 Nov 2010 08:52:21 +0100
> 
>> Once this patch is applied (and the procfs layout is changed anyway), i'd also
>> like to send a patch from my backlog that would extend the procfs output for
>> can-bcm with an additional drop counter.
> 
> I find this kind of discussion extremely disappointing.
> 
> All of this stuff you CAN guys do with procfs files and version
> strings is completely wrong and bogus.
> 
> Once you create a procfs file layout, you're basically stuck and you
> can at best only reasonably add new fields at the end, you can't
> really change existing fields.
> 
> And sysfs would have been a lot more appropriate, you could use
> attributes for each value you want to export and then just add new
> sysfs attributes when you want to export new values which has very
> clear semantics and backwards compatability implications.

I admit that from my todays knowledge i would have done things differently.
But the network layer information bits have been always exposed in /proc/net
as it was in 2003 when we started the implementation on a 2.4.x kernel.
There are netdriver infos in sysfs but no netlayer entries.

>From my point of view the only thing could be to improve the current
situation, which the posted patch does:

- remove kernel addresses that were only relevant at implementation time
- allow AF_CAN protocols to provide their own information due to their needs
- provide inode numbers that can be found in procfs at several places
  => improvements for developers in userspace & kernelspace

The patch has been discussed on SocketCAN ML and the filter entries have not
been identified as a problem for userspace tools. E.g. /proc/net/can/stats is
one of the entries that's used by userspace tools.

IMHO the patch improves the historic situation and fixes the useless leakage
of kernel addresses. Please consider to apply that procfs changes.

Best regards,
Oliver

^ permalink raw reply

* Re: [PATCH net-2.6 1/3] vlan: Add function to retrieve EtherType from vlan packets.
From: Jesse Gross @ 2010-11-10  7:18 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev, Hao Zheng
In-Reply-To: <20101109215412.38412cff@nehalam>

On Tue, Nov 9, 2010 at 9:54 PM, Stephen Hemminger <shemminger@vyatta.com> wrote:
> On Tue,  9 Nov 2010 17:09:02 -0800
> Jesse Gross <jesse@nicira.com> wrote:
>
>> From: Hao Zheng <hzheng@nicira.com>
>>
>> Depending on how a packet is vlan tagged (i.e. hardware accelerated or
>> not), the encapsulated protocol is stored in different locations.  This
>> provides a consistent method of accessing that protocol, which is needed
>> by drivers, security checks, etc.
>>
>> Signed-off-by: Hao Zheng <hzheng@nicira.com>
>> Signed-off-by: Jesse Gross <jesse@nicira.com>
>> ---
>>  include/linux/if_vlan.h |   20 ++++++++++++++++++++
>>  1 files changed, 20 insertions(+), 0 deletions(-)
>>
>> diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
>> index c2f3a72..ee06c52 100644
>> --- a/include/linux/if_vlan.h
>> +++ b/include/linux/if_vlan.h
>> @@ -339,6 +339,26 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
>>       }
>>  }
>>
>> +/**
>> + * vlan_get_protocol - get protocol EtherType.
>> + * @skb: skbuff to query
>> + *
>> + * Returns the EtherType of the packet, regardless of whether it is
>> + * vlan encapsulated (normal or hardware accelerated) or not.
>> + */
>> +static inline __be16 vlan_get_protocol(struct sk_buff *skb)
>> +{
>> +     __be16 protocol = 0;
>> +
>> +     if (vlan_tx_tag_present(skb) ||
>> +          skb->protocol != cpu_to_be16(ETH_P_8021Q))
>> +             protocol = skb->protocol;
>> +     else if (likely(pskb_may_pull(skb, VLAN_ETH_HLEN)))
>> +             protocol = ((const struct vlan_ethhdr *)skb->data)->
>> +                        h_vlan_encapsulated_proto;
>> +
>> +     return protocol;
>> +}
>
> This this calls pskb_may_pull, which modifies the skb data
> offsets and therefore could invalidate any callers pointers
> to ip header or other fields.
> Therefore you will need to audit all callers of this function!

That's a good point.  I switched it to use skb_header_pointer()
instead, which is probably more efficient anyways and avoids the
potential for a problem.

>
> Also, your code doesn't handle the case of too small a frame (VLAN header only).

The goal is to get equivalence to checking skb->protocol, except to
handle vlan accelerated vs non-accelerated consistently.  In this
case, the caller would need to check the length of the protocol header
as appropriate.  If the packet claims to be a vlan frame and the
length is less than the size of a vlan header then we'll return 0,
which should be sufficient to avoid any protocol processing.

Thanks.

^ permalink raw reply

* Re: [PATCH] Prevent reading uninitialized memory with socket filters
From: Eric Dumazet @ 2010-11-10  7:22 UTC (permalink / raw)
  To: David Miller; +Cc: drosenberg, netdev, stable, security
In-Reply-To: <1289368423.2700.17.camel@edumazet-laptop>

Le mercredi 10 novembre 2010 à 06:53 +0100, Eric Dumazet a écrit :
> Le mardi 09 novembre 2010 à 21:28 -0800, David Miller a écrit :
> > From: Dan Rosenberg <drosenberg@vsecurity.com>
> > Date: Tue, 09 Nov 2010 17:28:44 -0500
> > 
> > > The "mem" array used as scratch space for socket filters is not
> > > initialized, allowing unprivileged users to leak kernel stack bytes.
> > > 
> > > Signed-off-by: Dan Rosenberg <drosenberg@vsecurity.com>
> > 
> > Prove it.
> 
> And once done, add the checks in sk_chk_filter() ?
> 
> Allow a load of mem[X] only if a prior store of mem[X] is proven.
> 
> 

This seems complex, and might fail on some valid filters.

What about the following patch then ?

[PATCH] filter: make sure filters dont read uninitialized memory

There is a possibility malicious users can get limited information about
uninitialized stack mem array. Even if sk_run_filter() result is bound
to packet length (0 .. 65535), we could imagine this can be used by
hostile user.

Initializing mem[] array, like Dan Rosenberg suggested in his patch is
expensive since most filters dont even use this array.

Its hard to make the filter validation in sk_chk_filter(), because of
the jumps. This might be done later.

In this patch, I use a bitmap (a single long var) so that only filters
using mem[] loads/stores pay the price of added security checks.

For other filters, additional cost is a single instruction.

Reported-by: Dan Rosenberg <drosenberg@vsecurity.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/core/filter.c |   10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 7beaec3..4d84dc2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -117,10 +117,12 @@ unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int
 	u32 A = 0;			/* Accumulator */
 	u32 X = 0;			/* Index Register */
 	u32 mem[BPF_MEMWORDS];		/* Scratch Memory Store */
+	unsigned long memvalid = 0;
 	u32 tmp;
 	int k;
 	int pc;
 
+	BUILD_BUG_ON(BPF_MEMWORDS > BITS_PER_LONG);
 	/*
 	 * Process array of filter instructions.
 	 */
@@ -264,10 +266,12 @@ load_b:
 			X = fentry->k;
 			continue;
 		case BPF_S_LD_MEM:
-			A = mem[fentry->k];
+			A = (memvalid & (1UL << fentry->k)) ?
+				mem[fentry->k] : 0;
 			continue;
 		case BPF_S_LDX_MEM:
-			X = mem[fentry->k];
+			X = (memvalid & (1UL << fentry->k)) ?
+				mem[fentry->k] : 0;
 			continue;
 		case BPF_S_MISC_TAX:
 			X = A;
@@ -280,9 +284,11 @@ load_b:
 		case BPF_S_RET_A:
 			return A;
 		case BPF_S_ST:
+			memvalid |= 1UL << fentry->k;
 			mem[fentry->k] = A;
 			continue;
 		case BPF_S_STX:
+			memvalid |= 1UL << fentry->k;
 			mem[fentry->k] = X;
 			continue;
 		default:



^ permalink raw reply related

* Re: Loopback performance from kernel 2.6.12 to 2.6.37
From: Jesper Dangaard Brouer @ 2010-11-10  8:49 UTC (permalink / raw)
  To: Xose Vazquez Perez; +Cc: netdev
In-Reply-To: <4CD9BE9C.30003@gmail.com>

On Tue, 2010-11-09 at 22:35 +0100, Xose Vazquez Perez wrote:
> Jesper Dangaard Brouer wrote:
> 
> > To fix this I added "-q 0" to netcat.  Thus my working commands are:
> > 
> >  netcat -l -p 9999 >/dev/null &
> >  time dd if=/dev/zero bs=1M count=10000 | netcat -q0 127.0.0.1 9999
> > 
> > Running this on my "big" 10G testlab system, Dual Xeon 5550 2.67GHz,
> > kernel version 2.6.32-5-amd64 (which I usually don't use)
> > The results are 7.487 sec:
> 
> netcat flavor ?

Debian package netcat-traditional
 netcat version [v1.10-38]

>From "aptitude show netcat-traditional":
 This is the "classic" netcat, written by *Hobbit*. It lacks many
features found in netcat-openbsd.

Didn't know there were that many flavors...

> http://nc110.sourceforge.net/
> http://nmap.org/ncat/
> http://www.dest-unreach.org/socat/
> http://cryptcat.sourceforge.net/
> http://netcat.sourceforge.net/
> http://www.openbsd.org/cgi-bin/cvsweb/src/usr.bin/nc/


-- 
Med venlig hilsen / Best regards
  Jesper Brouer
  ComX Networks A/S
  Linux Network Kernel Developer
  Cand. Scient Datalog / MSc.CS
  Author of http://adsl-optimizer.dk
  LinkedIn: http://www.linkedin.com/in/brouer


^ permalink raw reply

* Re: [PATCH v15 00/17] Provide a zero-copy method on KVM virtio-net.
From: xiaohui.xin @ 2010-11-10  9:23 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, herbert, jdike, davem; +Cc: Xin Xiaohui
In-Reply-To: <20101109.091516.112581012.davem@davemloft.net>

From: Xin Xiaohui <xiaohui.xin@intel.com>

>2) The idea to key off of skb->dev in skb_release_data() is
>   fundamentally flawed since many actions can change skb->dev on you,
>   which will end up causing a leak of your external data areas.

How about this one? If the destructor_arg is not a good candidate,
then I have to add an apparent field in shinfo.

Thanks
Xiaohui

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 10ba67d..ad4636e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -199,14 +199,15 @@ struct skb_shared_info {
 	struct sk_buff	*frag_list;
 	struct skb_shared_hwtstamps hwtstamps;
 
+	/* Intermediate layers must ensure that destructor_arg
+	 * remains valid until skb destructor */
+	void *		destructor_arg;
+
 	/*
 	 * Warning : all fields before dataref are cleared in __alloc_skb()
 	 */
 	atomic_t	dataref;
 
-	/* Intermediate layers must ensure that destructor_arg
-	 * remains valid until skb destructor */
-	void *		destructor_arg;
 	/* must be last field, see pskb_expand_head() */
 	skb_frag_t	frags[MAX_SKB_FRAGS];
 };
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c83b421..eb040f4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -343,6 +343,13 @@ static void skb_release_data(struct sk_buff *skb)
 		if (skb_has_frags(skb))
 			skb_drop_fraglist(skb);
 
+		if (skb_shinfo(skb)->destructor_arg) {
+			struct skb_ext_page *ext_page =
+				skb_shinfo(skb)->destructor_arg;
+			if (ext_page->dtor)
+				ext_page->dtor(ext_page);
+		}
+
 		kfree(skb->head);
 	}
 }
-- 
1.7.3


^ permalink raw reply related

* [PATCH] net: avoid limits overflow
From: Eric Dumazet @ 2010-11-10  9:24 UTC (permalink / raw)
  To: Robin Holt
  Cc: David Miller, akpm, w, linux-kernel, netdev, kuznet, pekkas,
	jmorris, yoshfuji, kaber
In-Reply-To: <20101110061507.GF4330@sgi.com>

Le mercredi 10 novembre 2010 à 00:15 -0600, Robin Holt a écrit :
> On Tue, Oct 05, 2010 at 02:50:55PM -0700, David Miller wrote:
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Sat, 02 Oct 2010 15:22:16 +0200
> > 
> > > [PATCH] net: avoid limits overflow
> > > 
> > > Robin Holt tried to boot a 16TB machine and found some limits were
> > > reached : sysctl_tcp_mem[2], sysctl_udp_mem[2]
> > > 
> > > We can switch infrastructure to use long "instead" of "int", now
> > > atomic_long_t primitives are available for free.
> > > 
> > > Reported-by: Robin Holt <holt@sgi.com>
> > > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> > 
> > Eric please resubmit this when the sysctl fix is resolved.
> 
> It looks like the sysctl fix is upstream.  Has this been resubmitted
> and I missed it?
> 
> Robin

I believe it was in Andrew mm tree for a while, and Andrew sent it do
David the 21th of october, I am not sure what happened.

Here it is again for latest linux-2.6 tree, thanks for the headup !

[PATCH] net: avoid limits overflow

Robin Holt tried to boot a 16TB machine and found some limits were
reached : sysctl_tcp_mem[2], sysctl_udp_mem[2]

We can switch infrastructure to use long "instead" of "int", now
atomic_long_t primitives are available for free.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Reported-by: Robin Holt <holt@sgi.com>
Reviewed-by: Robin Holt <holt@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/net/dn.h               |    2 +-
 include/net/sock.h             |    4 ++--
 include/net/tcp.h              |    6 +++---
 include/net/udp.h              |    4 ++--
 net/core/sock.c                |   14 +++++++-------
 net/decnet/af_decnet.c         |    2 +-
 net/decnet/sysctl_net_decnet.c |    4 ++--
 net/ipv4/proc.c                |    8 ++++----
 net/ipv4/sysctl_net_ipv4.c     |    5 ++---
 net/ipv4/tcp.c                 |    4 ++--
 net/ipv4/tcp_input.c           |   11 +++++++----
 net/ipv4/udp.c                 |    4 ++--
 net/sctp/protocol.c            |    2 +-
 net/sctp/socket.c              |    4 ++--
 net/sctp/sysctl.c              |    4 ++--
 15 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/include/net/dn.h b/include/net/dn.h
index e5469f7..a514a3c 100644
--- a/include/net/dn.h
+++ b/include/net/dn.h
@@ -225,7 +225,7 @@ extern int decnet_di_count;
 extern int decnet_dr_count;
 extern int decnet_no_fc_max_cwnd;
 
-extern int sysctl_decnet_mem[3];
+extern long sysctl_decnet_mem[3];
 extern int sysctl_decnet_wmem[3];
 extern int sysctl_decnet_rmem[3];
 
diff --git a/include/net/sock.h b/include/net/sock.h
index c7a7362..a6338d0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -762,7 +762,7 @@ struct proto {
 
 	/* Memory pressure */
 	void			(*enter_memory_pressure)(struct sock *sk);
-	atomic_t		*memory_allocated;	/* Current allocated memory. */
+	atomic_long_t		*memory_allocated;	/* Current allocated memory. */
 	struct percpu_counter	*sockets_allocated;	/* Current number of sockets. */
 	/*
 	 * Pressure flag: try to collapse.
@@ -771,7 +771,7 @@ struct proto {
 	 * is strict, actions are advisory and have some latency.
 	 */
 	int			*memory_pressure;
-	int			*sysctl_mem;
+	long			*sysctl_mem;
 	int			*sysctl_wmem;
 	int			*sysctl_rmem;
 	int			max_header;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4fee042..e36c874 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -224,7 +224,7 @@ extern int sysctl_tcp_fack;
 extern int sysctl_tcp_reordering;
 extern int sysctl_tcp_ecn;
 extern int sysctl_tcp_dsack;
-extern int sysctl_tcp_mem[3];
+extern long sysctl_tcp_mem[3];
 extern int sysctl_tcp_wmem[3];
 extern int sysctl_tcp_rmem[3];
 extern int sysctl_tcp_app_win;
@@ -247,7 +247,7 @@ extern int sysctl_tcp_cookie_size;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
 
-extern atomic_t tcp_memory_allocated;
+extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
 extern int tcp_memory_pressure;
 
@@ -280,7 +280,7 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
 	}
 
 	if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
-	    atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])
+	    atomic_long_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])
 		return true;
 	return false;
 }
diff --git a/include/net/udp.h b/include/net/udp.h
index 200b828..bb967dd 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -105,10 +105,10 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
 
 extern struct proto udp_prot;
 
-extern atomic_t udp_memory_allocated;
+extern atomic_long_t udp_memory_allocated;
 
 /* sysctl variables for udp */
-extern int sysctl_udp_mem[3];
+extern long sysctl_udp_mem[3];
 extern int sysctl_udp_rmem_min;
 extern int sysctl_udp_wmem_min;
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 3eed542..fb60801 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1653,10 +1653,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
 {
 	struct proto *prot = sk->sk_prot;
 	int amt = sk_mem_pages(size);
-	int allocated;
+	long allocated;
 
 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
-	allocated = atomic_add_return(amt, prot->memory_allocated);
+	allocated = atomic_long_add_return(amt, prot->memory_allocated);
 
 	/* Under limit. */
 	if (allocated <= prot->sysctl_mem[0]) {
@@ -1714,7 +1714,7 @@ suppress_allocation:
 
 	/* Alas. Undo changes. */
 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
-	atomic_sub(amt, prot->memory_allocated);
+	atomic_long_sub(amt, prot->memory_allocated);
 	return 0;
 }
 EXPORT_SYMBOL(__sk_mem_schedule);
@@ -1727,12 +1727,12 @@ void __sk_mem_reclaim(struct sock *sk)
 {
 	struct proto *prot = sk->sk_prot;
 
-	atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
+	atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
 		   prot->memory_allocated);
 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
 
 	if (prot->memory_pressure && *prot->memory_pressure &&
-	    (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
+	    (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0]))
 		*prot->memory_pressure = 0;
 }
 EXPORT_SYMBOL(__sk_mem_reclaim);
@@ -2452,12 +2452,12 @@ static char proto_method_implemented(const void *method)
 
 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
 {
-	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
+	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
 		   proto->name,
 		   proto->obj_size,
 		   sock_prot_inuse_get(seq_file_net(seq), proto),
-		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
+		   proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
 		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
 		   proto->max_header,
 		   proto->slab == NULL ? "no" : "yes",
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index d6b93d1..a76b78d 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -155,7 +155,7 @@ static const struct proto_ops dn_proto_ops;
 static DEFINE_RWLOCK(dn_hash_lock);
 static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
 static struct hlist_head dn_wild_sk;
-static atomic_t decnet_memory_allocated;
+static atomic_long_t decnet_memory_allocated;
 
 static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen, int flags);
 static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags);
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index be3eb8e..28f8b5e 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -38,7 +38,7 @@ int decnet_log_martians = 1;
 int decnet_no_fc_max_cwnd = NSP_MIN_WINDOW;
 
 /* Reasonable defaults, I hope, based on tcp's defaults */
-int sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 };
+long sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 };
 int sysctl_decnet_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
 int sysctl_decnet_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
 
@@ -324,7 +324,7 @@ static ctl_table dn_table[] = {
 		.data = &sysctl_decnet_mem,
 		.maxlen = sizeof(sysctl_decnet_mem),
 		.mode = 0644,
-		.proc_handler = proc_dointvec,
+		.proc_handler = proc_doulongvec_minmax
 	},
 	{
 		.procname = "decnet_rmem",
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 4ae1f20..1b48eb1 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -59,13 +59,13 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 	local_bh_enable();
 
 	socket_seq_show(seq);
-	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
+	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
 		   sock_prot_inuse_get(net, &tcp_prot), orphans,
 		   tcp_death_row.tw_count, sockets,
-		   atomic_read(&tcp_memory_allocated));
-	seq_printf(seq, "UDP: inuse %d mem %d\n",
+		   atomic_long_read(&tcp_memory_allocated));
+	seq_printf(seq, "UDP: inuse %d mem %ld\n",
 		   sock_prot_inuse_get(net, &udp_prot),
-		   atomic_read(&udp_memory_allocated));
+		   atomic_long_read(&udp_memory_allocated));
 	seq_printf(seq, "UDPLITE: inuse %d\n",
 		   sock_prot_inuse_get(net, &udplite_prot));
 	seq_printf(seq, "RAW: inuse %d\n",
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d96c1da..e91911d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -398,7 +398,7 @@ static struct ctl_table ipv4_table[] = {
 		.data		= &sysctl_tcp_mem,
 		.maxlen		= sizeof(sysctl_tcp_mem),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_doulongvec_minmax
 	},
 	{
 		.procname	= "tcp_wmem",
@@ -602,8 +602,7 @@ static struct ctl_table ipv4_table[] = {
 		.data		= &sysctl_udp_mem,
 		.maxlen		= sizeof(sysctl_udp_mem),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero
+		.proc_handler	= proc_doulongvec_minmax,
 	},
 	{
 		.procname	= "udp_rmem_min",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1664a05..245603c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -282,7 +282,7 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
 struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 
-int sysctl_tcp_mem[3] __read_mostly;
+long sysctl_tcp_mem[3] __read_mostly;
 int sysctl_tcp_wmem[3] __read_mostly;
 int sysctl_tcp_rmem[3] __read_mostly;
 
@@ -290,7 +290,7 @@ EXPORT_SYMBOL(sysctl_tcp_mem);
 EXPORT_SYMBOL(sysctl_tcp_rmem);
 EXPORT_SYMBOL(sysctl_tcp_wmem);
 
-atomic_t tcp_memory_allocated;	/* Current allocated memory. */
+atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
 EXPORT_SYMBOL(tcp_memory_allocated);
 
 /*
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3357f69..6d8ab1c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -259,8 +259,11 @@ static void tcp_fixup_sndbuf(struct sock *sk)
 	int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
 		     sizeof(struct sk_buff);
 
-	if (sk->sk_sndbuf < 3 * sndmem)
-		sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]);
+	if (sk->sk_sndbuf < 3 * sndmem) {
+		sk->sk_sndbuf = 3 * sndmem;
+		if (sk->sk_sndbuf > sysctl_tcp_wmem[2])
+			sk->sk_sndbuf = sysctl_tcp_wmem[2];
+	}
 }
 
 /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -396,7 +399,7 @@ static void tcp_clamp_window(struct sock *sk)
 	if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
 	    !tcp_memory_pressure &&
-	    atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
+	    atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
 		sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
 				    sysctl_tcp_rmem[2]);
 	}
@@ -4861,7 +4864,7 @@ static int tcp_should_expand_sndbuf(struct sock *sk)
 		return 0;
 
 	/* If we are under soft global TCP memory pressure, do not expand.  */
-	if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
+	if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
 		return 0;
 
 	/* If we filled the congestion window, do not expand.  */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 28cb2d7..5e0a3a5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -110,7 +110,7 @@
 struct udp_table udp_table __read_mostly;
 EXPORT_SYMBOL(udp_table);
 
-int sysctl_udp_mem[3] __read_mostly;
+long sysctl_udp_mem[3] __read_mostly;
 EXPORT_SYMBOL(sysctl_udp_mem);
 
 int sysctl_udp_rmem_min __read_mostly;
@@ -119,7 +119,7 @@ EXPORT_SYMBOL(sysctl_udp_rmem_min);
 int sysctl_udp_wmem_min __read_mostly;
 EXPORT_SYMBOL(sysctl_udp_wmem_min);
 
-atomic_t udp_memory_allocated;
+atomic_long_t udp_memory_allocated;
 EXPORT_SYMBOL(udp_memory_allocated);
 
 #define MAX_UDP_PORTS 65536
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 1ef29c7..e58f947 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -92,7 +92,7 @@ static struct sctp_af *sctp_af_v6_specific;
 struct kmem_cache *sctp_chunk_cachep __read_mostly;
 struct kmem_cache *sctp_bucket_cachep __read_mostly;
 
-int sysctl_sctp_mem[3];
+long sysctl_sctp_mem[3];
 int sysctl_sctp_rmem[3];
 int sysctl_sctp_wmem[3];
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index e34ca9c..6bd5543 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -111,12 +111,12 @@ static void sctp_sock_migrate(struct sock *, struct sock *,
 static char *sctp_hmac_alg = SCTP_COOKIE_HMAC_ALG;
 
 extern struct kmem_cache *sctp_bucket_cachep;
-extern int sysctl_sctp_mem[3];
+extern long sysctl_sctp_mem[3];
 extern int sysctl_sctp_rmem[3];
 extern int sysctl_sctp_wmem[3];
 
 static int sctp_memory_pressure;
-static atomic_t sctp_memory_allocated;
+static atomic_long_t sctp_memory_allocated;
 struct percpu_counter sctp_sockets_allocated;
 
 static void sctp_enter_memory_pressure(struct sock *sk)
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 832590b..50cb57f 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -54,7 +54,7 @@ static int sack_timer_max = 500;
 static int addr_scope_max = 3; /* check sctp_scope_policy_t in include/net/sctp/constants.h for max entries */
 static int rwnd_scale_max = 16;
 
-extern int sysctl_sctp_mem[3];
+extern long sysctl_sctp_mem[3];
 extern int sysctl_sctp_rmem[3];
 extern int sysctl_sctp_wmem[3];
 
@@ -203,7 +203,7 @@ static ctl_table sctp_table[] = {
 		.data		= &sysctl_sctp_mem,
 		.maxlen		= sizeof(sysctl_sctp_mem),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_doulongvec_minmax
 	},
 	{
 		.procname	= "sctp_rmem",



^ permalink raw reply related

* Re: [PATCH 0/2] net: Changes in queue allocation and freeing
From: Eric Dumazet @ 2010-11-10 10:41 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev
In-Reply-To: <alpine.DEB.1.00.1011091240470.18565@pokey.mtv.corp.google.com>

Le mardi 09 novembre 2010 à 12:47 -0800, Tom Herbert a écrit :
> Changes to both RX and TX queue allocation.  In both cases allocate
> in alloc_netdev_mq and free in free_netdev.  For RX the reference
> couting also changed, the device reference count can now be used.

Oh well :)

Are they preliminary patches so that XPS also dont need the "reference
counts specific to TX queues" ? ;)




^ permalink raw reply

* Re: sk->sk_socket seems to disappear before connection termination
From: Jan Engelhardt @ 2010-11-10 10:53 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Howells, Netfilter Developer Mailing List, netdev,
	Rafał Maj
In-Reply-To: <1289368037.2700.14.camel@edumazet-laptop>

On Wednesday 2010-11-10 06:47, Eric Dumazet wrote:
>Le mercredi 10 novembre 2010 à 02:09 +0100, Jan Engelhardt a écrit :
>> Hi,
>> 
>> Rafał reported this to us on IRC, paraphrasing what has been observed:
>> 
>> Using a simple rule like `iptables -A OUTPUT -p tcp --dport 80 -j LOG 
>> --log-uid`, one can observe on creating a connection and terminating
>> it that the trailing packets have skb->sk->sk_socket == NULL.
>> Is this intended? Is the socket not retained until after TCP has
>> sent out the closing exchange?
>> 
>> As I can reproduce:
>> 
>> $ telnet 134.76.13.21 80
>> Trying 134.76.13.21...
>> Connected to 134.76.13.21.
>> Escape character is '^]'.
>> ^]
>> telnet> ^D
>> Connection closed.
>> 
>> [491419.500978] IN= OUT=tun0 SRC=134.76.2.163 DST=134.76.13.21 LEN=60 TOS=0x10 PREC=0x00 TTL=64 ID=35420 DF PROTO=TCP SPT=58613 DPT=80 WINDOW=5488 RES=0x00 SYN URGP=0 UID=25121 GID=100 
>> [491419.511533] IN= OUT=tun0 SRC=134.76.2.163 DST=134.76.13.21 LEN=52 TOS=0x10 PREC=0x00 TTL=64 ID=35421 DF PROTO=TCP SPT=58613 DPT=80 WINDOW=86 RES=0x00 ACK URGP=0 UID=25121 GID=100 
>> [491420.052182] IN= OUT=tun0 SRC=134.76.2.163 DST=134.76.13.21 LEN=52 TOS=0x10 PREC=0x00 TTL=64 ID=35422 DF PROTO=TCP SPT=58613 DPT=80 WINDOW=86 RES=0x00 ACK FIN URGP=0 UID=25121 GID=100 
>> [491420.063619] IN= OUT=tun0 SRC=134.76.2.163 DST=134.76.13.21 LEN=52 TOS=0x10 PREC=0x00 TTL=64 ID=35423 DF PROTO=TCP SPT=58613 DPT=80 WINDOW=86 RES=0x00 ACK URGP=0 
>
>Hmmm... skb->sk->sk_socket is really NULL ?
>Are you sure its not skb->sk->sk_socket->file which is NULL ?

I am certain of it, having augmented ipt_LOG/xt_LOGMARK temporarily by 
appropriate printks.

>In this case, you might need to use sock_i_uid() / sock_i_ino() as a
>fallback ? (expensive because they take a rwlock)

No, sock_i_uid also uses sk->sk_socket. What is interesting though is 
that sock_i_uid uses SOCK_INODE(sk->sk_socket)->i_uid, but xt_owner uses 
sk->sk_socket->file->f_cred->fsuid. Would you have an idea as to why 
that is?
Dave Howells (cced) did the last change on it.

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [RFC 0/3] MPEG2/TS drop analyzer iptables match extension
From: Jesper Dangaard Brouer @ 2010-11-10 11:02 UTC (permalink / raw)
  To: Jan Engelhardt; +Cc: Netfilter Developers, Eric Dumazet, netdev, Solvik Blum
In-Reply-To: <alpine.LNX.2.01.1011041121170.16551@obet.zrqbmnf.qr>


On Thu, 4 Nov 2010, Jan Engelhardt wrote:
> On Thursday 2010-11-04 10:20, Jesper Dangaard Brouer wrote:
>> On Thu, 4 Nov 2010, Jan Engelhardt wrote:
>>>
>>> This now lives in the mp2t branch (since NFWS already actually) of xt-a,
>>> and I have taken the liberty to start updating it to higher standards.
>>> Please watch that branch, as I don't have any MPEG equipment around me
>>> to do runtime tests.
>>
>> Jan, I would actually like to maintain the source via my own git tree. And I
>> would gladly accept your patches against that tree.
>
> I do not mind who is hosting what parts, as git repos can be
> transferred easily, but I strongly suggest not to decouple xt_mp2t
> from (any clone of) the xtables-addons structure base, because doing
> so would bring you back to square one with regard to maintenance.
>
> I recognize you may dislike splitting up the IPTV codebase, so I
> propose that you make use of submodules, and have an Xt-a clone as
> one submodule. That would allow merging in both directions.

Well, I'm ready to maintain the module my self.  I know we talked (during 
Netfilter Workshop) about putting the module into the Xtables-addons 
git-tree, because the project had stalled.

But things have changed! - I now have some people/developers interested in 
the project, thus I'm taking an active maintainer role instead.

My plan is to keep the module "compatible" with the Xtables-addon build 
system.  Thats why I make my module use your compat_* system from xt-a.
This way you can easier copy the module into you tree, if I fail to 
maintain the module...

I have looked at your improvements in the "mp2t" branch of you tree. I'll 
integrate those changes/improvement in my tree, and give you the author 
credits in my tree.

(ps. this email got delayed on my postponed email outbox, sorry.  Have 
already had some offlist discussions with Jan)

Cheers,
   Jesper Brouer

--
-------------------------------------------------------------------
MSc. Master of Computer Science
Dept. of Computer Science, University of Copenhagen
Author of http://www.adsl-optimizer.dk
-------------------------------------------------------------------

^ permalink raw reply

* Re: [PATCH] Prevent reading uninitialized memory with socket filters
From: Dan Rosenberg @ 2010-11-10 11:12 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, stable, security
In-Reply-To: <20101109.212838.193698340.davem@davemloft.net>


> 
> Prove it.

I hope this was a joke.  In either case, my reply is a joke:

http://lists.grok.org.uk/pipermail/full-disclosure/2010-November/077321.html


^ permalink raw reply

* Re: Loopback performance from kernel 2.6.12 to 2.6.37
From: Jesper Dangaard Brouer @ 2010-11-10 11:24 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, acme
In-Reply-To: <1289313516.17448.28.camel@traveldev.cxnet.dk>

On Tue, 2010-11-09 at 15:38 +0100, Jesper Dangaard Brouer wrote:
> On Tue, 2010-11-09 at 15:16 +0100, Jesper Dangaard Brouer wrote:
> > On Tue, 2010-11-09 at 14:59 +0100, Jesper Dangaard Brouer wrote:
> > > On Mon, 2010-11-08 at 16:06 +0100, Eric Dumazet wrote:
> > > ...
> > 
> > To fix this I added "-q 0" to netcat.  Thus my working commands are:
> > 
> >  netcat -l -p 9999 >/dev/null &
> >  time dd if=/dev/zero bs=1M count=10000 | netcat -q0 127.0.0.1 9999
> > 
> > Running this on my "big" 10G testlab system, Dual Xeon 5550 2.67GHz,
> > kernel version 2.6.32-5-amd64 (which I usually don't use)
> > The results are 7.487 sec
> 
> Using kernel 2.6.35.8-comx01+ (which is 35-stable with some minor
> patches of my own) on the same type of hardware (our preprod server).
> The result is 12 sec.
> 
> time dd if=/dev/zero bs=1M count=10000 | netcat -q0 127.0.0.1 9999
> 10000+0 records in
> 10000+0 records out
> 10485760000 bytes (10 GB) copied, 12,0805 s, 868 MB/s
> 
> real    0m12.082s
> user    0m0.311s
> sys     0m15.896s

On the same system I can better performance IF I pin the processes on
different CPUs. BUT the trick here is I choose CPUs with different "core
id", thus I avoid the HT CPUs in the system (hint look in /proc/cpuinfo
for choosing the CPUs).

Commands:
 taskset 16 netcat -lv -p 9999 >/dev/null &
 time taskset 1 dd if=/dev/zero bs=1M count=10000 | taskset 4 netcat -q0 127.0.0.1 9999

Result:
 10485760000 bytes (10 GB) copied, 8,74021 s, 1,2 GB/s
 real    0m8.742s
 user    0m0.208s
 sys     0m11.426s

So, perhaps the Core i7 has a problem with the HT CPUs with this
workload?

Forcing dd and netcat on the same HT CPU gives a result of approx 18
sec!

Commands:
 taskset 16 netcat -lv -p 9999 >/dev/null
 time taskset 1 dd if=/dev/zero bs=1M count=10000 | taskset 2 netcat -q0 127.0.0.1 9999

Result:
 10485760000 bytes (10 GB) copied, 18,6575 s, 562 MB/s
 real    0m18.659s
 user    0m0.341s
 sys     0m18.969s


> BUT perf top reveals that its probably related to the function
> 'find_busiest_group' ... any kernel config hints how I get rid of that?

The 'find_busiest_group' seems to be an artifact of "perf top", if I use
"perf record" then the 'find_busiest_group' function disappears.  Which
is kind of strange, as 'find_busiest_group' seem the be related to
sched_fair.c.

perf --version
perf version 2.6.35.7.1.g60d9c

-- 
Med venlig hilsen / Best regards
  Jesper Brouer
  ComX Networks A/S
  Linux Network Kernel Developer
  Cand. Scient Datalog / MSc.CS
  Author of http://adsl-optimizer.dk
  LinkedIn: http://www.linkedin.com/in/brouer


^ permalink raw reply

* Re: [PATCH 3/3] net: tipc: fix information leak to userland
From: walter harms @ 2010-11-10 11:58 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: David Miller, kernel-janitors, jon.maloy, allan.stephens,
	tipc-discussion, netdev, linux-kernel
In-Reply-To: <20101109203317.GA24933@albatros>



Am 09.11.2010 21:33, schrieb Vasiliy Kulikov:
> On Tue, Nov 09, 2010 at 09:26 -0800, David Miller wrote:
>> From: Vasiliy Kulikov <segooon@gmail.com>
>> Date: Sun, 31 Oct 2010 20:10:32 +0300
>>
>>> Structure sockaddr_tipc is copied to userland with padding bytes after
>>> "id" field in union field "name" unitialized.  It leads to leaking of
>>> contents of kernel stack memory.  We have to initialize them to zero.
>>>
>>> Signed-off-by: Vasiliy Kulikov <segooon@gmail.com>
>>
>> Applied.
>>
>> Patches #1 and #2 were given feedback which I need you to integrate
>> and submit new patches based upon, thanks.
> 
> About #2:
> 
> I still think that this:
> 
>     if (dev)
>         strncpy(uaddr->sa_data, dev->name, 14);
>     else
>         memset(uaddr->sa_data, 0, 14);
> 
> is better than this:
> 
>     memset(uaddr->sa_data, 0, 14);
>     dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
>     if (dev)
>         strlcpy(uaddr->sa_data, dev->name, 15);
> 
> Doesn't it?  Explicitly filling with zero on the same "if" level is
> slightly easier to read and understand.
> 

no problem with me, since i came up with the idea a simple explanation:
IMHO the pattern clear/if/copy is more robust

NTL the core problem was that sizeof sa_data is 14 while dev->name is IFNAMESZ=15.


re,
 wh

^ permalink raw reply

* Re: [PATCH] ucc_geth: Fix hung tasks.
From: Joakim Tjernlund @ 2010-11-10 12:05 UTC (permalink / raw)
  Cc: Anton Vorontsov, netdev
In-Reply-To: <1289211819-21746-1-git-send-email-Joakim.Tjernlund@transmode.se>

Ping?

Even though this patch didn't solve my hang it is still a bug.

     Jocke

Joakim Tjernlund <Joakim.Tjernlund@transmode.se> wrote on 2010/11/08 11:23:39:

> From: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
> To: linuxppc-dev@lists.ozlabs.org, netdev@vger.kernel.org, Anton Vorontsov <avorontsov@ru.mvista.com>
> Cc: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
> Date: 2010/11/08 11:23
> Subject: [PATCH] ucc_geth: Fix hung tasks.
>
> We noticed a few hangs like this:
>
> INFO: task ifconfig:572 blocked for more than 120 seconds.
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> ifconfig      D 0ff65760     0   572    369 0x00000000
> Call Trace:
> [c6157be0] [c6008460] 0xc6008460 (unreliable)
> [c6157ca0] [c0008608] __switch_to+0x4c/0x6c
> [c6157cb0] [c028fecc] schedule+0x184/0x310
> [c6157ce0] [c0290e54] __mutex_lock_slowpath+0xa4/0x150
> [c6157d20] [c0290c48] mutex_lock+0x44/0x48
> [c6157d30] [c01aba74] phy_stop+0x20/0x70
> [c6157d40] [c01aef40] ucc_geth_stop+0x30/0x98
> [c6157d60] [c01b18fc] ucc_geth_close+0x9c/0xdc
> [c6157d80] [c01db0cc] __dev_close+0xa0/0xd0
> [c6157d90] [c01deddc] __dev_change_flags+0x8c/0x148
> [c6157db0] [c01def54] dev_change_flags+0x1c/0x64
> [c6157dd0] [c0237ac8] devinet_ioctl+0x678/0x784
> [c6157e50] [c0239a58] inet_ioctl+0xb0/0xbc
> [c6157e60] [c01cafa8] sock_ioctl+0x174/0x2a0
> [c6157e80] [c009a16c] vfs_ioctl+0xcc/0xe0
> [c6157ea0] [c009a998] do_vfs_ioctl+0xc4/0x79c
> [c6157f10] [c009b0b0] sys_ioctl+0x40/0x74
> [c6157f40] [c00117c4] ret_from_syscall+0x0/0x38
>
> I THINK this is due to a missing cancel_work_sync in the driver
> although we cannot be sure. I found this by comparing
> ucc_geth with gianfar.
>
> Signed-off-by: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
> ---
>  drivers/net/ucc_geth.c |    1 +
>  1 files changed, 1 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/net/ucc_geth.c b/drivers/net/ucc_geth.c
> index 97f9f7d..6647ed7 100644
> --- a/drivers/net/ucc_geth.c
> +++ b/drivers/net/ucc_geth.c
> @@ -3556,6 +3556,7 @@ static int ucc_geth_close(struct net_device *dev)
>
>     napi_disable(&ugeth->napi);
>
> +   cancel_work_sync(&ugeth->timeout_work);
>     ucc_geth_stop(ugeth);
>
>     free_irq(ugeth->ug_info->uf_info.irq, ugeth->ndev);
> --
> 1.7.2.2
>


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox