netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH]  802.1Q VLAN
@ 2004-10-22 21:07 Ben Greear
  2004-10-22 21:46 ` Francois Romieu
  0 siblings, 1 reply; 22+ messages in thread
From: Ben Greear @ 2004-10-22 21:07 UTC (permalink / raw)
  To: 'netdev@oss.sgi.com', Linux 802.1Q VLAN

Signed off by:  Ben Greear <greearb@candelatech.com>

This patch brings the 2.6.9 802.1Q VLAN code up to date with 2.4.27.  It also
includes better return logic from the hard_start_xmit hook to allow
back pressure up the networking stack.

Comments welcome.

Thanks,
Ben

--- linux-2.6.9/net/8021q/vlan_dev.c	2004-10-18 14:55:07.000000000 -0700
+++ linux-2.6.9.p4s/net/8021q/vlan_dev.c	2004-10-22 12:14:24.000000000 -0700
@@ -1,4 +1,4 @@
-/*
+/* -*- linux-c -*-
   * INET		802.1Q VLAN
   *		Ethernet-type device handling.
   *
@@ -484,13 +484,32 @@
  	       veth->h_vlan_proto, veth->h_vlan_TCI, veth->h_vlan_encapsulated_proto);
  #endif

-	stats->tx_packets++; /* for statics only */
-	stats->tx_bytes += skb->len;
-
  	skb->dev = VLAN_DEV_INFO(dev)->real_dev;
-	dev_queue_xmit(skb);

-	return 0;
+	{
+		/* Please note, dev_queue_xmit consumes the pkt regardless of the
+		 * error value.  So, will copy the skb first and free if successful.
+		 */
+		struct sk_buff* skb2 = skb_get(skb);
+		int rv = dev_queue_xmit(skb2);
+		if (rv != 0) {
+			/* The skb memory should still be valid since we made a copy,
+			 * so can return error code here.
+			 */
+			return rv;
+		}
+		else {
+			/* Was success, need to free the skb reference since we bumped up the
+			 * user count above.
+			 */
+
+			stats->tx_packets++; /* for statics only */
+			stats->tx_bytes += skb->len;
+
+			kfree_skb(skb);
+			return 0;
+		}
+	}
  }

  int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -622,7 +641,57 @@
  	return -EINVAL;
  }

+
+int vlan_dev_get_realdev_name(const char *dev_name, char* result)
+{
+	struct net_device *dev = dev_get_by_name(dev_name);
+	int rv = 0;
+	if (dev) {
+		if (dev->priv_flags & IFF_802_1Q_VLAN) {
+			strncpy(result, VLAN_DEV_INFO(dev)->real_dev->name, 23);
+			dev_put(dev);
+			rv = 0;
+		} else {
+			/*printk(KERN_ERR
+			  "%s: %s is not a vlan device, priv_flags: %hX.\n",
+			  __FUNCTION__, dev->name, dev->priv_flags);*/
+			dev_put(dev);
+			rv = -EINVAL;
+		}
+	} else {
+		/* printk(KERN_ERR       "%s: Could not find device: %s\n",
+		   __FUNCTION__, dev_name); */
+		rv = -ENODEV;
+	}
+	return rv;
+}
+
+int vlan_dev_get_vid(const char *dev_name, unsigned short* result)
+{
+	struct net_device *dev = dev_get_by_name(dev_name);
+	int rv = 0;
+	if (dev) {
+		if (dev->priv_flags & IFF_802_1Q_VLAN) {
+			*result = VLAN_DEV_INFO(dev)->vlan_id;
+			dev_put(dev);
+			rv = 0;
+		} else {
+			/*printk(KERN_ERR
+			  "%s: %s is not a vlan device, priv_flags: %hX.\n",
+			  __FUNCTION__, dev->name, dev->priv_flags);*/
+			dev_put(dev);
+			rv = -EINVAL;
+		}
+	} else {
+		/* printk(KERN_ERR       "%s: Could not find device: %s\n",
+		   __FUNCTION__, dev_name);*/
+		rv = -ENODEV;
+	}
+	return rv;
+}
+
+
  int vlan_dev_set_mac_address(struct net_device *dev, void *addr_struct_p)
  {
  	struct sockaddr *addr = (struct sockaddr *)(addr_struct_p);
--- linux-2.6.9/net/8021q/vlan.c	2004-10-18 14:54:55.000000000 -0700
+++ linux-2.6.9.p4s/net/8021q/vlan.c	2004-10-22 12:14:24.000000000 -0700
@@ -1,4 +1,4 @@
-/*
+/* -*- linux-c -*-
   * INET		802.1Q VLAN
   *		Ethernet-type device handling.
   *
@@ -646,15 +646,9 @@
  static int vlan_ioctl_handler(void __user *arg)
  {
  	int err = 0;
+	unsigned short vid = 0;
  	struct vlan_ioctl_args args;

-	/* everything here needs root permissions, except aguably the
-	 * hack ioctls for sending packets.  However, I know _I_ don't
-	 * want users running that on my network! --BLG
-	 */
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
  	if (copy_from_user(&args, arg, sizeof(struct vlan_ioctl_args)))
  		return -EFAULT;

@@ -668,24 +662,32 @@

  	switch (args.cmd) {
  	case SET_VLAN_INGRESS_PRIORITY_CMD:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
  		err = vlan_dev_set_ingress_priority(args.device1,
  						    args.u.skb_priority,
  						    args.vlan_qos);
  		break;

  	case SET_VLAN_EGRESS_PRIORITY_CMD:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
  		err = vlan_dev_set_egress_priority(args.device1,
  						   args.u.skb_priority,
  						   args.vlan_qos);
  		break;

  	case SET_VLAN_FLAG_CMD:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
  		err = vlan_dev_set_vlan_flag(args.device1,
  					     args.u.flag,
  					     args.vlan_qos);
  		break;

  	case SET_VLAN_NAME_TYPE_CMD:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
  		if ((args.u.name_type >= 0) &&
  		    (args.u.name_type < VLAN_NAME_TYPE_HIGHEST)) {
  			vlan_name_type = args.u.name_type;
@@ -695,17 +697,9 @@
  		}
  		break;

-		/* TODO:  Figure out how to pass info back...
-		   case GET_VLAN_INGRESS_PRIORITY_IOCTL:
-		   err = vlan_dev_get_ingress_priority(args);
-		   break;
-
-		   case GET_VLAN_EGRESS_PRIORITY_IOCTL:
-		   err = vlan_dev_get_egress_priority(args);
-		   break;
-		*/
-
  	case ADD_VLAN_CMD:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
  		/* we have been given the name of the Ethernet Device we want to
  		 * talk to:  args.dev1	 We also have the
  		 * VLAN ID:  args.u.VID
@@ -718,13 +712,52 @@
  		break;

  	case DEL_VLAN_CMD:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
  		/* Here, the args.dev1 is the actual VLAN we want
  		 * to get rid of.
  		 */
  		err = unregister_vlan_device(args.device1);
  		break;

+	case GET_VLAN_INGRESS_PRIORITY_CMD:
+		/* TODO:  Implement
+		   err = vlan_dev_get_ingress_priority(args);
+		   if (copy_to_user((void*)arg, &args,
+		        sizeof(struct vlan_ioctl_args))) {
+		        err = -EFAULT;
+		   }
+		*/
+		err = -EINVAL;
+		break;
+	case GET_VLAN_EGRESS_PRIORITY_CMD:
+		/* TODO:  Implement
+		   err = vlan_dev_get_egress_priority(args.device1, &(args.args);
+		   if (copy_to_user((void*)arg, &args,
+		        sizeof(struct vlan_ioctl_args))) {
+		        err = -EFAULT;
+		   }
+		*/
+		err = -EINVAL;
+		break;
+	case GET_VLAN_REALDEV_NAME_CMD:
+		err = vlan_dev_get_realdev_name(args.device1, args.u.device2);
+		if (copy_to_user((void*)arg, &args,
+				 sizeof(struct vlan_ioctl_args))) {
+			err = -EFAULT;
+		}
+		break;
+
+	case GET_VLAN_VID_CMD:
+		err = vlan_dev_get_vid(args.device1, &vid);
+		args.u.VID = vid;
+		if (copy_to_user((void*)arg, &args,
+				 sizeof(struct vlan_ioctl_args))) {
+                      err = -EFAULT;
+		}
+		break;
+
  	default:
  		/* pass on to underlying device instead?? */
  		printk(VLAN_DBG "%s: Unknown VLAN CMD: %x \n",
--- linux-2.6.9/net/8021q/vlan.h	2004-10-18 14:54:37.000000000 -0700
+++ linux-2.6.9.p4s/net/8021q/vlan.h	2004-10-22 12:14:24.000000000 -0700
@@ -66,7 +66,9 @@
  int vlan_dev_set_ingress_priority(char* dev_name, __u32 skb_prio, short vlan_prio);
  int vlan_dev_set_egress_priority(char* dev_name, __u32 skb_prio, short vlan_prio);
  int vlan_dev_set_vlan_flag(char* dev_name, __u32 flag, short flag_val);
+int vlan_dev_get_realdev_name(const char* dev_name, char* result);
+int vlan_dev_get_vid(const char* dev_name, unsigned short* result);
  void vlan_dev_set_multicast_list(struct net_device *vlan_dev);

  #endif /* !(__BEN_VLAN_802_1Q_INC__) */
--- linux-2.6.9/include/linux/if_vlan.h	2004-10-18 14:53:43.000000000 -0700
+++ linux-2.6.9.p4s/include/linux/if_vlan.h	2004-10-22 12:14:24.000000000 -0700
@@ -366,7 +366,9 @@
  	GET_VLAN_INGRESS_PRIORITY_CMD,
  	GET_VLAN_EGRESS_PRIORITY_CMD,
  	SET_VLAN_NAME_TYPE_CMD,
-	SET_VLAN_FLAG_CMD
+	SET_VLAN_FLAG_CMD,
+        GET_VLAN_REALDEV_NAME_CMD, /* If this works, you know it's a VLAN device, btw */
+        GET_VLAN_VID_CMD /* Get the VID of this VLAN (specified by name) */
  };

  enum vlan_name_types {



-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-22 21:07 [PATCH] 802.1Q VLAN Ben Greear
@ 2004-10-22 21:46 ` Francois Romieu
  2004-10-22 22:09   ` Ben Greear
  0 siblings, 1 reply; 22+ messages in thread
From: Francois Romieu @ 2004-10-22 21:46 UTC (permalink / raw)
  To: Ben Greear; +Cc: 'netdev@oss.sgi.com', Linux 802.1Q VLAN

Minor nits below.

[...]
> @@ -484,13 +484,32 @@
>  	       veth->h_vlan_proto, veth->h_vlan_TCI, 
>  	       veth->h_vlan_encapsulated_proto);
>  #endif
> 
> -	stats->tx_packets++; /* for statics only */
> -	stats->tx_bytes += skb->len;
> -
>  	skb->dev = VLAN_DEV_INFO(dev)->real_dev;
> -	dev_queue_xmit(skb);
> 
> -	return 0;
> +	{
> +		/* Please note, dev_queue_xmit consumes the pkt regardless of the
> +		 * error value.  So, will copy the skb first and free if successful.
> +		 */
> +		struct sk_buff* skb2 = skb_get(skb);
> +		int rv = dev_queue_xmit(skb2);
> +		if (rv != 0) {
> +			/* The skb memory should still be valid since we made a copy,
> +			 * so can return error code here.
> +			 */
> +			return rv;
> +		}
> +		else {
> +			/* Was success, need to free the skb reference since we bumped up the
> +			 * user count above.
> +			 */
> +
> +			stats->tx_packets++; /* for statics only */
> +			stats->tx_bytes += skb->len;
> +
> +			kfree_skb(skb);
> +			return 0;
> +		}
> +	}


Why not use a single return point, say:

		struct sk_buff *skb2 = skb_get(skb);
		int rv = dev_queue_xmit(skb2);

		if (!rv) {
			/* 
			 * Was success, need to free the skb reference since 
			 * we bumped up the user count above.
			 */

			stats->tx_packets++; /* for statics only */
			stats->tx_bytes += skb->len;

			kfree_skb(skb);
		}
		return rv;

vlan_dev_get_realdev_name() and vlan_dev_get_vid() can be tidy up 
a bit (factoring dev_put() or handling debug code differently for instance).

[...]
> --- linux-2.6.9/include/linux/if_vlan.h	2004-10-18 
> 14:53:43.000000000 -0700
> +++ linux-2.6.9.p4s/include/linux/if_vlan.h	2004-10-22 
> 12:14:24.000000000 -0700
> @@ -366,7 +366,9 @@
>  	GET_VLAN_INGRESS_PRIORITY_CMD,
>  	GET_VLAN_EGRESS_PRIORITY_CMD,
>  	SET_VLAN_NAME_TYPE_CMD,
> -	SET_VLAN_FLAG_CMD
> +	SET_VLAN_FLAG_CMD,
> +        GET_VLAN_REALDEV_NAME_CMD, /* If this works, you know it's a VLAN device, btw */
> +        GET_VLAN_VID_CMD /* Get the VID of this VLAN (specified by name) */
>  };

Whitespace attacks.


My 0.02 U+20AC

--
Ueimor

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-22 21:46 ` Francois Romieu
@ 2004-10-22 22:09   ` Ben Greear
  2004-10-23  0:24     ` Francois Romieu
  2004-10-25 20:51     ` Ben Greear
  0 siblings, 2 replies; 22+ messages in thread
From: Ben Greear @ 2004-10-22 22:09 UTC (permalink / raw)
  To: Francois Romieu; +Cc: 'netdev@oss.sgi.com', Linux 802.1Q VLAN

Francois Romieu wrote:
> Minor nits below.
> 
> [...]
> 
>>@@ -484,13 +484,32 @@
>> 	       veth->h_vlan_proto, veth->h_vlan_TCI, 
>> 	       veth->h_vlan_encapsulated_proto);
>> #endif
>>
>>-	stats->tx_packets++; /* for statics only */
>>-	stats->tx_bytes += skb->len;
>>-
>> 	skb->dev = VLAN_DEV_INFO(dev)->real_dev;
>>-	dev_queue_xmit(skb);
>>
>>-	return 0;
>>+	{
>>+		/* Please note, dev_queue_xmit consumes the pkt regardless of the
>>+		 * error value.  So, will copy the skb first and free if successful.
>>+		 */
>>+		struct sk_buff* skb2 = skb_get(skb);
>>+		int rv = dev_queue_xmit(skb2);
>>+		if (rv != 0) {
>>+			/* The skb memory should still be valid since we made a copy,
>>+			 * so can return error code here.
>>+			 */
>>+			return rv;
>>+		}
>>+		else {
>>+			/* Was success, need to free the skb reference since we bumped up the
>>+			 * user count above.
>>+			 */
>>+
>>+			stats->tx_packets++; /* for statics only */
>>+			stats->tx_bytes += skb->len;
>>+
>>+			kfree_skb(skb);
>>+			return 0;
>>+		}
>>+	}
> 
> 
> 
> Why not use a single return point, say:
> 
> 		struct sk_buff *skb2 = skb_get(skb);
> 		int rv = dev_queue_xmit(skb2);
> 
> 		if (!rv) {
> 			/* 
> 			 * Was success, need to free the skb reference since 
> 			 * we bumped up the user count above.
> 			 */
> 
> 			stats->tx_packets++; /* for statics only */
> 			stats->tx_bytes += skb->len;
> 
> 			kfree_skb(skb);
> 		}
> 		return rv;

That should be OK.  I think I was worried that I would have to further
translate the error codes.  There appears to be no documentation on what
valid return values are for the dev_queue_xmit or the hard_start_xmit
method...  I think someone should add comments to the netdevice.h
file to specify the proper return codes (maybe even return an enum).

In my own code, I have this patch to dev.c.  I think it is correct,
but I could be wrong:

@@ -1253,6 +1272,17 @@
   *	A negative errno code is returned on a failure. A success does not
   *	guarantee the frame will be transmitted as it may be dropped due
   *	to congestion or traffic shaping.
+ *
+ * -----------------------------------------------------------------------------------
+ *      I notice this method can also return errors from the queue disciplines,
+ *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
+ *      be positive.
+ *
+ *      Regardless of the return value, the skb is consumed, so it is currently
+ *      impossible to retry a send to this method.  This implies that virtual devices,
+ *      such as VLANs, can ONLY return 0 from their hard_start_xmit method, making
+ *      it difficult to apply pressure back up the stack.
+ *          --Ben
   */

  int dev_queue_xmit(struct sk_buff *skb)


> vlan_dev_get_realdev_name() and vlan_dev_get_vid() can be tidy up 
> a bit (factoring dev_put() or handling debug code differently for instance).

Agreed, will fix this and the white-space issue.

Ben


-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-22 22:09   ` Ben Greear
@ 2004-10-23  0:24     ` Francois Romieu
  2004-10-25 20:51     ` Ben Greear
  1 sibling, 0 replies; 22+ messages in thread
From: Francois Romieu @ 2004-10-23  0:24 UTC (permalink / raw)
  To: Ben Greear; +Cc: 'netdev@oss.sgi.com'

(vlan cc: bounces)

Ben Greear <greearb@candelatech.com> :
[...]
> @@ -1253,6 +1272,17 @@
>   *	A negative errno code is returned on a failure. A success does not
>   *	guarantee the frame will be transmitted as it may be dropped due
>   *	to congestion or traffic shaping.
> + *
> + * 
> -----------------------------------------------------------------------------------
> + *      I notice this method can also return errors from the queue disciplines,
> + *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
> + *      be positive.
> + *
> + *      Regardless of the return value, the skb is consumed, so it is currently
> + *      impossible to retry a send to this method.  This implies that virtual devices,
> + *      such as VLANs, can ONLY return 0 from their hard_start_xmit method, making
> + *      it difficult to apply pressure back up the stack.
> + *          --Ben
>   */

Afaikr it is not true anymore with the previous code as the congestion/error
status is propagated and the extra skb_get() balances the kfree_skb() on error
in dev_queue_xmit. So I'd say that vlan_dev_hard_start_xmit() seems to behave
like a normal start_xmit() instead.

--
Ueimor

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-22 22:09   ` Ben Greear
  2004-10-23  0:24     ` Francois Romieu
@ 2004-10-25 20:51     ` Ben Greear
  2004-10-25 23:56       ` Ben Greear
  2004-10-28 23:40       ` Tommy Christensen
  1 sibling, 2 replies; 22+ messages in thread
From: Ben Greear @ 2004-10-25 20:51 UTC (permalink / raw)
  To: 'netdev@oss.sgi.com'; +Cc: Linux 802.1Q VLAN, Francois Romieu

[-- Attachment #1: Type: text/plain, Size: 362 bytes --]


Ok, I believe I have fixed all the suggestions offered by Francois (thanks!).

Please see what you think of this updated patch.  The changes have
been compile tested...and will pass some data in just a few minutes to
make sure nothing funny happens...

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com


[-- Attachment #2: vlan.patch --]
[-- Type: text/plain, Size: 6730 bytes --]

--- linux-2.6.9/net/8021q/vlan_dev.c	2004-10-18 14:55:07.000000000 -0700
+++ linux-2.6.9.p4s/net/8021q/vlan_dev.c	2004-10-25 13:38:32.779294920 -0700
@@ -1,4 +1,4 @@
-/*
+/* -*- linux-c -*-
  * INET		802.1Q VLAN
  *		Ethernet-type device handling.
  *
@@ -484,13 +484,26 @@
 	       veth->h_vlan_proto, veth->h_vlan_TCI, veth->h_vlan_encapsulated_proto);
 #endif
 
-	stats->tx_packets++; /* for statics only */
-	stats->tx_bytes += skb->len;
-
 	skb->dev = VLAN_DEV_INFO(dev)->real_dev;
-	dev_queue_xmit(skb);
 
-	return 0;
+	{
+		/* Please note, dev_queue_xmit consumes the pkt regardless of the
+		 * error value.  So, will copy the skb first and free if successful.
+		 */
+		struct sk_buff* skb2 = skb_get(skb);
+		int rv = dev_queue_xmit(skb2);
+		if (rv == 0) {
+			/* Was success, need to free the skb reference since we bumped up the
+			 * user count above.
+			 */
+
+			stats->tx_packets++; /* for statics only */
+			stats->tx_bytes += skb->len;
+
+			kfree_skb(skb);
+		}
+		return rv;
+	}
 }
 
 int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -622,7 +635,45 @@
 	return -EINVAL;
 }
 
+
+int vlan_dev_get_realdev_name(const char *dev_name, char* result)
+{
+	struct net_device *dev = dev_get_by_name(dev_name);
+	int rv = 0;
+	if (dev) {
+		if (dev->priv_flags & IFF_802_1Q_VLAN) {
+			strncpy(result, VLAN_DEV_INFO(dev)->real_dev->name, 23);
+			rv = 0;
+		} else {
+			rv = -EINVAL;
+		}
+		dev_put(dev);
+	} else {
+		rv = -ENODEV;
+	}
+	return rv;
+}
+
+int vlan_dev_get_vid(const char *dev_name, unsigned short* result)
+{
+	struct net_device *dev = dev_get_by_name(dev_name);
+	int rv = 0;
+	if (dev) {
+		if (dev->priv_flags & IFF_802_1Q_VLAN) {
+			*result = VLAN_DEV_INFO(dev)->vlan_id;
+			rv = 0;
+		} else {
+			rv = -EINVAL;
+		}
+		dev_put(dev);
+	} else {
+		rv = -ENODEV;
+	}
+	return rv;
+}
+
+
 int vlan_dev_set_mac_address(struct net_device *dev, void *addr_struct_p)
 {
 	struct sockaddr *addr = (struct sockaddr *)(addr_struct_p);
--- linux-2.6.9/net/8021q/vlan.c	2004-10-18 14:54:55.000000000 -0700
+++ linux-2.6.9.p4s/net/8021q/vlan.c	2004-10-22 12:14:24.000000000 -0700
@@ -1,4 +1,4 @@
-/*
+/* -*- linux-c -*-
  * INET		802.1Q VLAN
  *		Ethernet-type device handling.
  *
@@ -646,15 +646,9 @@
 static int vlan_ioctl_handler(void __user *arg)
 {
 	int err = 0;
+	unsigned short vid = 0;
 	struct vlan_ioctl_args args;
 
-	/* everything here needs root permissions, except aguably the
-	 * hack ioctls for sending packets.  However, I know _I_ don't
-	 * want users running that on my network! --BLG
-	 */
-	if (!capable(CAP_NET_ADMIN))
-		return -EPERM;
-
 	if (copy_from_user(&args, arg, sizeof(struct vlan_ioctl_args)))
 		return -EFAULT;
 
@@ -668,24 +662,32 @@
 
 	switch (args.cmd) {
 	case SET_VLAN_INGRESS_PRIORITY_CMD:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
 		err = vlan_dev_set_ingress_priority(args.device1,
 						    args.u.skb_priority,
 						    args.vlan_qos);
 		break;
 
 	case SET_VLAN_EGRESS_PRIORITY_CMD:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
 		err = vlan_dev_set_egress_priority(args.device1,
 						   args.u.skb_priority,
 						   args.vlan_qos);
 		break;
 
 	case SET_VLAN_FLAG_CMD:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
 		err = vlan_dev_set_vlan_flag(args.device1,
 					     args.u.flag,
 					     args.vlan_qos);
 		break;
 
 	case SET_VLAN_NAME_TYPE_CMD:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
 		if ((args.u.name_type >= 0) &&
 		    (args.u.name_type < VLAN_NAME_TYPE_HIGHEST)) {
 			vlan_name_type = args.u.name_type;
@@ -695,17 +697,9 @@
 		}
 		break;
 
-		/* TODO:  Figure out how to pass info back...
-		   case GET_VLAN_INGRESS_PRIORITY_IOCTL:
-		   err = vlan_dev_get_ingress_priority(args);
-		   break;
-
-		   case GET_VLAN_EGRESS_PRIORITY_IOCTL:
-		   err = vlan_dev_get_egress_priority(args);
-		   break;
-		*/
-
 	case ADD_VLAN_CMD:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
 		/* we have been given the name of the Ethernet Device we want to
 		 * talk to:  args.dev1	 We also have the
 		 * VLAN ID:  args.u.VID
@@ -718,13 +712,52 @@
 		break;
 
 	case DEL_VLAN_CMD:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
 		/* Here, the args.dev1 is the actual VLAN we want
 		 * to get rid of.
 		 */
 		err = unregister_vlan_device(args.device1);
 		break;
 
+	case GET_VLAN_INGRESS_PRIORITY_CMD:
+		/* TODO:  Implement
+		   err = vlan_dev_get_ingress_priority(args);
+		   if (copy_to_user((void*)arg, &args,
+		        sizeof(struct vlan_ioctl_args))) {
+		        err = -EFAULT;
+		   }
+		*/
+		err = -EINVAL;
+		break;
+	case GET_VLAN_EGRESS_PRIORITY_CMD:
+		/* TODO:  Implement
+		   err = vlan_dev_get_egress_priority(args.device1, &(args.args);
+		   if (copy_to_user((void*)arg, &args,
+		        sizeof(struct vlan_ioctl_args))) {
+		        err = -EFAULT;
+		   }
+		*/
+		err = -EINVAL;
+		break;
+	case GET_VLAN_REALDEV_NAME_CMD:
+		err = vlan_dev_get_realdev_name(args.device1, args.u.device2);
+		if (copy_to_user((void*)arg, &args,
+				 sizeof(struct vlan_ioctl_args))) {
+			err = -EFAULT;
+		}
+		break;
+
+	case GET_VLAN_VID_CMD:
+		err = vlan_dev_get_vid(args.device1, &vid);
+		args.u.VID = vid;
+		if (copy_to_user((void*)arg, &args,
+				 sizeof(struct vlan_ioctl_args))) {
+                      err = -EFAULT;
+		}
+		break;
+
 	default:
 		/* pass on to underlying device instead?? */
 		printk(VLAN_DBG "%s: Unknown VLAN CMD: %x \n",
--- linux-2.6.9/net/8021q/vlan.h	2004-10-18 14:54:37.000000000 -0700
+++ linux-2.6.9.p4s/net/8021q/vlan.h	2004-10-22 12:14:24.000000000 -0700
@@ -66,7 +66,9 @@
 int vlan_dev_set_ingress_priority(char* dev_name, __u32 skb_prio, short vlan_prio);
 int vlan_dev_set_egress_priority(char* dev_name, __u32 skb_prio, short vlan_prio);
 int vlan_dev_set_vlan_flag(char* dev_name, __u32 flag, short flag_val);
+int vlan_dev_get_realdev_name(const char* dev_name, char* result);
+int vlan_dev_get_vid(const char* dev_name, unsigned short* result);
 void vlan_dev_set_multicast_list(struct net_device *vlan_dev);
 
 #endif /* !(__BEN_VLAN_802_1Q_INC__) */
--- linux-2.6.9/include/linux/if_vlan.h	2004-10-18 14:53:43.000000000 -0700
+++ linux-2.6.9.p4s/include/linux/if_vlan.h	2004-10-22 15:05:52.000000000 -0700
@@ -1,4 +1,4 @@
-/*
+/* -*- linux-c -*-
  * VLAN		An implementation of 802.1Q VLAN tagging.
  *
  * Authors:	Ben Greear <greearb@candelatech.com>
@@ -366,7 +366,9 @@
 	GET_VLAN_INGRESS_PRIORITY_CMD,
 	GET_VLAN_EGRESS_PRIORITY_CMD,
 	SET_VLAN_NAME_TYPE_CMD,
-	SET_VLAN_FLAG_CMD
+	SET_VLAN_FLAG_CMD,
+	GET_VLAN_REALDEV_NAME_CMD, /* If this works, you know it's a VLAN device, btw */
+	GET_VLAN_VID_CMD /* Get the VID of this VLAN (specified by name) */
 };
 
 enum vlan_name_types {

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-25 20:51     ` Ben Greear
@ 2004-10-25 23:56       ` Ben Greear
  2004-10-27  1:02         ` David S. Miller
  2004-10-27 23:49         ` David S. Miller
  2004-10-28 23:40       ` Tommy Christensen
  1 sibling, 2 replies; 22+ messages in thread
From: Ben Greear @ 2004-10-25 23:56 UTC (permalink / raw)
  To: 'netdev@oss.sgi.com'; +Cc: David S. Miller

Ben Greear wrote:
> 
> Ok, I believe I have fixed all the suggestions offered by Francois 
> (thanks!).
> 
> Please see what you think of this updated patch.  The changes have
> been compile tested...and will pass some data in just a few minutes to
> make sure nothing funny happens...
> 
> Thanks,
> Ben

I've been beating on it with pktgen for a while, and it seems stable.

DaveM, please consider accepting this patch (it's in a previous email to netdev,
just ask if you want it re-sent.)

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-25 23:56       ` Ben Greear
@ 2004-10-27  1:02         ` David S. Miller
  2004-10-27 23:49         ` David S. Miller
  1 sibling, 0 replies; 22+ messages in thread
From: David S. Miller @ 2004-10-27  1:02 UTC (permalink / raw)
  To: Ben Greear; +Cc: netdev, davem

On Mon, 25 Oct 2004 16:56:10 -0700
Ben Greear <greearb@candelatech.com> wrote:

> DaveM, please consider accepting this patch (it's in a previous email to netdev,
> just ask if you want it re-sent.)

I'll review this either tonight or tomorrow afternoon.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-25 23:56       ` Ben Greear
  2004-10-27  1:02         ` David S. Miller
@ 2004-10-27 23:49         ` David S. Miller
  2004-10-28  1:28           ` Ben Greear
  1 sibling, 1 reply; 22+ messages in thread
From: David S. Miller @ 2004-10-27 23:49 UTC (permalink / raw)
  To: Ben Greear; +Cc: netdev

On Mon, 25 Oct 2004 16:56:10 -0700
Ben Greear <greearb@candelatech.com> wrote:

> DaveM, please consider accepting this patch (it's in a previous email to netdev,
> just ask if you want it re-sent.)

Applied, but I had to apply it by hand.  It gave a large reject.
For some reason, patch barfed at the end of the vlan_dev.c parts
and refused to interpreter the parts of the diffs for the other
files.

Did you try to apply that patch to a tree before sending it off?

Anyways, I put it in by hand nevertheless.

Also, what is up with the magic "23" constant in the get realdev
name function strncpy() call?

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-27 23:49         ` David S. Miller
@ 2004-10-28  1:28           ` Ben Greear
  2004-10-28  4:42             ` David S. Miller
  0 siblings, 1 reply; 22+ messages in thread
From: Ben Greear @ 2004-10-28  1:28 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

David S. Miller wrote:
> On Mon, 25 Oct 2004 16:56:10 -0700
> Ben Greear <greearb@candelatech.com> wrote:
> 
> 
>>DaveM, please consider accepting this patch (it's in a previous email to netdev,
>>just ask if you want it re-sent.)
> 
> 
> Applied, but I had to apply it by hand.  It gave a large reject.
> For some reason, patch barfed at the end of the vlan_dev.c parts
> and refused to interpreter the parts of the diffs for the other
> files.
> 
> Did you try to apply that patch to a tree before sending it off?
> 
> Anyways, I put it in by hand nevertheless.

I appreciate it.  I had edited the original patch, removing some
diffs of files that were not vlan related.  I guess that doesn't
work so well.  Will be more careful next time.

> 
> Also, what is up with the magic "23" constant in the get realdev
> name function strncpy() call?

Well, the fields are 24-bytes long, as defined in the include/linux/if_vlan.h
file.  I could change it to a #define and/or use sizeof if you prefer.

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-28  1:28           ` Ben Greear
@ 2004-10-28  4:42             ` David S. Miller
  0 siblings, 0 replies; 22+ messages in thread
From: David S. Miller @ 2004-10-28  4:42 UTC (permalink / raw)
  To: Ben Greear; +Cc: netdev

On Wed, 27 Oct 2004 18:28:21 -0700
Ben Greear <greearb@candelatech.com> wrote:

> > Also, what is up with the magic "23" constant in the get realdev
> > name function strncpy() call?
> 
> Well, the fields are 24-bytes long, as defined in the include/linux/if_vlan.h
> file.  I could change it to a #define and/or use sizeof if you prefer.

You could also use "sizeof(obj->member)".

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-28 23:40       ` Tommy Christensen
@ 2004-10-28 23:35         ` David S. Miller
  2004-10-29  0:23         ` Ben Greear
  1 sibling, 0 replies; 22+ messages in thread
From: David S. Miller @ 2004-10-28 23:35 UTC (permalink / raw)
  To: Tommy Christensen; +Cc: greearb, netdev, vlan, romieu, davem

On Fri, 29 Oct 2004 01:40:59 +0200
Tommy Christensen <tommy.christensen@tpack.net> wrote:

> Dave, I think this part of the patch should be reverted until someone comes up with a
> general scheme for flow_control handling through virtual devices.

It's certainly more correct, I think, than the code which
was there previously.

But in a way you're right, this is inconsistent with how IP and all
sorts of other tunnels behave, and if we change things we should
do it across the board.

So I will revert this part of his changes for now.

Thanks.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-25 20:51     ` Ben Greear
  2004-10-25 23:56       ` Ben Greear
@ 2004-10-28 23:40       ` Tommy Christensen
  2004-10-28 23:35         ` David S. Miller
  2004-10-29  0:23         ` Ben Greear
  1 sibling, 2 replies; 22+ messages in thread
From: Tommy Christensen @ 2004-10-28 23:40 UTC (permalink / raw)
  To: Ben Greear
  Cc: 'netdev@oss.sgi.com', Linux 802.1Q VLAN, Francois Romieu,
	David S. Miller

Ben Greear wrote:
> 
> --- linux-2.6.9/net/8021q/vlan_dev.c	2004-10-18 14:55:07.000000000 -0700
> +++ linux-2.6.9.p4s/net/8021q/vlan_dev.c	2004-10-25 13:38:32.779294920 -0700
> @@ -1,4 +1,4 @@
> -/*
> +/* -*- linux-c -*-
>   * INET		802.1Q VLAN
>   *		Ethernet-type device handling.
>   *
> @@ -484,13 +484,26 @@
>  	       veth->h_vlan_proto, veth->h_vlan_TCI, veth->h_vlan_encapsulated_proto);
>  #endif
>  
> -	stats->tx_packets++; /* for statics only */
> -	stats->tx_bytes += skb->len;
> -
>  	skb->dev = VLAN_DEV_INFO(dev)->real_dev;
> -	dev_queue_xmit(skb);
>  
> -	return 0;
> +	{
> +		/* Please note, dev_queue_xmit consumes the pkt regardless of the
> +		 * error value.  So, will copy the skb first and free if successful.
> +		 */
> +		struct sk_buff* skb2 = skb_get(skb);
> +		int rv = dev_queue_xmit(skb2);
> +		if (rv == 0) {
> +			/* Was success, need to free the skb reference since we bumped up the
> +			 * user count above.
> +			 */
> +
> +			stats->tx_packets++; /* for statics only */
> +			stats->tx_bytes += skb->len;
> +
> +			kfree_skb(skb);
> +		}
> +		return rv;
> +	}
>  }
>  
>  int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)

Hi Ben,

I am not happy with this change to vlan_dev_hard_start_xmit().

I certainly appreciate the idea of avoiding flow-control "black-holes", but that would
take more than just this change. You have to consider (at least) these issues:

  o It is considered an error if a queue-less device returns anything but zero from its
    hard_start_xmit() function (see dev_queue_xmit()).

  o So, lets add a tx queue to it. Sure, that would be nice. Now we can even do shaping
    and other fancy stuff. But then how do we manage netif_queue_stopped? Especially
    restarting the queue could be tricky.

  o But couldn't we skip netif_stop_queue() and just return NETDEV_TX_BUSY when congested?
    No, that would make the qdisc system "busy-retry" untill it succeeds. BAD.

  o It is unsafe to pass a shared skb to dev_queue_xmit() unless you control all the
    references yourself. (It will likely be enqueued on a list.)

And specifically for this patch:

  o The skb could be freed (replaced) in __vlan_put_tag(), so you cannot tell the caller
    to hang on to it.

  o If rv is NET_XMIT_CN (and probably also rv < 0) you have to return 0, in order to
    make the caller forget about this skb.


Dave, I think this part of the patch should be reverted until someone comes up with a
general scheme for flow_control handling through virtual devices.


-Tommy

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-28 23:40       ` Tommy Christensen
  2004-10-28 23:35         ` David S. Miller
@ 2004-10-29  0:23         ` Ben Greear
  2004-10-29  0:38           ` Krzysztof Halasa
  2004-10-29  8:29           ` Tommy Christensen
  1 sibling, 2 replies; 22+ messages in thread
From: Ben Greear @ 2004-10-29  0:23 UTC (permalink / raw)
  To: Tommy Christensen
  Cc: 'netdev@oss.sgi.com', Linux 802.1Q VLAN, Francois Romieu,
	David S. Miller

Tommy Christensen wrote:
> Ben Greear wrote:
> 

> Hi Ben,
> 
> I am not happy with this change to vlan_dev_hard_start_xmit().
> 
> I certainly appreciate the idea of avoiding flow-control "black-holes", 
> but that would
> take more than just this change. You have to consider (at least) these 
> issues:
> 
>  o It is considered an error if a queue-less device returns anything but 
> zero from its
>    hard_start_xmit() function (see dev_queue_xmit()).

This certainly was not clear to me.  The comments in dev_queue_xmit are
wrong about the return value (failure cases can be > zero too).  Are
there other errors or ommissions there?

What sorts of things go wrong if you do return an error here when you don't
have a queue?

>  o So, lets add a tx queue to it. Sure, that would be nice. Now we can 
> even do shaping
>    and other fancy stuff. But then how do we manage netif_queue_stopped? 
> Especially
>    restarting the queue could be tricky.

Right... it would probably be an O(N) thing to wake the queues for all virtual
devices on a physical device, and we certainly don't want to do that
often.  Maybe if you only tried to wake the blocked queues (ie, kept a list
of just blocked queues), then that would be less painful on average,
but the worst-case is still bad.

>  o But couldn't we skip netif_stop_queue() and just return 
> NETDEV_TX_BUSY when congested?
>    No, that would make the qdisc system "busy-retry" untill it succeeds. 
> BAD.
> 
>  o It is unsafe to pass a shared skb to dev_queue_xmit() unless you 
> control all the
>    references yourself. (It will likely be enqueued on a list.)

Since we either free the duplicate copy, or pass it to the queue and forget
about it, this last point does not matter in the patch I submitted, right?


> And specifically for this patch:
> 
>  o The skb could be freed (replaced) in __vlan_put_tag(), so you cannot 
> tell the caller
>    to hang on to it.

Yep, that is quite nasty...I had not noticed.  If I kept a copy of the original
pointer (using skb_get() to bump the reference) passed in,
that would fix this particular problem?

>  o If rv is NET_XMIT_CN (and probably also rv < 0) you have to return 0, 
> in order to
>    make the caller forget about this skb.

Is there a complete list of what return codes are possible?  Maybe we could make
it return an enum instead of an integer so we can more easily track these
sorts of things down??

Thanks for noticing!
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-29  0:23         ` Ben Greear
@ 2004-10-29  0:38           ` Krzysztof Halasa
  2004-10-29  8:29           ` Tommy Christensen
  1 sibling, 0 replies; 22+ messages in thread
From: Krzysztof Halasa @ 2004-10-29  0:38 UTC (permalink / raw)
  To: Ben Greear
  Cc: Tommy Christensen, 'netdev@oss.sgi.com',
	Linux 802.1Q VLAN, Francois Romieu, David S. Miller

Ben Greear <greearb@candelatech.com> writes:

> Right... it would probably be an O(N) thing to wake the queues for all virtual
> devices on a physical device, and we certainly don't want to do that
> often.  Maybe if you only tried to wake the blocked queues (ie, kept a list
> of just blocked queues), then that would be less painful on average,
> but the worst-case is still bad.

Not sure if we need multiple queues. I think one queue for one physical
device (a queue shared by all logical subdevices) would be enough
in this case.

Not sure how to do it, either. The semantics should be changed perhaps.


The same issue with Frame Relay logical devices.
-- 
Krzysztof Halasa

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-29  0:23         ` Ben Greear
  2004-10-29  0:38           ` Krzysztof Halasa
@ 2004-10-29  8:29           ` Tommy Christensen
  2004-10-29 17:45             ` Ben Greear
  1 sibling, 1 reply; 22+ messages in thread
From: Tommy Christensen @ 2004-10-29  8:29 UTC (permalink / raw)
  To: Ben Greear
  Cc: 'netdev@oss.sgi.com', Linux 802.1Q VLAN, Francois Romieu,
	David S. Miller

On Fri, 2004-10-29 at 02:23, Ben Greear wrote:
> >  o It is considered an error if a queue-less device returns anything but 
> > zero from its
> >    hard_start_xmit() function (see dev_queue_xmit()).
> 
> This certainly was not clear to me.  The comments in dev_queue_xmit are
> wrong about the return value (failure cases can be > zero too).  Are
> there other errors or ommissions there?

A return value > zero doesn't mean failure. It indicates congestion.

> What sorts of things go wrong if you do return an error here when you don't
> have a queue?

It is interpreted as a tx failure rather than congestion. So it doesn't
help the upper layers like you wanted it to.
And it spews out an error message.

> 
> >  o So, lets add a tx queue to it. Sure, that would be nice. Now we can 
> > even do shaping
> >    and other fancy stuff. But then how do we manage netif_queue_stopped? 
> > Especially
> >    restarting the queue could be tricky.
> 
> Right... it would probably be an O(N) thing to wake the queues for all virtual
> devices on a physical device, and we certainly don't want to do that
> often.  Maybe if you only tried to wake the blocked queues (ie, kept a list
> of just blocked queues), then that would be less painful on average,
> but the worst-case is still bad.

Yeah, we probably would need some sort of notification from the
qdisc of the underlying device when it can accept packets again.

> >  o But couldn't we skip netif_stop_queue() and just return 
> > NETDEV_TX_BUSY when congested?
> >    No, that would make the qdisc system "busy-retry" untill it succeeds. 
> > BAD.
> > 
> >  o It is unsafe to pass a shared skb to dev_queue_xmit() unless you 
> > control all the
> >    references yourself. (It will likely be enqueued on a list.)
> 
> Since we either free the duplicate copy, or pass it to the queue and forget
> about it, this last point does not matter in the patch I submitted, right?

Yes. This is the right way to do it. *Unless* the skb is already shared
when you receive it (e.g. from pktgen).

> > And specifically for this patch:
> > 
> >  o The skb could be freed (replaced) in __vlan_put_tag(), so you cannot 
> > tell the caller
> >    to hang on to it.
> 
> Yep, that is quite nasty...I had not noticed.  If I kept a copy of the original
> pointer (using skb_get() to bump the reference) passed in,
> that would fix this particular problem?

Yes, I would think so.

> >  o If rv is NET_XMIT_CN (and probably also rv < 0) you have to return 0, 
> > in order to
> >    make the caller forget about this skb.
> 
> Is there a complete list of what return codes are possible?  Maybe we could make
> it return an enum instead of an integer so we can more easily track these
> sorts of things down??

They are listed in netdevice.h - NET_XMIT_SUCCESS etc., and the usual
negative errno's.

> Thanks for noticing!
> Ben

-Tommy

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-29  8:29           ` Tommy Christensen
@ 2004-10-29 17:45             ` Ben Greear
  2004-10-29 23:37               ` Tommy Christensen
  0 siblings, 1 reply; 22+ messages in thread
From: Ben Greear @ 2004-10-29 17:45 UTC (permalink / raw)
  To: Tommy Christensen
  Cc: 'netdev@oss.sgi.com', Linux 802.1Q VLAN, Francois Romieu,
	David S. Miller

Tommy Christensen wrote:
> On Fri, 2004-10-29 at 02:23, Ben Greear wrote:
> 
>>> o It is considered an error if a queue-less device returns anything but 
>>>zero from its
>>>   hard_start_xmit() function (see dev_queue_xmit()).
>>
>>This certainly was not clear to me.  The comments in dev_queue_xmit are
>>wrong about the return value (failure cases can be > zero too).  Are
>>there other errors or ommissions there?
> 
> 
> A return value > zero doesn't mean failure. It indicates congestion.

Ok, but the skb is always deleted by the net_queue_xmit code if the
return is not zero?  The difference between a hard-start-xmit failure
on eth0 when the hardware-queue is full and having a rate-limiting
queue drop a packet is virtually identical to me....

>>What sorts of things go wrong if you do return an error here when you don't
>>have a queue?
> 
> It is interpreted as a tx failure rather than congestion. So it doesn't
> help the upper layers like you wanted it to.
> And it spews out an error message.

The e1000 and probably other NICs have failed hard_start_xmit for a long
time, and they are some of the most stable and high-performance NICs.
So, the upper layers must be handling it OK some how or another.

Can you point me to some code that takes a different action based on the
return values of dev_queue_xmit?  That may help me understand better.

>>> o So, lets add a tx queue to it. Sure, that would be nice. Now we can 
>>>even do shaping
>>>   and other fancy stuff. But then how do we manage netif_queue_stopped? 
>>>Especially
>>>   restarting the queue could be tricky.
>>
>>Right... it would probably be an O(N) thing to wake the queues for all virtual
>>devices on a physical device, and we certainly don't want to do that
>>often.  Maybe if you only tried to wake the blocked queues (ie, kept a list
>>of just blocked queues), then that would be less painful on average,
>>but the worst-case is still bad.
> 
> Yeah, we probably would need some sort of notification from the
> qdisc of the underlying device when it can accept packets again.

I did something like this for my non-busy-spin pktgen re-write and it
works fine with both VLANs and physical devices.  I just hooked
directly into this code in netdevice.h:

static inline void netif_wake_queue(struct net_device *dev)
{
#ifdef CONFIG_NETPOLL_TRAP
	if (netpoll_trap())
		return;
#endif
	if (test_and_clear_bit(__LINK_STATE_XOFF, &dev->state)) {
		__netif_schedule(dev);

                 if (dev->notify_queue_woken) {
                    dev->notify_queue_woken(dev);
                 }
         }
}

pktgen registers this hook on the physical device when it starts generating on
the physical device or any VLANs attached to it.  To make a scheme like this work
in general, we'd probably need a chain of callbacks instead of a single method
pointer...


>>> o But couldn't we skip netif_stop_queue() and just return 
>>>NETDEV_TX_BUSY when congested?
>>>   No, that would make the qdisc system "busy-retry" untill it succeeds. 
>>>BAD.
>>>
>>> o It is unsafe to pass a shared skb to dev_queue_xmit() unless you 
>>>control all the
>>>   references yourself. (It will likely be enqueued on a list.)
>>
>>Since we either free the duplicate copy, or pass it to the queue and forget
>>about it, this last point does not matter in the patch I submitted, right?
> 
> 
> Yes. This is the right way to do it. *Unless* the skb is already shared
> when you receive it (e.g. from pktgen).

You can't send shared skbs regardless, because the vlan Xmit changes the skb->dev at least, so
you just have to set the multi-skb setting in pktgen to 0 so that it does not
share when using VLANs.

>>>And specifically for this patch:
>>>
>>> o The skb could be freed (replaced) in __vlan_put_tag(), so you cannot 
>>>tell the caller
>>>   to hang on to it.
>>
>>Yep, that is quite nasty...I had not noticed.  If I kept a copy of the original
>>pointer (using skb_get() to bump the reference) passed in,
>>that would fix this particular problem?
> 
> 
> Yes, I would think so.

I will test this change and send a follow-up patch if it proves stable.

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-29 17:45             ` Ben Greear
@ 2004-10-29 23:37               ` Tommy Christensen
  2004-10-29 23:56                 ` Ben Greear
  2004-10-30  0:05                 ` Ben Greear
  0 siblings, 2 replies; 22+ messages in thread
From: Tommy Christensen @ 2004-10-29 23:37 UTC (permalink / raw)
  To: Ben Greear
  Cc: 'netdev@oss.sgi.com', Linux 802.1Q VLAN, Francois Romieu,
	David S. Miller

Ben Greear wrote:
> Tommy Christensen wrote:
> 
>> On Fri, 2004-10-29 at 02:23, Ben Greear wrote:
>>
>>>> o It is considered an error if a queue-less device returns anything 
>>>> but zero from its
>>>>   hard_start_xmit() function (see dev_queue_xmit()).
>>>
>>>
>>> This certainly was not clear to me.  The comments in dev_queue_xmit are
>>> wrong about the return value (failure cases can be > zero too).  Are
>>> there other errors or ommissions there?
>>
>>
>> A return value > zero doesn't mean failure. It indicates congestion.
> 
> 
> Ok, but the skb is always deleted by the net_queue_xmit code if the
> return is not zero?  The difference between a hard-start-xmit failure
> on eth0 when the hardware-queue is full and having a rate-limiting
> queue drop a packet is virtually identical to me....

For a virtual device: yes, dev_queue_xmit() drops the skb. What else
could it do with it? The semantic is that dev_queue_xmit always consumes
skb's given to it.

A physical device will have a qdisc attached to it, so you don't get to
see that the hardware queue is full. qdisc handles this case for you by
retrying the transmission later. This is not (yet) congestion.
OTOH if qdisc doesn't have room for a new skb in its *software* queue,
the skb is dropped and congestion is reported upwards the stack.

>>> What sorts of things go wrong if you do return an error here when you 
>>> don't
>>> have a queue?
>>
>>
>> It is interpreted as a tx failure rather than congestion. So it doesn't
>> help the upper layers like you wanted it to.
>> And it spews out an error message.
> 
> 
> The e1000 and probably other NICs have failed hard_start_xmit for a long
> time, and they are some of the most stable and high-performance NICs.
> So, the upper layers must be handling it OK some how or another.

Yes, this is perfectly valid for real devices. It is handled by the
qdisc system - specifically qdisc_restart().

> Can you point me to some code that takes a different action based on the
> return values of dev_queue_xmit?  That may help me understand better.

This is hard to track due to indirect function calls, but take a look
at tcp_transmit_skb(). It ultimately calls dev_queue_xmit().

>>>> o So, lets add a tx queue to it. Sure, that would be nice. Now we 
>>>> can even do shaping
>>>>   and other fancy stuff. But then how do we manage 
>>>> netif_queue_stopped? Especially
>>>>   restarting the queue could be tricky.
>>>
>>>
>>> Right... it would probably be an O(N) thing to wake the queues for 
>>> all virtual
>>> devices on a physical device, and we certainly don't want to do that
>>> often.  Maybe if you only tried to wake the blocked queues (ie, kept 
>>> a list
>>> of just blocked queues), then that would be less painful on average,
>>> but the worst-case is still bad.
>>
>>
>> Yeah, we probably would need some sort of notification from the
>> qdisc of the underlying device when it can accept packets again.
> 
> 
> I did something like this for my non-busy-spin pktgen re-write and it
> works fine with both VLANs and physical devices.  I just hooked
> directly into this code in netdevice.h:
> 
> static inline void netif_wake_queue(struct net_device *dev)
> {
> #ifdef CONFIG_NETPOLL_TRAP
>     if (netpoll_trap())
>         return;
> #endif
>     if (test_and_clear_bit(__LINK_STATE_XOFF, &dev->state)) {
>         __netif_schedule(dev);
> 
>                 if (dev->notify_queue_woken) {
>                    dev->notify_queue_woken(dev);
>                 }
>         }
> }
> 
> pktgen registers this hook on the physical device when it starts 
> generating on
> the physical device or any VLANs attached to it.  To make a scheme like 
> this work
> in general, we'd probably need a chain of callbacks instead of a single 
> method
> pointer...

Nice. This idea is definitely worth persuing. However, ideally we
would want to be notified when the *qdisc* queue opens up - this
is our "tx ring buffer".

>>>> o But couldn't we skip netif_stop_queue() and just return 
>>>> NETDEV_TX_BUSY when congested?
>>>>   No, that would make the qdisc system "busy-retry" untill it 
>>>> succeeds. BAD.
>>>>
>>>> o It is unsafe to pass a shared skb to dev_queue_xmit() unless you 
>>>> control all the
>>>>   references yourself. (It will likely be enqueued on a list.)
>>>
>>>
>>> Since we either free the duplicate copy, or pass it to the queue and 
>>> forget
>>> about it, this last point does not matter in the patch I submitted, 
>>> right?
>>
>>
>>
>> Yes. This is the right way to do it. *Unless* the skb is already shared
>> when you receive it (e.g. from pktgen).
> 
> 
> You can't send shared skbs regardless, because the vlan Xmit changes the 
> skb->dev at least, so
> you just have to set the multi-skb setting in pktgen to 0 so that it 
> does not
> share when using VLANs.

By sheer accident, this would actually work! Nevertheless, the code
should obviously handle this correctly (whatever that means?!).


-Tommy

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-29 23:37               ` Tommy Christensen
@ 2004-10-29 23:56                 ` Ben Greear
  2004-10-30  0:05                 ` Ben Greear
  1 sibling, 0 replies; 22+ messages in thread
From: Ben Greear @ 2004-10-29 23:56 UTC (permalink / raw)
  To: Tommy Christensen
  Cc: 'netdev@oss.sgi.com', Linux 802.1Q VLAN, Francois Romieu,
	David S. Miller

Tommy Christensen wrote:
> Ben Greear wrote:
> 

>> You can't send shared skbs regardless, because the vlan Xmit changes 
>> the skb->dev at least, so
>> you just have to set the multi-skb setting in pktgen to 0 so that it 
>> does not
>> share when using VLANs.
> 
> 
> By sheer accident, this would actually work! Nevertheless, the code
> should obviously handle this correctly (whatever that means?!).

It definately crashed on my system when I tried it, so I don't think
it actually works.  Think about a SMP system where pktgen is re-sending
the same pkt while the other CPU is handling the previous send..or something
like that.  The fact that skb->dev is changing cannot be healthy.

 From what I can tell, a net-devices hard_start_xmit method must either
return 0 and consume the skb, or return a non-zero value and not
consume the skb.  Since we can detect immediate drops due to the
dev_queue_xmit call failing, I don't see how it can hurt anything to
preserve the skb and return the error code.  Code that cares about retransmitting
can, and if it doesn't, it can just delete the skb.

I believe this is the same as the case where the e1000 does not
show netif_queue_stopped() but still returns failure when you
try the hard_start_xmit.  I know that this case will probably
eventually be fixed, but the fact that it *does* work leads me to
believe I can get away with what I'm trying to do with VLANs.

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-29 23:37               ` Tommy Christensen
  2004-10-29 23:56                 ` Ben Greear
@ 2004-10-30  0:05                 ` Ben Greear
  2004-10-30  0:31                   ` Tommy Christensen
  1 sibling, 1 reply; 22+ messages in thread
From: Ben Greear @ 2004-10-30  0:05 UTC (permalink / raw)
  To: Tommy Christensen
  Cc: 'netdev@oss.sgi.com', Linux 802.1Q VLAN, Francois Romieu,
	David S. Miller

Tommy Christensen wrote:
> Ben Greear wrote:
> 
>> Tommy Christensen wrote:

>>> A return value > zero doesn't mean failure. It indicates congestion.
>>
>>
>>
>> Ok, but the skb is always deleted by the net_queue_xmit code if the
>> return is not zero?  The difference between a hard-start-xmit failure
>> on eth0 when the hardware-queue is full and having a rate-limiting
>> queue drop a packet is virtually identical to me....
> 
> 
> For a virtual device: yes, dev_queue_xmit() drops the skb. What else
> could it do with it? The semantic is that dev_queue_xmit always consumes
> skb's given to it.
> 
> A physical device will have a qdisc attached to it, so you don't get to
> see that the hardware queue is full. qdisc handles this case for you by
> retrying the transmission later. This is not (yet) congestion.
> OTOH if qdisc doesn't have room for a new skb in its *software* queue,
> the skb is dropped and congestion is reported upwards the stack.

Can't you also add a queue to a VLAN device?

eth1.1009 Link encap:Ethernet  HWaddr 00:07:E9:1F:CE:02
           inet addr:172.100.1.109  Bcast:172.100.1.255  Mask:255.255.255.0
           UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
           RX packets:0 errors:0 dropped:0 overruns:0 frame:0
           TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
           collisions:0 txqueuelen:2000
           RX bytes:0 (0.0 b)  TX bytes:0 (0.0 b)

I see fairly high latency when I overdrive the network, but it does seem to
work just fine.


>> pktgen registers this hook on the physical device when it starts 
>> generating on
>> the physical device or any VLANs attached to it.  To make a scheme 
>> like this work
>> in general, we'd probably need a chain of callbacks instead of a 
>> single method
>> pointer...
> 
> 
> Nice. This idea is definitely worth persuing. However, ideally we
> would want to be notified when the *qdisc* queue opens up - this
> is our "tx ring buffer".

Maybe the qdisc could automatically flush what it could to lower
level devices/queues whenever it was asked to enqueue a packet?
This way, waking the writers could automatically wake the various
queues under the writers.

That could be happening already, and might explain why my pktgen hacks
work.

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-30  0:05                 ` Ben Greear
@ 2004-10-30  0:31                   ` Tommy Christensen
  2004-11-01 18:58                     ` Ben Greear
  0 siblings, 1 reply; 22+ messages in thread
From: Tommy Christensen @ 2004-10-30  0:31 UTC (permalink / raw)
  To: Ben Greear
  Cc: 'netdev@oss.sgi.com', Linux 802.1Q VLAN, Francois Romieu,
	David S. Miller

Ben Greear wrote:
> Can't you also add a queue to a VLAN device?

Sure you can. This is the best solution, if you have a way of
waking up the queue.

>> Nice. This idea is definitely worth persuing. However, ideally we
>> would want to be notified when the *qdisc* queue opens up - this
>> is our "tx ring buffer".
> 
> 
> Maybe the qdisc could automatically flush what it could to lower
> level devices/queues whenever it was asked to enqueue a packet?
> This way, waking the writers could automatically wake the various
> queues under the writers.
> 
> That could be happening already, and might explain why my pktgen hacks
> work.

Yes, this is what qdisc does. It just isn't good enough to have to
wait for the next packet, IMO.


-Tommy

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-10-30  0:31                   ` Tommy Christensen
@ 2004-11-01 18:58                     ` Ben Greear
  2004-11-01 23:08                       ` Tommy Christensen
  0 siblings, 1 reply; 22+ messages in thread
From: Ben Greear @ 2004-11-01 18:58 UTC (permalink / raw)
  To: Tommy Christensen
  Cc: 'netdev@oss.sgi.com', Linux 802.1Q VLAN, Francois Romieu,
	David S. Miller


How does this look?  I think it should fix the problem with
having a new skb created in the __vlan_put_tag() method.


int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct net_device_stats *stats = vlan_dev_get_stats(dev);
	struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data);

	/* Please note, dev_queue_xmit consumes the pkt regardless of the
	 * return value.  So, will copy the skb first and free if successful.
	 */
	struct sk_buff* skb2 = skb_get(skb);

	/* Handle non-VLAN frames if they are sent to us, for example by DHCP.
	 *
	 * NOTE: THIS ASSUMES DIX ETHERNET, SPECIFICALLY NOT SUPPORTING
	 * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs...
	 */

	if (veth->h_vlan_proto != __constant_htons(ETH_P_8021Q)) {
		int orig_headroom = skb_headroom(skb);
		unsigned short veth_TCI;

		/* This is not a VLAN frame...but we can fix that! */
		VLAN_DEV_INFO(dev)->cnt_encap_on_xmit++;

#ifdef VLAN_DEBUG
		printk(VLAN_DBG "%s: proto to encap: 0x%hx (hbo)\n",
			__FUNCTION__, htons(veth->h_vlan_proto));
#endif
		/* Construct the second two bytes. This field looks something
		 * like:
		 * usr_priority: 3 bits	 (high bits)
		 * CFI		 1 bit
		 * VLAN ID	 12 bits (low bits)
		 */
		veth_TCI = VLAN_DEV_INFO(dev)->vlan_id;
		veth_TCI |= vlan_dev_get_egress_qos_mask(dev, skb);

		skb = __vlan_put_tag(skb, veth_TCI);
		if (!skb) {
			stats->tx_dropped++;
			/* Free the extra copy, assuming this is a non-recoverable
			 * issue and we don't want calling code to retry.
			 */
			kfree_skb(skb2);
			return 0;
		}

		if (orig_headroom < VLAN_HLEN) {
			VLAN_DEV_INFO(dev)->cnt_inc_headroom_on_tx++;
		}
	}

#ifdef VLAN_DEBUG
	printk(VLAN_DBG "%s: about to send skb: %p to dev: %s\n",
		__FUNCTION__, skb, skb->dev->name);
	printk(VLAN_DBG "  %2hx.%2hx.%2hx.%2xh.%2hx.%2hx %2hx.%2hx.%2hx.%2hx.%2hx.%2hx %4hx %4hx %4hx\n",
	       veth->h_dest[0], veth->h_dest[1], veth->h_dest[2], veth->h_dest[3], veth->h_dest[4], veth->h_dest[5],
	       veth->h_source[0], veth->h_source[1], veth->h_source[2], veth->h_source[3], veth->h_source[4], veth->h_source[5],
	       veth->h_vlan_proto, veth->h_vlan_TCI, veth->h_vlan_encapsulated_proto);
#endif

	skb->dev = VLAN_DEV_INFO(dev)->real_dev;

	{
		int rv = dev_queue_xmit(skb);
		if (rv == 0) {
			/* Was success, need to free the skb reference since
			 * we bumped up the user count above.  If there was an
			 * error instead, then the skb2 will not be freed, and so
			 * the calling code will be able to re-send it.
			 */

			stats->tx_packets++; /* for statics only */
			stats->tx_bytes += skb2->len;

			kfree_skb(skb2);
		}
		return rv;
	}
}


-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH]  802.1Q VLAN
  2004-11-01 18:58                     ` Ben Greear
@ 2004-11-01 23:08                       ` Tommy Christensen
  0 siblings, 0 replies; 22+ messages in thread
From: Tommy Christensen @ 2004-11-01 23:08 UTC (permalink / raw)
  To: Ben Greear
  Cc: 'netdev@oss.sgi.com', Linux 802.1Q VLAN, Francois Romieu,
	David S. Miller

Ben Greear wrote:
> How does this look?  I think it should fix the problem with
> having a new skb created in the __vlan_put_tag() method.

Yes, this seems to be handled correctly now.

Two nitpickings:
  - veth should be reassigned after calling __vlan_put_tag
  - sample skb->len before calling dev_queue_xmit and use that to
    update stats->tx_bytes (it can be different from skb2->len)

And then there's the return values ...

This is a hard_start_xmit() method, so we should try to be
consistent with real device drivers. The only defined return
values are: NETDEV_TX_OK and NETDEV_TX_BUSY.
(There is also Andi's NETDEV_TX_LOCKED, which we can ignore here).

Furthermore transmission shouldn't be retried in case of failure,
only on congestion does this make sense.

So my suggestion for the last part of the function is:

	len = skb->len;
	rv = dev_queue_xmit(skb);
	if (rv < 0) {
		stats->tx_dropped++;
		kfree_skb(skb2);
		ret = NETDEV_TX_OK;
	} else if (rv == NET_XMIT_SUCCESS || rv == NET_XMIT_CN) {
		stats->tx_packets++;
		stats->tx_bytes += len;
		kfree_skb(skb2);
		ret = NETDEV_TX_OK;
	} else {
		/* The device below us is congested */
		ret = NETDEV_TX_BUSY;
	}

	return ret;


-Tommy

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2004-11-01 23:08 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-10-22 21:07 [PATCH] 802.1Q VLAN Ben Greear
2004-10-22 21:46 ` Francois Romieu
2004-10-22 22:09   ` Ben Greear
2004-10-23  0:24     ` Francois Romieu
2004-10-25 20:51     ` Ben Greear
2004-10-25 23:56       ` Ben Greear
2004-10-27  1:02         ` David S. Miller
2004-10-27 23:49         ` David S. Miller
2004-10-28  1:28           ` Ben Greear
2004-10-28  4:42             ` David S. Miller
2004-10-28 23:40       ` Tommy Christensen
2004-10-28 23:35         ` David S. Miller
2004-10-29  0:23         ` Ben Greear
2004-10-29  0:38           ` Krzysztof Halasa
2004-10-29  8:29           ` Tommy Christensen
2004-10-29 17:45             ` Ben Greear
2004-10-29 23:37               ` Tommy Christensen
2004-10-29 23:56                 ` Ben Greear
2004-10-30  0:05                 ` Ben Greear
2004-10-30  0:31                   ` Tommy Christensen
2004-11-01 18:58                     ` Ben Greear
2004-11-01 23:08                       ` Tommy Christensen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).