* [RFC PATCH] net, tun: remove the flow cache @ 2013-12-17 7:26 Zhi Yong Wu 2013-12-17 7:47 ` Stephen Hemminger ` (2 more replies) 0 siblings, 3 replies; 12+ messages in thread From: Zhi Yong Wu @ 2013-12-17 7:26 UTC (permalink / raw) To: netdev; +Cc: linux-kernel, davem, mst, jasowang, Zhi Yong Wu From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> The flow cache is an extremely broken concept, and it usually brings up growth issues and DoS attacks, so this patch is trying to remove it from the tuntap driver, and insteadly use a simpler way for its flow control. Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> --- drivers/net/tun.c | 208 +++-------------------------------------------------- 1 files changed, 10 insertions(+), 198 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 7c8343a..7c27fdc 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -32,12 +32,15 @@ * * Daniel Podlejski <underley@underley.eu.org> * Modifications for 2.3.99-pre5 kernel. + * + * Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> + * Remove the flow cache. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define DRV_NAME "tun" -#define DRV_VERSION "1.6" +#define DRV_VERSION "1.7" #define DRV_DESCRIPTION "Universal TUN/TAP device driver" #define DRV_COPYRIGHT "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>" @@ -146,18 +149,6 @@ struct tun_file { struct tun_struct *detached; }; -struct tun_flow_entry { - struct hlist_node hash_link; - struct rcu_head rcu; - struct tun_struct *tun; - - u32 rxhash; - int queue_index; - unsigned long updated; -}; - -#define TUN_NUM_FLOW_ENTRIES 1024 - /* Since the socket were moved to tun_file, to preserve the behavior of persist * device, socket filter, sndbuf and vnet header size were restore when the * file were attached to a persist device. @@ -184,163 +175,11 @@ struct tun_struct { int debug; #endif spinlock_t lock; - struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; - struct timer_list flow_gc_timer; - unsigned long ageing_time; unsigned int numdisabled; struct list_head disabled; void *security; - u32 flow_count; }; -static inline u32 tun_hashfn(u32 rxhash) -{ - return rxhash & 0x3ff; -} - -static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash) -{ - struct tun_flow_entry *e; - - hlist_for_each_entry_rcu(e, head, hash_link) { - if (e->rxhash == rxhash) - return e; - } - return NULL; -} - -static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun, - struct hlist_head *head, - u32 rxhash, u16 queue_index) -{ - struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC); - - if (e) { - tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n", - rxhash, queue_index); - e->updated = jiffies; - e->rxhash = rxhash; - e->queue_index = queue_index; - e->tun = tun; - hlist_add_head_rcu(&e->hash_link, head); - ++tun->flow_count; - } - return e; -} - -static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) -{ - tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n", - e->rxhash, e->queue_index); - hlist_del_rcu(&e->hash_link); - kfree_rcu(e, rcu); - --tun->flow_count; -} - -static void tun_flow_flush(struct tun_struct *tun) -{ - int i; - - spin_lock_bh(&tun->lock); - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { - struct tun_flow_entry *e; - struct hlist_node *n; - - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) - tun_flow_delete(tun, e); - } - spin_unlock_bh(&tun->lock); -} - -static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index) -{ - int i; - - spin_lock_bh(&tun->lock); - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { - struct tun_flow_entry *e; - struct hlist_node *n; - - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { - if (e->queue_index == queue_index) - tun_flow_delete(tun, e); - } - } - spin_unlock_bh(&tun->lock); -} - -static void tun_flow_cleanup(unsigned long data) -{ - struct tun_struct *tun = (struct tun_struct *)data; - unsigned long delay = tun->ageing_time; - unsigned long next_timer = jiffies + delay; - unsigned long count = 0; - int i; - - tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n"); - - spin_lock_bh(&tun->lock); - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { - struct tun_flow_entry *e; - struct hlist_node *n; - - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { - unsigned long this_timer; - count++; - this_timer = e->updated + delay; - if (time_before_eq(this_timer, jiffies)) - tun_flow_delete(tun, e); - else if (time_before(this_timer, next_timer)) - next_timer = this_timer; - } - } - - if (count) - mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer)); - spin_unlock_bh(&tun->lock); -} - -static void tun_flow_update(struct tun_struct *tun, u32 rxhash, - struct tun_file *tfile) -{ - struct hlist_head *head; - struct tun_flow_entry *e; - unsigned long delay = tun->ageing_time; - u16 queue_index = tfile->queue_index; - - if (!rxhash) - return; - else - head = &tun->flows[tun_hashfn(rxhash)]; - - rcu_read_lock(); - - /* We may get a very small possibility of OOO during switching, not - * worth to optimize.*/ - if (tun->numqueues == 1 || tfile->detached) - goto unlock; - - e = tun_flow_find(head, rxhash); - if (likely(e)) { - /* TODO: keep queueing to old queue until it's empty? */ - e->queue_index = queue_index; - e->updated = jiffies; - } else { - spin_lock_bh(&tun->lock); - if (!tun_flow_find(head, rxhash) && - tun->flow_count < MAX_TAP_FLOWS) - tun_flow_create(tun, head, rxhash, queue_index); - - if (!timer_pending(&tun->flow_gc_timer)) - mod_timer(&tun->flow_gc_timer, - round_jiffies_up(jiffies + delay)); - spin_unlock_bh(&tun->lock); - } - -unlock: - rcu_read_unlock(); -} - /* We try to identify a flow through its rxhash first. The reason that * we do not check rxq no. is becuase some cards(e.g 82599), chooses * the rxq based on the txq where the last packet of the flow comes. As @@ -351,7 +190,6 @@ unlock: static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) { struct tun_struct *tun = netdev_priv(dev); - struct tun_flow_entry *e; u32 txq = 0; u32 numqueues = 0; @@ -360,12 +198,11 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) txq = skb_get_rxhash(skb); if (txq) { - e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); - if (e) - txq = e->queue_index; - else + txq = skb_get_queue_mapping(skb); + if (!txq) { /* use multiply and shift instead of expensive divide */ txq = ((u64)txq * numqueues) >> 32; + } } else if (likely(skb_rx_queue_recorded(skb))) { txq = skb_get_rx_queue(skb); while (unlikely(txq >= numqueues)) @@ -439,7 +276,6 @@ static void __tun_detach(struct tun_file *tfile, bool clean) tun_disable_queue(tun, tfile); synchronize_net(); - tun_flow_delete_by_queue(tun, tun->numqueues + 1); /* Drop read queue */ tun_queue_purge(tfile); tun_set_real_num_queues(tun); @@ -858,25 +694,6 @@ static const struct net_device_ops tap_netdev_ops = { #endif }; -static void tun_flow_init(struct tun_struct *tun) -{ - int i; - - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) - INIT_HLIST_HEAD(&tun->flows[i]); - - tun->ageing_time = TUN_FLOW_EXPIRE; - setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun); - mod_timer(&tun->flow_gc_timer, - round_jiffies_up(jiffies + tun->ageing_time)); -} - -static void tun_flow_uninit(struct tun_struct *tun) -{ - del_timer_sync(&tun->flow_gc_timer); - tun_flow_flush(tun); -} - /* Initialize net device. */ static void tun_net_init(struct net_device *dev) { @@ -986,7 +803,6 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, int copylen; bool zerocopy = false; int err; - u32 rxhash; if (!(tun->flags & TUN_NO_PI)) { if (len < sizeof(pi)) @@ -1146,13 +962,12 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, skb_reset_network_header(skb); skb_probe_transport_header(skb, 0); - rxhash = skb_get_rxhash(skb); + skb_set_queue_mapping(skb, tfile->queue_index); netif_rx_ni(skb); tun->dev->stats.rx_packets++; tun->dev->stats.rx_bytes += len; - tun_flow_update(tun, rxhash, tfile); return total_len; } @@ -1370,7 +1185,6 @@ static void tun_free_netdev(struct net_device *dev) struct tun_struct *tun = netdev_priv(dev); BUG_ON(!(list_empty(&tun->disabled))); - tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); free_netdev(dev); } @@ -1644,7 +1458,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) goto err_free_dev; tun_net_init(dev); - tun_flow_init(tun); dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | @@ -1655,7 +1468,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) INIT_LIST_HEAD(&tun->disabled); err = tun_attach(tun, file, false); if (err < 0) - goto err_free_flow; + goto err_free_security; err = register_netdevice(tun->dev); if (err < 0) @@ -1705,8 +1518,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) err_detach: tun_detach_all(dev); -err_free_flow: - tun_flow_uninit(tun); +err_free_security: security_tun_dev_free_security(tun->security); err_free_dev: free_netdev(dev); -- 1.7.6.5 ^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] net, tun: remove the flow cache 2013-12-17 7:26 [RFC PATCH] net, tun: remove the flow cache Zhi Yong Wu @ 2013-12-17 7:47 ` Stephen Hemminger 2013-12-17 8:13 ` Zhi Yong Wu 2013-12-17 8:49 ` Jason Wang 2013-12-18 4:06 ` Tom Herbert 2 siblings, 1 reply; 12+ messages in thread From: Stephen Hemminger @ 2013-12-17 7:47 UTC (permalink / raw) To: Zhi Yong Wu; +Cc: netdev, linux-kernel, davem, mst, jasowang, Zhi Yong Wu On Tue, 17 Dec 2013 15:26:22 +0800 Zhi Yong Wu <zwu.kernel@gmail.com> wrote: > From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> > > The flow cache is an extremely broken concept, and it usually brings up > growth issues and DoS attacks, so this patch is trying to remove it from > the tuntap driver, and insteadly use a simpler way for its flow control. > > Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> > --- > drivers/net/tun.c | 208 +++-------------------------------------------------- > 1 files changed, 10 insertions(+), 198 deletions(-) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index 7c8343a..7c27fdc 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -32,12 +32,15 @@ > * > * Daniel Podlejski <underley@underley.eu.org> > * Modifications for 2.3.99-pre5 kernel. > + * > + * Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> > + * Remove the flow cache. > */ I agree with your patch, but please don't add to the comment changelog. These are all historical. The kernel development process has not used them for 5+ years. Can we get kernel janitors to just remove them, or would that step on too many early developers toes by removing credit? ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] net, tun: remove the flow cache 2013-12-17 7:47 ` Stephen Hemminger @ 2013-12-17 8:13 ` Zhi Yong Wu 0 siblings, 0 replies; 12+ messages in thread From: Zhi Yong Wu @ 2013-12-17 8:13 UTC (permalink / raw) To: Stephen Hemminger; +Cc: Zhi Yong Wu, netdev, linux-kernel, davem, mst, jasowang On Mon, 2013-12-16 at 23:47 -0800, Stephen Hemminger wrote: > On Tue, 17 Dec 2013 15:26:22 +0800 > Zhi Yong Wu <zwu.kernel@gmail.com> wrote: > > > From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> > > > > The flow cache is an extremely broken concept, and it usually brings up > > growth issues and DoS attacks, so this patch is trying to remove it from > > the tuntap driver, and insteadly use a simpler way for its flow control. > > > > Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> > > --- > > drivers/net/tun.c | 208 +++-------------------------------------------------- > > 1 files changed, 10 insertions(+), 198 deletions(-) > > > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > > index 7c8343a..7c27fdc 100644 > > --- a/drivers/net/tun.c > > +++ b/drivers/net/tun.c > > @@ -32,12 +32,15 @@ > > * > > * Daniel Podlejski <underley@underley.eu.org> > > * Modifications for 2.3.99-pre5 kernel. > > + * > > + * Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> > > + * Remove the flow cache. > > */ > > I agree with your patch, but please don't add to the comment changelog. > These are all historical. The kernel development process has not used > them for 5+ years. > > Can we get kernel janitors to just remove them, or would that step > on too many early developers toes by removing credit? I thought that it is a big code change, and need to add some changelog for this, but you seem to have a big argue. :) I don't object to removing my comment in its changelog if other guys also agree with you. > -- Regards, Zhi Yong Wu ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] net, tun: remove the flow cache 2013-12-17 7:26 [RFC PATCH] net, tun: remove the flow cache Zhi Yong Wu 2013-12-17 7:47 ` Stephen Hemminger @ 2013-12-17 8:49 ` Jason Wang 2013-12-17 9:13 ` Zhi Yong Wu 2013-12-18 4:06 ` Tom Herbert 2 siblings, 1 reply; 12+ messages in thread From: Jason Wang @ 2013-12-17 8:49 UTC (permalink / raw) To: Zhi Yong Wu, netdev; +Cc: linux-kernel, davem, mst, Zhi Yong Wu On 12/17/2013 03:26 PM, Zhi Yong Wu wrote: > From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> > > The flow cache is an extremely broken concept, and it usually brings up > growth issues and DoS attacks, so this patch is trying to remove it from > the tuntap driver, and insteadly use a simpler way for its flow control. NACK. This single function revert does not make sense to me. Since: - You in fact removes the flow steering function in tun. We definitely need something like this to unbreak the TCP performance in a multiqueue guest. Please have a look at the virtio-net driver / virtio sepc for more information. - The total number of flow caches were limited to 4096, so there's no DoS or growth issue. - The only issue is scalability, but fixing this is not easy. We can just use arrays/indirection table like RSS instead of hash buckets, it saves some time in linear search but has other issues like collision - I've also had a RFC of using aRFS in the past, it also has several drawbacks such as busy looping in the networking hotspot. So in conclusion, we need flow steering in tun, just removing current method does not help. The proper way is to expose several different methods to user and let user to choose the preferable mechanism like packet. > > Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> > --- > drivers/net/tun.c | 208 +++-------------------------------------------------- > 1 files changed, 10 insertions(+), 198 deletions(-) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index 7c8343a..7c27fdc 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -32,12 +32,15 @@ > * > * Daniel Podlejski <underley@underley.eu.org> > * Modifications for 2.3.99-pre5 kernel. > + * > + * Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> > + * Remove the flow cache. > */ > > #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt > > #define DRV_NAME "tun" > -#define DRV_VERSION "1.6" > +#define DRV_VERSION "1.7" > #define DRV_DESCRIPTION "Universal TUN/TAP device driver" > #define DRV_COPYRIGHT "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>" > > @@ -146,18 +149,6 @@ struct tun_file { > struct tun_struct *detached; > }; > > -struct tun_flow_entry { > - struct hlist_node hash_link; > - struct rcu_head rcu; > - struct tun_struct *tun; > - > - u32 rxhash; > - int queue_index; > - unsigned long updated; > -}; > - > -#define TUN_NUM_FLOW_ENTRIES 1024 > - > /* Since the socket were moved to tun_file, to preserve the behavior of persist > * device, socket filter, sndbuf and vnet header size were restore when the > * file were attached to a persist device. > @@ -184,163 +175,11 @@ struct tun_struct { > int debug; > #endif > spinlock_t lock; > - struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; > - struct timer_list flow_gc_timer; > - unsigned long ageing_time; > unsigned int numdisabled; > struct list_head disabled; > void *security; > - u32 flow_count; > }; > > -static inline u32 tun_hashfn(u32 rxhash) > -{ > - return rxhash & 0x3ff; > -} > - > -static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash) > -{ > - struct tun_flow_entry *e; > - > - hlist_for_each_entry_rcu(e, head, hash_link) { > - if (e->rxhash == rxhash) > - return e; > - } > - return NULL; > -} > - > -static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun, > - struct hlist_head *head, > - u32 rxhash, u16 queue_index) > -{ > - struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC); > - > - if (e) { > - tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n", > - rxhash, queue_index); > - e->updated = jiffies; > - e->rxhash = rxhash; > - e->queue_index = queue_index; > - e->tun = tun; > - hlist_add_head_rcu(&e->hash_link, head); > - ++tun->flow_count; > - } > - return e; > -} > - > -static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) > -{ > - tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n", > - e->rxhash, e->queue_index); > - hlist_del_rcu(&e->hash_link); > - kfree_rcu(e, rcu); > - --tun->flow_count; > -} > - > -static void tun_flow_flush(struct tun_struct *tun) > -{ > - int i; > - > - spin_lock_bh(&tun->lock); > - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { > - struct tun_flow_entry *e; > - struct hlist_node *n; > - > - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) > - tun_flow_delete(tun, e); > - } > - spin_unlock_bh(&tun->lock); > -} > - > -static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index) > -{ > - int i; > - > - spin_lock_bh(&tun->lock); > - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { > - struct tun_flow_entry *e; > - struct hlist_node *n; > - > - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { > - if (e->queue_index == queue_index) > - tun_flow_delete(tun, e); > - } > - } > - spin_unlock_bh(&tun->lock); > -} > - > -static void tun_flow_cleanup(unsigned long data) > -{ > - struct tun_struct *tun = (struct tun_struct *)data; > - unsigned long delay = tun->ageing_time; > - unsigned long next_timer = jiffies + delay; > - unsigned long count = 0; > - int i; > - > - tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n"); > - > - spin_lock_bh(&tun->lock); > - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { > - struct tun_flow_entry *e; > - struct hlist_node *n; > - > - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { > - unsigned long this_timer; > - count++; > - this_timer = e->updated + delay; > - if (time_before_eq(this_timer, jiffies)) > - tun_flow_delete(tun, e); > - else if (time_before(this_timer, next_timer)) > - next_timer = this_timer; > - } > - } > - > - if (count) > - mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer)); > - spin_unlock_bh(&tun->lock); > -} > - > -static void tun_flow_update(struct tun_struct *tun, u32 rxhash, > - struct tun_file *tfile) > -{ > - struct hlist_head *head; > - struct tun_flow_entry *e; > - unsigned long delay = tun->ageing_time; > - u16 queue_index = tfile->queue_index; > - > - if (!rxhash) > - return; > - else > - head = &tun->flows[tun_hashfn(rxhash)]; > - > - rcu_read_lock(); > - > - /* We may get a very small possibility of OOO during switching, not > - * worth to optimize.*/ > - if (tun->numqueues == 1 || tfile->detached) > - goto unlock; > - > - e = tun_flow_find(head, rxhash); > - if (likely(e)) { > - /* TODO: keep queueing to old queue until it's empty? */ > - e->queue_index = queue_index; > - e->updated = jiffies; > - } else { > - spin_lock_bh(&tun->lock); > - if (!tun_flow_find(head, rxhash) && > - tun->flow_count < MAX_TAP_FLOWS) > - tun_flow_create(tun, head, rxhash, queue_index); > - > - if (!timer_pending(&tun->flow_gc_timer)) > - mod_timer(&tun->flow_gc_timer, > - round_jiffies_up(jiffies + delay)); > - spin_unlock_bh(&tun->lock); > - } > - > -unlock: > - rcu_read_unlock(); > -} > - > /* We try to identify a flow through its rxhash first. The reason that > * we do not check rxq no. is becuase some cards(e.g 82599), chooses > * the rxq based on the txq where the last packet of the flow comes. As > @@ -351,7 +190,6 @@ unlock: > static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) > { > struct tun_struct *tun = netdev_priv(dev); > - struct tun_flow_entry *e; > u32 txq = 0; > u32 numqueues = 0; > > @@ -360,12 +198,11 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) > > txq = skb_get_rxhash(skb); > if (txq) { > - e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); > - if (e) > - txq = e->queue_index; > - else > + txq = skb_get_queue_mapping(skb); > + if (!txq) { > /* use multiply and shift instead of expensive divide */ > txq = ((u64)txq * numqueues) >> 32; > + } > } else if (likely(skb_rx_queue_recorded(skb))) { > txq = skb_get_rx_queue(skb); > while (unlikely(txq >= numqueues)) > @@ -439,7 +276,6 @@ static void __tun_detach(struct tun_file *tfile, bool clean) > tun_disable_queue(tun, tfile); > > synchronize_net(); > - tun_flow_delete_by_queue(tun, tun->numqueues + 1); > /* Drop read queue */ > tun_queue_purge(tfile); > tun_set_real_num_queues(tun); > @@ -858,25 +694,6 @@ static const struct net_device_ops tap_netdev_ops = { > #endif > }; > > -static void tun_flow_init(struct tun_struct *tun) > -{ > - int i; > - > - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) > - INIT_HLIST_HEAD(&tun->flows[i]); > - > - tun->ageing_time = TUN_FLOW_EXPIRE; > - setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun); > - mod_timer(&tun->flow_gc_timer, > - round_jiffies_up(jiffies + tun->ageing_time)); > -} > - > -static void tun_flow_uninit(struct tun_struct *tun) > -{ > - del_timer_sync(&tun->flow_gc_timer); > - tun_flow_flush(tun); > -} > - > /* Initialize net device. */ > static void tun_net_init(struct net_device *dev) > { > @@ -986,7 +803,6 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, > int copylen; > bool zerocopy = false; > int err; > - u32 rxhash; > > if (!(tun->flags & TUN_NO_PI)) { > if (len < sizeof(pi)) > @@ -1146,13 +962,12 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, > skb_reset_network_header(skb); > skb_probe_transport_header(skb, 0); > > - rxhash = skb_get_rxhash(skb); > + skb_set_queue_mapping(skb, tfile->queue_index); > netif_rx_ni(skb); > > tun->dev->stats.rx_packets++; > tun->dev->stats.rx_bytes += len; > > - tun_flow_update(tun, rxhash, tfile); > return total_len; > } > > @@ -1370,7 +1185,6 @@ static void tun_free_netdev(struct net_device *dev) > struct tun_struct *tun = netdev_priv(dev); > > BUG_ON(!(list_empty(&tun->disabled))); > - tun_flow_uninit(tun); > security_tun_dev_free_security(tun->security); > free_netdev(dev); > } > @@ -1644,7 +1458,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) > goto err_free_dev; > > tun_net_init(dev); > - tun_flow_init(tun); > > dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | > TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | > @@ -1655,7 +1468,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) > INIT_LIST_HEAD(&tun->disabled); > err = tun_attach(tun, file, false); > if (err < 0) > - goto err_free_flow; > + goto err_free_security; > > err = register_netdevice(tun->dev); > if (err < 0) > @@ -1705,8 +1518,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) > > err_detach: > tun_detach_all(dev); > -err_free_flow: > - tun_flow_uninit(tun); > +err_free_security: > security_tun_dev_free_security(tun->security); > err_free_dev: > free_netdev(dev); ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] net, tun: remove the flow cache 2013-12-17 8:49 ` Jason Wang @ 2013-12-17 9:13 ` Zhi Yong Wu 2013-12-17 10:05 ` Jason Wang 0 siblings, 1 reply; 12+ messages in thread From: Zhi Yong Wu @ 2013-12-17 9:13 UTC (permalink / raw) To: Jason Wang Cc: netdev, linux-kernel mlist, David S. Miller, Michael S. Tsirkin, Zhi Yong Wu On Tue, Dec 17, 2013 at 4:49 PM, Jason Wang <jasowang@redhat.com> wrote: > On 12/17/2013 03:26 PM, Zhi Yong Wu wrote: >> From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> >> >> The flow cache is an extremely broken concept, and it usually brings up >> growth issues and DoS attacks, so this patch is trying to remove it from >> the tuntap driver, and insteadly use a simpler way for its flow control. > > NACK. > > This single function revert does not make sense to me. Since: IIRC, the tuntap flow cache is only used to save the mapping of skb packet <-> queue index. My idea only save the queue index in skb_buff early when skb buffer is filled, not in flow cache as the current code. This method is actually more simpler and completely doesn't need any flow cache. > > - You in fact removes the flow steering function in tun. We definitely > need something like this to unbreak the TCP performance in a multiqueue I don't think it will downgrade the TCP perf even in mq guest, but my idea maybe has better TCP perf, because it doesn't have any cache table lookup, etc. > guest. Please have a look at the virtio-net driver / virtio sepc for > more information. > - The total number of flow caches were limited to 4096, so there's no > DoS or growth issue. Can you check why the ipv4 routing cache is removed? maybe i miss something, if yes, pls correct me. :) > - The only issue is scalability, but fixing this is not easy. We can > just use arrays/indirection table like RSS instead of hash buckets, it > saves some time in linear search but has other issues like collision > - I've also had a RFC of using aRFS in the past, it also has several > drawbacks such as busy looping in the networking hotspot. > > So in conclusion, we need flow steering in tun, just removing current > method does not help. The proper way is to expose several different > methods to user and let user to choose the preferable mechanism like > packet. By the way, let us look at what other networking guys think of this, such as MST, dave, etc. :) > >> >> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> >> --- >> drivers/net/tun.c | 208 +++-------------------------------------------------- >> 1 files changed, 10 insertions(+), 198 deletions(-) >> >> diff --git a/drivers/net/tun.c b/drivers/net/tun.c >> index 7c8343a..7c27fdc 100644 >> --- a/drivers/net/tun.c >> +++ b/drivers/net/tun.c >> @@ -32,12 +32,15 @@ >> * >> * Daniel Podlejski <underley@underley.eu.org> >> * Modifications for 2.3.99-pre5 kernel. >> + * >> + * Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> >> + * Remove the flow cache. >> */ >> >> #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt >> >> #define DRV_NAME "tun" >> -#define DRV_VERSION "1.6" >> +#define DRV_VERSION "1.7" >> #define DRV_DESCRIPTION "Universal TUN/TAP device driver" >> #define DRV_COPYRIGHT "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>" >> >> @@ -146,18 +149,6 @@ struct tun_file { >> struct tun_struct *detached; >> }; >> >> -struct tun_flow_entry { >> - struct hlist_node hash_link; >> - struct rcu_head rcu; >> - struct tun_struct *tun; >> - >> - u32 rxhash; >> - int queue_index; >> - unsigned long updated; >> -}; >> - >> -#define TUN_NUM_FLOW_ENTRIES 1024 >> - >> /* Since the socket were moved to tun_file, to preserve the behavior of persist >> * device, socket filter, sndbuf and vnet header size were restore when the >> * file were attached to a persist device. >> @@ -184,163 +175,11 @@ struct tun_struct { >> int debug; >> #endif >> spinlock_t lock; >> - struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; >> - struct timer_list flow_gc_timer; >> - unsigned long ageing_time; >> unsigned int numdisabled; >> struct list_head disabled; >> void *security; >> - u32 flow_count; >> }; >> >> -static inline u32 tun_hashfn(u32 rxhash) >> -{ >> - return rxhash & 0x3ff; >> -} >> - >> -static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash) >> -{ >> - struct tun_flow_entry *e; >> - >> - hlist_for_each_entry_rcu(e, head, hash_link) { >> - if (e->rxhash == rxhash) >> - return e; >> - } >> - return NULL; >> -} >> - >> -static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun, >> - struct hlist_head *head, >> - u32 rxhash, u16 queue_index) >> -{ >> - struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC); >> - >> - if (e) { >> - tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n", >> - rxhash, queue_index); >> - e->updated = jiffies; >> - e->rxhash = rxhash; >> - e->queue_index = queue_index; >> - e->tun = tun; >> - hlist_add_head_rcu(&e->hash_link, head); >> - ++tun->flow_count; >> - } >> - return e; >> -} >> - >> -static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) >> -{ >> - tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n", >> - e->rxhash, e->queue_index); >> - hlist_del_rcu(&e->hash_link); >> - kfree_rcu(e, rcu); >> - --tun->flow_count; >> -} >> - >> -static void tun_flow_flush(struct tun_struct *tun) >> -{ >> - int i; >> - >> - spin_lock_bh(&tun->lock); >> - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { >> - struct tun_flow_entry *e; >> - struct hlist_node *n; >> - >> - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) >> - tun_flow_delete(tun, e); >> - } >> - spin_unlock_bh(&tun->lock); >> -} >> - >> -static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index) >> -{ >> - int i; >> - >> - spin_lock_bh(&tun->lock); >> - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { >> - struct tun_flow_entry *e; >> - struct hlist_node *n; >> - >> - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { >> - if (e->queue_index == queue_index) >> - tun_flow_delete(tun, e); >> - } >> - } >> - spin_unlock_bh(&tun->lock); >> -} >> - >> -static void tun_flow_cleanup(unsigned long data) >> -{ >> - struct tun_struct *tun = (struct tun_struct *)data; >> - unsigned long delay = tun->ageing_time; >> - unsigned long next_timer = jiffies + delay; >> - unsigned long count = 0; >> - int i; >> - >> - tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n"); >> - >> - spin_lock_bh(&tun->lock); >> - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { >> - struct tun_flow_entry *e; >> - struct hlist_node *n; >> - >> - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { >> - unsigned long this_timer; >> - count++; >> - this_timer = e->updated + delay; >> - if (time_before_eq(this_timer, jiffies)) >> - tun_flow_delete(tun, e); >> - else if (time_before(this_timer, next_timer)) >> - next_timer = this_timer; >> - } >> - } >> - >> - if (count) >> - mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer)); >> - spin_unlock_bh(&tun->lock); >> -} >> - >> -static void tun_flow_update(struct tun_struct *tun, u32 rxhash, >> - struct tun_file *tfile) >> -{ >> - struct hlist_head *head; >> - struct tun_flow_entry *e; >> - unsigned long delay = tun->ageing_time; >> - u16 queue_index = tfile->queue_index; >> - >> - if (!rxhash) >> - return; >> - else >> - head = &tun->flows[tun_hashfn(rxhash)]; >> - >> - rcu_read_lock(); >> - >> - /* We may get a very small possibility of OOO during switching, not >> - * worth to optimize.*/ >> - if (tun->numqueues == 1 || tfile->detached) >> - goto unlock; >> - >> - e = tun_flow_find(head, rxhash); >> - if (likely(e)) { >> - /* TODO: keep queueing to old queue until it's empty? */ >> - e->queue_index = queue_index; >> - e->updated = jiffies; >> - } else { >> - spin_lock_bh(&tun->lock); >> - if (!tun_flow_find(head, rxhash) && >> - tun->flow_count < MAX_TAP_FLOWS) >> - tun_flow_create(tun, head, rxhash, queue_index); >> - >> - if (!timer_pending(&tun->flow_gc_timer)) >> - mod_timer(&tun->flow_gc_timer, >> - round_jiffies_up(jiffies + delay)); >> - spin_unlock_bh(&tun->lock); >> - } >> - >> -unlock: >> - rcu_read_unlock(); >> -} >> - >> /* We try to identify a flow through its rxhash first. The reason that >> * we do not check rxq no. is becuase some cards(e.g 82599), chooses >> * the rxq based on the txq where the last packet of the flow comes. As >> @@ -351,7 +190,6 @@ unlock: >> static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) >> { >> struct tun_struct *tun = netdev_priv(dev); >> - struct tun_flow_entry *e; >> u32 txq = 0; >> u32 numqueues = 0; >> >> @@ -360,12 +198,11 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) >> >> txq = skb_get_rxhash(skb); >> if (txq) { >> - e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); >> - if (e) >> - txq = e->queue_index; >> - else >> + txq = skb_get_queue_mapping(skb); >> + if (!txq) { >> /* use multiply and shift instead of expensive divide */ >> txq = ((u64)txq * numqueues) >> 32; >> + } >> } else if (likely(skb_rx_queue_recorded(skb))) { >> txq = skb_get_rx_queue(skb); >> while (unlikely(txq >= numqueues)) >> @@ -439,7 +276,6 @@ static void __tun_detach(struct tun_file *tfile, bool clean) >> tun_disable_queue(tun, tfile); >> >> synchronize_net(); >> - tun_flow_delete_by_queue(tun, tun->numqueues + 1); >> /* Drop read queue */ >> tun_queue_purge(tfile); >> tun_set_real_num_queues(tun); >> @@ -858,25 +694,6 @@ static const struct net_device_ops tap_netdev_ops = { >> #endif >> }; >> >> -static void tun_flow_init(struct tun_struct *tun) >> -{ >> - int i; >> - >> - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) >> - INIT_HLIST_HEAD(&tun->flows[i]); >> - >> - tun->ageing_time = TUN_FLOW_EXPIRE; >> - setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun); >> - mod_timer(&tun->flow_gc_timer, >> - round_jiffies_up(jiffies + tun->ageing_time)); >> -} >> - >> -static void tun_flow_uninit(struct tun_struct *tun) >> -{ >> - del_timer_sync(&tun->flow_gc_timer); >> - tun_flow_flush(tun); >> -} >> - >> /* Initialize net device. */ >> static void tun_net_init(struct net_device *dev) >> { >> @@ -986,7 +803,6 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, >> int copylen; >> bool zerocopy = false; >> int err; >> - u32 rxhash; >> >> if (!(tun->flags & TUN_NO_PI)) { >> if (len < sizeof(pi)) >> @@ -1146,13 +962,12 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, >> skb_reset_network_header(skb); >> skb_probe_transport_header(skb, 0); >> >> - rxhash = skb_get_rxhash(skb); >> + skb_set_queue_mapping(skb, tfile->queue_index); >> netif_rx_ni(skb); >> >> tun->dev->stats.rx_packets++; >> tun->dev->stats.rx_bytes += len; >> >> - tun_flow_update(tun, rxhash, tfile); >> return total_len; >> } >> >> @@ -1370,7 +1185,6 @@ static void tun_free_netdev(struct net_device *dev) >> struct tun_struct *tun = netdev_priv(dev); >> >> BUG_ON(!(list_empty(&tun->disabled))); >> - tun_flow_uninit(tun); >> security_tun_dev_free_security(tun->security); >> free_netdev(dev); >> } >> @@ -1644,7 +1458,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) >> goto err_free_dev; >> >> tun_net_init(dev); >> - tun_flow_init(tun); >> >> dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | >> TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | >> @@ -1655,7 +1468,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) >> INIT_LIST_HEAD(&tun->disabled); >> err = tun_attach(tun, file, false); >> if (err < 0) >> - goto err_free_flow; >> + goto err_free_security; >> >> err = register_netdevice(tun->dev); >> if (err < 0) >> @@ -1705,8 +1518,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) >> >> err_detach: >> tun_detach_all(dev); >> -err_free_flow: >> - tun_flow_uninit(tun); >> +err_free_security: >> security_tun_dev_free_security(tun->security); >> err_free_dev: >> free_netdev(dev); > -- Regards, Zhi Yong Wu ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] net, tun: remove the flow cache 2013-12-17 9:13 ` Zhi Yong Wu @ 2013-12-17 10:05 ` Jason Wang 2013-12-18 2:08 ` Zhi Yong Wu 0 siblings, 1 reply; 12+ messages in thread From: Jason Wang @ 2013-12-17 10:05 UTC (permalink / raw) To: Zhi Yong Wu Cc: netdev, linux-kernel mlist, David S. Miller, Michael S. Tsirkin, Zhi Yong Wu On 12/17/2013 05:13 PM, Zhi Yong Wu wrote: > On Tue, Dec 17, 2013 at 4:49 PM, Jason Wang <jasowang@redhat.com> wrote: >> > On 12/17/2013 03:26 PM, Zhi Yong Wu wrote: >>> >> From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> >>> >> >>> >> The flow cache is an extremely broken concept, and it usually brings up >>> >> growth issues and DoS attacks, so this patch is trying to remove it from >>> >> the tuntap driver, and insteadly use a simpler way for its flow control. >> > >> > NACK. >> > >> > This single function revert does not make sense to me. Since: > IIRC, the tuntap flow cache is only used to save the mapping of skb > packet <-> queue index. My idea only save the queue index in skb_buff > early when skb buffer is filled, not in flow cache as the current > code. This method is actually more simpler and completely doesn't need > any flow cache. Nope. Flow caches record the flow to queues mapping like what most multiqueue nic does. The only difference is tun record it silently while most nic needs driver to tell the mapping. What your patch does is: - set the queue mapping of skb during tun_get_user(). But most drivers using XPS or processor id to select the real tx queue. So the real txq depends on the cpu that vhost or qemu is running. This setting does not have any effect in fact. - the queue mapping of skb were fetched during tun_select_queue(). This value is usually set by a multiqueue nic to record which hardware rxq was this packet came. Can you explain how your patch works exactly? >> > >> > - You in fact removes the flow steering function in tun. We definitely >> > need something like this to unbreak the TCP performance in a multiqueue > I don't think it will downgrade the TCP perf even in mq guest, but my > idea maybe has better TCP perf, because it doesn't have any cache > table lookup, etc. Did you test and compare the performance numbers? Did you run profiler to see how much does the lookup cost? >> > guest. Please have a look at the virtio-net driver / virtio sepc for >> > more information. >> > - The total number of flow caches were limited to 4096, so there's no >> > DoS or growth issue. > Can you check why the ipv4 routing cache is removed? maybe i miss > something, if yes, pls correct me. :) The main differences is that the flow caches were best effort. Tun can not store all flows to queue mapping, and even a hardware nic can not do this. If a packet misses the flow cache, it's safe to distribute it randomly or through another method. So the limitation just work. Could you please explain the DoS or growth issue you meet here? >> > - The only issue is scalability, but fixing this is not easy. We can >> > just use arrays/indirection table like RSS instead of hash buckets, it >> > saves some time in linear search but has other issues like collision >> > - I've also had a RFC of using aRFS in the past, it also has several >> > drawbacks such as busy looping in the networking hotspot. >> > >> > So in conclusion, we need flow steering in tun, just removing current >> > method does not help. The proper way is to expose several different >> > methods to user and let user to choose the preferable mechanism like >> > packet. > By the way, let us look at what other networking guys think of this, > such as MST, dave, etc. :) > Of course. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] net, tun: remove the flow cache 2013-12-17 10:05 ` Jason Wang @ 2013-12-18 2:08 ` Zhi Yong Wu 2013-12-23 7:00 ` Jason Wang 0 siblings, 1 reply; 12+ messages in thread From: Zhi Yong Wu @ 2013-12-18 2:08 UTC (permalink / raw) To: Jason Wang Cc: netdev, linux-kernel mlist, David S. Miller, Michael S. Tsirkin, Zhi Yong Wu On Tue, Dec 17, 2013 at 6:05 PM, Jason Wang <jasowang@redhat.com> wrote: > On 12/17/2013 05:13 PM, Zhi Yong Wu wrote: >> On Tue, Dec 17, 2013 at 4:49 PM, Jason Wang <jasowang@redhat.com> wrote: >>> > On 12/17/2013 03:26 PM, Zhi Yong Wu wrote: >>>> >> From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> >>>> >> >>>> >> The flow cache is an extremely broken concept, and it usually brings up >>>> >> growth issues and DoS attacks, so this patch is trying to remove it from >>>> >> the tuntap driver, and insteadly use a simpler way for its flow control. >>> > >>> > NACK. >>> > >>> > This single function revert does not make sense to me. Since: >> IIRC, the tuntap flow cache is only used to save the mapping of skb >> packet <-> queue index. My idea only save the queue index in skb_buff >> early when skb buffer is filled, not in flow cache as the current >> code. This method is actually more simpler and completely doesn't need >> any flow cache. > > Nope. Flow caches record the flow to queues mapping like what most > multiqueue nic does. The only difference is tun record it silently while > most nic needs driver to tell the mapping. Just check virtio specs, i seem to miss the fact that flow cache enable packet steering in mq mode, thanks for your comments. But i have some concerns about some of your comments. > > What your patch does is: > - set the queue mapping of skb during tun_get_user(). But most drivers > using XPS or processor id to select the real tx queue. So the real txq > depends on the cpu that vhost or qemu is running. This setting does not Doesn't those drivers invoke netdev_pick_tx() or its counterpart to select real tx queue? e.g. tun_select_queue(). or can you say it with an example? Moreover, how do those drivers know which cpu vhost or qemu is running on? > have any effect in fact. > - the queue mapping of skb were fetched during tun_select_queue(). This > value is usually set by a multiqueue nic to record which hardware rxq > was this packet came. ah? Can you let me know where a mq nic controller set it? > > Can you explain how your patch works exactly? You have understood it. >>> > >>> > - You in fact removes the flow steering function in tun. We definitely >>> > need something like this to unbreak the TCP performance in a multiqueue >> I don't think it will downgrade the TCP perf even in mq guest, but my >> idea maybe has better TCP perf, because it doesn't have any cache >> table lookup, etc. > > Did you test and compare the performance numbers? Did you run profiler > to see how much does the lookup cost? No, As i jus said above, i miss that flow cache can enable packet steering. But Did you do related perf testing? To be honest, i am wondering how much perf the packet steering can improve. Actually it also injects a lot of cache lookup cost. >>> > guest. Please have a look at the virtio-net driver / virtio sepc for >>> > more information. >>> > - The total number of flow caches were limited to 4096, so there's no >>> > DoS or growth issue. >> Can you check why the ipv4 routing cache is removed? maybe i miss >> something, if yes, pls correct me. :) > > The main differences is that the flow caches were best effort. Tun can > not store all flows to queue mapping, and even a hardware nic can not do > this. If a packet misses the flow cache, it's safe to distribute it > randomly or through another method. So the limitation just work. Exactly, we can know this from tun_select_queue(). > > Could you please explain the DoS or growth issue you meet here? >>> > - The only issue is scalability, but fixing this is not easy. We can >>> > just use arrays/indirection table like RSS instead of hash buckets, it >>> > saves some time in linear search but has other issues like collision >>> > - I've also had a RFC of using aRFS in the past, it also has several >>> > drawbacks such as busy looping in the networking hotspot. >>> > >>> > So in conclusion, we need flow steering in tun, just removing current >>> > method does not help. The proper way is to expose several different >>> > methods to user and let user to choose the preferable mechanism like >>> > packet. >> By the way, let us look at what other networking guys think of this, >> such as MST, dave, etc. :) >> > > Of course. -- Regards, Zhi Yong Wu ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] net, tun: remove the flow cache 2013-12-18 2:08 ` Zhi Yong Wu @ 2013-12-23 7:00 ` Jason Wang 0 siblings, 0 replies; 12+ messages in thread From: Jason Wang @ 2013-12-23 7:00 UTC (permalink / raw) To: Zhi Yong Wu Cc: netdev, linux-kernel mlist, David S. Miller, Michael S. Tsirkin, Zhi Yong Wu On 12/18/2013 10:08 AM, Zhi Yong Wu wrote: > On Tue, Dec 17, 2013 at 6:05 PM, Jason Wang <jasowang@redhat.com> wrote: >> On 12/17/2013 05:13 PM, Zhi Yong Wu wrote: >>> On Tue, Dec 17, 2013 at 4:49 PM, Jason Wang <jasowang@redhat.com> wrote: >>>>> On 12/17/2013 03:26 PM, Zhi Yong Wu wrote: >>>>>>> From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> >>>>>>> >>>>>>> The flow cache is an extremely broken concept, and it usually brings up >>>>>>> growth issues and DoS attacks, so this patch is trying to remove it from >>>>>>> the tuntap driver, and insteadly use a simpler way for its flow control. >>>>> NACK. >>>>> >>>>> This single function revert does not make sense to me. Since: >>> IIRC, the tuntap flow cache is only used to save the mapping of skb >>> packet <-> queue index. My idea only save the queue index in skb_buff >>> early when skb buffer is filled, not in flow cache as the current >>> code. This method is actually more simpler and completely doesn't need >>> any flow cache. >> Nope. Flow caches record the flow to queues mapping like what most >> multiqueue nic does. The only difference is tun record it silently while >> most nic needs driver to tell the mapping. > Just check virtio specs, i seem to miss the fact that flow cache > enable packet steering in mq mode, thanks for your comments. But i > have some concerns about some of your comments. >> What your patch does is: >> - set the queue mapping of skb during tun_get_user(). But most drivers >> using XPS or processor id to select the real tx queue. So the real txq >> depends on the cpu that vhost or qemu is running. This setting does not > Doesn't those drivers invoke netdev_pick_tx() or its counterpart to > select real tx queue? e.g. tun_select_queue(). or can you say it with > an example? See __netdev_pick_tx() which will call get_xps_queues() to let XPS to choose txq if there's no ndo_select_queue() method. Even if some driver who has its own implementation, it will also call __netdev_pick_tx() for ordinary traffic. Few drivers just using processor id to select txq. See tile_net_select_queue(). Some drivers does this implicitly through XPS, see ixgbe and virtio-net. So the value you intend to "teach" the hardware nic may not work for most of the cases. > Moreover, how do those drivers know which cpu vhost or qemu is running on? It does not know, but it can use processor id which is normally where qemu/vhost is running. >> have any effect in fact. >> - the queue mapping of skb were fetched during tun_select_queue(). This >> value is usually set by a multiqueue nic to record which hardware rxq >> was this packet came. > ah? Can you let me know where a mq nic controller set it? Almost all multiqueue nic set this when it receives a packet. One example is ixgbe_process_skb_fields(). >> Can you explain how your patch works exactly? > You have understood it. >>>>> - You in fact removes the flow steering function in tun. We definitely >>>>> need something like this to unbreak the TCP performance in a multiqueue >>> I don't think it will downgrade the TCP perf even in mq guest, but my >>> idea maybe has better TCP perf, because it doesn't have any cache >>> table lookup, etc. >> Did you test and compare the performance numbers? Did you run profiler >> to see how much does the lookup cost? > No, As i jus said above, i miss that flow cache can enable packet > steering. But Did you do related perf testing? To be honest, i am > wondering how much perf the packet steering can improve. Actually it > also injects a lot of cache lookup cost. Not a lot unless there's a unbalance distribution of the hash bucket which is very rare in the common case. Without it, you may get a very huge regression even on a single stream tcp netperf test. Flow caches guarantees packets of a single stream was not processed by more than one vhost threads/vcpus which is very important for TCP performance. >>>>> guest. Please have a look at the virtio-net driver / virtio sepc for >>>>> more information. >>>>> - The total number of flow caches were limited to 4096, so there's no >>>>> DoS or growth issue. >>> Can you check why the ipv4 routing cache is removed? maybe i miss >>> something, if yes, pls correct me. :) >> The main differences is that the flow caches were best effort. Tun can >> not store all flows to queue mapping, and even a hardware nic can not do >> this. If a packet misses the flow cache, it's safe to distribute it >> randomly or through another method. So the limitation just work. > Exactly, we can know this from tun_select_queue(). >> Could you please explain the DoS or growth issue you meet here? >>>>> - The only issue is scalability, but fixing this is not easy. We can >>>>> just use arrays/indirection table like RSS instead of hash buckets, it >>>>> saves some time in linear search but has other issues like collision >>>>> - I've also had a RFC of using aRFS in the past, it also has several >>>>> drawbacks such as busy looping in the networking hotspot. >>>>> >>>>> So in conclusion, we need flow steering in tun, just removing current >>>>> method does not help. The proper way is to expose several different >>>>> methods to user and let user to choose the preferable mechanism like >>>>> packet. >>> By the way, let us look at what other networking guys think of this, >>> such as MST, dave, etc. :) >>> >> Of course. > > ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] net, tun: remove the flow cache 2013-12-17 7:26 [RFC PATCH] net, tun: remove the flow cache Zhi Yong Wu 2013-12-17 7:47 ` Stephen Hemminger 2013-12-17 8:49 ` Jason Wang @ 2013-12-18 4:06 ` Tom Herbert 2013-12-18 4:37 ` Zhi Yong Wu 2 siblings, 1 reply; 12+ messages in thread From: Tom Herbert @ 2013-12-18 4:06 UTC (permalink / raw) To: Zhi Yong Wu Cc: Linux Netdev List, LKML, David Miller, Michael S. Tsirkin, Jason Wang, Zhi Yong Wu On Mon, Dec 16, 2013 at 11:26 PM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote: > From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> > > The flow cache is an extremely broken concept, and it usually brings up > growth issues and DoS attacks, so this patch is trying to remove it from > the tuntap driver, and insteadly use a simpler way for its flow control. > Yes , in it's current state it's broken. But maybe we can try to fix it instead of arbitrarily removing it. Please see my patches on plumbing RFS into tuntap which may start to make it useful. Tom > Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> > --- > drivers/net/tun.c | 208 +++-------------------------------------------------- > 1 files changed, 10 insertions(+), 198 deletions(-) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index 7c8343a..7c27fdc 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -32,12 +32,15 @@ > * > * Daniel Podlejski <underley@underley.eu.org> > * Modifications for 2.3.99-pre5 kernel. > + * > + * Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> > + * Remove the flow cache. > */ > > #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt > > #define DRV_NAME "tun" > -#define DRV_VERSION "1.6" > +#define DRV_VERSION "1.7" > #define DRV_DESCRIPTION "Universal TUN/TAP device driver" > #define DRV_COPYRIGHT "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>" > > @@ -146,18 +149,6 @@ struct tun_file { > struct tun_struct *detached; > }; > > -struct tun_flow_entry { > - struct hlist_node hash_link; > - struct rcu_head rcu; > - struct tun_struct *tun; > - > - u32 rxhash; > - int queue_index; > - unsigned long updated; > -}; > - > -#define TUN_NUM_FLOW_ENTRIES 1024 > - > /* Since the socket were moved to tun_file, to preserve the behavior of persist > * device, socket filter, sndbuf and vnet header size were restore when the > * file were attached to a persist device. > @@ -184,163 +175,11 @@ struct tun_struct { > int debug; > #endif > spinlock_t lock; > - struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; > - struct timer_list flow_gc_timer; > - unsigned long ageing_time; > unsigned int numdisabled; > struct list_head disabled; > void *security; > - u32 flow_count; > }; > > -static inline u32 tun_hashfn(u32 rxhash) > -{ > - return rxhash & 0x3ff; > -} > - > -static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash) > -{ > - struct tun_flow_entry *e; > - > - hlist_for_each_entry_rcu(e, head, hash_link) { > - if (e->rxhash == rxhash) > - return e; > - } > - return NULL; > -} > - > -static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun, > - struct hlist_head *head, > - u32 rxhash, u16 queue_index) > -{ > - struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC); > - > - if (e) { > - tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n", > - rxhash, queue_index); > - e->updated = jiffies; > - e->rxhash = rxhash; > - e->queue_index = queue_index; > - e->tun = tun; > - hlist_add_head_rcu(&e->hash_link, head); > - ++tun->flow_count; > - } > - return e; > -} > - > -static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) > -{ > - tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n", > - e->rxhash, e->queue_index); > - hlist_del_rcu(&e->hash_link); > - kfree_rcu(e, rcu); > - --tun->flow_count; > -} > - > -static void tun_flow_flush(struct tun_struct *tun) > -{ > - int i; > - > - spin_lock_bh(&tun->lock); > - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { > - struct tun_flow_entry *e; > - struct hlist_node *n; > - > - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) > - tun_flow_delete(tun, e); > - } > - spin_unlock_bh(&tun->lock); > -} > - > -static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index) > -{ > - int i; > - > - spin_lock_bh(&tun->lock); > - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { > - struct tun_flow_entry *e; > - struct hlist_node *n; > - > - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { > - if (e->queue_index == queue_index) > - tun_flow_delete(tun, e); > - } > - } > - spin_unlock_bh(&tun->lock); > -} > - > -static void tun_flow_cleanup(unsigned long data) > -{ > - struct tun_struct *tun = (struct tun_struct *)data; > - unsigned long delay = tun->ageing_time; > - unsigned long next_timer = jiffies + delay; > - unsigned long count = 0; > - int i; > - > - tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n"); > - > - spin_lock_bh(&tun->lock); > - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { > - struct tun_flow_entry *e; > - struct hlist_node *n; > - > - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { > - unsigned long this_timer; > - count++; > - this_timer = e->updated + delay; > - if (time_before_eq(this_timer, jiffies)) > - tun_flow_delete(tun, e); > - else if (time_before(this_timer, next_timer)) > - next_timer = this_timer; > - } > - } > - > - if (count) > - mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer)); > - spin_unlock_bh(&tun->lock); > -} > - > -static void tun_flow_update(struct tun_struct *tun, u32 rxhash, > - struct tun_file *tfile) > -{ > - struct hlist_head *head; > - struct tun_flow_entry *e; > - unsigned long delay = tun->ageing_time; > - u16 queue_index = tfile->queue_index; > - > - if (!rxhash) > - return; > - else > - head = &tun->flows[tun_hashfn(rxhash)]; > - > - rcu_read_lock(); > - > - /* We may get a very small possibility of OOO during switching, not > - * worth to optimize.*/ > - if (tun->numqueues == 1 || tfile->detached) > - goto unlock; > - > - e = tun_flow_find(head, rxhash); > - if (likely(e)) { > - /* TODO: keep queueing to old queue until it's empty? */ > - e->queue_index = queue_index; > - e->updated = jiffies; > - } else { > - spin_lock_bh(&tun->lock); > - if (!tun_flow_find(head, rxhash) && > - tun->flow_count < MAX_TAP_FLOWS) > - tun_flow_create(tun, head, rxhash, queue_index); > - > - if (!timer_pending(&tun->flow_gc_timer)) > - mod_timer(&tun->flow_gc_timer, > - round_jiffies_up(jiffies + delay)); > - spin_unlock_bh(&tun->lock); > - } > - > -unlock: > - rcu_read_unlock(); > -} > - > /* We try to identify a flow through its rxhash first. The reason that > * we do not check rxq no. is becuase some cards(e.g 82599), chooses > * the rxq based on the txq where the last packet of the flow comes. As > @@ -351,7 +190,6 @@ unlock: > static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) > { > struct tun_struct *tun = netdev_priv(dev); > - struct tun_flow_entry *e; > u32 txq = 0; > u32 numqueues = 0; > > @@ -360,12 +198,11 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) > > txq = skb_get_rxhash(skb); > if (txq) { > - e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); > - if (e) > - txq = e->queue_index; > - else > + txq = skb_get_queue_mapping(skb); > + if (!txq) { > /* use multiply and shift instead of expensive divide */ > txq = ((u64)txq * numqueues) >> 32; > + } > } else if (likely(skb_rx_queue_recorded(skb))) { > txq = skb_get_rx_queue(skb); > while (unlikely(txq >= numqueues)) > @@ -439,7 +276,6 @@ static void __tun_detach(struct tun_file *tfile, bool clean) > tun_disable_queue(tun, tfile); > > synchronize_net(); > - tun_flow_delete_by_queue(tun, tun->numqueues + 1); > /* Drop read queue */ > tun_queue_purge(tfile); > tun_set_real_num_queues(tun); > @@ -858,25 +694,6 @@ static const struct net_device_ops tap_netdev_ops = { > #endif > }; > > -static void tun_flow_init(struct tun_struct *tun) > -{ > - int i; > - > - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) > - INIT_HLIST_HEAD(&tun->flows[i]); > - > - tun->ageing_time = TUN_FLOW_EXPIRE; > - setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun); > - mod_timer(&tun->flow_gc_timer, > - round_jiffies_up(jiffies + tun->ageing_time)); > -} > - > -static void tun_flow_uninit(struct tun_struct *tun) > -{ > - del_timer_sync(&tun->flow_gc_timer); > - tun_flow_flush(tun); > -} > - > /* Initialize net device. */ > static void tun_net_init(struct net_device *dev) > { > @@ -986,7 +803,6 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, > int copylen; > bool zerocopy = false; > int err; > - u32 rxhash; > > if (!(tun->flags & TUN_NO_PI)) { > if (len < sizeof(pi)) > @@ -1146,13 +962,12 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, > skb_reset_network_header(skb); > skb_probe_transport_header(skb, 0); > > - rxhash = skb_get_rxhash(skb); > + skb_set_queue_mapping(skb, tfile->queue_index); > netif_rx_ni(skb); > > tun->dev->stats.rx_packets++; > tun->dev->stats.rx_bytes += len; > > - tun_flow_update(tun, rxhash, tfile); > return total_len; > } > > @@ -1370,7 +1185,6 @@ static void tun_free_netdev(struct net_device *dev) > struct tun_struct *tun = netdev_priv(dev); > > BUG_ON(!(list_empty(&tun->disabled))); > - tun_flow_uninit(tun); > security_tun_dev_free_security(tun->security); > free_netdev(dev); > } > @@ -1644,7 +1458,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) > goto err_free_dev; > > tun_net_init(dev); > - tun_flow_init(tun); > > dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | > TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | > @@ -1655,7 +1468,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) > INIT_LIST_HEAD(&tun->disabled); > err = tun_attach(tun, file, false); > if (err < 0) > - goto err_free_flow; > + goto err_free_security; > > err = register_netdevice(tun->dev); > if (err < 0) > @@ -1705,8 +1518,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) > > err_detach: > tun_detach_all(dev); > -err_free_flow: > - tun_flow_uninit(tun); > +err_free_security: > security_tun_dev_free_security(tun->security); > err_free_dev: > free_netdev(dev); > -- > 1.7.6.5 > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] net, tun: remove the flow cache 2013-12-18 4:06 ` Tom Herbert @ 2013-12-18 4:37 ` Zhi Yong Wu 2013-12-18 4:58 ` Tom Herbert 0 siblings, 1 reply; 12+ messages in thread From: Zhi Yong Wu @ 2013-12-18 4:37 UTC (permalink / raw) To: Tom Herbert Cc: Linux Netdev List, LKML, David Miller, Michael S. Tsirkin, Jason Wang, Zhi Yong Wu HI, Tom, On Wed, Dec 18, 2013 at 12:06 PM, Tom Herbert <therbert@google.com> wrote: > On Mon, Dec 16, 2013 at 11:26 PM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote: >> From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> >> >> The flow cache is an extremely broken concept, and it usually brings up >> growth issues and DoS attacks, so this patch is trying to remove it from >> the tuntap driver, and insteadly use a simpler way for its flow control. >> > Yes , in it's current state it's broken. But maybe we can try to fix > it instead of arbitrarily removing it. Please see my patches on > plumbing RFS into tuntap which may start to make it useful. Do you mean you patch [5/5] tun: Added support for RFS on tun flows? Sorry, can you say with more details? > > Tom > >> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> >> --- >> drivers/net/tun.c | 208 +++-------------------------------------------------- >> 1 files changed, 10 insertions(+), 198 deletions(-) >> >> diff --git a/drivers/net/tun.c b/drivers/net/tun.c >> index 7c8343a..7c27fdc 100644 >> --- a/drivers/net/tun.c >> +++ b/drivers/net/tun.c >> @@ -32,12 +32,15 @@ >> * >> * Daniel Podlejski <underley@underley.eu.org> >> * Modifications for 2.3.99-pre5 kernel. >> + * >> + * Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> >> + * Remove the flow cache. >> */ >> >> #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt >> >> #define DRV_NAME "tun" >> -#define DRV_VERSION "1.6" >> +#define DRV_VERSION "1.7" >> #define DRV_DESCRIPTION "Universal TUN/TAP device driver" >> #define DRV_COPYRIGHT "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>" >> >> @@ -146,18 +149,6 @@ struct tun_file { >> struct tun_struct *detached; >> }; >> >> -struct tun_flow_entry { >> - struct hlist_node hash_link; >> - struct rcu_head rcu; >> - struct tun_struct *tun; >> - >> - u32 rxhash; >> - int queue_index; >> - unsigned long updated; >> -}; >> - >> -#define TUN_NUM_FLOW_ENTRIES 1024 >> - >> /* Since the socket were moved to tun_file, to preserve the behavior of persist >> * device, socket filter, sndbuf and vnet header size were restore when the >> * file were attached to a persist device. >> @@ -184,163 +175,11 @@ struct tun_struct { >> int debug; >> #endif >> spinlock_t lock; >> - struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; >> - struct timer_list flow_gc_timer; >> - unsigned long ageing_time; >> unsigned int numdisabled; >> struct list_head disabled; >> void *security; >> - u32 flow_count; >> }; >> >> -static inline u32 tun_hashfn(u32 rxhash) >> -{ >> - return rxhash & 0x3ff; >> -} >> - >> -static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash) >> -{ >> - struct tun_flow_entry *e; >> - >> - hlist_for_each_entry_rcu(e, head, hash_link) { >> - if (e->rxhash == rxhash) >> - return e; >> - } >> - return NULL; >> -} >> - >> -static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun, >> - struct hlist_head *head, >> - u32 rxhash, u16 queue_index) >> -{ >> - struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC); >> - >> - if (e) { >> - tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n", >> - rxhash, queue_index); >> - e->updated = jiffies; >> - e->rxhash = rxhash; >> - e->queue_index = queue_index; >> - e->tun = tun; >> - hlist_add_head_rcu(&e->hash_link, head); >> - ++tun->flow_count; >> - } >> - return e; >> -} >> - >> -static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) >> -{ >> - tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n", >> - e->rxhash, e->queue_index); >> - hlist_del_rcu(&e->hash_link); >> - kfree_rcu(e, rcu); >> - --tun->flow_count; >> -} >> - >> -static void tun_flow_flush(struct tun_struct *tun) >> -{ >> - int i; >> - >> - spin_lock_bh(&tun->lock); >> - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { >> - struct tun_flow_entry *e; >> - struct hlist_node *n; >> - >> - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) >> - tun_flow_delete(tun, e); >> - } >> - spin_unlock_bh(&tun->lock); >> -} >> - >> -static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index) >> -{ >> - int i; >> - >> - spin_lock_bh(&tun->lock); >> - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { >> - struct tun_flow_entry *e; >> - struct hlist_node *n; >> - >> - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { >> - if (e->queue_index == queue_index) >> - tun_flow_delete(tun, e); >> - } >> - } >> - spin_unlock_bh(&tun->lock); >> -} >> - >> -static void tun_flow_cleanup(unsigned long data) >> -{ >> - struct tun_struct *tun = (struct tun_struct *)data; >> - unsigned long delay = tun->ageing_time; >> - unsigned long next_timer = jiffies + delay; >> - unsigned long count = 0; >> - int i; >> - >> - tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n"); >> - >> - spin_lock_bh(&tun->lock); >> - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { >> - struct tun_flow_entry *e; >> - struct hlist_node *n; >> - >> - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { >> - unsigned long this_timer; >> - count++; >> - this_timer = e->updated + delay; >> - if (time_before_eq(this_timer, jiffies)) >> - tun_flow_delete(tun, e); >> - else if (time_before(this_timer, next_timer)) >> - next_timer = this_timer; >> - } >> - } >> - >> - if (count) >> - mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer)); >> - spin_unlock_bh(&tun->lock); >> -} >> - >> -static void tun_flow_update(struct tun_struct *tun, u32 rxhash, >> - struct tun_file *tfile) >> -{ >> - struct hlist_head *head; >> - struct tun_flow_entry *e; >> - unsigned long delay = tun->ageing_time; >> - u16 queue_index = tfile->queue_index; >> - >> - if (!rxhash) >> - return; >> - else >> - head = &tun->flows[tun_hashfn(rxhash)]; >> - >> - rcu_read_lock(); >> - >> - /* We may get a very small possibility of OOO during switching, not >> - * worth to optimize.*/ >> - if (tun->numqueues == 1 || tfile->detached) >> - goto unlock; >> - >> - e = tun_flow_find(head, rxhash); >> - if (likely(e)) { >> - /* TODO: keep queueing to old queue until it's empty? */ >> - e->queue_index = queue_index; >> - e->updated = jiffies; >> - } else { >> - spin_lock_bh(&tun->lock); >> - if (!tun_flow_find(head, rxhash) && >> - tun->flow_count < MAX_TAP_FLOWS) >> - tun_flow_create(tun, head, rxhash, queue_index); >> - >> - if (!timer_pending(&tun->flow_gc_timer)) >> - mod_timer(&tun->flow_gc_timer, >> - round_jiffies_up(jiffies + delay)); >> - spin_unlock_bh(&tun->lock); >> - } >> - >> -unlock: >> - rcu_read_unlock(); >> -} >> - >> /* We try to identify a flow through its rxhash first. The reason that >> * we do not check rxq no. is becuase some cards(e.g 82599), chooses >> * the rxq based on the txq where the last packet of the flow comes. As >> @@ -351,7 +190,6 @@ unlock: >> static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) >> { >> struct tun_struct *tun = netdev_priv(dev); >> - struct tun_flow_entry *e; >> u32 txq = 0; >> u32 numqueues = 0; >> >> @@ -360,12 +198,11 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) >> >> txq = skb_get_rxhash(skb); >> if (txq) { >> - e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); >> - if (e) >> - txq = e->queue_index; >> - else >> + txq = skb_get_queue_mapping(skb); >> + if (!txq) { >> /* use multiply and shift instead of expensive divide */ >> txq = ((u64)txq * numqueues) >> 32; >> + } >> } else if (likely(skb_rx_queue_recorded(skb))) { >> txq = skb_get_rx_queue(skb); >> while (unlikely(txq >= numqueues)) >> @@ -439,7 +276,6 @@ static void __tun_detach(struct tun_file *tfile, bool clean) >> tun_disable_queue(tun, tfile); >> >> synchronize_net(); >> - tun_flow_delete_by_queue(tun, tun->numqueues + 1); >> /* Drop read queue */ >> tun_queue_purge(tfile); >> tun_set_real_num_queues(tun); >> @@ -858,25 +694,6 @@ static const struct net_device_ops tap_netdev_ops = { >> #endif >> }; >> >> -static void tun_flow_init(struct tun_struct *tun) >> -{ >> - int i; >> - >> - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) >> - INIT_HLIST_HEAD(&tun->flows[i]); >> - >> - tun->ageing_time = TUN_FLOW_EXPIRE; >> - setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun); >> - mod_timer(&tun->flow_gc_timer, >> - round_jiffies_up(jiffies + tun->ageing_time)); >> -} >> - >> -static void tun_flow_uninit(struct tun_struct *tun) >> -{ >> - del_timer_sync(&tun->flow_gc_timer); >> - tun_flow_flush(tun); >> -} >> - >> /* Initialize net device. */ >> static void tun_net_init(struct net_device *dev) >> { >> @@ -986,7 +803,6 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, >> int copylen; >> bool zerocopy = false; >> int err; >> - u32 rxhash; >> >> if (!(tun->flags & TUN_NO_PI)) { >> if (len < sizeof(pi)) >> @@ -1146,13 +962,12 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, >> skb_reset_network_header(skb); >> skb_probe_transport_header(skb, 0); >> >> - rxhash = skb_get_rxhash(skb); >> + skb_set_queue_mapping(skb, tfile->queue_index); >> netif_rx_ni(skb); >> >> tun->dev->stats.rx_packets++; >> tun->dev->stats.rx_bytes += len; >> >> - tun_flow_update(tun, rxhash, tfile); >> return total_len; >> } >> >> @@ -1370,7 +1185,6 @@ static void tun_free_netdev(struct net_device *dev) >> struct tun_struct *tun = netdev_priv(dev); >> >> BUG_ON(!(list_empty(&tun->disabled))); >> - tun_flow_uninit(tun); >> security_tun_dev_free_security(tun->security); >> free_netdev(dev); >> } >> @@ -1644,7 +1458,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) >> goto err_free_dev; >> >> tun_net_init(dev); >> - tun_flow_init(tun); >> >> dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | >> TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | >> @@ -1655,7 +1468,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) >> INIT_LIST_HEAD(&tun->disabled); >> err = tun_attach(tun, file, false); >> if (err < 0) >> - goto err_free_flow; >> + goto err_free_security; >> >> err = register_netdevice(tun->dev); >> if (err < 0) >> @@ -1705,8 +1518,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) >> >> err_detach: >> tun_detach_all(dev); >> -err_free_flow: >> - tun_flow_uninit(tun); >> +err_free_security: >> security_tun_dev_free_security(tun->security); >> err_free_dev: >> free_netdev(dev); >> -- >> 1.7.6.5 >> >> -- >> To unsubscribe from this list: send the line "unsubscribe netdev" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html -- Regards, Zhi Yong Wu ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] net, tun: remove the flow cache 2013-12-18 4:37 ` Zhi Yong Wu @ 2013-12-18 4:58 ` Tom Herbert 2013-12-18 5:23 ` Zhi Yong Wu 0 siblings, 1 reply; 12+ messages in thread From: Tom Herbert @ 2013-12-18 4:58 UTC (permalink / raw) To: Zhi Yong Wu Cc: Linux Netdev List, LKML, David Miller, Michael S. Tsirkin, Jason Wang, Zhi Yong Wu >> Yes , in it's current state it's broken. But maybe we can try to fix >> it instead of arbitrarily removing it. Please see my patches on >> plumbing RFS into tuntap which may start to make it useful. > Do you mean you patch [5/5] tun: Added support for RFS on tun flows? > Sorry, can you say with more details? Correct. It was RFC since I didn't have a good way to test, if you do please try it and see if there's any effect. We should also be able to do something similar for KVM guests, either doing the flow lookup on each packet from the guest, or use aRFS interface from the guest driver for end to end RFS (more exciting prospect). We are finding that guest to driver accelerations like this (and tso, lro) are quite important in getting virtual networking performance up. > >> >> Tom >> >>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> >>> --- >>> drivers/net/tun.c | 208 +++-------------------------------------------------- >>> 1 files changed, 10 insertions(+), 198 deletions(-) >>> >>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c >>> index 7c8343a..7c27fdc 100644 >>> --- a/drivers/net/tun.c >>> +++ b/drivers/net/tun.c >>> @@ -32,12 +32,15 @@ >>> * >>> * Daniel Podlejski <underley@underley.eu.org> >>> * Modifications for 2.3.99-pre5 kernel. >>> + * >>> + * Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> >>> + * Remove the flow cache. >>> */ >>> >>> #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt >>> >>> #define DRV_NAME "tun" >>> -#define DRV_VERSION "1.6" >>> +#define DRV_VERSION "1.7" >>> #define DRV_DESCRIPTION "Universal TUN/TAP device driver" >>> #define DRV_COPYRIGHT "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>" >>> >>> @@ -146,18 +149,6 @@ struct tun_file { >>> struct tun_struct *detached; >>> }; >>> >>> -struct tun_flow_entry { >>> - struct hlist_node hash_link; >>> - struct rcu_head rcu; >>> - struct tun_struct *tun; >>> - >>> - u32 rxhash; >>> - int queue_index; >>> - unsigned long updated; >>> -}; >>> - >>> -#define TUN_NUM_FLOW_ENTRIES 1024 >>> - >>> /* Since the socket were moved to tun_file, to preserve the behavior of persist >>> * device, socket filter, sndbuf and vnet header size were restore when the >>> * file were attached to a persist device. >>> @@ -184,163 +175,11 @@ struct tun_struct { >>> int debug; >>> #endif >>> spinlock_t lock; >>> - struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; >>> - struct timer_list flow_gc_timer; >>> - unsigned long ageing_time; >>> unsigned int numdisabled; >>> struct list_head disabled; >>> void *security; >>> - u32 flow_count; >>> }; >>> >>> -static inline u32 tun_hashfn(u32 rxhash) >>> -{ >>> - return rxhash & 0x3ff; >>> -} >>> - >>> -static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash) >>> -{ >>> - struct tun_flow_entry *e; >>> - >>> - hlist_for_each_entry_rcu(e, head, hash_link) { >>> - if (e->rxhash == rxhash) >>> - return e; >>> - } >>> - return NULL; >>> -} >>> - >>> -static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun, >>> - struct hlist_head *head, >>> - u32 rxhash, u16 queue_index) >>> -{ >>> - struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC); >>> - >>> - if (e) { >>> - tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n", >>> - rxhash, queue_index); >>> - e->updated = jiffies; >>> - e->rxhash = rxhash; >>> - e->queue_index = queue_index; >>> - e->tun = tun; >>> - hlist_add_head_rcu(&e->hash_link, head); >>> - ++tun->flow_count; >>> - } >>> - return e; >>> -} >>> - >>> -static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) >>> -{ >>> - tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n", >>> - e->rxhash, e->queue_index); >>> - hlist_del_rcu(&e->hash_link); >>> - kfree_rcu(e, rcu); >>> - --tun->flow_count; >>> -} >>> - >>> -static void tun_flow_flush(struct tun_struct *tun) >>> -{ >>> - int i; >>> - >>> - spin_lock_bh(&tun->lock); >>> - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { >>> - struct tun_flow_entry *e; >>> - struct hlist_node *n; >>> - >>> - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) >>> - tun_flow_delete(tun, e); >>> - } >>> - spin_unlock_bh(&tun->lock); >>> -} >>> - >>> -static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index) >>> -{ >>> - int i; >>> - >>> - spin_lock_bh(&tun->lock); >>> - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { >>> - struct tun_flow_entry *e; >>> - struct hlist_node *n; >>> - >>> - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { >>> - if (e->queue_index == queue_index) >>> - tun_flow_delete(tun, e); >>> - } >>> - } >>> - spin_unlock_bh(&tun->lock); >>> -} >>> - >>> -static void tun_flow_cleanup(unsigned long data) >>> -{ >>> - struct tun_struct *tun = (struct tun_struct *)data; >>> - unsigned long delay = tun->ageing_time; >>> - unsigned long next_timer = jiffies + delay; >>> - unsigned long count = 0; >>> - int i; >>> - >>> - tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n"); >>> - >>> - spin_lock_bh(&tun->lock); >>> - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { >>> - struct tun_flow_entry *e; >>> - struct hlist_node *n; >>> - >>> - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { >>> - unsigned long this_timer; >>> - count++; >>> - this_timer = e->updated + delay; >>> - if (time_before_eq(this_timer, jiffies)) >>> - tun_flow_delete(tun, e); >>> - else if (time_before(this_timer, next_timer)) >>> - next_timer = this_timer; >>> - } >>> - } >>> - >>> - if (count) >>> - mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer)); >>> - spin_unlock_bh(&tun->lock); >>> -} >>> - >>> -static void tun_flow_update(struct tun_struct *tun, u32 rxhash, >>> - struct tun_file *tfile) >>> -{ >>> - struct hlist_head *head; >>> - struct tun_flow_entry *e; >>> - unsigned long delay = tun->ageing_time; >>> - u16 queue_index = tfile->queue_index; >>> - >>> - if (!rxhash) >>> - return; >>> - else >>> - head = &tun->flows[tun_hashfn(rxhash)]; >>> - >>> - rcu_read_lock(); >>> - >>> - /* We may get a very small possibility of OOO during switching, not >>> - * worth to optimize.*/ >>> - if (tun->numqueues == 1 || tfile->detached) >>> - goto unlock; >>> - >>> - e = tun_flow_find(head, rxhash); >>> - if (likely(e)) { >>> - /* TODO: keep queueing to old queue until it's empty? */ >>> - e->queue_index = queue_index; >>> - e->updated = jiffies; >>> - } else { >>> - spin_lock_bh(&tun->lock); >>> - if (!tun_flow_find(head, rxhash) && >>> - tun->flow_count < MAX_TAP_FLOWS) >>> - tun_flow_create(tun, head, rxhash, queue_index); >>> - >>> - if (!timer_pending(&tun->flow_gc_timer)) >>> - mod_timer(&tun->flow_gc_timer, >>> - round_jiffies_up(jiffies + delay)); >>> - spin_unlock_bh(&tun->lock); >>> - } >>> - >>> -unlock: >>> - rcu_read_unlock(); >>> -} >>> - >>> /* We try to identify a flow through its rxhash first. The reason that >>> * we do not check rxq no. is becuase some cards(e.g 82599), chooses >>> * the rxq based on the txq where the last packet of the flow comes. As >>> @@ -351,7 +190,6 @@ unlock: >>> static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) >>> { >>> struct tun_struct *tun = netdev_priv(dev); >>> - struct tun_flow_entry *e; >>> u32 txq = 0; >>> u32 numqueues = 0; >>> >>> @@ -360,12 +198,11 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) >>> >>> txq = skb_get_rxhash(skb); >>> if (txq) { >>> - e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); >>> - if (e) >>> - txq = e->queue_index; >>> - else >>> + txq = skb_get_queue_mapping(skb); >>> + if (!txq) { >>> /* use multiply and shift instead of expensive divide */ >>> txq = ((u64)txq * numqueues) >> 32; >>> + } >>> } else if (likely(skb_rx_queue_recorded(skb))) { >>> txq = skb_get_rx_queue(skb); >>> while (unlikely(txq >= numqueues)) >>> @@ -439,7 +276,6 @@ static void __tun_detach(struct tun_file *tfile, bool clean) >>> tun_disable_queue(tun, tfile); >>> >>> synchronize_net(); >>> - tun_flow_delete_by_queue(tun, tun->numqueues + 1); >>> /* Drop read queue */ >>> tun_queue_purge(tfile); >>> tun_set_real_num_queues(tun); >>> @@ -858,25 +694,6 @@ static const struct net_device_ops tap_netdev_ops = { >>> #endif >>> }; >>> >>> -static void tun_flow_init(struct tun_struct *tun) >>> -{ >>> - int i; >>> - >>> - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) >>> - INIT_HLIST_HEAD(&tun->flows[i]); >>> - >>> - tun->ageing_time = TUN_FLOW_EXPIRE; >>> - setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun); >>> - mod_timer(&tun->flow_gc_timer, >>> - round_jiffies_up(jiffies + tun->ageing_time)); >>> -} >>> - >>> -static void tun_flow_uninit(struct tun_struct *tun) >>> -{ >>> - del_timer_sync(&tun->flow_gc_timer); >>> - tun_flow_flush(tun); >>> -} >>> - >>> /* Initialize net device. */ >>> static void tun_net_init(struct net_device *dev) >>> { >>> @@ -986,7 +803,6 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, >>> int copylen; >>> bool zerocopy = false; >>> int err; >>> - u32 rxhash; >>> >>> if (!(tun->flags & TUN_NO_PI)) { >>> if (len < sizeof(pi)) >>> @@ -1146,13 +962,12 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, >>> skb_reset_network_header(skb); >>> skb_probe_transport_header(skb, 0); >>> >>> - rxhash = skb_get_rxhash(skb); >>> + skb_set_queue_mapping(skb, tfile->queue_index); >>> netif_rx_ni(skb); >>> >>> tun->dev->stats.rx_packets++; >>> tun->dev->stats.rx_bytes += len; >>> >>> - tun_flow_update(tun, rxhash, tfile); >>> return total_len; >>> } >>> >>> @@ -1370,7 +1185,6 @@ static void tun_free_netdev(struct net_device *dev) >>> struct tun_struct *tun = netdev_priv(dev); >>> >>> BUG_ON(!(list_empty(&tun->disabled))); >>> - tun_flow_uninit(tun); >>> security_tun_dev_free_security(tun->security); >>> free_netdev(dev); >>> } >>> @@ -1644,7 +1458,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) >>> goto err_free_dev; >>> >>> tun_net_init(dev); >>> - tun_flow_init(tun); >>> >>> dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | >>> TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | >>> @@ -1655,7 +1468,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) >>> INIT_LIST_HEAD(&tun->disabled); >>> err = tun_attach(tun, file, false); >>> if (err < 0) >>> - goto err_free_flow; >>> + goto err_free_security; >>> >>> err = register_netdevice(tun->dev); >>> if (err < 0) >>> @@ -1705,8 +1518,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) >>> >>> err_detach: >>> tun_detach_all(dev); >>> -err_free_flow: >>> - tun_flow_uninit(tun); >>> +err_free_security: >>> security_tun_dev_free_security(tun->security); >>> err_free_dev: >>> free_netdev(dev); >>> -- >>> 1.7.6.5 >>> >>> -- >>> To unsubscribe from this list: send the line "unsubscribe netdev" in >>> the body of a message to majordomo@vger.kernel.org >>> More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > -- > Regards, > > Zhi Yong Wu ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] net, tun: remove the flow cache 2013-12-18 4:58 ` Tom Herbert @ 2013-12-18 5:23 ` Zhi Yong Wu 0 siblings, 0 replies; 12+ messages in thread From: Zhi Yong Wu @ 2013-12-18 5:23 UTC (permalink / raw) To: Tom Herbert Cc: Linux Netdev List, LKML, David Miller, Michael S. Tsirkin, Jason Wang, Zhi Yong Wu On Wed, Dec 18, 2013 at 12:58 PM, Tom Herbert <therbert@google.com> wrote: >>> Yes , in it's current state it's broken. But maybe we can try to fix >>> it instead of arbitrarily removing it. Please see my patches on >>> plumbing RFS into tuntap which may start to make it useful. >> Do you mean you patch [5/5] tun: Added support for RFS on tun flows? >> Sorry, can you say with more details? > > Correct. It was RFC since I didn't have a good way to test, if you do > please try it and see if there's any effect. We should also be able to Interesting, i will try to dig it. Sorry, i don't understand why you can't test. Does it require some special hardware support? or other facilities? > do something similar for KVM guests, either doing the flow lookup on > each packet from the guest, or use aRFS interface from the guest > driver for end to end RFS (more exciting prospect). We are finding which two ends do you mean? > that guest to driver accelerations like this (and tso, lro) are quite Sorry, i got a bit confused, the driver here mean "virtio_net" or tuntap driver? > important in getting virtual networking performance up. > >> >>> >>> Tom >>> >>>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> >>>> --- >>>> drivers/net/tun.c | 208 +++-------------------------------------------------- >>>> 1 files changed, 10 insertions(+), 198 deletions(-) >>>> >>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c >>>> index 7c8343a..7c27fdc 100644 >>>> --- a/drivers/net/tun.c >>>> +++ b/drivers/net/tun.c >>>> @@ -32,12 +32,15 @@ >>>> * >>>> * Daniel Podlejski <underley@underley.eu.org> >>>> * Modifications for 2.3.99-pre5 kernel. >>>> + * >>>> + * Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> >>>> + * Remove the flow cache. >>>> */ >>>> >>>> #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt >>>> >>>> #define DRV_NAME "tun" >>>> -#define DRV_VERSION "1.6" >>>> +#define DRV_VERSION "1.7" >>>> #define DRV_DESCRIPTION "Universal TUN/TAP device driver" >>>> #define DRV_COPYRIGHT "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>" >>>> >>>> @@ -146,18 +149,6 @@ struct tun_file { >>>> struct tun_struct *detached; >>>> }; >>>> >>>> -struct tun_flow_entry { >>>> - struct hlist_node hash_link; >>>> - struct rcu_head rcu; >>>> - struct tun_struct *tun; >>>> - >>>> - u32 rxhash; >>>> - int queue_index; >>>> - unsigned long updated; >>>> -}; >>>> - >>>> -#define TUN_NUM_FLOW_ENTRIES 1024 >>>> - >>>> /* Since the socket were moved to tun_file, to preserve the behavior of persist >>>> * device, socket filter, sndbuf and vnet header size were restore when the >>>> * file were attached to a persist device. >>>> @@ -184,163 +175,11 @@ struct tun_struct { >>>> int debug; >>>> #endif >>>> spinlock_t lock; >>>> - struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; >>>> - struct timer_list flow_gc_timer; >>>> - unsigned long ageing_time; >>>> unsigned int numdisabled; >>>> struct list_head disabled; >>>> void *security; >>>> - u32 flow_count; >>>> }; >>>> >>>> -static inline u32 tun_hashfn(u32 rxhash) >>>> -{ >>>> - return rxhash & 0x3ff; >>>> -} >>>> - >>>> -static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash) >>>> -{ >>>> - struct tun_flow_entry *e; >>>> - >>>> - hlist_for_each_entry_rcu(e, head, hash_link) { >>>> - if (e->rxhash == rxhash) >>>> - return e; >>>> - } >>>> - return NULL; >>>> -} >>>> - >>>> -static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun, >>>> - struct hlist_head *head, >>>> - u32 rxhash, u16 queue_index) >>>> -{ >>>> - struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC); >>>> - >>>> - if (e) { >>>> - tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n", >>>> - rxhash, queue_index); >>>> - e->updated = jiffies; >>>> - e->rxhash = rxhash; >>>> - e->queue_index = queue_index; >>>> - e->tun = tun; >>>> - hlist_add_head_rcu(&e->hash_link, head); >>>> - ++tun->flow_count; >>>> - } >>>> - return e; >>>> -} >>>> - >>>> -static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) >>>> -{ >>>> - tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n", >>>> - e->rxhash, e->queue_index); >>>> - hlist_del_rcu(&e->hash_link); >>>> - kfree_rcu(e, rcu); >>>> - --tun->flow_count; >>>> -} >>>> - >>>> -static void tun_flow_flush(struct tun_struct *tun) >>>> -{ >>>> - int i; >>>> - >>>> - spin_lock_bh(&tun->lock); >>>> - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { >>>> - struct tun_flow_entry *e; >>>> - struct hlist_node *n; >>>> - >>>> - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) >>>> - tun_flow_delete(tun, e); >>>> - } >>>> - spin_unlock_bh(&tun->lock); >>>> -} >>>> - >>>> -static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index) >>>> -{ >>>> - int i; >>>> - >>>> - spin_lock_bh(&tun->lock); >>>> - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { >>>> - struct tun_flow_entry *e; >>>> - struct hlist_node *n; >>>> - >>>> - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { >>>> - if (e->queue_index == queue_index) >>>> - tun_flow_delete(tun, e); >>>> - } >>>> - } >>>> - spin_unlock_bh(&tun->lock); >>>> -} >>>> - >>>> -static void tun_flow_cleanup(unsigned long data) >>>> -{ >>>> - struct tun_struct *tun = (struct tun_struct *)data; >>>> - unsigned long delay = tun->ageing_time; >>>> - unsigned long next_timer = jiffies + delay; >>>> - unsigned long count = 0; >>>> - int i; >>>> - >>>> - tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n"); >>>> - >>>> - spin_lock_bh(&tun->lock); >>>> - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { >>>> - struct tun_flow_entry *e; >>>> - struct hlist_node *n; >>>> - >>>> - hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { >>>> - unsigned long this_timer; >>>> - count++; >>>> - this_timer = e->updated + delay; >>>> - if (time_before_eq(this_timer, jiffies)) >>>> - tun_flow_delete(tun, e); >>>> - else if (time_before(this_timer, next_timer)) >>>> - next_timer = this_timer; >>>> - } >>>> - } >>>> - >>>> - if (count) >>>> - mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer)); >>>> - spin_unlock_bh(&tun->lock); >>>> -} >>>> - >>>> -static void tun_flow_update(struct tun_struct *tun, u32 rxhash, >>>> - struct tun_file *tfile) >>>> -{ >>>> - struct hlist_head *head; >>>> - struct tun_flow_entry *e; >>>> - unsigned long delay = tun->ageing_time; >>>> - u16 queue_index = tfile->queue_index; >>>> - >>>> - if (!rxhash) >>>> - return; >>>> - else >>>> - head = &tun->flows[tun_hashfn(rxhash)]; >>>> - >>>> - rcu_read_lock(); >>>> - >>>> - /* We may get a very small possibility of OOO during switching, not >>>> - * worth to optimize.*/ >>>> - if (tun->numqueues == 1 || tfile->detached) >>>> - goto unlock; >>>> - >>>> - e = tun_flow_find(head, rxhash); >>>> - if (likely(e)) { >>>> - /* TODO: keep queueing to old queue until it's empty? */ >>>> - e->queue_index = queue_index; >>>> - e->updated = jiffies; >>>> - } else { >>>> - spin_lock_bh(&tun->lock); >>>> - if (!tun_flow_find(head, rxhash) && >>>> - tun->flow_count < MAX_TAP_FLOWS) >>>> - tun_flow_create(tun, head, rxhash, queue_index); >>>> - >>>> - if (!timer_pending(&tun->flow_gc_timer)) >>>> - mod_timer(&tun->flow_gc_timer, >>>> - round_jiffies_up(jiffies + delay)); >>>> - spin_unlock_bh(&tun->lock); >>>> - } >>>> - >>>> -unlock: >>>> - rcu_read_unlock(); >>>> -} >>>> - >>>> /* We try to identify a flow through its rxhash first. The reason that >>>> * we do not check rxq no. is becuase some cards(e.g 82599), chooses >>>> * the rxq based on the txq where the last packet of the flow comes. As >>>> @@ -351,7 +190,6 @@ unlock: >>>> static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) >>>> { >>>> struct tun_struct *tun = netdev_priv(dev); >>>> - struct tun_flow_entry *e; >>>> u32 txq = 0; >>>> u32 numqueues = 0; >>>> >>>> @@ -360,12 +198,11 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb) >>>> >>>> txq = skb_get_rxhash(skb); >>>> if (txq) { >>>> - e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); >>>> - if (e) >>>> - txq = e->queue_index; >>>> - else >>>> + txq = skb_get_queue_mapping(skb); >>>> + if (!txq) { >>>> /* use multiply and shift instead of expensive divide */ >>>> txq = ((u64)txq * numqueues) >> 32; >>>> + } >>>> } else if (likely(skb_rx_queue_recorded(skb))) { >>>> txq = skb_get_rx_queue(skb); >>>> while (unlikely(txq >= numqueues)) >>>> @@ -439,7 +276,6 @@ static void __tun_detach(struct tun_file *tfile, bool clean) >>>> tun_disable_queue(tun, tfile); >>>> >>>> synchronize_net(); >>>> - tun_flow_delete_by_queue(tun, tun->numqueues + 1); >>>> /* Drop read queue */ >>>> tun_queue_purge(tfile); >>>> tun_set_real_num_queues(tun); >>>> @@ -858,25 +694,6 @@ static const struct net_device_ops tap_netdev_ops = { >>>> #endif >>>> }; >>>> >>>> -static void tun_flow_init(struct tun_struct *tun) >>>> -{ >>>> - int i; >>>> - >>>> - for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) >>>> - INIT_HLIST_HEAD(&tun->flows[i]); >>>> - >>>> - tun->ageing_time = TUN_FLOW_EXPIRE; >>>> - setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun); >>>> - mod_timer(&tun->flow_gc_timer, >>>> - round_jiffies_up(jiffies + tun->ageing_time)); >>>> -} >>>> - >>>> -static void tun_flow_uninit(struct tun_struct *tun) >>>> -{ >>>> - del_timer_sync(&tun->flow_gc_timer); >>>> - tun_flow_flush(tun); >>>> -} >>>> - >>>> /* Initialize net device. */ >>>> static void tun_net_init(struct net_device *dev) >>>> { >>>> @@ -986,7 +803,6 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, >>>> int copylen; >>>> bool zerocopy = false; >>>> int err; >>>> - u32 rxhash; >>>> >>>> if (!(tun->flags & TUN_NO_PI)) { >>>> if (len < sizeof(pi)) >>>> @@ -1146,13 +962,12 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, >>>> skb_reset_network_header(skb); >>>> skb_probe_transport_header(skb, 0); >>>> >>>> - rxhash = skb_get_rxhash(skb); >>>> + skb_set_queue_mapping(skb, tfile->queue_index); >>>> netif_rx_ni(skb); >>>> >>>> tun->dev->stats.rx_packets++; >>>> tun->dev->stats.rx_bytes += len; >>>> >>>> - tun_flow_update(tun, rxhash, tfile); >>>> return total_len; >>>> } >>>> >>>> @@ -1370,7 +1185,6 @@ static void tun_free_netdev(struct net_device *dev) >>>> struct tun_struct *tun = netdev_priv(dev); >>>> >>>> BUG_ON(!(list_empty(&tun->disabled))); >>>> - tun_flow_uninit(tun); >>>> security_tun_dev_free_security(tun->security); >>>> free_netdev(dev); >>>> } >>>> @@ -1644,7 +1458,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) >>>> goto err_free_dev; >>>> >>>> tun_net_init(dev); >>>> - tun_flow_init(tun); >>>> >>>> dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | >>>> TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | >>>> @@ -1655,7 +1468,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) >>>> INIT_LIST_HEAD(&tun->disabled); >>>> err = tun_attach(tun, file, false); >>>> if (err < 0) >>>> - goto err_free_flow; >>>> + goto err_free_security; >>>> >>>> err = register_netdevice(tun->dev); >>>> if (err < 0) >>>> @@ -1705,8 +1518,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) >>>> >>>> err_detach: >>>> tun_detach_all(dev); >>>> -err_free_flow: >>>> - tun_flow_uninit(tun); >>>> +err_free_security: >>>> security_tun_dev_free_security(tun->security); >>>> err_free_dev: >>>> free_netdev(dev); >>>> -- >>>> 1.7.6.5 >>>> >>>> -- >>>> To unsubscribe from this list: send the line "unsubscribe netdev" in >>>> the body of a message to majordomo@vger.kernel.org >>>> More majordomo info at http://vger.kernel.org/majordomo-info.html >> >> >> >> -- >> Regards, >> >> Zhi Yong Wu -- Regards, Zhi Yong Wu ^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2013-12-23 7:00 UTC | newest] Thread overview: 12+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2013-12-17 7:26 [RFC PATCH] net, tun: remove the flow cache Zhi Yong Wu 2013-12-17 7:47 ` Stephen Hemminger 2013-12-17 8:13 ` Zhi Yong Wu 2013-12-17 8:49 ` Jason Wang 2013-12-17 9:13 ` Zhi Yong Wu 2013-12-17 10:05 ` Jason Wang 2013-12-18 2:08 ` Zhi Yong Wu 2013-12-23 7:00 ` Jason Wang 2013-12-18 4:06 ` Tom Herbert 2013-12-18 4:37 ` Zhi Yong Wu 2013-12-18 4:58 ` Tom Herbert 2013-12-18 5:23 ` Zhi Yong Wu
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).