* Re: [PATCH v3] prism54: isl_38xx: Replace 'struct timeval'
From: Johannes Berg @ 2016-04-13 8:38 UTC (permalink / raw)
To: Tina Ruchandani, Arnd Bergmann
Cc: y2038, netdev, linux-wireless, linux-kernel
In-Reply-To: <20160413060916.GA21184@localhost>
> The patch was build-tested / debugged by removing the
> "if VERBOSE > SHOW_ERROR_MESSAGES" guards.
Stands to reason that we should just remove the (more or less) dead
code, since I don't think anyone really ever touches this driver any
more or will ever again ...
johannes
_______________________________________________
Y2038 mailing list
Y2038@lists.linaro.org
https://lists.linaro.org/mailman/listinfo/y2038
^ permalink raw reply
* [PATCH net-next] tun: use per cpu variables for stats accounting
From: Paolo Abeni @ 2016-04-13 8:52 UTC (permalink / raw)
To: netdev
Cc: David S. Miller, Michael S. Tsirkin, Hannes Frederic Sowa,
Eric W. Biederman, Greg Kurz, Jason Wang
Currently the tun device accounting uses dev->stats without applying any
kind of protection, regardless that accounting happens in preemptible
process context.
This patch move the tun stats to a per cpu data structure, and protect
the updates with u64_stats_update_begin()/u64_stats_update_end() or
this_cpu_inc according to the stat type. The per cpu stats are
aggregated by the newly added ndo_get_stats64 ops.
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
drivers/net/tun.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 83 insertions(+), 12 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index a746616..faf9297 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -131,6 +131,17 @@ struct tap_filter {
#define TUN_FLOW_EXPIRE (3 * HZ)
+struct tun_pcpu_stats {
+ u64 rx_packets;
+ u64 rx_bytes;
+ u64 tx_packets;
+ u64 tx_bytes;
+ struct u64_stats_sync syncp;
+ u32 rx_dropped;
+ u32 tx_dropped;
+ u32 rx_frame_errors;
+};
+
/* A tun_file connects an open character device to a tuntap netdevice. It
* also contains all socket related structures (except sock_fprog and tap_filter)
* to serve as one transmit queue for tuntap device. The sock_fprog and
@@ -205,6 +216,7 @@ struct tun_struct {
struct list_head disabled;
void *security;
u32 flow_count;
+ struct tun_pcpu_stats __percpu *pcpu_stats;
};
#ifdef CONFIG_TUN_VNET_CROSS_LE
@@ -886,7 +898,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
drop:
- dev->stats.tx_dropped++;
+ this_cpu_inc(tun->pcpu_stats->tx_dropped);
skb_tx_error(skb);
kfree_skb(skb);
rcu_read_unlock();
@@ -949,6 +961,43 @@ static void tun_set_headroom(struct net_device *dev, int new_hr)
tun->align = new_hr;
}
+static struct rtnl_link_stats64 *
+tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+{
+ u32 rx_dropped = 0, tx_dropped = 0, rx_frame_errors = 0;
+ struct tun_struct *tun = netdev_priv(dev);
+ struct tun_pcpu_stats *p;
+ int i;
+
+ for_each_possible_cpu(i) {
+ u64 rxpackets, rxbytes, txpackets, txbytes;
+ unsigned int start;
+
+ p = per_cpu_ptr(tun->pcpu_stats, i);
+ do {
+ start = u64_stats_fetch_begin(&p->syncp);
+ rxpackets = p->rx_packets;
+ rxbytes = p->rx_bytes;
+ txpackets = p->tx_packets;
+ txbytes = p->tx_bytes;
+ } while (u64_stats_fetch_retry(&p->syncp, start));
+
+ stats->rx_packets += rxpackets;
+ stats->rx_bytes += rxbytes;
+ stats->tx_packets += txpackets;
+ stats->tx_bytes += txbytes;
+
+ /* u32 counters */
+ rx_dropped += p->rx_dropped;
+ rx_frame_errors += p->rx_frame_errors;
+ tx_dropped += p->tx_dropped;
+ }
+ stats->rx_dropped = rx_dropped;
+ stats->rx_frame_errors = rx_frame_errors;
+ stats->tx_dropped = tx_dropped;
+ return stats;
+}
+
static const struct net_device_ops tun_netdev_ops = {
.ndo_uninit = tun_net_uninit,
.ndo_open = tun_net_open,
@@ -961,6 +1010,7 @@ static const struct net_device_ops tun_netdev_ops = {
.ndo_poll_controller = tun_poll_controller,
#endif
.ndo_set_rx_headroom = tun_set_headroom,
+ .ndo_get_stats64 = tun_net_get_stats64,
};
static const struct net_device_ops tap_netdev_ops = {
@@ -979,6 +1029,7 @@ static const struct net_device_ops tap_netdev_ops = {
#endif
.ndo_features_check = passthru_features_check,
.ndo_set_rx_headroom = tun_set_headroom,
+ .ndo_get_stats64 = tun_net_get_stats64,
};
static void tun_flow_init(struct tun_struct *tun)
@@ -1103,6 +1154,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
size_t total_len = iov_iter_count(from);
size_t len = total_len, align = tun->align, linear;
struct virtio_net_hdr gso = { 0 };
+ struct tun_pcpu_stats *stats;
int good_linear;
int copylen;
bool zerocopy = false;
@@ -1177,7 +1229,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
if (IS_ERR(skb)) {
if (PTR_ERR(skb) != -EAGAIN)
- tun->dev->stats.rx_dropped++;
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
return PTR_ERR(skb);
}
@@ -1192,7 +1244,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
}
if (err) {
- tun->dev->stats.rx_dropped++;
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
kfree_skb(skb);
return -EFAULT;
}
@@ -1200,7 +1252,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
if (gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
if (!skb_partial_csum_set(skb, tun16_to_cpu(tun, gso.csum_start),
tun16_to_cpu(tun, gso.csum_offset))) {
- tun->dev->stats.rx_frame_errors++;
+ this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
kfree_skb(skb);
return -EINVAL;
}
@@ -1217,7 +1269,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
pi.proto = htons(ETH_P_IPV6);
break;
default:
- tun->dev->stats.rx_dropped++;
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
kfree_skb(skb);
return -EINVAL;
}
@@ -1245,7 +1297,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
break;
default:
- tun->dev->stats.rx_frame_errors++;
+ this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
kfree_skb(skb);
return -EINVAL;
}
@@ -1255,7 +1307,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
skb_shinfo(skb)->gso_size = tun16_to_cpu(tun, gso.gso_size);
if (skb_shinfo(skb)->gso_size == 0) {
- tun->dev->stats.rx_frame_errors++;
+ this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
kfree_skb(skb);
return -EINVAL;
}
@@ -1278,8 +1330,12 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
rxhash = skb_get_hash(skb);
netif_rx_ni(skb);
- tun->dev->stats.rx_packets++;
- tun->dev->stats.rx_bytes += len;
+ stats = get_cpu_ptr(tun->pcpu_stats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->rx_packets++;
+ stats->rx_bytes += len;
+ u64_stats_update_end(&stats->syncp);
+ put_cpu_ptr(stats);
tun_flow_update(tun, rxhash, tfile);
return total_len;
@@ -1308,6 +1364,7 @@ static ssize_t tun_put_user(struct tun_struct *tun,
struct iov_iter *iter)
{
struct tun_pi pi = { 0, skb->protocol };
+ struct tun_pcpu_stats *stats;
ssize_t total;
int vlan_offset = 0;
int vlan_hlen = 0;
@@ -1408,8 +1465,13 @@ static ssize_t tun_put_user(struct tun_struct *tun,
skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset);
done:
- tun->dev->stats.tx_packets++;
- tun->dev->stats.tx_bytes += skb->len + vlan_hlen;
+ /* caller is in process context, */
+ stats = get_cpu_ptr(tun->pcpu_stats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->tx_packets++;
+ stats->tx_bytes += skb->len + vlan_hlen;
+ u64_stats_update_end(&stats->syncp);
+ put_cpu_ptr(tun->pcpu_stats);
return total;
}
@@ -1467,6 +1529,7 @@ static void tun_free_netdev(struct net_device *dev)
struct tun_struct *tun = netdev_priv(dev);
BUG_ON(!(list_empty(&tun->disabled)));
+ free_percpu(tun->pcpu_stats);
tun_flow_uninit(tun);
security_tun_dev_free_security(tun->security);
free_netdev(dev);
@@ -1715,11 +1778,17 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
tun->filter_attached = false;
tun->sndbuf = tfile->socket.sk->sk_sndbuf;
+ tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
+ if (!tun->pcpu_stats) {
+ err = -ENOMEM;
+ goto err_free_dev;
+ }
+
spin_lock_init(&tun->lock);
err = security_tun_dev_alloc_security(&tun->security);
if (err < 0)
- goto err_free_dev;
+ goto err_free_stat;
tun_net_init(dev);
tun_flow_init(tun);
@@ -1763,6 +1832,8 @@ err_detach:
err_free_flow:
tun_flow_uninit(tun);
security_tun_dev_free_security(tun->security);
+err_free_stat:
+ free_percpu(tun->pcpu_stats);
err_free_dev:
free_netdev(dev);
return err;
--
1.8.3.1
^ permalink raw reply related
* [PATCH RFC 1/2] tun: don't require serialization lock on tx
From: Paolo Abeni @ 2016-04-13 9:04 UTC (permalink / raw)
To: netdev
Cc: David S. Miller, Michael S. Tsirkin, Hannes Frederic Sowa,
Eric W. Biederman, Greg Kurz, Jason Wang
In-Reply-To: <cover.1460393493.git.pabeni@redhat.com>
The current tun_net_xmit() implementation don't need any external
lock since it relay on rcu protection for the tun data structure
and on socket queue lock for skb queuing.
This patch set the NETIF_F_LLTX feature bit in the tun device, so
that on xmit, in absence of qdisc, no serialization lock is acquired
by the caller.
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
drivers/net/tun.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index faf9297..42992dc 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1796,7 +1796,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX;
- dev->features = dev->hw_features;
+ dev->features = dev->hw_features | NETIF_F_LLTX;
dev->vlan_features = dev->features &
~(NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX);
--
1.8.3.1
^ permalink raw reply related
* [PATCH RFC 2/2] tun: don't set a default qdisc
From: Paolo Abeni @ 2016-04-13 9:04 UTC (permalink / raw)
To: netdev
Cc: David S. Miller, Michael S. Tsirkin, Hannes Frederic Sowa,
Eric W. Biederman, Greg Kurz, Jason Wang
In-Reply-To: <cover.1460393493.git.pabeni@redhat.com>
This patch disables the default qdisc by explicitly setting the
IFF_NO_QUEUE private flag so that now the tun xmit path do not
require any lock by default.
The default qdisc was first removed as a side effect of commit
f84bb1eac027 ("net: fix IFF_NO_QUEUE for drivers using alloc_netdev")
and recently restored with commit 016adb7260f4 ("tuntap: restore default qdisc")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
drivers/net/tun.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 42992dc..0a0b63c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1081,6 +1081,7 @@ static void tun_net_init(struct net_device *dev)
break;
}
+ dev->priv_flags |= IFF_NO_QUEUE;
}
/* Character device part */
--
1.8.3.1
^ permalink raw reply related
* [PATCH RFC 0/2] tun: lockless xmit
From: Paolo Abeni @ 2016-04-13 9:04 UTC (permalink / raw)
To: netdev
Cc: David S. Miller, Michael S. Tsirkin, Hannes Frederic Sowa,
Eric W. Biederman, Greg Kurz, Jason Wang
This patch series try to remove the need for any lock in the tun device
xmit path, significantly improving the forwarding performance when multiple
processes are accessing the tun device (i.e. in a nic->bridge->tun->vm scenario).
The lockless xmit is obtained explicitly setting the NETIF_F_LLTX feature bit
and removing the default qdisc.
Unlikely most virtual devices, the tun driver has featured a default qdisc
for a long period, but it already lost such feature in linux 4.3.
Paolo Abeni (2):
tun: don't require serialization lock on tx
tun: don't set a default qdisc
drivers/net/tun.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--
1.8.3.1
^ permalink raw reply
* Re: [PATCH RFC 1/2] tun: don't require serialization lock on tx
From: Michael S. Tsirkin @ 2016-04-13 9:41 UTC (permalink / raw)
To: Paolo Abeni
Cc: netdev, David S. Miller, Hannes Frederic Sowa, Eric W. Biederman,
Greg Kurz, Jason Wang, Christian Borntraeger, Herbert Xu
In-Reply-To: <425e8f69ffd8fe315ef9d3cc678519c7060fb2e0.1460393493.git.pabeni@redhat.com>
On Wed, Apr 13, 2016 at 11:04:46AM +0200, Paolo Abeni wrote:
> The current tun_net_xmit() implementation don't need any external
> lock since it relay on rcu protection for the tun data structure
> and on socket queue lock for skb queuing.
>
> This patch set the NETIF_F_LLTX feature bit in the tun device, so
> that on xmit, in absence of qdisc, no serialization lock is acquired
> by the caller.
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> drivers/net/tun.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index faf9297..42992dc 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1796,7 +1796,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
> TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
> NETIF_F_HW_VLAN_STAG_TX;
> - dev->features = dev->hw_features;
> + dev->features = dev->hw_features | NETIF_F_LLTX;
the documentation says:
NETIF_F_LLTX_BIT, /* LockLess TX - deprecated. Please */
/* do not use LLTX in new drivers */
git log gives this explanation:
> 1) unnecessary;
> 2) causes problems with AF_PACKET seeing things twice.
> dev->vlan_features = dev->features &
> ~(NETIF_F_HW_VLAN_CTAG_TX |
> NETIF_F_HW_VLAN_STAG_TX);
> --
> 1.8.3.1
^ permalink raw reply
* Re: [PATCH v3] prism54: isl_38xx: Replace 'struct timeval'
From: Arend Van Spriel @ 2016-04-13 9:45 UTC (permalink / raw)
To: Johannes Berg, Tina Ruchandani, Arnd Bergmann
Cc: y2038, netdev, linux-wireless, linux-kernel
In-Reply-To: <1460536706.3057.3.camel@sipsolutions.net>
On 13-4-2016 10:38, Johannes Berg wrote:
>
>> The patch was build-tested / debugged by removing the
>> "if VERBOSE > SHOW_ERROR_MESSAGES" guards.
>
> Stands to reason that we should just remove the (more or less) dead
> code, since I don't think anyone really ever touches this driver any
> more or will ever again ...
It does bring back memories from my Intersil/Globespan/Conexant day(s),
but not sentimental enough to touch the prism54 driver.
Gr. AvS
> johannes
> --
> To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
_______________________________________________
Y2038 mailing list
Y2038@lists.linaro.org
https://lists.linaro.org/mailman/listinfo/y2038
^ permalink raw reply
* Re: [PATCH RFC 1/2] tun: don't require serialization lock on tx
From: Hannes Frederic Sowa @ 2016-04-13 9:48 UTC (permalink / raw)
To: Michael S. Tsirkin, Paolo Abeni
Cc: netdev, David S. Miller, Eric W. Biederman, Greg Kurz, Jason Wang,
Christian Borntraeger, Herbert Xu
In-Reply-To: <20160413123828-mutt-send-email-mst@redhat.com>
On 13.04.2016 11:41, Michael S. Tsirkin wrote:
> On Wed, Apr 13, 2016 at 11:04:46AM +0200, Paolo Abeni wrote:
>> The current tun_net_xmit() implementation don't need any external
>> lock since it relay on rcu protection for the tun data structure
>> and on socket queue lock for skb queuing.
>>
>> This patch set the NETIF_F_LLTX feature bit in the tun device, so
>> that on xmit, in absence of qdisc, no serialization lock is acquired
>> by the caller.
>>
>> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
>> ---
>> drivers/net/tun.c | 2 +-
>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index faf9297..42992dc 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -1796,7 +1796,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>> dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
>> TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
>> NETIF_F_HW_VLAN_STAG_TX;
>> - dev->features = dev->hw_features;
>> + dev->features = dev->hw_features | NETIF_F_LLTX;
>
> the documentation says:
> NETIF_F_LLTX_BIT, /* LockLess TX - deprecated. Please */
> /* do not use LLTX in new drivers */
In networking/netdev-features.txt
* LLTX driver (deprecated for hardware drivers)
NETIF_F_LLTX should be set in drivers that implement their own locking in
transmit path or don't need locking at all (e.g. software tunnels).
In ndo_start_xmit, it is recommended to use a try_lock and return
NETDEV_TX_LOCKED when the spin lock fails. The locking should also properly
protect against other callbacks (the rules you need to find out).
Don't use it for new drivers.
I think this is documentation is correct and it is only deprecated for
hardware drivers.
Bye,
Hannes
^ permalink raw reply
* [PATCH 2/3] io-mapping: Specify mapping size for io_mapping_map_wc()
From: Chris Wilson @ 2016-04-13 9:52 UTC (permalink / raw)
To: intel-gfx
Cc: David Airlie, netdev, Yishai Hadas, linux-kernel,
Peter Zijlstra (Intel), dri-devel, linux-rdma, Daniel Vetter,
Dan Williams, Ingo Molnar, David Hildenbrand
In-Reply-To: <1460541160-1955-1-git-send-email-chris@chris-wilson.co.uk>
The ioremap() hidden behind the io_mapping_map_wc() convenience helper
can be used for remapping multiple pages. Extend the helper so that
future callers can use it for larger ranges.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: David Airlie <airlied@linux.ie>
Cc: Yishai Hadas <yishaih@mellanox.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: David Hildenbrand <dahi@linux.vnet.ibm.com>
Cc: intel-gfx@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org
Cc: netdev@vger.kernel.org
Cc: linux-rdma@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
drivers/gpu/drm/i915/intel_overlay.c | 3 ++-
drivers/net/ethernet/mellanox/mlx4/pd.c | 4 +++-
include/linux/io-mapping.h | 10 +++++++---
3 files changed, 12 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index e487ff18b42f..5bba2cf62a26 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -198,7 +198,8 @@ intel_overlay_map_regs(struct intel_overlay *overlay)
regs = (struct overlay_registers __iomem *)overlay->reg_bo->phys_handle->vaddr;
else
regs = io_mapping_map_wc(ggtt->mappable,
- overlay->flip_addr);
+ overlay->flip_addr,
+ PAGE_SIZE);
return regs;
}
diff --git a/drivers/net/ethernet/mellanox/mlx4/pd.c b/drivers/net/ethernet/mellanox/mlx4/pd.c
index b3cc3ab63799..6fc156a3918d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/pd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/pd.c
@@ -205,7 +205,9 @@ int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf, int node)
goto free_uar;
}
- uar->bf_map = io_mapping_map_wc(priv->bf_mapping, uar->index << PAGE_SHIFT);
+ uar->bf_map = io_mapping_map_wc(priv->bf_mapping,
+ uar->index << PAGE_SHIFT,
+ PAGE_SIZE);
if (!uar->bf_map) {
err = -ENOMEM;
goto unamp_uar;
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index e399029b68c5..645ad06b5d52 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -100,14 +100,16 @@ io_mapping_unmap_atomic(void __iomem *vaddr)
}
static inline void __iomem *
-io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset)
+io_mapping_map_wc(struct io_mapping *mapping,
+ unsigned long offset,
+ unsigned long size)
{
resource_size_t phys_addr;
BUG_ON(offset >= mapping->size);
phys_addr = mapping->base + offset;
- return ioremap_wc(phys_addr, PAGE_SIZE);
+ return ioremap_wc(phys_addr, size);
}
static inline void
@@ -155,7 +157,9 @@ io_mapping_unmap_atomic(void __iomem *vaddr)
/* Non-atomic map/unmap */
static inline void __iomem *
-io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset)
+io_mapping_map_wc(struct io_mapping *mapping,
+ unsigned long offset,
+ unsigned long size)
{
return ((char __force __iomem *) mapping) + offset;
}
--
2.8.0.rc3
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related
* Re: [PATCH RFC 2/2] tun: don't set a default qdisc
From: Michael S. Tsirkin @ 2016-04-13 10:26 UTC (permalink / raw)
To: Paolo Abeni
Cc: netdev, David S. Miller, Hannes Frederic Sowa, Eric W. Biederman,
Greg Kurz, Jason Wang
In-Reply-To: <1a216fb6cade7dcd7d2c1d4ce113a1ee6e3fdeb6.1460393493.git.pabeni@redhat.com>
On Wed, Apr 13, 2016 at 11:04:47AM +0200, Paolo Abeni wrote:
> This patch disables the default qdisc by explicitly setting the
> IFF_NO_QUEUE private flag so that now the tun xmit path do not
> require any lock by default.
>
> The default qdisc was first removed as a side effect of commit
> f84bb1eac027 ("net: fix IFF_NO_QUEUE for drivers using alloc_netdev")
> and recently restored with commit 016adb7260f4 ("tuntap: restore default qdisc")
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
I wonder about this back and forth.
Jason, do you see a workload where the default qdisc
is preferable?
> ---
> drivers/net/tun.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 42992dc..0a0b63c 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1081,6 +1081,7 @@ static void tun_net_init(struct net_device *dev)
>
> break;
> }
> + dev->priv_flags |= IFF_NO_QUEUE;
> }
>
> /* Character device part */
> --
> 1.8.3.1
^ permalink raw reply
* Re: [RFC PATCH v2 5/5] Add sample for adding simple drop program to link
From: Jamal Hadi Salim @ 2016-04-13 10:40 UTC (permalink / raw)
To: Brenden Blanco
Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, daniel, brouer,
eric.dumazet, ecree, john.fastabend, tgraf, johannes,
eranlinuxmellanox, lorenzo
In-Reply-To: <20160410183810.GA18749@gmail.com>
On 16-04-10 02:38 PM, Brenden Blanco wrote:
>> I always go for the lowest hanging fruit.
> Which to me is the 60% time spent above the driver level as shown above.
[..]
>> It seemed it was the driver path in your case. When we removed
>> the driver overhead (as demoed at the tc workshop in netdev11) we saw
>> __netif_receive_skb_core() at the top of the profile.
>> So in this case seems it was mlx4_en_process_rx_cq() - thats why i
>> was saying the bottleneck is the driver.
> I wouldn't call it a bottleneck when the time spent is additive,
> aka run-to-completion.
The driver is a bottleneck regardless. It is probably the DMA interfaces
and lots of cacheline misses. So the first thing to
fix is whats at the top of the profile if you wanb
The fact you are dropping earlier is in itself an improvement
as long as you dont try to be too fancy.
> Of course the second perf report is on the same machine as the commit
> message. That was generated fresh for this email thread. All of the
> numbers I've quoted come from the same single-sender/single-receiver
> setup. I did also revert the change the in mlx4 driver and there was no
> change in the tc numbers.
Ok, i misunderstood then because you hinted Daniel had seen those
numbers. If you please also add that to your commit numbers.
cheers,
jamal
^ permalink raw reply
* Re: [PATCH RFC 0/2] tun: lockless xmit
From: Michael S. Tsirkin @ 2016-04-13 11:08 UTC (permalink / raw)
To: Paolo Abeni
Cc: netdev, David S. Miller, Hannes Frederic Sowa, Eric W. Biederman,
Greg Kurz, Jason Wang
In-Reply-To: <cover.1460393493.git.pabeni@redhat.com>
On Wed, Apr 13, 2016 at 11:04:45AM +0200, Paolo Abeni wrote:
> This patch series try to remove the need for any lock in the tun device
> xmit path, significantly improving the forwarding performance when multiple
> processes are accessing the tun device (i.e. in a nic->bridge->tun->vm scenario).
>
> The lockless xmit is obtained explicitly setting the NETIF_F_LLTX feature bit
> and removing the default qdisc.
>
> Unlikely most virtual devices, the tun driver has featured a default qdisc
> for a long period, but it already lost such feature in linux 4.3.
Thanks - I think it's a good idea to reduce the
lock contention there.
But I think it's unfortunate that it requires
bypassing the qdisc completely: this means
that anyone trying to do traffic shaping will
get back the contention.
Can we solve the lock contention for qdisc?
E.g. add a small lockless queue in front of it,
whoever has the qdisc lock would be
responsible for moving things from there to qdisc
proper.
Thoughts? Is there a chance this might work reasonably well?
> Paolo Abeni (2):
> tun: don't require serialization lock on tx
> tun: don't set a default qdisc
>
> drivers/net/tun.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> --
> 1.8.3.1
^ permalink raw reply
* [RFC] Is it a bug for nfs on udp6 mode or kernel?
From: Ding Tianhong @ 2016-04-13 11:28 UTC (permalink / raw)
To: linux-nfs-approval, tom, Netdev, linux-kernel@vger.kernel.org,
dhowells, viro, chuck.lever
Hi everyone:
I have met this problem when I try to test udp6 for nfs connection, my environment is:
Server:
kernel: 4.1.15
IP:xxxx::36/64
MTU:1500
Setting: /etc/exports:/home/nfs *(rw,sync,no_subtree_check,no_root_squash)
Client:
kernel: 4.1.18
IP:xxxx::90/64
MTU:1500
command: mount -t nfs -o vers=3,proto=udp6,rsize=4096,wsize=4096 [xxxx::36]:/home/nfs /home/tmp
I check the nfs parameter configuration, it looks fine and could work well for proto=tcp6.
Then I have mount correctly and try to run the command "ls", it hang.
When I use the rsize=1024 and wsize=1024 to mount, the problem disappeared, so I guess it is the problem for GSO or GRO for UDP。
Then I try to debug the problem, first I tcpdump the package from cline to server, and found that
the client have send readdirplus message to server correctly, and then the Server send a 4k package
to client(the big package will frag to 4 package by GSO), till now it looks fine, and the Client Nic could
receive the 4 skb then send to upper stack to ipv6 and udp, I found the incoming 4 package has been merged
to one and send to upper stack just like sunrpc, but I try to open the rpc_debug, it looks that the rpc could
not receive message.
I built a simple demo to test the udp stack, use the client socket to send big package to server socket, it work well,
so I think the udp is fine, maybe the bug is in sunrpc.
The test is very simple, does any body met the same problem like me, thanks for any suggestion.
Ding
^ permalink raw reply
* Re: [RFC PATCH 0/2] selinux: avoid nf hooks overhead when not needed
From: Paolo Abeni @ 2016-04-13 11:57 UTC (permalink / raw)
To: Casey Schaufler
Cc: Paul Moore, Florian Westphal, linux-security-module,
David S. Miller, James Morris, Andreas Gruenbacher,
Stephen Smalley, netdev, selinux
In-Reply-To: <570CFEBA.7090001@schaufler-ca.com>
On Tue, 2016-04-12 at 06:57 -0700, Casey Schaufler wrote:
> On 4/12/2016 1:52 AM, Paolo Abeni wrote:
> > On Thu, 2016-04-07 at 14:55 -0400, Paul Moore wrote:
> >> On Thursday, April 07, 2016 01:45:32 AM Florian Westphal wrote:
> >>> Paul Moore <paul@paul-moore.com> wrote:
> >>>> On Wed, Apr 6, 2016 at 6:14 PM, Florian Westphal <fw@strlen.de> wrote:
> >>>>> netfilter hooks are per namespace -- so there is hook unregister when
> >>>>> netns is destroyed.
> >>>> Looking around, I see the global and per-namespace registration
> >>>> functions (nf_register_hook and nf_register_net_hook, respectively),
> >>>> but I'm looking to see if/how newly created namespace inherit
> >>>> netfilter hooks from the init network namespace ... if you can create
> >>>> a network namespace and dodge the SELinux hooks, that isn't a good
> >>>> thing from a SELinux point of view, although it might be a plus
> >>>> depending on where you view Paolo's original patches ;)
> >>> Heh :-)
> >>>
> >>> If you use nf_register_net_hook, the hook is only registered in the
> >>> namespace.
> >>>
> >>> If you use nf_register_hook, the hook is put on a global list and
> >>> registed in all existing namespaces.
> >>>
> >>> New namespaces will have the hook added as well (see
> >>> netfilter_net_init -> nf_register_hook_list in netfilter/core.c )
> >>>
> >>> Since nf_register_hook is used it should be impossible to get a netns
> >>> that doesn't call these hooks.
> >> Great, thanks.
> >>
> >>>>> Do you think it makes sense to rework the patch to delay registering
> >>>>> of the netfiler hooks until the system is in a state where they're
> >>>>> needed, without the 'unregister' aspect?
> >>>> I would need to see the patch to say for certain, but in principle
> >>>> that seems perfectly reasonable and I think would satisfy both the
> >>>> netdev and SELinux camps - good suggestion. My main goal is to drop
> >>>> the selinux_nf_ip_init() entirely so it can't be used as a ROP gadget.
> >>>>
> >>>> We might even be able to trim the secmark_active and peerlbl_active
> >>>> checks in the SELinux netfilter hooks (an earlier attempt at
> >>>> optimization; contrary to popular belief, I do care about SELinux
> >>>> performance), although that would mean that enabling the network
> >>>> access controls would be one way ... I guess you can disregard that
> >>>> last bit, I'm thinking aloud again.
> >>> One way is fine I think.
> >> Yes, just disregard my second paragraph above.
> >>
> >>>>> Ideally this would even be per netns -- in perfect world we would
> >>>>> be able to make it so that a new netns are created with an empty
> >>>>> hook list.
> >>>> In general SELinux doesn't care about namespaces, for reasons that are
> >>>> sorta beyond the scope of this conversation, so I would like to stick
> >>>> to a all or nothing approach to enabling the SELinux netfilter hooks
> >>>> across namespaces. Perhaps we can revisit this at a later time, but
> >>>> let's keep it simple right now.
> >>> Okay, I'd prefer to stick to your recommendation anyway wrt. to selinux
> >>> (Casey, I read your comment regarding smack. Noted, we don't want to
> >>> break smack either...)
> >>>
> >>> I think that in this case the entire question is:
> >>>
> >>> In your experience, how likely is a config where selinux is enabled BUT the
> >>> hooks are not needed (i.e., where we hit the
> >>>
> >>> if (!selinux_policycap_netpeer)
> >>> return NF_ACCEPT;
> >>>
> >>> if (!secmark_active && !peerlbl_active)
> >>> return NF_ACCEPT;
> >>>
> >>> tests inside the hooks)? If such setups are uncommon we should just
> >>> drop this idea or at least put it on the back burner until the more
> >>> expensive netfilter hooks (conntrack, cough) are out of the way.
> >> A few years ago I would have said that it is relatively uncommon for admins to
> >> enable the SELinux network access controls; it was typically just
> >> government/intelligence agencies who had very strict access control
> >> requirements and represented a small portion of SELinux users. However, over
> >> the past few years I've been fielding more and more questions from admins/devs
> >> in the virtualization space who are interested in some of these capabilities;
> >> it isn't clear to me how many of these people are switching it on, but there
> >> is definitely more interest than I have seen in the past and the interested is
> >> centered around some rather common use cases.
> >>
> >> So, to summarize, I don't know ;)
> >>
> >> If you've got bigger sources of overhead, my opinion would be to go tackle
> >> those first. Perhaps I can even find the time to work on the
> >> SELinux/netfilter stuff while you are off slaying the bigger dragons, no
> >> promises at the moment.
> > Double checking if I got the above correctly.
> >
> > Will be ok if we post a v2 version of this series, removing the hooks
> > de-registration bits, but preserving the selinux nf-hooks and
> > socket_sock_rcv_skb() on-demand/delayed registration ?
>
> Imagine that I have two security modules that control sockets.
> The work I'm knee deep in will allow this. If adding hooks after
> the init phase is allowed you have to face the possibility that
> blob sizes (in this case sock->sk_security) may change. That
> requires checking on every hook that uses blobs to determine
> whether the blob has data for all the modules using it. I know
> that that is a simple matter of arithmetic, but it's additional
> overhead on every hook call. It also makes creating kmem caches
> for security blobs much more difficult. Another performance
> optimization that becomes unavailable.
I got your point.
Without seeing the code, I wonder if the above scenario could be covered
always allocating a blob large enough for all concurrent security
modules, i.e. each security module always declares/requests/allocates
space into the blobs regardless of it does not have registered (yet)
some security hooks, trading memory usage for performance.
Paolo
^ permalink raw reply
* [PATCH net-next] net/hsr: Added support for HSR v1
From: Peter Heise @ 2016-04-13 11:52 UTC (permalink / raw)
To: arvid.brodin, davem, hannes, sd, henrik, nikolay, tgraf, linville,
gospo, dsa, eranbe, ast, netdev, peter.heise
This patch adds support for the newer version 1 of the HSR
networking standard. Version 0 is still default and the new
version has to be selected via iproute2.
Main changes are in the supervision frame handling and its
ethertype field.
Signed-off-by: Peter Heise <peter.heise@airbus.com>
---
include/uapi/linux/if_ether.h | 1 +
include/uapi/linux/if_link.h | 1 +
net/hsr/Kconfig | 5 +--
net/hsr/hsr_device.c | 80 +++++++++++++++++++++++++------------------
net/hsr/hsr_device.h | 2 +-
net/hsr/hsr_forward.c | 43 +++++++++++++++++------
net/hsr/hsr_framereg.c | 30 ++++++++++------
net/hsr/hsr_main.h | 13 ++++++-
net/hsr/hsr_netlink.c | 10 ++++--
net/hsr/hsr_slave.c | 4 ++-
10 files changed, 126 insertions(+), 63 deletions(-)
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 4a93051..cec849a 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -92,6 +92,7 @@
#define ETH_P_TDLS 0x890D /* TDLS */
#define ETH_P_FIP 0x8914 /* FCoE Initialization Protocol */
#define ETH_P_80221 0x8917 /* IEEE 802.21 Media Independent Handover Protocol */
+#define ETH_P_HSR 0x892F /* IEC 62439-3 HSRv1 */
#define ETH_P_LOOPBACK 0x9000 /* Ethernet loopback packet, per IEEE 802.3 */
#define ETH_P_QINQ1 0x9100 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
#define ETH_P_QINQ2 0x9200 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 9427f17..bb3a90b 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -773,6 +773,7 @@ enum {
IFLA_HSR_SLAVE1,
IFLA_HSR_SLAVE2,
IFLA_HSR_MULTICAST_SPEC, /* Last byte of supervision addr */
+ IFLA_HSR_VERSION, /* HSR version */
IFLA_HSR_SUPERVISION_ADDR, /* Supervision frame multicast addr */
IFLA_HSR_SEQ_NR,
__IFLA_HSR_MAX,
diff --git a/net/hsr/Kconfig b/net/hsr/Kconfig
index 0d3d709..4b683fd 100644
--- a/net/hsr/Kconfig
+++ b/net/hsr/Kconfig
@@ -18,8 +18,9 @@ config HSR
earlier.
This code is a "best effort" to comply with the HSR standard as
- described in IEC 62439-3:2010 (HSRv0), but no compliancy tests have
- been made.
+ described in IEC 62439-3:2010 (HSRv0) and IEC 62439-3:2012 (HSRv1),
+ but no compliancy tests have been made. Use iproute2 to select
+ the version you desire.
You need to perform any and all necessary tests yourself before
relying on this code in a safety critical system!
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index c7d1adc..386cbce 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -90,7 +90,8 @@ static void hsr_check_announce(struct net_device *hsr_dev,
hsr = netdev_priv(hsr_dev);
- if ((hsr_dev->operstate == IF_OPER_UP) && (old_operstate != IF_OPER_UP)) {
+ if ((hsr_dev->operstate == IF_OPER_UP)
+ && (old_operstate != IF_OPER_UP)) {
/* Went up */
hsr->announce_count = 0;
hsr->announce_timer.expires = jiffies +
@@ -250,31 +251,22 @@ static const struct header_ops hsr_header_ops = {
.parse = eth_header_parse,
};
-
-/* HSR:2010 supervision frames should be padded so that the whole frame,
- * including headers and FCS, is 64 bytes (without VLAN).
- */
-static int hsr_pad(int size)
-{
- const int min_size = ETH_ZLEN - HSR_HLEN - ETH_HLEN;
-
- if (size >= min_size)
- return size;
- return min_size;
-}
-
-static void send_hsr_supervision_frame(struct hsr_port *master, u8 type)
+static void send_hsr_supervision_frame(struct hsr_port *master,
+ u8 type, u8 hsrVer)
{
struct sk_buff *skb;
int hlen, tlen;
+ struct hsr_tag *hsr_tag;
struct hsr_sup_tag *hsr_stag;
struct hsr_sup_payload *hsr_sp;
unsigned long irqflags;
hlen = LL_RESERVED_SPACE(master->dev);
tlen = master->dev->needed_tailroom;
- skb = alloc_skb(hsr_pad(sizeof(struct hsr_sup_payload)) + hlen + tlen,
- GFP_ATOMIC);
+ skb = dev_alloc_skb(
+ sizeof(struct hsr_tag) +
+ sizeof(struct hsr_sup_tag) +
+ sizeof(struct hsr_sup_payload) + hlen + tlen);
if (skb == NULL)
return;
@@ -282,32 +274,48 @@ static void send_hsr_supervision_frame(struct hsr_port *master, u8 type)
skb_reserve(skb, hlen);
skb->dev = master->dev;
- skb->protocol = htons(ETH_P_PRP);
+ skb->protocol = htons(hsrVer ? ETH_P_HSR : ETH_P_PRP);
skb->priority = TC_PRIO_CONTROL;
- if (dev_hard_header(skb, skb->dev, ETH_P_PRP,
+ if (dev_hard_header(skb, skb->dev, (hsrVer ? ETH_P_HSR : ETH_P_PRP),
master->hsr->sup_multicast_addr,
skb->dev->dev_addr, skb->len) <= 0)
goto out;
skb_reset_mac_header(skb);
- hsr_stag = (typeof(hsr_stag)) skb_put(skb, sizeof(*hsr_stag));
+ if (hsrVer > 0) {
+ hsr_tag = (typeof(hsr_tag)) skb_put(skb, sizeof(struct hsr_tag));
+ hsr_tag->encap_proto = htons(ETH_P_PRP);
+ set_hsr_tag_LSDU_size(hsr_tag, HSR_V1_SUP_LSDUSIZE);
+ }
- set_hsr_stag_path(hsr_stag, 0xf);
- set_hsr_stag_HSR_Ver(hsr_stag, 0);
+ hsr_stag = (typeof(hsr_stag)) skb_put(skb, sizeof(struct hsr_sup_tag));
+ set_hsr_stag_path(hsr_stag, (hsrVer ? 0x0 : 0xf));
+ set_hsr_stag_HSR_Ver(hsr_stag, hsrVer);
+ /* From HSRv1 on we have separate supervision sequence numbers. */
spin_lock_irqsave(&master->hsr->seqnr_lock, irqflags);
- hsr_stag->sequence_nr = htons(master->hsr->sequence_nr);
- master->hsr->sequence_nr++;
+ if (hsrVer > 0) {
+ hsr_stag->sequence_nr = htons(master->hsr->sup_sequence_nr);
+ hsr_tag->sequence_nr = htons(master->hsr->sequence_nr);
+ master->hsr->sup_sequence_nr++;
+ master->hsr->sequence_nr++;
+ } else {
+ hsr_stag->sequence_nr = htons(master->hsr->sequence_nr);
+ master->hsr->sequence_nr++;
+ }
spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags);
hsr_stag->HSR_TLV_Type = type;
- hsr_stag->HSR_TLV_Length = 12;
+ /* TODO: Why 12 in HSRv0? */
+ hsr_stag->HSR_TLV_Length = hsrVer ? sizeof(struct hsr_sup_payload) : 12;
/* Payload: MacAddressA */
- hsr_sp = (typeof(hsr_sp)) skb_put(skb, sizeof(*hsr_sp));
+ hsr_sp = (typeof(hsr_sp)) skb_put(skb, sizeof(struct hsr_sup_payload));
ether_addr_copy(hsr_sp->MacAddressA, master->dev->dev_addr);
+ skb_put_padto(skb, ETH_ZLEN + HSR_HLEN);
+
hsr_forward_skb(skb, master);
return;
@@ -329,19 +337,20 @@ static void hsr_announce(unsigned long data)
rcu_read_lock();
master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
- if (hsr->announce_count < 3) {
- send_hsr_supervision_frame(master, HSR_TLV_ANNOUNCE);
+ if (hsr->announce_count < 3 && hsr->protVersion == 0) {
+ send_hsr_supervision_frame(master, HSR_TLV_ANNOUNCE,
+ hsr->protVersion);
hsr->announce_count++;
- } else {
- send_hsr_supervision_frame(master, HSR_TLV_LIFE_CHECK);
- }
- if (hsr->announce_count < 3)
hsr->announce_timer.expires = jiffies +
msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL);
- else
+ } else {
+ send_hsr_supervision_frame(master, HSR_TLV_LIFE_CHECK,
+ hsr->protVersion);
+
hsr->announce_timer.expires = jiffies +
msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL);
+ }
if (is_admin_up(master->dev))
add_timer(&hsr->announce_timer);
@@ -428,7 +437,7 @@ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = {
};
int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
- unsigned char multicast_spec)
+ unsigned char multicast_spec, u8 protocol_version)
{
struct hsr_priv *hsr;
struct hsr_port *port;
@@ -450,6 +459,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
spin_lock_init(&hsr->seqnr_lock);
/* Overflow soon to find bugs easier: */
hsr->sequence_nr = HSR_SEQNR_START;
+ hsr->sup_sequence_nr = HSR_SUP_SEQNR_START;
init_timer(&hsr->announce_timer);
hsr->announce_timer.function = hsr_announce;
@@ -462,6 +472,8 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
ether_addr_copy(hsr->sup_multicast_addr, def_multicast_addr);
hsr->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec;
+ hsr->protVersion = protocol_version;
+
/* FIXME: should I modify the value of these?
*
* - hsr_dev->flags - i.e.
diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h
index 108a5d5..9975e31 100644
--- a/net/hsr/hsr_device.h
+++ b/net/hsr/hsr_device.h
@@ -17,7 +17,7 @@
void hsr_dev_setup(struct net_device *dev);
int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
- unsigned char multicast_spec);
+ unsigned char multicast_spec, u8 protocol_version);
void hsr_check_carrier_and_operstate(struct hsr_priv *hsr);
bool is_hsr_master(struct net_device *dev);
int hsr_get_max_mtu(struct hsr_priv *hsr);
diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c
index 7871ed6..5ee1d43 100644
--- a/net/hsr/hsr_forward.c
+++ b/net/hsr/hsr_forward.c
@@ -50,21 +50,40 @@ struct hsr_frame_info {
*/
static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb)
{
- struct hsr_ethhdr_sp *hdr;
+ struct ethhdr *ethHdr;
+ struct hsr_sup_tag *hsrSupTag;
+ struct hsrv1_ethhdr_sp *hsrV1Hdr;
WARN_ON_ONCE(!skb_mac_header_was_set(skb));
- hdr = (struct hsr_ethhdr_sp *) skb_mac_header(skb);
+ ethHdr = (struct ethhdr *) skb_mac_header(skb);
- if (!ether_addr_equal(hdr->ethhdr.h_dest,
+ /* Correct addr? */
+ if (!ether_addr_equal(ethHdr->h_dest,
hsr->sup_multicast_addr))
return false;
- if (get_hsr_stag_path(&hdr->hsr_sup) != 0x0f)
+ /* Correct ether type?. */
+ if (!(ethHdr->h_proto == htons(ETH_P_PRP)
+ || ethHdr->h_proto == htons(ETH_P_HSR)))
return false;
- if ((hdr->hsr_sup.HSR_TLV_Type != HSR_TLV_ANNOUNCE) &&
- (hdr->hsr_sup.HSR_TLV_Type != HSR_TLV_LIFE_CHECK))
+
+ /* Get the supervision header from correct location. */
+ if (ethHdr->h_proto == htons(ETH_P_HSR)) { /* Okay HSRv1. */
+ hsrV1Hdr = (struct hsrv1_ethhdr_sp *) skb_mac_header(skb);
+ if (hsrV1Hdr->hsr.encap_proto != htons(ETH_P_PRP))
+ return false;
+
+ hsrSupTag = &hsrV1Hdr->hsr_sup;
+ } else {
+ hsrSupTag = &((struct hsrv0_ethhdr_sp *) skb_mac_header(skb))->hsr_sup;
+ }
+
+ if ((hsrSupTag->HSR_TLV_Type != HSR_TLV_ANNOUNCE) &&
+ (hsrSupTag->HSR_TLV_Type != HSR_TLV_LIFE_CHECK))
return false;
- if (hdr->hsr_sup.HSR_TLV_Length != 12)
+ if ((hsrSupTag->HSR_TLV_Length != 12) &&
+ (hsrSupTag->HSR_TLV_Length !=
+ sizeof(struct hsr_sup_payload)))
return false;
return true;
@@ -110,7 +129,7 @@ static struct sk_buff *frame_get_stripped_skb(struct hsr_frame_info *frame,
static void hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame,
- struct hsr_port *port)
+ struct hsr_port *port, u8 protoVersion)
{
struct hsr_ethhdr *hsr_ethhdr;
int lane_id;
@@ -131,7 +150,8 @@ static void hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame,
set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, lsdu_size);
hsr_ethhdr->hsr_tag.sequence_nr = htons(frame->sequence_nr);
hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto;
- hsr_ethhdr->ethhdr.h_proto = htons(ETH_P_PRP);
+ hsr_ethhdr->ethhdr.h_proto = htons(protoVersion ?
+ ETH_P_HSR : ETH_P_PRP);
}
static struct sk_buff *create_tagged_skb(struct sk_buff *skb_o,
@@ -160,7 +180,7 @@ static struct sk_buff *create_tagged_skb(struct sk_buff *skb_o,
memmove(dst, src, movelen);
skb_reset_mac_header(skb);
- hsr_fill_tag(skb, frame, port);
+ hsr_fill_tag(skb, frame, port, port->hsr->protVersion);
return skb;
}
@@ -320,7 +340,8 @@ static int hsr_fill_frame_info(struct hsr_frame_info *frame,
/* FIXME: */
WARN_ONCE(1, "HSR: VLAN not yet supported");
}
- if (ethhdr->h_proto == htons(ETH_P_PRP)) {
+ if (ethhdr->h_proto == htons(ETH_P_PRP)
+ || ethhdr->h_proto == htons(ETH_P_HSR)) {
frame->skb_std = NULL;
frame->skb_hsr = skb;
frame->sequence_nr = hsr_get_skb_sequence_nr(skb);
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
index bace124..7ea9258 100644
--- a/net/hsr/hsr_framereg.c
+++ b/net/hsr/hsr_framereg.c
@@ -177,17 +177,17 @@ struct hsr_node *hsr_get_node(struct list_head *node_db, struct sk_buff *skb,
return node;
}
- if (!is_sup)
- return NULL; /* Only supervision frame may create node entry */
+ /* Everyone may create a node entry, connected node to a HSR device. */
- if (ethhdr->h_proto == htons(ETH_P_PRP)) {
+ if (ethhdr->h_proto == htons(ETH_P_PRP)
+ || ethhdr->h_proto == htons(ETH_P_HSR)) {
/* Use the existing sequence_nr from the tag as starting point
* for filtering duplicate frames.
*/
seq_out = hsr_get_skb_sequence_nr(skb) - 1;
} else {
WARN_ONCE(1, "%s: Non-HSR frame\n", __func__);
- seq_out = 0;
+ seq_out = HSR_SEQNR_START;
}
return hsr_add_node(node_db, ethhdr->h_source, seq_out);
@@ -200,17 +200,25 @@ struct hsr_node *hsr_get_node(struct list_head *node_db, struct sk_buff *skb,
void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr,
struct hsr_port *port_rcv)
{
+ struct ethhdr *ethhdr;
struct hsr_node *node_real;
struct hsr_sup_payload *hsr_sp;
struct list_head *node_db;
int i;
- skb_pull(skb, sizeof(struct hsr_ethhdr_sp));
- hsr_sp = (struct hsr_sup_payload *) skb->data;
+ ethhdr = (struct ethhdr *) skb_mac_header(skb);
- if (ether_addr_equal(eth_hdr(skb)->h_source, hsr_sp->MacAddressA))
- /* Not sent from MacAddressB of a PICS_SUBS capable node */
- goto done;
+ /* Leave the ethernet header. */
+ skb_pull(skb, sizeof(struct ethhdr));
+
+ /* And leave the HSR tag. */
+ if (ethhdr->h_proto == htons(ETH_P_HSR))
+ skb_pull(skb, sizeof(struct hsr_tag));
+
+ /* And leave the HSR sup tag. */
+ skb_pull(skb, sizeof(struct hsr_sup_tag));
+
+ hsr_sp = (struct hsr_sup_payload *) skb->data;
/* Merge node_curr (registered on MacAddressB) into node_real */
node_db = &port_rcv->hsr->node_db;
@@ -225,7 +233,7 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr,
/* Node has already been merged */
goto done;
- ether_addr_copy(node_real->MacAddressB, eth_hdr(skb)->h_source);
+ ether_addr_copy(node_real->MacAddressB, ethhdr->h_source);
for (i = 0; i < HSR_PT_PORTS; i++) {
if (!node_curr->time_in_stale[i] &&
time_after(node_curr->time_in[i], node_real->time_in[i])) {
@@ -241,7 +249,7 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr,
kfree_rcu(node_curr, rcu_head);
done:
- skb_push(skb, sizeof(struct hsr_ethhdr_sp));
+ skb_push(skb, sizeof(struct hsrv1_ethhdr_sp));
}
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
index 5a9c699..9b9909e 100644
--- a/net/hsr/hsr_main.h
+++ b/net/hsr/hsr_main.h
@@ -30,6 +30,7 @@
*/
#define MAX_SLAVE_DIFF 3000 /* ms */
#define HSR_SEQNR_START (USHRT_MAX - 1024)
+#define HSR_SUP_SEQNR_START (HSR_SEQNR_START / 2)
/* How often shall we check for broken ring and remove node entries older than
@@ -58,6 +59,8 @@ struct hsr_tag {
#define HSR_HLEN 6
+#define HSR_V1_SUP_LSDUSIZE 52
+
/* The helper functions below assumes that 'path' occupies the 4 most
* significant bits of the 16-bit field shared by 'path' and 'LSDU_size' (or
* equivalently, the 4 most significant bits of HSR tag byte 14).
@@ -131,8 +134,14 @@ static inline void set_hsr_stag_HSR_Ver(struct hsr_sup_tag *hst, u16 HSR_Ver)
set_hsr_tag_LSDU_size((struct hsr_tag *) hst, HSR_Ver);
}
-struct hsr_ethhdr_sp {
+struct hsrv0_ethhdr_sp {
+ struct ethhdr ethhdr;
+ struct hsr_sup_tag hsr_sup;
+} __packed;
+
+struct hsrv1_ethhdr_sp {
struct ethhdr ethhdr;
+ struct hsr_tag hsr;
struct hsr_sup_tag hsr_sup;
} __packed;
@@ -162,6 +171,8 @@ struct hsr_priv {
struct timer_list prune_timer;
int announce_count;
u16 sequence_nr;
+ u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */
+ u8 protVersion; /* Indicate if HSRv0 or HSRv1. */
spinlock_t seqnr_lock; /* locking for sequence_nr */
unsigned char sup_multicast_addr[ETH_ALEN];
};
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index a2c7e4c..5425d87 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -23,6 +23,7 @@ static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = {
[IFLA_HSR_SLAVE1] = { .type = NLA_U32 },
[IFLA_HSR_SLAVE2] = { .type = NLA_U32 },
[IFLA_HSR_MULTICAST_SPEC] = { .type = NLA_U8 },
+ [IFLA_HSR_VERSION] = { .type = NLA_U8 },
[IFLA_HSR_SUPERVISION_ADDR] = { .type = NLA_BINARY, .len = ETH_ALEN },
[IFLA_HSR_SEQ_NR] = { .type = NLA_U16 },
};
@@ -35,7 +36,7 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
struct net_device *link[2];
- unsigned char multicast_spec;
+ unsigned char multicast_spec, hsr_version;
if (!data) {
netdev_info(dev, "HSR: No slave devices specified\n");
@@ -62,7 +63,12 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev,
else
multicast_spec = nla_get_u8(data[IFLA_HSR_MULTICAST_SPEC]);
- return hsr_dev_finalize(dev, link, multicast_spec);
+ if (!data[IFLA_HSR_VERSION])
+ hsr_version = 0;
+ else
+ hsr_version = nla_get_u8(data[IFLA_HSR_VERSION]);
+
+ return hsr_dev_finalize(dev, link, multicast_spec, hsr_version);
}
static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev)
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index 7d37366..f5b6038 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -22,6 +22,7 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb)
{
struct sk_buff *skb = *pskb;
struct hsr_port *port;
+ u16 protocol;
if (!skb_mac_header_was_set(skb)) {
WARN_ONCE(1, "%s: skb invalid", __func__);
@@ -37,7 +38,8 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb)
goto finish_consume;
}
- if (eth_hdr(skb)->h_proto != htons(ETH_P_PRP))
+ protocol = eth_hdr(skb)->h_proto;
+ if (protocol != htons(ETH_P_PRP) && protocol != htons(ETH_P_HSR))
goto finish_pass;
skb_push(skb, ETH_HLEN);
--
2.5.0
^ permalink raw reply related
* [RESEND PATCH net-next] phy: keep the BCMR_LOOPBACK bit while setup forced mode
From: Weidong Wang @ 2016-04-13 11:59 UTC (permalink / raw)
To: David Miller, f.fainelli; +Cc: netdev, linux-kernel
When tested the PHY SGMII Loopback,:
1.set the LOOPBACK bit,
2.set the autoneg to AUTONEG_DISABLE, it calls the
genphy_setup_forced which will clear the bit.
So just keep the LOOPBACK bit while setup forced mode.
Signed-off-by: Weidong Wang <wangweidong1@huawei.com>
---
drivers/net/phy/phy_device.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index e551f3a..8da4b80 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1124,7 +1124,9 @@ static int genphy_config_advert(struct phy_device *phydev)
int genphy_setup_forced(struct phy_device *phydev)
{
int ctl = 0;
+ int val = phy_read(phydev, MII_BMCR);
+ ctl |= val & BMCR_LOOPBACK;
phydev->pause = 0;
phydev->asym_pause = 0;
-- 2.7.0
^ permalink raw reply related
* Re: [PATCH net-next v2 1/2] rtnetlink: add new RTM_GETSTATS message to dump link stats
From: Jamal Hadi Salim @ 2016-04-13 12:11 UTC (permalink / raw)
To: Thomas Graf, roopa; +Cc: netdev, davem, Nikolay Aleksandrov
In-Reply-To: <20160412132151.GA16560@pox.localdomain>
On 16-04-12 09:21 AM, Thomas Graf wrote:
> On 04/11/16 at 08:53pm, roopa wrote:
>> Top level stats attributes can be netdev or global attributes: We can include string "LINK" in
>> the names of all stats belonging to a netdev to make it easier to recognize the netdev stats (example):
>> IFLA_STATS_LINK64, (netdev)
>> IFLA_STATS_LINK_INET6, (netdev)
>> IFLA_STATS_TCP, (non-netdev, global tcp stats)
>
> This is fine as well. It means that we cant mix netdev and non-netdev
> stats or stats for multiple netdevs in the same request which would
> not be the case if you nest it and have a top level attribute which
> is a list of requests. That may be borderline to overengineering
> though so I'm fine this as well.
Well - using a subheader which has ifindex on it for non-netdev stats
seems wrong then.
cheers,
jamal
^ permalink raw reply
* Re: [PATCH RFC 0/2] tun: lockless xmit
From: Eric Dumazet @ 2016-04-13 12:50 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Paolo Abeni, netdev, David S. Miller, Hannes Frederic Sowa,
Eric W. Biederman, Greg Kurz, Jason Wang
In-Reply-To: <20160413132816-mutt-send-email-mst@redhat.com>
On Wed, 2016-04-13 at 14:08 +0300, Michael S. Tsirkin wrote:
> On Wed, Apr 13, 2016 at 11:04:45AM +0200, Paolo Abeni wrote:
> > This patch series try to remove the need for any lock in the tun device
> > xmit path, significantly improving the forwarding performance when multiple
> > processes are accessing the tun device (i.e. in a nic->bridge->tun->vm scenario).
> >
> > The lockless xmit is obtained explicitly setting the NETIF_F_LLTX feature bit
> > and removing the default qdisc.
> >
> > Unlikely most virtual devices, the tun driver has featured a default qdisc
> > for a long period, but it already lost such feature in linux 4.3.
>
> Thanks - I think it's a good idea to reduce the
> lock contention there.
>
> But I think it's unfortunate that it requires
> bypassing the qdisc completely: this means
> that anyone trying to do traffic shaping will
> get back the contention.
>
> Can we solve the lock contention for qdisc?
> E.g. add a small lockless queue in front of it,
> whoever has the qdisc lock would be
> responsible for moving things from there to qdisc
> proper.
>
> Thoughts? Is there a chance this might work reasonably well?
Adding any new queue in front of qdisc is problematic :
- Adds a new buffer, with extra latencies.
- If you want to implement priorities properly for X COS, you need X
queues.
- Who is going to service this extra buffer and feed the qdisc ?
- If the innocent guy is RT thread, maybe the extra latency will hurt.
- Adding another set of atomic ops.
We have such a schem here at Google (called holdq), but it was a
nightmare to tune.
We never tried to upstream this beast, it is kind of ugly, and were
expecting something better. Problem is : If you use HTB on a bonding
device, you want still to properly use MQ on the slaves.
HTB queue. 20 netperf generating UDP packets
lpaa23:~# ./super_netperf 20 -H lpaa24 -t UDP_STREAM -l 3000 -- -m 100 &
[1] 181993
With the holdq feature turned on : about 1 Mpps
lpaa23:~# sar -n DEV 1 10|grep eth0|grep Average
Average: eth0 28.50 999071.60 3.07 138542.64 0.00
0.00 0.60
holdq turned off : about 620 Kpps
lpaa23:~# sar -n DEV 1 10|grep eth0|grep Average
Average: eth0 39.00 617765.40 4.73 85667.42 0.00
0.00 0.90
^ permalink raw reply
* Re: [PATCH RFC 1/2] tun: don't require serialization lock on tx
From: Eric Dumazet @ 2016-04-13 12:52 UTC (permalink / raw)
To: Paolo Abeni
Cc: netdev, David S. Miller, Michael S. Tsirkin, Hannes Frederic Sowa,
Eric W. Biederman, Greg Kurz, Jason Wang
In-Reply-To: <425e8f69ffd8fe315ef9d3cc678519c7060fb2e0.1460393493.git.pabeni@redhat.com>
On Wed, 2016-04-13 at 11:04 +0200, Paolo Abeni wrote:
> The current tun_net_xmit() implementation don't need any external
> lock since it relay on rcu protection for the tun data structure
> and on socket queue lock for skb queuing.
>
> This patch set the NETIF_F_LLTX feature bit in the tun device, so
> that on xmit, in absence of qdisc, no serialization lock is acquired
> by the caller.
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> drivers/net/tun.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index faf9297..42992dc 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1796,7 +1796,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
> TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
> NETIF_F_HW_VLAN_STAG_TX;
> - dev->features = dev->hw_features;
> + dev->features = dev->hw_features | NETIF_F_LLTX;
> dev->vlan_features = dev->features &
> ~(NETIF_F_HW_VLAN_CTAG_TX |
> NETIF_F_HW_VLAN_STAG_TX);
Acked-by: Eric Dumazet <edumazet@google.com>
^ permalink raw reply
* Re: [PATCH RFC 0/2] tun: lockless xmit
From: Michael S. Tsirkin @ 2016-04-13 12:56 UTC (permalink / raw)
To: Eric Dumazet
Cc: Paolo Abeni, netdev, David S. Miller, Hannes Frederic Sowa,
Eric W. Biederman, Greg Kurz, Jason Wang
In-Reply-To: <1460551817.10638.7.camel@edumazet-glaptop3.roam.corp.google.com>
On Wed, Apr 13, 2016 at 05:50:17AM -0700, Eric Dumazet wrote:
> On Wed, 2016-04-13 at 14:08 +0300, Michael S. Tsirkin wrote:
> > On Wed, Apr 13, 2016 at 11:04:45AM +0200, Paolo Abeni wrote:
> > > This patch series try to remove the need for any lock in the tun device
> > > xmit path, significantly improving the forwarding performance when multiple
> > > processes are accessing the tun device (i.e. in a nic->bridge->tun->vm scenario).
> > >
> > > The lockless xmit is obtained explicitly setting the NETIF_F_LLTX feature bit
> > > and removing the default qdisc.
> > >
> > > Unlikely most virtual devices, the tun driver has featured a default qdisc
> > > for a long period, but it already lost such feature in linux 4.3.
> >
> > Thanks - I think it's a good idea to reduce the
> > lock contention there.
> >
> > But I think it's unfortunate that it requires
> > bypassing the qdisc completely: this means
> > that anyone trying to do traffic shaping will
> > get back the contention.
> >
> > Can we solve the lock contention for qdisc?
> > E.g. add a small lockless queue in front of it,
> > whoever has the qdisc lock would be
> > responsible for moving things from there to qdisc
> > proper.
> >
> > Thoughts? Is there a chance this might work reasonably well?
>
> Adding any new queue in front of qdisc is problematic :
> - Adds a new buffer, with extra latencies.
Only where lock contention would previously occur, right?
> - If you want to implement priorities properly for X COS, you need X
> queues.
This definitely needs thought.
> - Who is going to service this extra buffer and feed the qdisc ?
The way I see it - whoever has the lock, at unlock time.
> - If the innocent guy is RT thread, maybe the extra latency will hurt.
Again - more than a lock?
> - Adding another set of atomic ops.
That's likely true. Use some per-cpu trick instead?
> We have such a schem here at Google (called holdq), but it was a
> nightmare to tune.
>
> We never tried to upstream this beast, it is kind of ugly, and were
> expecting something better. Problem is : If you use HTB on a bonding
> device, you want still to properly use MQ on the slaves.
>
> HTB queue. 20 netperf generating UDP packets
> lpaa23:~# ./super_netperf 20 -H lpaa24 -t UDP_STREAM -l 3000 -- -m 100 &
> [1] 181993
>
>
> With the holdq feature turned on : about 1 Mpps
>
> lpaa23:~# sar -n DEV 1 10|grep eth0|grep Average
> Average: eth0 28.50 999071.60 3.07 138542.64 0.00
> 0.00 0.60
>
> holdq turned off : about 620 Kpps
>
> lpaa23:~# sar -n DEV 1 10|grep eth0|grep Average
> Average: eth0 39.00 617765.40 4.73 85667.42 0.00
> 0.00 0.90
>
^ permalink raw reply
* Re: [PATCH RFC 1/2] tun: don't require serialization lock on tx
From: Michael S. Tsirkin @ 2016-04-13 12:57 UTC (permalink / raw)
To: Hannes Frederic Sowa
Cc: Paolo Abeni, netdev, David S. Miller, Eric W. Biederman,
Greg Kurz, Jason Wang, Christian Borntraeger, Herbert Xu
In-Reply-To: <570E15D9.4000308@stressinduktion.org>
On Wed, Apr 13, 2016 at 11:48:09AM +0200, Hannes Frederic Sowa wrote:
> On 13.04.2016 11:41, Michael S. Tsirkin wrote:
> >On Wed, Apr 13, 2016 at 11:04:46AM +0200, Paolo Abeni wrote:
> >>The current tun_net_xmit() implementation don't need any external
> >>lock since it relay on rcu protection for the tun data structure
> >>and on socket queue lock for skb queuing.
> >>
> >>This patch set the NETIF_F_LLTX feature bit in the tun device, so
> >>that on xmit, in absence of qdisc, no serialization lock is acquired
> >>by the caller.
> >>
> >>Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> >>---
> >> drivers/net/tun.c | 2 +-
> >> 1 file changed, 1 insertion(+), 1 deletion(-)
> >>
> >>diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> >>index faf9297..42992dc 100644
> >>--- a/drivers/net/tun.c
> >>+++ b/drivers/net/tun.c
> >>@@ -1796,7 +1796,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >> dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
> >> TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
> >> NETIF_F_HW_VLAN_STAG_TX;
> >>- dev->features = dev->hw_features;
> >>+ dev->features = dev->hw_features | NETIF_F_LLTX;
> >
> >the documentation says:
> > NETIF_F_LLTX_BIT, /* LockLess TX - deprecated. Please */
> > /* do not use LLTX in new drivers */
>
> In networking/netdev-features.txt
>
> * LLTX driver (deprecated for hardware drivers)
>
> NETIF_F_LLTX should be set in drivers that implement their own locking in
> transmit path or don't need locking at all (e.g. software tunnels).
> In ndo_start_xmit, it is recommended to use a try_lock and return
> NETDEV_TX_LOCKED when the spin lock fails. The locking should also properly
> protect against other callbacks (the rules you need to find out).
>
> Don't use it for new drivers.
>
> I think this is documentation is correct and it is only deprecated for
> hardware drivers.
>
> Bye,
> Hannes
Fine, but what's the AF_PACKET duplication that Herbert Xu
reported with NETIF_F_LLTX? Does anyone remember?
--
MST
^ permalink raw reply
* Re: [PATCH RFC 0/2] tun: lockless xmit
From: Eric Dumazet @ 2016-04-13 13:09 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Paolo Abeni, netdev, David S. Miller, Hannes Frederic Sowa,
Eric W. Biederman, Greg Kurz, Jason Wang
In-Reply-To: <20160413155146-mutt-send-email-mst@redhat.com>
On Wed, 2016-04-13 at 15:56 +0300, Michael S. Tsirkin wrote:
> On Wed, Apr 13, 2016 at 05:50:17AM -0700, Eric Dumazet wrote:
> > On Wed, 2016-04-13 at 14:08 +0300, Michael S. Tsirkin wrote:
> > > On Wed, Apr 13, 2016 at 11:04:45AM +0200, Paolo Abeni wrote:
> > > > This patch series try to remove the need for any lock in the tun device
> > > > xmit path, significantly improving the forwarding performance when multiple
> > > > processes are accessing the tun device (i.e. in a nic->bridge->tun->vm scenario).
> > > >
> > > > The lockless xmit is obtained explicitly setting the NETIF_F_LLTX feature bit
> > > > and removing the default qdisc.
> > > >
> > > > Unlikely most virtual devices, the tun driver has featured a default qdisc
> > > > for a long period, but it already lost such feature in linux 4.3.
> > >
> > > Thanks - I think it's a good idea to reduce the
> > > lock contention there.
> > >
> > > But I think it's unfortunate that it requires
> > > bypassing the qdisc completely: this means
> > > that anyone trying to do traffic shaping will
> > > get back the contention.
> > >
> > > Can we solve the lock contention for qdisc?
> > > E.g. add a small lockless queue in front of it,
> > > whoever has the qdisc lock would be
> > > responsible for moving things from there to qdisc
> > > proper.
> > >
> > > Thoughts? Is there a chance this might work reasonably well?
> >
> > Adding any new queue in front of qdisc is problematic :
> > - Adds a new buffer, with extra latencies.
>
> Only where lock contention would previously occur, right?
>
> > - If you want to implement priorities properly for X COS, you need X
> > queues.
>
> This definitely needs thought.
>
> > - Who is going to service this extra buffer and feed the qdisc ?
>
> The way I see it - whoever has the lock, at unlock time.
>
> > - If the innocent guy is RT thread, maybe the extra latency will hurt.
>
> Again - more than a lock?
Way more. HTB is slow as hell.
Remember the qdisc dequeue is already a big problem in itself.
Adding another layer can practically double the latencies.
>
> > - Adding another set of atomic ops.
>
> That's likely true. Use some per-cpu trick instead?
We tried that, and we got miserable production incidents...
You really need to convince John Fastabend to work full time on the real
thing, not on another queue in front of the existing qdisc.
^ permalink raw reply
* Re: [PATCH RFC 0/2] tun: lockless xmit
From: Michael S. Tsirkin @ 2016-04-13 13:17 UTC (permalink / raw)
To: Eric Dumazet
Cc: Paolo Abeni, netdev, David S. Miller, Hannes Frederic Sowa,
Eric W. Biederman, Greg Kurz, Jason Wang
In-Reply-To: <1460552966.10638.12.camel@edumazet-glaptop3.roam.corp.google.com>
On Wed, Apr 13, 2016 at 06:09:26AM -0700, Eric Dumazet wrote:
> You really need to convince John Fastabend to work full time on the real
> thing
Meaning making all qdiscs themselves lockless? With complex policies
like codel I can see how that might be challenging ...
--
MST
^ permalink raw reply
* Re: [PATCH RFC 1/2] tun: don't require serialization lock on tx
From: Eric Dumazet @ 2016-04-13 13:27 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Hannes Frederic Sowa, Paolo Abeni, netdev, David S. Miller,
Eric W. Biederman, Greg Kurz, Jason Wang, Christian Borntraeger,
Herbert Xu
In-Reply-To: <20160413155654-mutt-send-email-mst@redhat.com>
I. On Wed, 2016-04-13 at 15:57 +0300, Michael S. Tsirkin wrote:
> Fine, but what's the AF_PACKET duplication that Herbert Xu
> reported with NETIF_F_LLTX? Does anyone remember?
Really a lot of virtual drivers use NETIF_F_LLTX these days.
Duplication is more likely to happen with a qdisc, when a packet is
requeued if a driver returns NETDEV_TX_BUSY
^ permalink raw reply
* Re: [PATCH net-next 2/2] net: exit busy loop when another process is runnable
From: Peter Zijlstra @ 2016-04-13 13:28 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Ingo Molnar, Mike Galbraith, Jason Wang, davem, netdev,
linux-kernel, Ingo Molnar
In-Reply-To: <20160411182111-mutt-send-email-mst@redhat.com>
On Mon, Apr 11, 2016 at 07:31:57PM +0300, Michael S. Tsirkin wrote:
> +static bool expected_to_run_fair(struct cfs_rq *cfs_rq, s64 t)
> +{
> + struct sched_entity *left;
> + struct sched_entity *curr = cfs_rq->curr;
> +
> + if (!curr || !curr->on_rq)
> + return false;
> +
> + left = __pick_first_entity(cfs_rq);
> + if (!left)
> + return true;
> +
> + return (s64)(curr->vruntime + calc_delta_fair(t, curr) -
> + left->vruntime) < 0;
> +}
>
> The reason it seems easier is because that way we can reuse
> calc_delta_fair and don't have to do the reverse translation
> from vruntime to nsec.
>
> And I guess if we do this with interrupts disabled, and only poke
> at the current CPU's rq, we know first entity
> won't go away so we don't need locks?
Nope, not true. Current isn't actually in the tree, and any other task
is subject to being moved at any time.
Even if current was in the tree, there is no guarantee it is
->rb_leftmost; imagine a task being migrated in that has a smaller
vruntime.
So this really cannot work without locks :/
I've not thought about the actual problem you're trying to solve; but I
figured I'd let you know this before you continue down this path.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox