* Re: [PATCH net-next] ibmvnic: Improve output for unsupported stats
From: Andrew Lunn @ 2017-10-02 21:57 UTC (permalink / raw)
To: John Allen; +Cc: netdev, Thomas Falcon
In-Reply-To: <6ce14d1a-11cc-37be-e6ca-b9ec4afb01a8@linux.vnet.ibm.com>
On Mon, Oct 02, 2017 at 03:31:39PM -0500, John Allen wrote:
> The vnic server can report -1 in the event that a given statistic is not
> supported. Currently, the -1 value is implicitly cast to an unsigned
> integer and appears through the ethtool -S output as a very large number.
> This patch improves this behavior by reporting 0 in the event that a
> given statistic is not supported.
Hi John
If it does not exist, why not skip it altogether?
ibmvnic_get_sset_count() could walk the list and count the valid ones.
ibmvnic_get_strings() could only return the name of the valid onces.
Andrew
^ permalink raw reply
* [PATCH v3 net-next] net: core: decouple ifalias get/set from rtnl lock
From: Florian Westphal @ 2017-10-02 21:50 UTC (permalink / raw)
To: netdev; +Cc: edumazet, Florian Westphal
Device alias can be set by either rtnetlink (rtnl is held) or sysfs.
rtnetlink hold the rtnl mutex, sysfs acquires it for this purpose.
Add an extra mutex for it and use rcu to protect concurrent accesses.
This allows the sysfs path to not take rtnl and would later allow
to not hold it when dumping ifalias.
Based on suggestion from Eric Dumazet.
Signed-off-by: Florian Westphal <fw@strlen.de>
---
Changes since v2:
- use rcu_swap_protected
Change since v1:
- don't use a sequence lock, instead use kmalloc+kfree_rcu
- add comment for dev_get_alias
- add comment why we use kfree and not kfree_rcu
to free dev->alias during netdevice destruction
include/linux/netdevice.h | 8 +++++++-
net/core/dev.c | 52 +++++++++++++++++++++++++++++++++++------------
net/core/net-sysfs.c | 17 ++++++++--------
net/core/rtnetlink.c | 13 ++++++++++--
4 files changed, 65 insertions(+), 25 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e1d6ef130611..d04424cfffba 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -826,6 +826,11 @@ struct xfrmdev_ops {
};
#endif
+struct dev_ifalias {
+ struct rcu_head rcuhead;
+ char ifalias[];
+};
+
/*
* This structure defines the management hooks for network devices.
* The following hooks can be defined; unless noted otherwise, they are
@@ -1632,7 +1637,7 @@ enum netdev_priv_flags {
struct net_device {
char name[IFNAMSIZ];
struct hlist_node name_hlist;
- char *ifalias;
+ struct dev_ifalias __rcu *ifalias;
/*
* I/O specific fields
* FIXME: Merge these and struct ifmap into one
@@ -3275,6 +3280,7 @@ void __dev_notify_flags(struct net_device *, unsigned int old_flags,
unsigned int gchanges);
int dev_change_name(struct net_device *, const char *);
int dev_set_alias(struct net_device *, const char *, size_t);
+int dev_get_alias(const struct net_device *, char *, size_t);
int dev_change_net_namespace(struct net_device *, struct net *, const char *);
int __dev_set_mtu(struct net_device *, int);
int dev_set_mtu(struct net_device *, int);
diff --git a/net/core/dev.c b/net/core/dev.c
index e350c768d4b5..1770097cfd86 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -188,6 +188,8 @@ static struct napi_struct *napi_by_id(unsigned int napi_id);
DEFINE_RWLOCK(dev_base_lock);
EXPORT_SYMBOL(dev_base_lock);
+static DEFINE_MUTEX(ifalias_mutex);
+
/* protects napi_hash addition/deletion and napi_gen_id */
static DEFINE_SPINLOCK(napi_hash_lock);
@@ -1265,29 +1267,53 @@ int dev_change_name(struct net_device *dev, const char *newname)
*/
int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
{
- char *new_ifalias;
-
- ASSERT_RTNL();
+ struct dev_ifalias *new_alias = NULL;
if (len >= IFALIASZ)
return -EINVAL;
- if (!len) {
- kfree(dev->ifalias);
- dev->ifalias = NULL;
- return 0;
+ if (len) {
+ new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
+ if (!new_alias)
+ return -ENOMEM;
+
+ memcpy(new_alias->ifalias, alias, len);
+ new_alias->ifalias[len] = 0;
}
- new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
- if (!new_ifalias)
- return -ENOMEM;
- dev->ifalias = new_ifalias;
- memcpy(dev->ifalias, alias, len);
- dev->ifalias[len] = 0;
+ mutex_lock(&ifalias_mutex);
+ rcu_swap_protected(dev->ifalias, new_alias,
+ mutex_is_locked(&ifalias_mutex));
+ mutex_unlock(&ifalias_mutex);
+
+ if (new_alias)
+ kfree_rcu(new_alias, rcuhead);
return len;
}
+/**
+ * dev_get_alias - get ifalias of a device
+ * @dev: device
+ * @alias: buffer to store name of ifalias
+ * @len: size of buffer
+ *
+ * get ifalias for a device. Caller must make sure dev cannot go
+ * away, e.g. rcu read lock or own a reference count to device.
+ */
+int dev_get_alias(const struct net_device *dev, char *name, size_t len)
+{
+ const struct dev_ifalias *alias;
+ int ret = 0;
+
+ rcu_read_lock();
+ alias = rcu_dereference(dev->ifalias);
+ if (alias)
+ ret = snprintf(name, len, "%s", alias->ifalias);
+ rcu_read_unlock();
+
+ return ret;
+}
/**
* netdev_features_change - device changes features
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 927a6dcbad96..51d5836d8fb9 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -391,10 +391,7 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
if (len > 0 && buf[len - 1] == '\n')
--count;
- if (!rtnl_trylock())
- return restart_syscall();
ret = dev_set_alias(netdev, buf, count);
- rtnl_unlock();
return ret < 0 ? ret : len;
}
@@ -403,13 +400,12 @@ static ssize_t ifalias_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
const struct net_device *netdev = to_net_dev(dev);
+ char tmp[IFALIASZ];
ssize_t ret = 0;
- if (!rtnl_trylock())
- return restart_syscall();
- if (netdev->ifalias)
- ret = sprintf(buf, "%s\n", netdev->ifalias);
- rtnl_unlock();
+ ret = dev_get_alias(netdev, tmp, sizeof(tmp));
+ if (ret > 0)
+ ret = sprintf(buf, "%s\n", tmp);
return ret;
}
static DEVICE_ATTR_RW(ifalias);
@@ -1488,7 +1484,10 @@ static void netdev_release(struct device *d)
BUG_ON(dev->reg_state != NETREG_RELEASED);
- kfree(dev->ifalias);
+ /* no need to wait for rcu grace period:
+ * device is dead and about to be freed.
+ */
+ kfree(rcu_access_pointer(dev->ifalias));
netdev_freemem(dev);
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e6955da0d58d..3961f87cdc76 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1366,6 +1366,16 @@ static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev)
return nla_put_u32(skb, IFLA_LINK, ifindex);
}
+static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ char buf[IFALIASZ];
+ int ret;
+
+ ret = dev_get_alias(dev, buf, sizeof(buf));
+ return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0;
+}
+
static int rtnl_fill_link_netnsid(struct sk_buff *skb,
const struct net_device *dev)
{
@@ -1425,8 +1435,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
(dev->qdisc &&
nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
- (dev->ifalias &&
- nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
+ nla_put_ifalias(skb, dev) ||
nla_put_u32(skb, IFLA_CARRIER_CHANGES,
atomic_read(&dev->carrier_changes)) ||
nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
--
2.13.6
^ permalink raw reply related
* Re: [RFC net-next 0/5] TSN: Add qdisc-based config interfaces for traffic shapers
From: Levi Pearson @ 2017-10-02 21:48 UTC (permalink / raw)
To: Rodney Cummings
Cc: Linux Kernel Network Developers, Vinicius Costa Gomes,
Henrik Austad, richardcochran@gmail.com,
jesus.sanchez-palencia@intel.com, andre.guedes@intel.com
In-Reply-To: <DM2PR0401MB138982C6B22DF8154A82DB39927D0@DM2PR0401MB1389.namprd04.prod.outlook.com>
Hi Rodney,
On Mon, Oct 2, 2017 at 1:40 PM, Rodney Cummings <rodney.cummings@ni.com> wrote:
> It's a shame that someone built such hardware. Speaking as a manufacturer
> of daisy-chainable products for industrial/automotive applications, I
> wouldn't use that hardware in my products. The whole point of scheduling
> is to obtain close-to-optimal network determinism. If the hardware
> doesn't schedule properly, the product is probably better off using CBS.
I should note that the hardware I'm using was not designed to be a TSN
endpoint. It just happens that the hardware, designed for other
reasons, happens to have hardware support for Qbv and thus makes it an
interesting and inexpensive platform on which to do Qbv experiments.
Nevertheless, I believe the right combination of driver support for
the hardware shaping and appropriate software in the network stack
would allow it to schedule things fairly precisely in the
application-centric manner you've described. I will ultimately defer
to your judgement on that, though, and spend some time with the Qcc
draft to see where I may be wrong.
> It would be great if all of the user-level application code was scheduled
> as tightly as the network, but the reality is that we aren't there yet.
> Also, even if the end-station has a separately timed RT Linux thread for
> each stream, the order of those threads can vary. Therefore, when the
> end-station (SoC) has more than one talker, the frames for each talker
> can be written into the Qbv queue at any time, in any order.
> There is no scheduling of frames (i.e. streams)... only the queue.
Frames are presented to *the kernel* in any order. The qdisc
infrastructure provides mechanisms by which other scheduling policy
can be added between presentation to the kernel and presentation to
hardware queues.
Even with i210-style LaunchTime functionality, this must also be
provided. The hardware cannot re-order time-scheduled frames, and
reviewers of the SO_TXTIME proposal rejected the idea of having the
driver try to re-order scheduled frames already submitted to the
hardware-visible descriptor chains, and rightly so I think.
So far, I have thought of 3 places this ordering constraint might be provided:
1. Built into each driver with offload support for SO_TXTIME
2. A qdisc module that will re-order scheduled skbuffs within a certain window
3. Left to userspace to coordinate
I think 1 is particularly bad due to duplication of code. 3 is the
easy default from the point of view of the kernel, but not attractive
from the point of view of precise scheduling and low latency to
egress. Unless someone has a better idea, I think 2 would be the most
appropriate choice. I can think of a number of ways one might be
designed, but I have not thought them through fully yet.
> From the perspective of the CNC, that means that a Qbv-only end-station is
> sloppy. In contrast, an end-station using per-stream scheduling
> (e.g. i210 with LaunchTime) is precise, because each frame has a known time.
> It's true that P802.1Qcc supports both, so the CNC might support both.
> Nevertheless, the CNC is likely to compute a network-wide schedule
> that optimizes for the per-stream end-stations, and locates the windows
> for the Qbv-only end-stations in a slower interval.
Because out-of-order submission of frames to LaunchTime hardware would
result in the frame that is enqueued later but scheduled earlier only
transmitting once the later-scheduled frame completes, enqueuing
frames immediately with LaunchTime information results in even *worse*
sloppiness if two applications don't always submit their frames in the
correct order to the kernel during a cycle. If your queue is simply
gated at the time of the first scheduled launch, the frame that should
have been first will only be late by the length of the queued-first
frame. If they both have precise launch times, the frame that should
have been first will have to wait until the normal launch time of the
later-scheduled frame plus the time it takes for it to egress!
In either case, re-ordering is required if it is not controlled for in
userspace. In other words, I am not disagreeing with you about the
insufficiency of a Qbv shaper for an endpoint! I am just pointing out
that neither hardware LaunchTime support nor hardware Qbv window
support are sufficient by themselves. Nor, for that case, is the CBS
offload support sufficient to meet multi-stream requirements from SRP.
All of these provide a low-layer mechanism by which appropriate
scheduling of a multi-stream or multi-application endpoint can be
enhanced, but none of them provide the complete story by themselves.
> I realize that we are only doing CBS for now. I guess my main point is that
> if/when we add scheduling, it cannot be limited to Qbv. Per-stream scheduling
> is essential, because that is the assumption for most CNC designs that
> I'm aware of.
I understand and completely agree. I have believed that per-stream
shaping/scheduling in some form is essential from the beginning.
Likewise, CBS is not sufficient in itself for multi-stream systems in
which multiple applications provide streams scheduled for the same
traffic class. But CBS and SO_TXTIME are good first steps; they
provides hooks for drivers to finally support the mechanisms provided
by hardware, and also in the case of SO_TXTIME a hook on which further
time-sensitive enhancements to the network stack can be hung, such as
qdiscs that can store and release frames based on scheduled launch
times. When these are in place, I think stream-based scheduling via
qdisc would be a reasonable next step; perhaps a common design could
be found to work both for media streams over CBS and industrial
streams scheduled by launch time.
Levi
^ permalink raw reply
* Re: [PATCH v2 net-next] net: core: decouple ifalias get/set from rtnl lock
From: Florian Westphal @ 2017-10-02 21:47 UTC (permalink / raw)
To: Florian Westphal; +Cc: netdev
In-Reply-To: <20171002212942.14521-1-fw@strlen.de>
Florian Westphal <fw@strlen.de> wrote:
> + mutex_lock(&ifalias_mutex);
> +
> + old = rcu_dereference_protected(dev->ifalias,
> + mutex_is_locked(&ifalias_mutex));
> + if (len) {
> + memcpy(new_alias->ifalias, alias, len);
> + new_alias->ifalias[len] = 0;
> + }
This can be done outside of the lock, so this can be
> +
> + rcu_assign_pointer(dev->ifalias, new_alias);
rcu_swap_protected().
I'll send a v3.
^ permalink raw reply
* RE: [net-next 11/15] i40evf: Enable VF to request an alternate queue allocation
From: Brady, Alan @ 2017-10-02 21:35 UTC (permalink / raw)
To: Yuval Mintz, Kirsher, Jeffrey T, davem@davemloft.net
Cc: netdev@vger.kernel.org, nhorman@redhat.com, sassmann@redhat.com,
jogreene@redhat.com
In-Reply-To: <AM0PR0502MB368318470E6010AB9D6C07E1BF7D0@AM0PR0502MB3683.eurprd05.prod.outlook.com>
> -----Original Message-----
> From: Yuval Mintz [mailto:yuvalm@mellanox.com]
> Sent: Monday, October 2, 2017 1:51 PM
> To: Kirsher, Jeffrey T <jeffrey.t.kirsher@intel.com>; davem@davemloft.net
> Cc: Brady, Alan <alan.brady@intel.com>; netdev@vger.kernel.org;
> nhorman@redhat.com; sassmann@redhat.com; jogreene@redhat.com
> Subject: RE: [net-next 11/15] i40evf: Enable VF to request an alternate queue
> allocation
>
> > + case VIRTCHNL_OP_REQUEST_QUEUES: {
> > + struct virtchnl_vf_res_request *vfres =
> > + (struct virtchnl_vf_res_request *)msg;
> > + if (vfres->num_queue_pairs == adapter->num_req_queues)
> > {
> > + adapter->flags |=
> > I40EVF_FLAG_REINIT_ITR_NEEDED;
> > + i40evf_schedule_reset(adapter);
> > + } else {
> > + dev_info(&adapter->pdev->dev,
> > + "Requested %d queues, PF can support
> > %d\n",
> > + adapter->num_req_queues,
> > + vfres->num_queue_pairs);
> > + adapter->num_req_queues = 0;
> > + }
> > + }
> > + break;
>
> Something is odd about your parenthesis.
>
> > default:
> > if (adapter->current_op && (v_opcode != adapter-
> > >current_op))
> > dev_warn(&adapter->pdev->dev, "Expected
> response %d from PF,
> > received %d\n",
> > --
> > 2.14.2
I'm not sure which parentheses you are referring to. Perhaps you mean the double curly braces? The outer set of braces are creating a block in the case statement so we can declare a variable specific to the case statement. The inner brace is an if/else statement block. They are both consistent with brace usage/alignment in the rest of the file and adding a new line between them generates a GCC warning.
-Alan
^ permalink raw reply
* Re: [PATCH net-next] vhost_net: do not stall on zerocopy depletion
From: Willem de Bruijn @ 2017-10-02 21:34 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Network Development, David Miller, Jason Wang, Koichiro Den,
virtualization, Willem de Bruijn
In-Reply-To: <CAF=yD-KotdpHs96GomMKR-BqG3Gyrvo+to0sk2=a6E5BKjgpkg@mail.gmail.com>
On Fri, Sep 29, 2017 at 9:25 PM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Fri, Sep 29, 2017 at 3:38 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
>> On Wed, Sep 27, 2017 at 08:25:56PM -0400, Willem de Bruijn wrote:
>>> From: Willem de Bruijn <willemb@google.com>
>>>
>>> Vhost-net has a hard limit on the number of zerocopy skbs in flight.
>>> When reached, transmission stalls. Stalls cause latency, as well as
>>> head-of-line blocking of other flows that do not use zerocopy.
>>>
>>> Instead of stalling, revert to copy-based transmission.
>>>
>>> Tested by sending two udp flows from guest to host, one with payload
>>> of VHOST_GOODCOPY_LEN, the other too small for zerocopy (1B). The
>>> large flow is redirected to a netem instance with 1MBps rate limit
>>> and deep 1000 entry queue.
>>>
>>> modprobe ifb
>>> ip link set dev ifb0 up
>>> tc qdisc add dev ifb0 root netem limit 1000 rate 1MBit
>>>
>>> tc qdisc add dev tap0 ingress
>>> tc filter add dev tap0 parent ffff: protocol ip \
>>> u32 match ip dport 8000 0xffff \
>>> action mirred egress redirect dev ifb0
>>>
>>> Before the delay, both flows process around 80K pps. With the delay,
>>> before this patch, both process around 400. After this patch, the
>>> large flow is still rate limited, while the small reverts to its
>>> original rate. See also discussion in the first link, below.
>>>
>>> The limit in vhost_exceeds_maxpend must be carefully chosen. When
>>> vq->num >> 1, the flows remain correlated. This value happens to
>>> correspond to VHOST_MAX_PENDING for vq->num == 256. Allow smaller
>>> fractions and ensure correctness also for much smaller values of
>>> vq->num, by testing the min() of both explicitly. See also the
>>> discussion in the second link below.
>>>
>>> Link:http://lkml.kernel.org/r/CAF=yD-+Wk9sc9dXMUq1+x_hh=3ThTXa6BnZkygP3tgVpjbp93g@mail.gmail.com
>>> Link:http://lkml.kernel.org/r/20170819064129.27272-1-den@klaipeden.com
>>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>>
>> I'd like to see the effect on the non rate limited case though.
>> If guest is quick won't we have lots of copies then?
>
> Yes, but not significantly more than without this patch.
>
> I ran 1, 10 and 100 flow tcp_stream throughput tests from a sender
> in the guest to a receiver in the host.
>
> To answer the other benchmark question first, I did not see anything
> noteworthy when increasing vq->num from 256 to 1024.
>
> With 1 and 10 flows without this patch all packets use zerocopy.
> With the patch, less than 1% eschews zerocopy.
>
> With 100 flows, even without this patch, 90+% of packets are copied.
> Some zerocopy packets from vhost_net fail this test in tun.c
>
> if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
>
> Generating packets with up to 21 frags. I'm not sure yet why or
> what the fraction of these packets is.
This seems to be a mix of page alignment and compound pages.
The iov_len is always well below the maximum, but frags exceed
page size and can start high in the initial page.
tun_get_user: num_pages=21 max=17 iov_len=6 len=65226
0: p_off=3264 len=6888
1: p_off=1960 len=16384
2: p_off=1960 len=6232
3: p_off=0 len=10152
4: p_off=1960 len=16384
5: p_off=1960 len=9120
> But this in turn can
> disable zcopy_used in vhost_net_tx_select_zcopy for a
> larger share of packets:
>
> return !net->tx_flush &&
> net->tx_packets / 64 >= net->tx_zcopy_err;
>
Testing iov_iter_npages() in handle_tx to inform zcopy_used allows
skipping these without turning off zerocopy for all other packets.
After implementing that, tx_zcopy_err drops to zero, but only around
40% of packets use zerocopy.
^ permalink raw reply
* Question about "prevent dst uses after free" and WARNING in nf_xfrm_me_harder / refcnt / 4.13.3
From: Denys Fedoryshchenko @ 2017-10-02 21:33 UTC (permalink / raw)
To: Eric Dumazet, Linux Kernel Network Developers
Hi,
I'm running now 4.13.3, is this patch required for 4.13 as well?
(it doesnt apply cleanly, as in 4.13 tcp_prequeue use
skb_dst_force_safe, so i just renamed it there to skb_dst_force )
This is what i get on PPPoE BRAS on this kernel, patch applied
(no idea if its related to patch, but just mentioning i applied it, as
it's not vanilla 4.13.3)
[ 7858.579600] ------------[ cut here ]------------
[ 7858.579818] WARNING: CPU: 2 PID: 0 at ./include/net/dst.h:254
nf_xfrm_me_harder+0x61/0xec [nf_nat]
[ 7858.580160] Modules linked in: cls_fw act_police cls_u32 sch_ingress
sch_htb pppoe pppox ppp_generic slhc netconsole configfs coretemp
nf_nat_pptp nf_nat_proto_gre nf_conntrack_pptp nf_conntrack_proto_gre
tun xt_REDIRECT nf_nat_redirect xt_nat xt_TCPMSS ipt_REJECT
nf_reject_ipv4 xt_set ts_bm xt_string xt_connmark xt_DSCP xt_mark
xt_tcpudp ip_set_hash_net ip_set_hash_ip ip_set nfnetlink iptable_mangle
iptable_filter iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4
nf_nat nf_conntrack ip_tables x_tables 8021q garp mrp stp llc ixgbe dca
[ 7858.581255] CPU: 2 PID: 0 Comm: swapper/2 Not tainted
4.13.3-build-0133 #27
[ 7858.581456] Hardware name: HP ProLiant DL320e Gen8 v2, BIOS P80
04/02/2015
[ 7858.581659] task: ffff880434e6a700 task.stack: ffffc90001904000
[ 7858.581862] RIP: 0010:nf_xfrm_me_harder+0x61/0xec [nf_nat]
[ 7858.582061] RSP: 0018:ffff880436483bc0 EFLAGS: 00010246
[ 7858.582259] RAX: 0000000000000000 RBX: ffffffff822df000 RCX:
ffff8803ee9028ce
[ 7858.582461] RDX: 0000000000000014 RSI: ffff88041cd82900 RDI:
ffff880436483bf8
[ 7858.582661] RBP: ffff880436483c20 R08: ffffffff81e0b400 R09:
00000000b9160000
[ 7858.582865] R10: ffff8803ee9028e8 R11: 0000000000000000 R12:
ffff880401e92100
[ 7858.583068] R13: 0000000000000001 R14: ffffffff822df000 R15:
ffff88042e280078
[ 7858.583269] FS: 0000000000000000(0000) GS:ffff880436480000(0000)
knlGS:0000000000000000
[ 7858.583608] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 7858.583809] CR2: 00007f9b2886fc9c CR3: 0000000429223000 CR4:
00000000001406e0
[ 7858.584013] Call Trace:
[ 7858.584209] <IRQ>
[ 7858.584408] ? nf_nat_ipv4_fn+0x12e/0x189 [nf_nat_ipv4]
[ 7858.584605] nf_nat_ipv4_out+0xb6/0xd3 [nf_nat_ipv4]
[ 7858.584807] iptable_nat_ipv4_out+0x15/0x17 [iptable_nat]
[ 7858.585010] nf_hook_slow+0x2a/0x9a
[ 7858.585209] ip_output+0x96/0xb4
[ 7858.585410] ? ip_fragment.constprop.5+0x7c/0x7c
[ 7858.585610] ip_forward_finish+0x5b/0x60
[ 7858.585811] ip_forward+0x36d/0x37a
[ 7858.586010] ? ip_frag_mem+0x11/0x11
[ 7858.586207] ip_rcv_finish+0x2f9/0x304
[ 7858.586406] ip_rcv+0x32a/0x337
[ 7858.586604] ? ip_local_deliver_finish+0x1bb/0x1bb
[ 7858.586808] __netif_receive_skb_core+0x4f0/0x847
[ 7858.587009] __netif_receive_skb+0x18/0x5a
[ 7858.587208] ? __netif_receive_skb+0x18/0x5a
[ 7858.587407] process_backlog+0xa4/0x127
[ 7858.587606] net_rx_action+0x11e/0x2d8
[ 7858.587811] ? sched_clock_cpu+0x15/0x9b
[ 7858.588013] __do_softirq+0xe7/0x23a
[ 7858.588210] irq_exit+0x52/0x93
[ 7858.588408] smp_call_function_single_interrupt+0x33/0x35
[ 7858.588610] call_function_single_interrupt+0x83/0x90
[ 7858.588811] RIP: 0010:mwait_idle+0x93/0x13c
[ 7858.589007] RSP: 0018:ffffc90001907eb0 EFLAGS: 00000246 ORIG_RAX:
ffffffffffffff04
[ 7858.589347] RAX: 0000000000000000 RBX: ffff880434e6a700 RCX:
0000000000000000
[ 7858.589548] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
0000000000000000
[ 7858.589750] RBP: ffffc90001907ec0 R08: 0000000000000000 R09:
0000000000000001
[ 7858.589952] R10: ffffc90001907e58 R11: 000000000000024d R12:
0000000000000002
[ 7858.590149] R13: 0000000000000000 R14: ffff880434e6a700 R15:
ffff880434e6a700
[ 7858.590347] </IRQ>
[ 7858.590541] arch_cpu_idle+0xf/0x11
[ 7858.590738] default_idle_call+0x25/0x27
[ 7858.590938] do_idle+0xb8/0x150
[ 7858.591133] cpu_startup_entry+0x1f/0x21
[ 7858.591332] start_secondary+0xe8/0xeb
[ 7858.591531] secondary_startup_64+0x9f/0x9f
[ 7858.591729] Code: 83 7e 48 00 74 07 48 8b b6 80 01 00 00 8b 86 80 00
00 00 85 c0 74 14 8d 50 01 f0 0f b1 96 80 00 00 00 0f 94 c2 84 d2 75 04
eb e8 <0f> ff 49 8b 4c 24 18 48 8d 55 a0 45 31 c0 48 89 df e8 d9 de 95
[ 7858.592239] ---[ end trace c089174999ff4fc3 ]---
[ 7858.592448] dst_release: dst:ffff88041cd82900 refcnt:-1
[ 8139.130003] igb 0000:07:00.0 eth0: igb: eth0 NIC Link is Down
[ 8139.130309] igb 0000:07:00.0 eth0: Reset adapter
[ 8164.431523] igb 0000:07:00.0 eth0: igb: eth0 NIC Link is Up 1000 Mbps
Full Duplex, Flow Control: RX/TX
[ 9149.190518] perf: interrupt took too long (3132 > 3128), lowering
kernel.perf_event_max_sample_rate to 63000
[17205.528640] ------------[ cut here ]------------
[17205.528855] WARNING: CPU: 0 PID: 0 at ./include/net/dst.h:254
nf_xfrm_me_harder+0x61/0xec [nf_nat]
[17205.529197] Modules linked in: cls_fw act_police cls_u32 sch_ingress
sch_htb pppoe pppox ppp_generic slhc netconsole configfs coretemp
nf_nat_pptp nf_nat_proto_gre nf_conntrack_pptp nf_conntrack_proto_gre
tun xt_REDIRECT nf_nat_redirect xt_nat xt_TCPMSS ipt_REJECT
nf_reject_ipv4 xt_set ts_bm xt_string xt_connmark xt_DSCP xt_mark
xt_tcpudp ip_set_hash_net ip_set_hash_ip ip_set nfnetlink iptable_mangle
iptable_filter iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4
nf_nat nf_conntrack ip_tables x_tables 8021q garp mrp stp llc ixgbe dca
[17205.530294] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G W
4.13.3-build-0133 #27
[17205.530632] Hardware name: HP ProLiant DL320e Gen8 v2, BIOS P80
04/02/2015
[17205.530834] task: ffffffff8220e480 task.stack: ffffffff82200000
[17205.531033] RIP: 0010:nf_xfrm_me_harder+0x61/0xec [nf_nat]
[17205.531232] RSP: 0018:ffff880436403bc0 EFLAGS: 00010246
[17205.531434] RAX: 0000000000000000 RBX: ffffffff822df000 RCX:
ffff8803f5fba0ce
[17205.531636] RDX: 0000000000000014 RSI: ffff8804041ae100 RDI:
ffff880436403bf8
[17205.531836] RBP: ffff880436403c20 R08: ffffffff81e0b400 R09:
0000000033d10000
[17205.532035] R10: ffff8803f5fba0e8 R11: 0000000000000000 R12:
ffff88041e7a3500
[17205.532235] R13: 0000000000000001 R14: ffffffff822df000 R15:
ffff88042e280078
[17205.532435] FS: 0000000000000000(0000) GS:ffff880436400000(0000)
knlGS:0000000000000000
[17205.532775] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[17205.532974] CR2: 00007f9b2c6b52b8 CR3: 0000000429223000 CR4:
00000000001406f0
[17205.533170] Call Trace:
[17205.533361] <IRQ>
[17205.533555] ? nf_nat_ipv4_fn+0x12e/0x189 [nf_nat_ipv4]
[17205.533754] nf_nat_ipv4_out+0xb6/0xd3 [nf_nat_ipv4]
[17205.533953] iptable_nat_ipv4_out+0x15/0x17 [iptable_nat]
[17205.534151] nf_hook_slow+0x2a/0x9a
[17205.534344] ip_output+0x96/0xb4
[17205.534539] ? ip_fragment.constprop.5+0x7c/0x7c
[17205.534738] ip_forward_finish+0x5b/0x60
[17205.534939] ip_forward+0x36d/0x37a
[17205.535137] ? ip_frag_mem+0x11/0x11
[17205.535337] ip_rcv_finish+0x2f9/0x304
[17205.535537] ip_rcv+0x32a/0x337
[17205.535732] ? ip_local_deliver_finish+0x1bb/0x1bb
[17205.535935] __netif_receive_skb_core+0x4f0/0x847
[17205.536135] __netif_receive_skb+0x18/0x5a
[17205.536332] ? __netif_receive_skb+0x18/0x5a
[17205.536533] process_backlog+0xa4/0x127
[17205.536731] net_rx_action+0x11e/0x2d8
[17205.536934] ? sched_clock_cpu+0x15/0x9b
[17205.537134] __do_softirq+0xe7/0x23a
[17205.537331] irq_exit+0x52/0x93
[17205.537530] smp_call_function_single_interrupt+0x33/0x35
[17205.537730] call_function_single_interrupt+0x83/0x90
[17205.537934] RIP: 0010:mwait_idle+0x93/0x13c
[17205.538131] RSP: 0018:ffffffff82203e28 EFLAGS: 00000246 ORIG_RAX:
ffffffffffffff04
[17205.538469] RAX: 0000000000000000 RBX: ffffffff8220e480 RCX:
0000000000000000
[17205.538668] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
0000000000000000
[17205.538871] RBP: ffffffff82203e38 R08: 0000000000000000 R09:
0000000000000001
[17205.539071] R10: ffffffff82203dd0 R11: 000000000000002a R12:
0000000000000000
[17205.539271] R13: 0000000000000000 R14: ffffffff8220e480 R15:
ffffffff8220e480
[17205.539472] </IRQ>
[17205.539670] arch_cpu_idle+0xf/0x11
[17205.539869] default_idle_call+0x25/0x27
[17205.540068] do_idle+0xb8/0x150
[17205.540266] cpu_startup_entry+0x1f/0x21
[17205.540465] rest_init+0xb5/0xb7
[17205.540665] start_kernel+0x3b0/0x3bd
[17205.540864] x86_64_start_reservations+0x2a/0x2c
[17205.541063] x86_64_start_kernel+0x16a/0x178
[17205.541262] secondary_startup_64+0x9f/0x9f
[17205.541458] Code: 83 7e 48 00 74 07 48 8b b6 80 01 00 00 8b 86 80 00
00 00 85 c0 74 14 8d 50 01 f0 0f b1 96 80 00 00 00 0f 94 c2 84 d2 75 04
eb e8 <0f> ff 49 8b 4c 24 18 48 8d 55 a0 45 31 c0 48 89 df e8 d9 de 95
[17205.541964] ---[ end trace c089174999ff4fc4 ]---
[17205.542165] dst_release: dst:ffff8804041ae100 refcnt:-1
^ permalink raw reply
* [PATCH v2 net-next] net: core: decouple ifalias get/set from rtnl lock
From: Florian Westphal @ 2017-10-02 21:29 UTC (permalink / raw)
To: netdev; +Cc: Florian Westphal
Device alias can be set by either rtnetlink (rtnl is held) or sysfs.
rtnetlink hold the rtnl mutex, sysfs acquires it for this purpose.
Add an extra mutex for it and use rcu to protect concurrent accesses.
This allows the sysfs path to not take rtnl and would later allow
to not hold it when dumping ifalias.
Based on suggestion from Eric Dumazet.
Signed-off-by: Florian Westphal <fw@strlen.de>
---
Change since v1:
- don't use a sequence lock, instead use kmalloc+kfree_rcu.
include/linux/netdevice.h | 8 ++++++-
net/core/dev.c | 58 ++++++++++++++++++++++++++++++++++++-----------
net/core/net-sysfs.c | 17 +++++++-------
net/core/rtnetlink.c | 13 +++++++++--
4 files changed, 71 insertions(+), 25 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e1d6ef130611..d04424cfffba 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -826,6 +826,11 @@ struct xfrmdev_ops {
};
#endif
+struct dev_ifalias {
+ struct rcu_head rcuhead;
+ char ifalias[];
+};
+
/*
* This structure defines the management hooks for network devices.
* The following hooks can be defined; unless noted otherwise, they are
@@ -1632,7 +1637,7 @@ enum netdev_priv_flags {
struct net_device {
char name[IFNAMSIZ];
struct hlist_node name_hlist;
- char *ifalias;
+ struct dev_ifalias __rcu *ifalias;
/*
* I/O specific fields
* FIXME: Merge these and struct ifmap into one
@@ -3275,6 +3280,7 @@ void __dev_notify_flags(struct net_device *, unsigned int old_flags,
unsigned int gchanges);
int dev_change_name(struct net_device *, const char *);
int dev_set_alias(struct net_device *, const char *, size_t);
+int dev_get_alias(const struct net_device *, char *, size_t);
int dev_change_net_namespace(struct net_device *, struct net *, const char *);
int __dev_set_mtu(struct net_device *, int);
int dev_set_mtu(struct net_device *, int);
diff --git a/net/core/dev.c b/net/core/dev.c
index e350c768d4b5..cadd7218a6ae 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -188,6 +188,8 @@ static struct napi_struct *napi_by_id(unsigned int napi_id);
DEFINE_RWLOCK(dev_base_lock);
EXPORT_SYMBOL(dev_base_lock);
+static DEFINE_MUTEX(ifalias_mutex);
+
/* protects napi_hash addition/deletion and napi_gen_id */
static DEFINE_SPINLOCK(napi_hash_lock);
@@ -1265,29 +1267,59 @@ int dev_change_name(struct net_device *dev, const char *newname)
*/
int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
{
- char *new_ifalias;
-
- ASSERT_RTNL();
+ struct dev_ifalias *new_alias, *old;
if (len >= IFALIASZ)
return -EINVAL;
- if (!len) {
- kfree(dev->ifalias);
- dev->ifalias = NULL;
- return 0;
+ if (len) {
+ new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
+ if (!new_alias)
+ return -ENOMEM;
+ } else {
+ new_alias = NULL;
}
- new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
- if (!new_ifalias)
- return -ENOMEM;
- dev->ifalias = new_ifalias;
- memcpy(dev->ifalias, alias, len);
- dev->ifalias[len] = 0;
+ mutex_lock(&ifalias_mutex);
+
+ old = rcu_dereference_protected(dev->ifalias,
+ mutex_is_locked(&ifalias_mutex));
+ if (len) {
+ memcpy(new_alias->ifalias, alias, len);
+ new_alias->ifalias[len] = 0;
+ }
+
+ rcu_assign_pointer(dev->ifalias, new_alias);
+ mutex_unlock(&ifalias_mutex);
+
+ if (old)
+ kfree_rcu(old, rcuhead);
return len;
}
+/**
+ * dev_get_alias - get ifalias of a device
+ * @dev: device
+ * @alias: buffer to store name of ifalias
+ * @len: size of buffer
+ *
+ * get ifalias for a device. Caller must make sure dev cannot go
+ * away, e.g. rcu read lock or own a reference count to device.
+ */
+int dev_get_alias(const struct net_device *dev, char *name, size_t len)
+{
+ const struct dev_ifalias *alias;
+ int ret = 0;
+
+ rcu_read_lock();
+ alias = rcu_dereference(dev->ifalias);
+ if (alias)
+ ret = snprintf(name, len, "%s", alias->ifalias);
+ rcu_read_unlock();
+
+ return ret;
+}
/**
* netdev_features_change - device changes features
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 927a6dcbad96..51d5836d8fb9 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -391,10 +391,7 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
if (len > 0 && buf[len - 1] == '\n')
--count;
- if (!rtnl_trylock())
- return restart_syscall();
ret = dev_set_alias(netdev, buf, count);
- rtnl_unlock();
return ret < 0 ? ret : len;
}
@@ -403,13 +400,12 @@ static ssize_t ifalias_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
const struct net_device *netdev = to_net_dev(dev);
+ char tmp[IFALIASZ];
ssize_t ret = 0;
- if (!rtnl_trylock())
- return restart_syscall();
- if (netdev->ifalias)
- ret = sprintf(buf, "%s\n", netdev->ifalias);
- rtnl_unlock();
+ ret = dev_get_alias(netdev, tmp, sizeof(tmp));
+ if (ret > 0)
+ ret = sprintf(buf, "%s\n", tmp);
return ret;
}
static DEVICE_ATTR_RW(ifalias);
@@ -1488,7 +1484,10 @@ static void netdev_release(struct device *d)
BUG_ON(dev->reg_state != NETREG_RELEASED);
- kfree(dev->ifalias);
+ /* no need to wait for rcu grace period:
+ * device is dead and about to be freed.
+ */
+ kfree(rcu_access_pointer(dev->ifalias));
netdev_freemem(dev);
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e6955da0d58d..3961f87cdc76 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1366,6 +1366,16 @@ static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev)
return nla_put_u32(skb, IFLA_LINK, ifindex);
}
+static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ char buf[IFALIASZ];
+ int ret;
+
+ ret = dev_get_alias(dev, buf, sizeof(buf));
+ return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0;
+}
+
static int rtnl_fill_link_netnsid(struct sk_buff *skb,
const struct net_device *dev)
{
@@ -1425,8 +1435,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
(dev->qdisc &&
nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
- (dev->ifalias &&
- nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
+ nla_put_ifalias(skb, dev) ||
nla_put_u32(skb, IFLA_CARRIER_CHANGES,
atomic_read(&dev->carrier_changes)) ||
nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
--
2.13.6
^ permalink raw reply related
* RE: [net-next 11/15] i40evf: Enable VF to request an alternate queue allocation
From: Yuval Mintz @ 2017-10-02 20:50 UTC (permalink / raw)
To: Jeff Kirsher, davem@davemloft.net
Cc: Alan Brady, netdev@vger.kernel.org, nhorman@redhat.com,
sassmann@redhat.com, jogreene@redhat.com
In-Reply-To: <20171002194852.71970-12-jeffrey.t.kirsher@intel.com>
> + case VIRTCHNL_OP_REQUEST_QUEUES: {
> + struct virtchnl_vf_res_request *vfres =
> + (struct virtchnl_vf_res_request *)msg;
> + if (vfres->num_queue_pairs == adapter->num_req_queues)
> {
> + adapter->flags |=
> I40EVF_FLAG_REINIT_ITR_NEEDED;
> + i40evf_schedule_reset(adapter);
> + } else {
> + dev_info(&adapter->pdev->dev,
> + "Requested %d queues, PF can support
> %d\n",
> + adapter->num_req_queues,
> + vfres->num_queue_pairs);
> + adapter->num_req_queues = 0;
> + }
> + }
> + break;
Something is odd about your parenthesis.
> default:
> if (adapter->current_op && (v_opcode != adapter-
> >current_op))
> dev_warn(&adapter->pdev->dev, "Expected
> response %d from PF, received %d\n",
> --
> 2.14.2
^ permalink raw reply
* RE: [PATCH net-next 01/10] net/smc: add missing dev_put
From: Parav Pandit @ 2017-10-02 20:39 UTC (permalink / raw)
To: Parav Pandit, Ursula Braun, davem@davemloft.net
Cc: netdev@vger.kernel.org, linux-rdma@vger.kernel.org,
linux-s390@vger.kernel.org, jwi@linux.vnet.ibm.com,
schwidefsky@de.ibm.com, heiko.carstens@de.ibm.com,
raspl@linux.vnet.ibm.com
In-Reply-To: <VI1PR0502MB30089D06B9C95D682E21511BD17D0@VI1PR0502MB3008.eurprd05.prod.outlook.com>
> -----Original Message-----
> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
> owner@vger.kernel.org] On Behalf Of Parav Pandit
> Sent: Monday, October 02, 2017 3:36 PM
> To: Ursula Braun <ubraun@linux.vnet.ibm.com>; davem@davemloft.net
> Cc: netdev@vger.kernel.org; linux-rdma@vger.kernel.org; linux-
> s390@vger.kernel.org; jwi@linux.vnet.ibm.com; schwidefsky@de.ibm.com;
> heiko.carstens@de.ibm.com; raspl@linux.vnet.ibm.com
> Subject: RE: [PATCH net-next 01/10] net/smc: add missing dev_put
>
> Hi Ursula, Dave, Hans,
>
> > -----Original Message-----
> > From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
> > owner@vger.kernel.org] On Behalf Of Ursula Braun
> > Sent: Wednesday, September 20, 2017 6:58 AM
> > To: davem@davemloft.net
> > Cc: netdev@vger.kernel.org; linux-rdma@vger.kernel.org; linux-
> > s390@vger.kernel.org; jwi@linux.vnet.ibm.com; schwidefsky@de.ibm.com;
> > heiko.carstens@de.ibm.com; raspl@linux.vnet.ibm.com;
> > ubraun@linux.vnet.ibm.com
> > Subject: [PATCH net-next 01/10] net/smc: add missing dev_put
> >
> > From: Hans Wippel <hwippel@linux.vnet.ibm.com>
> >
> > In the infiniband part, SMC currently uses get_netdev which calls
> > dev_hold on the returned net device. However, the SMC code never calls
> > dev_put on that net device resulting in a wrong reference count.
> >
> > This patch adds a dev_put after the usage of the net device to fix the issue.
> >
> > Signed-off-by: Hans Wippel <hwippel@linux.vnet.ibm.com>
> > Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
> > ---
> > net/smc/smc_ib.c | 1 +
> > 1 file changed, 1 insertion(+)
> >
> > diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index
> > 547e0e113b17..0b5852299158 100644
> > --- a/net/smc/smc_ib.c
> > +++ b/net/smc/smc_ib.c
> > @@ -380,6 +380,7 @@ static int smc_ib_fill_gid_and_mac(struct
> > smc_ib_device *smcibdev, u8 ibport)
> > ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport);
> > if (ndev) {
> > memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN);
> > + dev_put(ndev);
>
> I am sorry for providing late comments. smc_ib_fill_gid_and_mac() is not coded
> correctly.
> Few fixes are needed.
> 1. ULP such as SMC should not open code/deference any function pointer like
> get_netdev() of the IB device.
> 2. Replace ib_query_gid(..., NULL)
> With
> ib_query_gid(..., gid_attr);
>
> Use gid_attr.ndev to get the MAC address.
> Do dev_put(gid_attr.ndev);
>
> Code should look like below,
>
> struct ib_gid_attr gid_attr;
>
> rc = ib_query_gid(..., &gid_attr);
> if (rc || !gid_addr.ndev)
> return -ENODEV;
> else
> memcpy(smcibdev->mac, ndev->dev_addr, ETH_ALEN);
> dev_put(gid_addr.ndev);
> --
Also,
smc_link_determine_gid() doesn't do dev_put(gattr.ndev) in for loop.
Please fix it as well.
^ permalink raw reply
* Re: [PATCH net-next 2/2] flow_dissector: dissect tunnel info
From: Tom Herbert @ 2017-10-02 20:37 UTC (permalink / raw)
To: Simon Horman
Cc: David Miller, Jiri Pirko, Jamal Hadi Salim, Cong Wang,
Linux Kernel Network Developers, oss-drivers
In-Reply-To: <1506933676-20121-3-git-send-email-simon.horman@netronome.com>
On Mon, Oct 2, 2017 at 1:41 AM, Simon Horman <simon.horman@netronome.com> wrote:
> Move dissection of tunnel info from the flower classifier to the flow
> dissector where all other dissection occurs. This should not have any
> behavioural affect on other users of the flow dissector.
>
> Signed-off-by: Simon Horman <simon.horman@netronome.com>
> Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> ---
> net/core/flow_dissector.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++
> net/sched/cls_flower.c | 25 ------------
> 2 files changed, 100 insertions(+), 25 deletions(-)
>
> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
> index 0a977373d003..1f5caafb4492 100644
> --- a/net/core/flow_dissector.c
> +++ b/net/core/flow_dissector.c
> @@ -5,6 +5,7 @@
> #include <linux/ipv6.h>
> #include <linux/if_vlan.h>
> #include <net/dsa.h>
> +#include <net/dst_metadata.h>
> #include <net/ip.h>
> #include <net/ipv6.h>
> #include <net/gre.h>
> @@ -115,6 +116,102 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
> }
> EXPORT_SYMBOL(__skb_flow_get_ports);
>
> +static void
> +skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
> + struct flow_dissector *flow_dissector,
> + void *target_container)
> +{
> + struct flow_dissector_key_control *ctrl;
> +
> + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL))
> + return;
> +
> + ctrl = skb_flow_dissector_target(flow_dissector,
> + FLOW_DISSECTOR_KEY_ENC_CONTROL,
> + target_container);
> + ctrl->addr_type = type;
> +}
> +
> +static void
> +__skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
> + struct flow_dissector *flow_dissector,
> + void *target_container)
> +{
> + struct ip_tunnel_info *info;
> + struct ip_tunnel_key *key;
> +
> + /* A quick check to see if there might be something to do. */
> + if (!dissector_uses_key(flow_dissector,
> + FLOW_DISSECTOR_KEY_ENC_KEYID) &&
> + !dissector_uses_key(flow_dissector,
> + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) &&
> + !dissector_uses_key(flow_dissector,
> + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) &&
> + !dissector_uses_key(flow_dissector,
> + FLOW_DISSECTOR_KEY_ENC_CONTROL) &&
> + !dissector_uses_key(flow_dissector,
> + FLOW_DISSECTOR_KEY_ENC_PORTS))
> + return;
> +
> + info = skb_tunnel_info(skb);
> + if (!info)
> + return;
> +
> + key = &info->key;
> +
> + switch (ip_tunnel_info_af(info)) {
> + case AF_INET:
> + skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV4_ADDRS,
> + flow_dissector,
> + target_container);
> + if (dissector_uses_key(flow_dissector,
> + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) {
> + struct flow_dissector_key_ipv4_addrs *ipv4;
> +
> + ipv4 = skb_flow_dissector_target(flow_dissector,
> + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
> + target_container);
> + ipv4->src = key->u.ipv4.src;
> + ipv4->dst = key->u.ipv4.dst;
> + }
> + break;
> + case AF_INET6:
> + skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV6_ADDRS,
> + flow_dissector,
> + target_container);
> + if (dissector_uses_key(flow_dissector,
> + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) {
> + struct flow_dissector_key_ipv6_addrs *ipv6;
> +
> + ipv6 = skb_flow_dissector_target(flow_dissector,
> + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS,
> + target_container);
> + ipv6->src = key->u.ipv6.src;
> + ipv6->dst = key->u.ipv6.dst;
Simon,
I think I'm missing something fundamental here. This code is
populating flow dissector keys not based on the contents of the packet
like rest of the flow dissector, but on external meta data related to
the packet which I believe is constant during the whole flow
dissection. Why can't this be handled by the caller? Also, if I read
this correctly, this code could be called multiple times and it seems
like it does the exact same thing in each call.
Tom
> + }
> + break;
> + }
> +
> + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) {
> + struct flow_dissector_key_keyid *keyid;
> +
> + keyid = skb_flow_dissector_target(flow_dissector,
> + FLOW_DISSECTOR_KEY_ENC_KEYID,
> + target_container);
> + keyid->keyid = tunnel_id_to_key32(key->tun_id);
> + }
> +
> + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) {
> + struct flow_dissector_key_ports *tp;
> +
> + tp = skb_flow_dissector_target(flow_dissector,
> + FLOW_DISSECTOR_KEY_ENC_PORTS,
> + target_container);
> + tp->src = key->tp_src;
> + tp->dst = key->tp_dst;
> + }
> +}
> +
> static enum flow_dissect_ret
> __skb_flow_dissect_mpls(const struct sk_buff *skb,
> struct flow_dissector *flow_dissector,
> @@ -478,6 +575,9 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
> FLOW_DISSECTOR_KEY_BASIC,
> target_container);
>
> + __skb_flow_dissect_tunnel_info(skb, flow_dissector,
> + target_container);
> +
> if (dissector_uses_key(flow_dissector,
> FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
> struct ethhdr *eth = eth_hdr(skb);
> diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
> index d230cb4c8094..db831ac708f6 100644
> --- a/net/sched/cls_flower.c
> +++ b/net/sched/cls_flower.c
> @@ -152,37 +152,12 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
> struct cls_fl_filter *f;
> struct fl_flow_key skb_key;
> struct fl_flow_key skb_mkey;
> - struct ip_tunnel_info *info;
>
> if (!atomic_read(&head->ht.nelems))
> return -1;
>
> fl_clear_masked_range(&skb_key, &head->mask);
>
> - info = skb_tunnel_info(skb);
> - if (info) {
> - struct ip_tunnel_key *key = &info->key;
> -
> - switch (ip_tunnel_info_af(info)) {
> - case AF_INET:
> - skb_key.enc_control.addr_type =
> - FLOW_DISSECTOR_KEY_IPV4_ADDRS;
> - skb_key.enc_ipv4.src = key->u.ipv4.src;
> - skb_key.enc_ipv4.dst = key->u.ipv4.dst;
> - break;
> - case AF_INET6:
> - skb_key.enc_control.addr_type =
> - FLOW_DISSECTOR_KEY_IPV6_ADDRS;
> - skb_key.enc_ipv6.src = key->u.ipv6.src;
> - skb_key.enc_ipv6.dst = key->u.ipv6.dst;
> - break;
> - }
> -
> - skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id);
> - skb_key.enc_tp.src = key->tp_src;
> - skb_key.enc_tp.dst = key->tp_dst;
> - }
> -
> skb_key.indev_ifindex = skb->skb_iif;
> /* skb_flow_dissect() does not set n_proto in case an unknown protocol,
> * so do it rather here.
> --
> 2.1.4
>
^ permalink raw reply
* RE: [PATCH net-next 01/10] net/smc: add missing dev_put
From: Parav Pandit @ 2017-10-02 20:36 UTC (permalink / raw)
To: Ursula Braun, davem@davemloft.net
Cc: netdev@vger.kernel.org, linux-rdma@vger.kernel.org,
linux-s390@vger.kernel.org, jwi@linux.vnet.ibm.com,
schwidefsky@de.ibm.com, heiko.carstens@de.ibm.com,
raspl@linux.vnet.ibm.com
In-Reply-To: <20170920115813.63745-2-ubraun@linux.vnet.ibm.com>
Hi Ursula, Dave, Hans,
> -----Original Message-----
> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
> owner@vger.kernel.org] On Behalf Of Ursula Braun
> Sent: Wednesday, September 20, 2017 6:58 AM
> To: davem@davemloft.net
> Cc: netdev@vger.kernel.org; linux-rdma@vger.kernel.org; linux-
> s390@vger.kernel.org; jwi@linux.vnet.ibm.com; schwidefsky@de.ibm.com;
> heiko.carstens@de.ibm.com; raspl@linux.vnet.ibm.com;
> ubraun@linux.vnet.ibm.com
> Subject: [PATCH net-next 01/10] net/smc: add missing dev_put
>
> From: Hans Wippel <hwippel@linux.vnet.ibm.com>
>
> In the infiniband part, SMC currently uses get_netdev which calls dev_hold on
> the returned net device. However, the SMC code never calls dev_put on that net
> device resulting in a wrong reference count.
>
> This patch adds a dev_put after the usage of the net device to fix the issue.
>
> Signed-off-by: Hans Wippel <hwippel@linux.vnet.ibm.com>
> Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
> ---
> net/smc/smc_ib.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index
> 547e0e113b17..0b5852299158 100644
> --- a/net/smc/smc_ib.c
> +++ b/net/smc/smc_ib.c
> @@ -380,6 +380,7 @@ static int smc_ib_fill_gid_and_mac(struct smc_ib_device
> *smcibdev, u8 ibport)
> ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport);
> if (ndev) {
> memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN);
> + dev_put(ndev);
I am sorry for providing late comments. smc_ib_fill_gid_and_mac() is not coded correctly.
Few fixes are needed.
1. ULP such as SMC should not open code/deference any function pointer like get_netdev() of the IB device.
2. Replace ib_query_gid(..., NULL)
With
ib_query_gid(..., gid_attr);
Use gid_attr.ndev to get the MAC address.
Do dev_put(gid_attr.ndev);
Code should look like below,
struct ib_gid_attr gid_attr;
rc = ib_query_gid(..., &gid_attr);
if (rc || !gid_addr.ndev)
return -ENODEV;
else
memcpy(smcibdev->mac, ndev->dev_addr, ETH_ALEN);
dev_put(gid_addr.ndev);
^ permalink raw reply
* [PATCH net-next] ibmvnic: Improve output for unsupported stats
From: John Allen @ 2017-10-02 20:31 UTC (permalink / raw)
To: netdev, Thomas Falcon
The vnic server can report -1 in the event that a given statistic is not
supported. Currently, the -1 value is implicitly cast to an unsigned
integer and appears through the ethtool -S output as a very large number.
This patch improves this behavior by reporting 0 in the event that a
given statistic is not supported.
Signed-off-by: John Allen <jallen@linux.vnet.ibm.com>
---
diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index cb8182f..b8ad2db 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1862,6 +1862,7 @@ static void ibmvnic_get_ethtool_stats(struct net_device *dev,
{
struct ibmvnic_adapter *adapter = netdev_priv(dev);
union ibmvnic_crq crq;
+ u64 stat;
int i, j;
memset(&crq, 0, sizeof(crq));
@@ -1876,9 +1877,11 @@ static void ibmvnic_get_ethtool_stats(struct net_device *dev,
ibmvnic_send_crq(adapter, &crq);
wait_for_completion(&adapter->stats_done);
- for (i = 0; i < ARRAY_SIZE(ibmvnic_stats); i++)
- data[i] = be64_to_cpu(IBMVNIC_GET_STAT(adapter,
- ibmvnic_stats[i].offset));
+ for (i = 0; i < ARRAY_SIZE(ibmvnic_stats); i++) {
+ stat = be64_to_cpu(IBMVNIC_GET_STAT(adapter,
+ ibmvnic_stats[i].offset));
+ data[i] = stat == -1 ? 0 : stat;
+ }
for (j = 0; j < adapter->req_tx_queues; j++) {
data[i] = adapter->tx_stats_buffers[j].packets;
^ permalink raw reply related
* Re: [PATCH net] RDS: IB: Limit the scope of has_fr/has_fmr variables
From: Santosh Shilimkar @ 2017-10-02 20:30 UTC (permalink / raw)
To: David Miller, avinash.repaka; +Cc: netdev, linux-rdma, rds-devel, linux-kernel
In-Reply-To: <20171001.225655.1261950191787562705.davem@davemloft.net>
On 10/1/2017 10:56 PM, David Miller wrote:
> From: David Miller <davem@davemloft.net>
> Date: Sun, 01 Oct 2017 22:54:19 -0700 (PDT)
>
>> From: Avinash Repaka <avinash.repaka@oracle.com>
>> Date: Fri, 29 Sep 2017 18:13:50 -0700
>>
>>> This patch fixes the scope of has_fr and has_fmr variables as they are
>>> needed only in rds_ib_add_one().
>>>
>>> Signed-off-by: Avinash Repaka <avinash.repaka@oracle.com>
>>
>> Applied.
>
> Actually, reverted, this breaks the build.
>
> net/rds/rdma_transport.c:38:10: fatal error: ib.h: No such file or directory
> #include "ib.h"
>
> Although I can't see how in the world this patch is causing such
> an error.
>
Weird indeed. Will sort this out with Avinash. Thanks Dave.
Regards,
Santosh
^ permalink raw reply
* Re: [PATCH iproute2] iproute: build more easily on Android
From: enh @ 2017-10-02 20:23 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: Lorenzo Colitti, netdev
In-Reply-To: <20171002103611.17ccb01f@xeon-e3>
On Mon, Oct 2, 2017 at 10:36 AM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
> On Tue, 3 Oct 2017 02:03:37 +0900
> Lorenzo Colitti <lorenzo@google.com> wrote:
>
>> iproute2 contains a bunch of kernel headers, including uapi ones.
>> Android's libc uses uapi headers almost directly, and uses a
>> script to fix kernel types that don't match what userspace
>> expects.
>>
>> For example: https://issuetracker.google.com/36987220 reports
>> that our struct ip_mreq_source contains "__be32 imr_multiaddr"
>> rather than "struct in_addr imr_multiaddr". The script addresses
>> this by replacing the uapi struct definition with a #include
>> <bits/ip_mreq.h> which contains the traditional userspace
>> definition.
>>
>> Unfortunately, when we compile iproute2, this definition
>> conflicts with the one in iproute2's linux/in.h.
>>
>> Historically we've just solved this problem by running "git rm"
>> on all the iproute2 include/linux headers that break Android's
>> libc. However, deleting the files in this way makes it harder to
>> keep up with upstream, because every upstream change to
>> an include file causes a merge conflict with the delete.
>>
>> This patch fixes the problem by moving the iproute2 linux headers
>> from include/linux to include/uapi/linux.
>>
>> Tested: compiles on ubuntu trusty (glibc)
>>
>> Signed-off-by: Elliott Hughes <enh@google.com>
>> Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
>
> Rather than moving everything, why not make kernel headers directory
> configurable as part of the configure script setup process.
the problem is that C libraries with their our own uapi headers still
need your app-specific headers. to build iproute2 we need to put
iproute2's include/ on our include path, but then the fact that your
different uapi headers are *under* that directory causes the conflict.
separating your headers from your copy of the uapi headers is a
necessary part of any fix.
--
Elliott Hughes - http://who/enh - http://jessies.org/~enh/
Android native code/tools questions? Mail me/drop by/add me as a reviewer.
^ permalink raw reply
* Re: [oss-drivers] Re: [PATCH net-next 2/2] flow_dissector: dissect tunnel info
From: Simon Horman @ 2017-10-02 20:04 UTC (permalink / raw)
To: Tom Herbert
Cc: David Miller, Jiri Pirko, Jamal Hadi Salim, Cong Wang,
Linux Kernel Network Developers, oss-drivers
In-Reply-To: <CALx6S35DpMWdLVWKSoRdsfROM7=A2p9pLi7n2fMtmqwhSEh5wQ@mail.gmail.com>
On Mon, Oct 02, 2017 at 12:36:33PM -0700, Tom Herbert wrote:
> On Mon, Oct 2, 2017 at 1:41 AM, Simon Horman <simon.horman@netronome.com> wrote:
> > Move dissection of tunnel info from the flower classifier to the flow
> > dissector where all other dissection occurs. This should not have any
> > behavioural affect on other users of the flow dissector.
...
> > +static void
> > +__skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
> > + struct flow_dissector *flow_dissector,
> > + void *target_container)
> > +{
> > + struct ip_tunnel_info *info;
> > + struct ip_tunnel_key *key;
> > +
> > + /* A quick check to see if there might be something to do. */
> > + if (!dissector_uses_key(flow_dissector,
> > + FLOW_DISSECTOR_KEY_ENC_KEYID) &&
> > + !dissector_uses_key(flow_dissector,
> > + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) &&
> > + !dissector_uses_key(flow_dissector,
> > + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) &&
> > + !dissector_uses_key(flow_dissector,
> > + FLOW_DISSECTOR_KEY_ENC_CONTROL) &&
> > + !dissector_uses_key(flow_dissector,
> > + FLOW_DISSECTOR_KEY_ENC_PORTS))
> > + return;
>
> This complex conditional is now in the path of every call to flow
> dissector regardless of whether a classifier is enabled or tunnels
> are. What does the assembly show in terms of number of branches? Can
> we at least get this down to one check (might be a use case for
> FLOW_DISSECTOR_F_FLOWER ;-) ), or even better use the static key when
> encap or is enabled?
Hi Tom,
it appears to me (a little to my surprise but I did check before
posting) that the compiler turns the above into a single comparison.
$ gcc --version
gcc (Debian 4.9.2-10) 4.9.2
Copyright (C) 2014 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
^ permalink raw reply
* [net-next 03/15] i40e: Fix a potential NULL pointer dereference
From: Jeff Kirsher @ 2017-10-02 19:48 UTC (permalink / raw)
To: davem; +Cc: Christophe JAILLET, netdev, nhorman, sassmann, jogreene,
Jeff Kirsher
In-Reply-To: <20171002194852.71970-1-jeffrey.t.kirsher@intel.com>
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
If 'kzalloc()' fails, a NULL pointer will be dereferenced.
Return an error code (-ENOMEM) instead.
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index e6b95e1e1a33..9e3667fc7f6a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -423,6 +423,9 @@ static int i40e_config_iwarp_qvlist(struct i40e_vf *vf,
(sizeof(struct virtchnl_iwarp_qv_info) *
(qvlist_info->num_vectors - 1));
vf->qvlist_info = kzalloc(size, GFP_KERNEL);
+ if (!vf->qvlist_info)
+ return -ENOMEM;
+
vf->qvlist_info->num_vectors = qvlist_info->num_vectors;
msix_vf = pf->hw.func_caps.num_msix_vectors_vf;
--
2.14.2
^ permalink raw reply related
* [net-next 06/15] i40e: use admin queue for setting LEDs behavior
From: Jeff Kirsher @ 2017-10-02 19:48 UTC (permalink / raw)
To: davem; +Cc: Mariusz Stachura, netdev, nhorman, sassmann, jogreene,
Jeff Kirsher
In-Reply-To: <20171002194852.71970-1-jeffrey.t.kirsher@intel.com>
From: Mariusz Stachura <mariusz.stachura@intel.com>
Instead of accessing register directly, use newly added AQC in
order to blink LEDs. Introduce and utilize a new flag to prevent
excessive API version checking.
Signed-off-by: Mariusz Stachura <mariusz.stachura@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ethernet/intel/i40e/i40e_adminq.c | 6 ++
drivers/net/ethernet/intel/i40e/i40e_common.c | 115 ++++++++++++++++++++------
drivers/net/ethernet/intel/i40evf/i40e_type.h | 2 +
3 files changed, 99 insertions(+), 24 deletions(-)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.c b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
index ba04988e0598..08f63226105a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
@@ -607,6 +607,12 @@ i40e_status i40e_init_adminq(struct i40e_hw *hw)
&oem_lo);
hw->nvm.oem_ver = ((u32)oem_hi << 16) | oem_lo;
+ if (hw->mac.type == I40E_MAC_XL710 &&
+ hw->aq.api_maj_ver == I40E_FW_API_VERSION_MAJOR &&
+ hw->aq.api_min_ver >= I40E_MINOR_VER_GET_LINK_INFO_XL710) {
+ hw->flags |= I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE;
+ }
+
if (hw->aq.api_maj_ver > I40E_FW_API_VERSION_MAJOR) {
ret_code = I40E_ERR_FIRMWARE_API_VERSION;
goto init_adminq_free_arq;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c
index fada03799850..a4838779de5d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -4836,6 +4836,74 @@ i40e_status i40e_blink_phy_link_led(struct i40e_hw *hw,
return status;
}
+/**
+ * i40e_led_get_reg - read LED register
+ * @hw: pointer to the HW structure
+ * @led_addr: LED register address
+ * @reg_val: read register value
+ **/
+static enum i40e_status_code i40e_led_get_reg(struct i40e_hw *hw, u16 led_addr,
+ u32 *reg_val)
+{
+ enum i40e_status_code status;
+ u8 phy_addr = 0;
+ u8 port_num;
+ u32 i;
+
+ *reg_val = 0;
+ if (hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE) {
+ status =
+ i40e_aq_get_phy_register(hw,
+ I40E_AQ_PHY_REG_ACCESS_EXTERNAL,
+ I40E_PHY_COM_REG_PAGE,
+ I40E_PHY_LED_PROV_REG_1,
+ reg_val, NULL);
+ } else {
+ i = rd32(hw, I40E_PFGEN_PORTNUM);
+ port_num = (u8)(i & I40E_PFGEN_PORTNUM_PORT_NUM_MASK);
+ phy_addr = i40e_get_phy_address(hw, port_num);
+ status = i40e_read_phy_register_clause45(hw,
+ I40E_PHY_COM_REG_PAGE,
+ led_addr, phy_addr,
+ (u16 *)reg_val);
+ }
+ return status;
+}
+
+/**
+ * i40e_led_set_reg - write LED register
+ * @hw: pointer to the HW structure
+ * @led_addr: LED register address
+ * @reg_val: register value to write
+ **/
+static enum i40e_status_code i40e_led_set_reg(struct i40e_hw *hw, u16 led_addr,
+ u32 reg_val)
+{
+ enum i40e_status_code status;
+ u8 phy_addr = 0;
+ u8 port_num;
+ u32 i;
+
+ if (hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE) {
+ status =
+ i40e_aq_set_phy_register(hw,
+ I40E_AQ_PHY_REG_ACCESS_EXTERNAL,
+ I40E_PHY_COM_REG_PAGE,
+ I40E_PHY_LED_PROV_REG_1,
+ reg_val, NULL);
+ } else {
+ i = rd32(hw, I40E_PFGEN_PORTNUM);
+ port_num = (u8)(i & I40E_PFGEN_PORTNUM_PORT_NUM_MASK);
+ phy_addr = i40e_get_phy_address(hw, port_num);
+ status = i40e_write_phy_register_clause45(hw,
+ I40E_PHY_COM_REG_PAGE,
+ led_addr, phy_addr,
+ (u16)reg_val);
+ }
+
+ return status;
+}
+
/**
* i40e_led_get_phy - return current on/off mode
* @hw: pointer to the hw struct
@@ -4853,7 +4921,19 @@ i40e_status i40e_led_get_phy(struct i40e_hw *hw, u16 *led_addr,
u16 temp_addr;
u8 port_num;
u32 i;
-
+ u32 reg_val_aq;
+
+ if (hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE) {
+ status =
+ i40e_aq_get_phy_register(hw,
+ I40E_AQ_PHY_REG_ACCESS_EXTERNAL,
+ I40E_PHY_COM_REG_PAGE,
+ I40E_PHY_LED_PROV_REG_1,
+ ®_val_aq, NULL);
+ if (status == I40E_SUCCESS)
+ *val = (u16)reg_val_aq;
+ return status;
+ }
temp_addr = I40E_PHY_LED_PROV_REG_1;
i = rd32(hw, I40E_PFGEN_PORTNUM);
port_num = (u8)(i & I40E_PFGEN_PORTNUM_PORT_NUM_MASK);
@@ -4888,51 +4968,38 @@ i40e_status i40e_led_set_phy(struct i40e_hw *hw, bool on,
u16 led_addr, u32 mode)
{
i40e_status status = 0;
- u16 led_ctl = 0;
- u16 led_reg = 0;
- u8 phy_addr = 0;
- u8 port_num;
- u32 i;
+ u32 led_ctl = 0;
+ u32 led_reg = 0;
- i = rd32(hw, I40E_PFGEN_PORTNUM);
- port_num = (u8)(i & I40E_PFGEN_PORTNUM_PORT_NUM_MASK);
- phy_addr = i40e_get_phy_address(hw, port_num);
- status = i40e_read_phy_register_clause45(hw, I40E_PHY_COM_REG_PAGE,
- led_addr, phy_addr, &led_reg);
+ status = i40e_led_get_reg(hw, led_addr, &led_reg);
if (status)
return status;
led_ctl = led_reg;
if (led_reg & I40E_PHY_LED_LINK_MODE_MASK) {
led_reg = 0;
- status = i40e_write_phy_register_clause45(hw,
- I40E_PHY_COM_REG_PAGE,
- led_addr, phy_addr,
- led_reg);
+ status = i40e_led_set_reg(hw, led_addr, led_reg);
if (status)
return status;
}
- status = i40e_read_phy_register_clause45(hw, I40E_PHY_COM_REG_PAGE,
- led_addr, phy_addr, &led_reg);
+ status = i40e_led_get_reg(hw, led_addr, &led_reg);
if (status)
goto restore_config;
if (on)
led_reg = I40E_PHY_LED_MANUAL_ON;
else
led_reg = 0;
- status = i40e_write_phy_register_clause45(hw, I40E_PHY_COM_REG_PAGE,
- led_addr, phy_addr, led_reg);
+
+ status = i40e_led_set_reg(hw, led_addr, led_reg);
if (status)
goto restore_config;
if (mode & I40E_PHY_LED_MODE_ORIG) {
led_ctl = (mode & I40E_PHY_LED_MODE_MASK);
- status = i40e_write_phy_register_clause45(hw,
- I40E_PHY_COM_REG_PAGE,
- led_addr, phy_addr, led_ctl);
+ status = i40e_led_set_reg(hw, led_addr, led_ctl);
}
return status;
+
restore_config:
- status = i40e_write_phy_register_clause45(hw, I40E_PHY_COM_REG_PAGE,
- led_addr, phy_addr, led_ctl);
+ status = i40e_led_set_reg(hw, led_addr, led_ctl);
return status;
}
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_type.h b/drivers/net/ethernet/intel/i40evf/i40e_type.h
index b53584e3d580..48eacf5e73e4 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_type.h
@@ -568,6 +568,8 @@ struct i40e_hw {
/* LLDP/DCBX Status */
u16 dcbx_status;
+#define I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE BIT_ULL(2)
+
/* DCBX info */
struct i40e_dcbx_config local_dcbx_config; /* Oper/Local Cfg */
struct i40e_dcbx_config remote_dcbx_config; /* Peer Cfg */
--
2.14.2
^ permalink raw reply related
* [net-next 09/15] i40e: make use of i40e_vc_disable_vf
From: Jeff Kirsher @ 2017-10-02 19:48 UTC (permalink / raw)
To: davem; +Cc: Jacob Keller, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <20171002194852.71970-1-jeffrey.t.kirsher@intel.com>
From: Jacob Keller <jacob.e.keller@intel.com>
Replace i40e_vc_notify_vf_reset and i40e_reset_vf with a call to
i40e_vc_disable_vf which does this exact thing. This matches similar
code patterns throughout the driver.
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 70a79864177a..94ee243f110e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -3388,8 +3388,7 @@ int i40e_ndo_set_vf_trust(struct net_device *netdev, int vf_id, bool setting)
goto out;
vf->trusted = setting;
- i40e_vc_notify_vf_reset(vf);
- i40e_reset_vf(vf, false);
+ i40e_vc_disable_vf(vf);
dev_info(&pf->pdev->dev, "VF %u is now %strusted\n",
vf_id, setting ? "" : "un");
out:
--
2.14.2
^ permalink raw reply related
* [net-next 07/15] i40e: don't hold spinlock while resetting VF
From: Jeff Kirsher @ 2017-10-02 19:48 UTC (permalink / raw)
To: davem; +Cc: Jacob Keller, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <20171002194852.71970-1-jeffrey.t.kirsher@intel.com>
From: Jacob Keller <jacob.e.keller@intel.com>
When we refactored handling of the PVID in commit 9af52f60b2d9
("i40e: use (add|rm)_vlan_all_mac helper functions when changing PVID")
we introduced a scheduling while atomic regression.
This occurred because we now held the spinlock across a call to
i40e_reset_vf(), which results in a usleep_range() call that triggers
a scheduling while atomic bug. This was rare as it only occurred if the
user configured a VLAN on a VF and also attempted to reconfigure the VF
from the host system with a port VLAN.
We do need to hold the lock while calling i40e_is_vsi_in_vlan(), but we
should not be holding it while we reset the VF.
We'll fix this by introducing a separate helper function
i40e_vsi_has_vlans which checks whether we have a PVID and whether the
VSI has configured VLANs. This helper function will manage its own need
for the mac_filter_hash_lock.
Then, we can move the acquiring of the spinlock until after we reset the
VF, which ensures that we do not sleep while holding the lock.
Using a separate function like this makes the code more clear and is
easier to read than attempting to release and re-acquire the spinlock
when we reset the VF.
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 36 +++++++++++++++++++---
1 file changed, 32 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 9e3667fc7f6a..53ead127b293 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -2925,6 +2925,34 @@ int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
return ret;
}
+/**
+ * i40e_vsi_has_vlans - True if VSI has configured VLANs
+ * @vsi: pointer to the vsi
+ *
+ * Check if a VSI has configured any VLANs. False if we have a port VLAN or if
+ * we have no configured VLANs. Do not call while holding the
+ * mac_filter_hash_lock.
+ */
+static bool i40e_vsi_has_vlans(struct i40e_vsi *vsi)
+{
+ bool have_vlans;
+
+ /* If we have a port VLAN, then the VSI cannot have any VLANs
+ * configured, as all MAC/VLAN filters will be assigned to the PVID.
+ */
+ if (vsi->info.pvid)
+ return false;
+
+ /* Since we don't have a PVID, we know that if the device is in VLAN
+ * mode it must be because of a VLAN filter configured on this VSI.
+ */
+ spin_lock_bh(&vsi->mac_filter_hash_lock);
+ have_vlans = i40e_is_vsi_in_vlan(vsi);
+ spin_unlock_bh(&vsi->mac_filter_hash_lock);
+
+ return have_vlans;
+}
+
/**
* i40e_ndo_set_vf_port_vlan
* @netdev: network interface device structure
@@ -2977,10 +3005,7 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id,
/* duplicate request, so just return success */
goto error_pvid;
- /* Locked once because multiple functions below iterate list */
- spin_lock_bh(&vsi->mac_filter_hash_lock);
-
- if (le16_to_cpu(vsi->info.pvid) == 0 && i40e_is_vsi_in_vlan(vsi)) {
+ if (i40e_vsi_has_vlans(vsi)) {
dev_err(&pf->pdev->dev,
"VF %d has already configured VLAN filters and the administrator is requesting a port VLAN override.\nPlease unload and reload the VF driver for this change to take effect.\n",
vf_id);
@@ -2993,6 +3018,9 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id,
vsi = pf->vsi[vf->lan_vsi_idx];
}
+ /* Locked once because multiple functions below iterate list */
+ spin_lock_bh(&vsi->mac_filter_hash_lock);
+
/* Check for condition where there was already a port VLAN ID
* filter set and now it is being deleted by setting it to zero.
* Additionally check for the condition where there was a port
--
2.14.2
^ permalink raw reply related
* [net-next 12/15] i40e: make i40evf_map_rings_to_vectors void
From: Jeff Kirsher @ 2017-10-02 19:48 UTC (permalink / raw)
To: davem; +Cc: Mitch Williams, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <20171002194852.71970-1-jeffrey.t.kirsher@intel.com>
From: Mitch Williams <mitch.a.williams@intel.com>
This function cannot fail, so why is it returning a value? And why are
we checking it? Why shouldn't we just make it void? Why is this commit
message made up of only questions?
Signed-off-by: Mitch Williams <mitch.a.williams@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ethernet/intel/i40evf/i40evf_main.c | 9 ++-------
1 file changed, 2 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 8c513ce84345..f2f1e754c2ce 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -430,12 +430,11 @@ i40evf_map_vector_to_txq(struct i40evf_adapter *adapter, int v_idx, int t_idx)
* group the rings as "efficiently" as possible. You would add new
* mapping configurations in here.
**/
-static int i40evf_map_rings_to_vectors(struct i40evf_adapter *adapter)
+static void i40evf_map_rings_to_vectors(struct i40evf_adapter *adapter)
{
int rings_remaining = adapter->num_active_queues;
int ridx = 0, vidx = 0;
int q_vectors;
- int err = 0;
q_vectors = adapter->num_msix_vectors - NONQ_VECS;
@@ -451,8 +450,6 @@ static int i40evf_map_rings_to_vectors(struct i40evf_adapter *adapter)
}
adapter->aq_required |= I40EVF_FLAG_AQ_MAP_VECTORS;
-
- return err;
}
#ifdef CONFIG_NET_POLL_CONTROLLER
@@ -1578,9 +1575,7 @@ static int i40evf_reinit_interrupt_scheme(struct i40evf_adapter *adapter)
set_bit(__I40E_VSI_DOWN, adapter->vsi.state);
- err = i40evf_map_rings_to_vectors(adapter);
- if (err)
- goto err;
+ i40evf_map_rings_to_vectors(adapter);
if (RSS_AQ(adapter))
adapter->aq_required |= I40EVF_FLAG_AQ_CONFIGURE_RSS;
--
2.14.2
^ permalink raw reply related
* [net-next 15/15] i40e: Stop dropping 802.1ad tags - eth proto 0x88a8
From: Jeff Kirsher @ 2017-10-02 19:48 UTC (permalink / raw)
To: davem; +Cc: Scott Peterson, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <20171002194852.71970-1-jeffrey.t.kirsher@intel.com>
From: Scott Peterson <scott.d.peterson@intel.com>
Enable i40e to pass traffic with VLAN tags using the 802.1ad ethernet
protocol ID (0x88a8).
This requires NIC firmware providing version 1.7 of the API. With
older NIC firmware 802.1ad tagged packets will continue to be dropped.
No VLAN offloads nor RSS are supported for 802.1ad VLANs.
Signed-off-by: Scott Peterson <scott.d.peterson@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ethernet/intel/i40e/i40e_adminq.c | 6 ++++++
drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h | 17 ++++++++++++++++-
drivers/net/ethernet/intel/i40e/i40e_common.c | 6 +++++-
drivers/net/ethernet/intel/i40e/i40e_main.c | 7 +++++++
drivers/net/ethernet/intel/i40e/i40e_type.h | 6 ++++++
drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h | 17 ++++++++++++++++-
drivers/net/ethernet/intel/i40evf/i40e_type.h | 6 ++++++
7 files changed, 62 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.c b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
index 08f63226105a..9dcb2a961197 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
@@ -613,6 +613,12 @@ i40e_status i40e_init_adminq(struct i40e_hw *hw)
hw->flags |= I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE;
}
+ /* The ability to RX (not drop) 802.1ad frames was added in API 1.7 */
+ if (hw->aq.api_maj_ver > 1 ||
+ (hw->aq.api_maj_ver == 1 &&
+ hw->aq.api_min_ver >= 7))
+ hw->flags |= I40E_HW_FLAG_802_1AD_CAPABLE;
+
if (hw->aq.api_maj_ver > I40E_FW_API_VERSION_MAJOR) {
ret_code = I40E_ERR_FIRMWARE_API_VERSION;
goto init_adminq_free_arq;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
index ed7bbe14bc6e..4c85ea9cd89a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
@@ -775,7 +775,22 @@ struct i40e_aqc_set_switch_config {
#define I40E_AQ_SET_SWITCH_CFG_PROMISC 0x0001
#define I40E_AQ_SET_SWITCH_CFG_L2_FILTER 0x0002
__le16 valid_flags;
- u8 reserved[12];
+ /* The ethertype in switch_tag is dropped on ingress and used
+ * internally by the switch. Set this to zero for the default
+ * of 0x88a8 (802.1ad). Should be zero for firmware API
+ * versions lower than 1.7.
+ */
+ __le16 switch_tag;
+ /* The ethertypes in first_tag and second_tag are used to
+ * match the outer and inner VLAN tags (respectively) when HW
+ * double VLAN tagging is enabled via the set port parameters
+ * AQ command. Otherwise these are both ignored. Set them to
+ * zero for their defaults of 0x8100 (802.1Q). Should be zero
+ * for firmware API versions lower than 1.7.
+ */
+ __le16 first_tag;
+ __le16 second_tag;
+ u8 reserved[6];
};
I40E_CHECK_CMD_LENGTH(i40e_aqc_set_switch_config);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c
index a4838779de5d..60542beda7ad 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -2402,7 +2402,11 @@ enum i40e_status_code i40e_aq_set_switch_config(struct i40e_hw *hw,
i40e_aqc_opc_set_switch_config);
scfg->flags = cpu_to_le16(flags);
scfg->valid_flags = cpu_to_le16(valid_flags);
-
+ if (hw->flags & I40E_HW_FLAG_802_1AD_CAPABLE) {
+ scfg->switch_tag = cpu_to_le16(hw->switch_tag);
+ scfg->first_tag = cpu_to_le16(hw->first_tag);
+ scfg->second_tag = cpu_to_le16(hw->second_tag);
+ }
status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
return status;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 387f0863f794..3f9e89b054ec 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -11361,6 +11361,13 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
hw->bus.bus_id = pdev->bus->number;
pf->instance = pfs_found;
+ /* Select something other than the 802.1ad ethertype for the
+ * switch to use internally and drop on ingress.
+ */
+ hw->switch_tag = 0xffff;
+ hw->first_tag = ETH_P_8021AD;
+ hw->second_tag = ETH_P_8021Q;
+
INIT_LIST_HEAD(&pf->l3_flex_pit_list);
INIT_LIST_HEAD(&pf->l4_flex_pit_list);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h
index 8b0b9f826b7f..4b32b1d38a66 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
@@ -610,9 +610,15 @@ struct i40e_hw {
struct i40e_dcbx_config desired_dcbx_config; /* CEE Desired Cfg */
#define I40E_HW_FLAG_AQ_SRCTL_ACCESS_ENABLE BIT_ULL(0)
+#define I40E_HW_FLAG_802_1AD_CAPABLE BIT_ULL(1)
#define I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE BIT_ULL(2)
u64 flags;
+ /* Used in set switch config AQ command */
+ u16 switch_tag;
+ u16 first_tag;
+ u16 second_tag;
+
/* debug mask */
u32 debug_mask;
char err_str[16];
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h
index eee7ece42b39..ed5602f4bbcd 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h
@@ -771,7 +771,22 @@ struct i40e_aqc_set_switch_config {
#define I40E_AQ_SET_SWITCH_CFG_PROMISC 0x0001
#define I40E_AQ_SET_SWITCH_CFG_L2_FILTER 0x0002
__le16 valid_flags;
- u8 reserved[12];
+ /* The ethertype in switch_tag is dropped on ingress and used
+ * internally by the switch. Set this to zero for the default
+ * of 0x88a8 (802.1ad). Should be zero for firmware API
+ * versions lower than 1.7.
+ */
+ __le16 switch_tag;
+ /* The ethertypes in first_tag and second_tag are used to
+ * match the outer and inner VLAN tags (respectively) when HW
+ * double VLAN tagging is enabled via the set port parameters
+ * AQ command. Otherwise these are both ignored. Set them to
+ * zero for their defaults of 0x8100 (802.1Q). Should be zero
+ * for firmware API versions lower than 1.7.
+ */
+ __le16 first_tag;
+ __le16 second_tag;
+ u8 reserved[6];
};
I40E_CHECK_CMD_LENGTH(i40e_aqc_set_switch_config);
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_type.h b/drivers/net/ethernet/intel/i40evf/i40e_type.h
index 48eacf5e73e4..9364b67fff9c 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_type.h
@@ -568,6 +568,7 @@ struct i40e_hw {
/* LLDP/DCBX Status */
u16 dcbx_status;
+#define I40E_HW_FLAG_802_1AD_CAPABLE BIT_ULL(1)
#define I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE BIT_ULL(2)
/* DCBX info */
@@ -575,6 +576,11 @@ struct i40e_hw {
struct i40e_dcbx_config remote_dcbx_config; /* Peer Cfg */
struct i40e_dcbx_config desired_dcbx_config; /* CEE Desired Cfg */
+ /* Used in set switch config AQ command */
+ u16 switch_tag;
+ u16 first_tag;
+ u16 second_tag;
+
/* debug mask */
u32 debug_mask;
char err_str[16];
--
2.14.2
^ permalink raw reply related
* [net-next 11/15] i40evf: Enable VF to request an alternate queue allocation
From: Jeff Kirsher @ 2017-10-02 19:48 UTC (permalink / raw)
To: davem; +Cc: Alan Brady, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <20171002194852.71970-1-jeffrey.t.kirsher@intel.com>
From: Alan Brady <alan.brady@intel.com>
Currently the VF gets a default number of allocated queues from HW on
init and it could choose to enable or disable those allocated queues.
This makes it such that the VF can request more or less underlying
allocated queues from the PF.
First the VF negotiates the number of queues it wants that can be
supported by the PF and if successful asks for a reset. During reset
the PF will reallocate the HW queues for the VF and will then remap the
new queues.
Signed-off-by: Alan Brady <alan.brady@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ethernet/intel/i40evf/i40evf.h | 4 +
drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c | 38 ++++++++-
drivers/net/ethernet/intel/i40evf/i40evf_main.c | 94 ++++++++++++++++++++--
.../net/ethernet/intel/i40evf/i40evf_virtchnl.c | 44 +++++++++-
4 files changed, 173 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf.h b/drivers/net/ethernet/intel/i40evf/i40evf.h
index 82f69031e5cd..5982362c5643 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf.h
+++ b/drivers/net/ethernet/intel/i40evf/i40evf.h
@@ -102,6 +102,7 @@ struct i40e_vsi {
#define I40E_TX_CTXTDESC(R, i) \
(&(((struct i40e_tx_context_desc *)((R)->desc))[i]))
#define MAX_QUEUES 16
+#define I40EVF_MAX_REQ_QUEUES 4
#define I40EVF_HKEY_ARRAY_SIZE ((I40E_VFQF_HKEY_MAX_INDEX + 1) * 4)
#define I40EVF_HLUT_ARRAY_SIZE ((I40E_VFQF_HLUT_MAX_INDEX + 1) * 4)
@@ -200,6 +201,7 @@ struct i40evf_adapter {
struct list_head vlan_filter_list;
char misc_vector_name[IFNAMSIZ + 9];
int num_active_queues;
+ int num_req_queues;
/* TX */
struct i40e_ring *tx_rings;
@@ -235,6 +237,7 @@ struct i40evf_adapter {
#define I40EVF_FLAG_PROMISC_ON BIT(18)
#define I40EVF_FLAG_ALLMULTI_ON BIT(19)
#define I40EVF_FLAG_LEGACY_RX BIT(20)
+#define I40EVF_FLAG_REINIT_ITR_NEEDED BIT(21)
/* duplicates for common code */
#define I40E_FLAG_DCB_ENABLED 0
#define I40E_FLAG_RX_CSUM_ENABLED I40EVF_FLAG_RX_CSUM_ENABLED
@@ -349,6 +352,7 @@ void i40evf_deconfigure_queues(struct i40evf_adapter *adapter);
void i40evf_enable_queues(struct i40evf_adapter *adapter);
void i40evf_disable_queues(struct i40evf_adapter *adapter);
void i40evf_map_queues(struct i40evf_adapter *adapter);
+int i40evf_request_queues(struct i40evf_adapter *adapter, int num);
void i40evf_add_ether_addrs(struct i40evf_adapter *adapter);
void i40evf_del_ether_addrs(struct i40evf_adapter *adapter);
void i40evf_add_vlans(struct i40evf_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
index 65874d6b3ab9..da006fa3fec1 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
@@ -669,7 +669,7 @@ static void i40evf_get_channels(struct net_device *netdev,
struct i40evf_adapter *adapter = netdev_priv(netdev);
/* Report maximum channels */
- ch->max_combined = adapter->num_active_queues;
+ ch->max_combined = I40EVF_MAX_REQ_QUEUES;
ch->max_other = NONQ_VECS;
ch->other_count = NONQ_VECS;
@@ -677,6 +677,41 @@ static void i40evf_get_channels(struct net_device *netdev,
ch->combined_count = adapter->num_active_queues;
}
+/**
+ * i40evf_set_channels: set the new channel count
+ * @netdev: network interface device structure
+ * @ch: channel information structure
+ *
+ * Negotiate a new number of channels with the PF then do a reset. During
+ * reset we'll realloc queues and fix the RSS table. Returns 0 on success,
+ * negative on failure.
+ **/
+static int i40evf_set_channels(struct net_device *netdev,
+ struct ethtool_channels *ch)
+{
+ struct i40evf_adapter *adapter = netdev_priv(netdev);
+ int num_req = ch->combined_count;
+
+ if (num_req != adapter->num_active_queues &&
+ !(adapter->vf_res->vf_cap_flags &
+ VIRTCHNL_VF_OFFLOAD_REQ_QUEUES)) {
+ dev_info(&adapter->pdev->dev, "PF is not capable of queue negotiation.\n");
+ return -EINVAL;
+ }
+
+ /* All of these should have already been checked by ethtool before this
+ * even gets to us, but just to be sure.
+ */
+ if (num_req <= 0 || num_req > I40EVF_MAX_REQ_QUEUES)
+ return -EINVAL;
+
+ if (ch->rx_count || ch->tx_count || ch->other_count != NONQ_VECS)
+ return -EINVAL;
+
+ adapter->num_req_queues = num_req;
+ return i40evf_request_queues(adapter, num_req);
+}
+
/**
* i40evf_get_rxfh_key_size - get the RSS hash key size
* @netdev: network interface device structure
@@ -785,6 +820,7 @@ static const struct ethtool_ops i40evf_ethtool_ops = {
.get_rxfh = i40evf_get_rxfh,
.set_rxfh = i40evf_set_rxfh,
.get_channels = i40evf_get_channels,
+ .set_channels = i40evf_set_channels,
.get_rxfh_key_size = i40evf_get_rxfh_key_size,
.get_link_ksettings = i40evf_get_link_ksettings,
};
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 69ef6c1d5364..8c513ce84345 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -1189,9 +1189,18 @@ static int i40evf_alloc_queues(struct i40evf_adapter *adapter)
{
int i, num_active_queues;
- num_active_queues = min_t(int,
- adapter->vsi_res->num_queue_pairs,
- (int)(num_online_cpus()));
+ /* If we're in reset reallocating queues we don't actually know yet for
+ * certain the PF gave us the number of queues we asked for but we'll
+ * assume it did. Once basic reset is finished we'll confirm once we
+ * start negotiating config with PF.
+ */
+ if (adapter->num_req_queues)
+ num_active_queues = adapter->num_req_queues;
+ else
+ num_active_queues = min_t(int,
+ adapter->vsi_res->num_queue_pairs,
+ (int)(num_online_cpus()));
+
adapter->tx_rings = kcalloc(num_active_queues,
sizeof(struct i40e_ring), GFP_KERNEL);
@@ -1539,6 +1548,48 @@ static void i40evf_free_rss(struct i40evf_adapter *adapter)
adapter->rss_lut = NULL;
}
+/**
+ * i40evf_reinit_interrupt_scheme - Reallocate queues and vectors
+ * @adapter: board private structure
+ *
+ * Returns 0 on success, negative on failure
+ **/
+static int i40evf_reinit_interrupt_scheme(struct i40evf_adapter *adapter)
+{
+ struct net_device *netdev = adapter->netdev;
+ int err;
+
+ if (netif_running(netdev))
+ i40evf_free_traffic_irqs(adapter);
+ i40evf_free_misc_irq(adapter);
+ i40evf_reset_interrupt_capability(adapter);
+ i40evf_free_q_vectors(adapter);
+ i40evf_free_queues(adapter);
+
+ err = i40evf_init_interrupt_scheme(adapter);
+ if (err)
+ goto err;
+
+ netif_tx_stop_all_queues(netdev);
+
+ err = i40evf_request_misc_irq(adapter);
+ if (err)
+ goto err;
+
+ set_bit(__I40E_VSI_DOWN, adapter->vsi.state);
+
+ err = i40evf_map_rings_to_vectors(adapter);
+ if (err)
+ goto err;
+
+ if (RSS_AQ(adapter))
+ adapter->aq_required |= I40EVF_FLAG_AQ_CONFIGURE_RSS;
+ else
+ err = i40evf_init_rss(adapter);
+err:
+ return err;
+}
+
/**
* i40evf_watchdog_timer - Periodic call-back timer
* @data: pointer to adapter disguised as unsigned long
@@ -1885,8 +1936,15 @@ static void i40evf_reset_task(struct work_struct *work)
if (err)
dev_info(&adapter->pdev->dev, "Failed to init adminq: %d\n",
err);
+ adapter->aq_required = 0;
- adapter->aq_required = I40EVF_FLAG_AQ_GET_CONFIG;
+ if (adapter->flags & I40EVF_FLAG_REINIT_ITR_NEEDED) {
+ err = i40evf_reinit_interrupt_scheme(adapter);
+ if (err)
+ goto reset_err;
+ }
+
+ adapter->aq_required |= I40EVF_FLAG_AQ_GET_CONFIG;
adapter->aq_required |= I40EVF_FLAG_AQ_MAP_VECTORS;
/* re-add all MAC filters */
@@ -1916,6 +1974,15 @@ static void i40evf_reset_task(struct work_struct *work)
if (err)
goto reset_err;
+ if (adapter->flags & I40EVF_FLAG_REINIT_ITR_NEEDED) {
+ err = i40evf_request_traffic_irqs(adapter,
+ netdev->name);
+ if (err)
+ goto reset_err;
+
+ adapter->flags &= ~I40EVF_FLAG_REINIT_ITR_NEEDED;
+ }
+
i40evf_configure(adapter);
i40evf_up_complete(adapter);
@@ -2431,9 +2498,9 @@ static int i40evf_check_reset_complete(struct i40e_hw *hw)
int i40evf_process_config(struct i40evf_adapter *adapter)
{
struct virtchnl_vf_resource *vfres = adapter->vf_res;
+ int i, num_req_queues = adapter->num_req_queues;
struct net_device *netdev = adapter->netdev;
struct i40e_vsi *vsi = &adapter->vsi;
- int i;
netdev_features_t hw_enc_features;
netdev_features_t hw_features;
@@ -2447,6 +2514,23 @@ int i40evf_process_config(struct i40evf_adapter *adapter)
return -ENODEV;
}
+ if (num_req_queues &&
+ num_req_queues != adapter->vsi_res->num_queue_pairs) {
+ /* Problem. The PF gave us fewer queues than what we had
+ * negotiated in our request. Need a reset to see if we can't
+ * get back to a working state.
+ */
+ dev_err(&adapter->pdev->dev,
+ "Requested %d queues, but PF only gave us %d.\n",
+ num_req_queues,
+ adapter->vsi_res->num_queue_pairs);
+ adapter->flags |= I40EVF_FLAG_REINIT_ITR_NEEDED;
+ adapter->num_req_queues = adapter->vsi_res->num_queue_pairs;
+ i40evf_schedule_reset(adapter);
+ return -ENODEV;
+ }
+ adapter->num_req_queues = 0;
+
hw_enc_features = NETIF_F_SG |
NETIF_F_IP_CSUM |
NETIF_F_IPV6_CSUM |
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
index 2bb0fe00361f..2bb81c39d85f 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
@@ -160,7 +160,8 @@ int i40evf_send_vf_config_msg(struct i40evf_adapter *adapter)
VIRTCHNL_VF_OFFLOAD_WB_ON_ITR |
VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2 |
VIRTCHNL_VF_OFFLOAD_ENCAP |
- VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM;
+ VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM |
+ VIRTCHNL_VF_OFFLOAD_REQ_QUEUES;
adapter->current_op = VIRTCHNL_OP_GET_VF_RESOURCES;
adapter->aq_required &= ~I40EVF_FLAG_AQ_GET_CONFIG;
@@ -384,6 +385,32 @@ void i40evf_map_queues(struct i40evf_adapter *adapter)
kfree(vimi);
}
+/**
+ * i40evf_request_queues
+ * @adapter: adapter structure
+ * @num: number of requested queues
+ *
+ * We get a default number of queues from the PF. This enables us to request a
+ * different number. Returns 0 on success, negative on failure
+ **/
+int i40evf_request_queues(struct i40evf_adapter *adapter, int num)
+{
+ struct virtchnl_vf_res_request vfres;
+
+ if (adapter->current_op != VIRTCHNL_OP_UNKNOWN) {
+ /* bail because we already have a command pending */
+ dev_err(&adapter->pdev->dev, "Cannot request queues, command %d pending\n",
+ adapter->current_op);
+ return -EBUSY;
+ }
+
+ vfres.num_queue_pairs = num;
+
+ adapter->current_op = VIRTCHNL_OP_REQUEST_QUEUES;
+ return i40evf_send_pf_msg(adapter, VIRTCHNL_OP_REQUEST_QUEUES,
+ (u8 *)&vfres, sizeof(vfres));
+}
+
/**
* i40evf_add_ether_addrs
* @adapter: adapter structure
@@ -1068,6 +1095,21 @@ void i40evf_virtchnl_completion(struct i40evf_adapter *adapter,
"Invalid message %d from PF\n", v_opcode);
}
break;
+ case VIRTCHNL_OP_REQUEST_QUEUES: {
+ struct virtchnl_vf_res_request *vfres =
+ (struct virtchnl_vf_res_request *)msg;
+ if (vfres->num_queue_pairs == adapter->num_req_queues) {
+ adapter->flags |= I40EVF_FLAG_REINIT_ITR_NEEDED;
+ i40evf_schedule_reset(adapter);
+ } else {
+ dev_info(&adapter->pdev->dev,
+ "Requested %d queues, PF can support %d\n",
+ adapter->num_req_queues,
+ vfres->num_queue_pairs);
+ adapter->num_req_queues = 0;
+ }
+ }
+ break;
default:
if (adapter->current_op && (v_opcode != adapter->current_op))
dev_warn(&adapter->pdev->dev, "Expected response %d from PF, received %d\n",
--
2.14.2
^ permalink raw reply related
* [net-next 08/15] i40e: drop i40e_pf *pf from i40e_vc_disable_vf()
From: Jeff Kirsher @ 2017-10-02 19:48 UTC (permalink / raw)
To: davem; +Cc: Jacob Keller, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <20171002194852.71970-1-jeffrey.t.kirsher@intel.com>
From: Jacob Keller <jacob.e.keller@intel.com>
It's never used, and the vf structure could get back to the PF if
necessary. Lets just drop the extra unneeded parameter.
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 53ead127b293..70a79864177a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -154,12 +154,11 @@ void i40e_vc_notify_vf_reset(struct i40e_vf *vf)
/**
* i40e_vc_disable_vf
- * @pf: pointer to the PF info
* @vf: pointer to the VF info
*
* Disable the VF through a SW reset
**/
-static inline void i40e_vc_disable_vf(struct i40e_pf *pf, struct i40e_vf *vf)
+static inline void i40e_vc_disable_vf(struct i40e_vf *vf)
{
i40e_vc_notify_vf_reset(vf);
i40e_reset_vf(vf, false);
@@ -2918,7 +2917,7 @@ int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
}
/* Force the VF driver stop so it has to reload with new MAC address */
- i40e_vc_disable_vf(pf, vf);
+ i40e_vc_disable_vf(vf);
dev_info(&pf->pdev->dev, "Reload the VF driver to make this change effective.\n");
error_param:
@@ -3013,7 +3012,7 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id,
* the right thing by reconfiguring his network correctly
* and then reloading the VF driver.
*/
- i40e_vc_disable_vf(pf, vf);
+ i40e_vc_disable_vf(vf);
/* During reset the VF got a new VSI, so refresh the pointer. */
vsi = pf->vsi[vf->lan_vsi_idx];
}
--
2.14.2
^ permalink raw reply related
* [net-next 14/15] i40e: fix client notify of VF reset
From: Jeff Kirsher @ 2017-10-02 19:48 UTC (permalink / raw)
To: davem; +Cc: Alan Brady, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <20171002194852.71970-1-jeffrey.t.kirsher@intel.com>
From: Alan Brady <alan.brady@intel.com>
Currently there is a bug in which the PF driver fails to inform clients
of a VF reset which then causes clients to leak resources. The bug
exists because we were incorrectly checking the I40E_VF_STATE_PRE_ENABLE
bit.
When a VF is first init we go through a reset to initialize variables
and allocate resources but we don't want to inform clients of this first
reset since the client isn't fully enabled yet so we set a state bit
signifying we're in a "pre-enabled" client state. During the first
reset we should be clearing the bit, allowing all following resets to
notify the client of the reset when the bit is not set. This patch
fixes the issue by negating the 'test_and_clear_bit' check to accurately
reflect the behavior we want.
Signed-off-by: Alan Brady <alan.brady@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 989a65d60ac9..04568137e029 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -1050,8 +1050,8 @@ static void i40e_cleanup_reset_vf(struct i40e_vf *vf)
set_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states);
clear_bit(I40E_VF_STATE_DISABLED, &vf->vf_states);
/* Do not notify the client during VF init */
- if (test_and_clear_bit(I40E_VF_STATE_PRE_ENABLE,
- &vf->vf_states))
+ if (!test_and_clear_bit(I40E_VF_STATE_PRE_ENABLE,
+ &vf->vf_states))
i40e_notify_client_of_vf_reset(pf, abs_vf_id);
vf->num_vlan = 0;
}
--
2.14.2
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox