* Re: [PATCH net-next] openvswitch: add ct_clear action
From: Eric Garver @ 2017-10-10 15:09 UTC (permalink / raw)
To: Joe Stringer; +Cc: Pravin Shelar, Linux Kernel Network Developers, ovs dev
In-Reply-To: <CAPWQB7FgKYe4Ax08NzW97-WGmriC7j9YhxQF9QtuQZwMjA00bQ@mail.gmail.com>
On Tue, Oct 10, 2017 at 05:33:48AM -0700, Joe Stringer wrote:
> On 9 October 2017 at 21:41, Pravin Shelar <pshelar@ovn.org> wrote:
> > On Fri, Oct 6, 2017 at 9:44 AM, Eric Garver <e@erig.me> wrote:
> >> This adds a ct_clear action for clearing conntrack state. ct_clear is
> >> currently implemented in OVS userspace, but is not backed by an action
> >> in the kernel datapath. This is useful for flows that may modify a
> >> packet tuple after a ct lookup has already occurred.
> >>
> >> Signed-off-by: Eric Garver <e@erig.me>
> > Patch mostly looks good. I have following comments.
> >
> >> ---
> >> include/uapi/linux/openvswitch.h | 2 ++
> >> net/openvswitch/actions.c | 5 +++++
> >> net/openvswitch/conntrack.c | 12 ++++++++++++
> >> net/openvswitch/conntrack.h | 7 +++++++
> >> net/openvswitch/flow_netlink.c | 5 +++++
> >> 5 files changed, 31 insertions(+)
> >>
> >> diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
> >> index 156ee4cab82e..1b6e510e2cc6 100644
> >> --- a/include/uapi/linux/openvswitch.h
> >> +++ b/include/uapi/linux/openvswitch.h
> >> @@ -806,6 +806,7 @@ struct ovs_action_push_eth {
> >> * packet.
> >> * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
> >> * packet.
> >> + * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet.
> >> *
> >> * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all
> >> * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
> >> @@ -835,6 +836,7 @@ enum ovs_action_attr {
> >> OVS_ACTION_ATTR_TRUNC, /* u32 struct ovs_action_trunc. */
> >> OVS_ACTION_ATTR_PUSH_ETH, /* struct ovs_action_push_eth. */
> >> OVS_ACTION_ATTR_POP_ETH, /* No argument. */
> >> + OVS_ACTION_ATTR_CT_CLEAR, /* No argument. */
> >>
> >> __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted
> >> * from userspace. */
> >> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
> >> index a54a556fcdb5..db9c7f2e662b 100644
> >> --- a/net/openvswitch/actions.c
> >> +++ b/net/openvswitch/actions.c
> >> @@ -1203,6 +1203,10 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
> >> return err == -EINPROGRESS ? 0 : err;
> >> break;
> >>
> >> + case OVS_ACTION_ATTR_CT_CLEAR:
> >> + err = ovs_ct_clear(skb, key);
> >> + break;
> >> +
> >> case OVS_ACTION_ATTR_PUSH_ETH:
> >> err = push_eth(skb, key, nla_data(a));
> >> break;
> >> @@ -1210,6 +1214,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
> >> case OVS_ACTION_ATTR_POP_ETH:
> >> err = pop_eth(skb, key);
> >> break;
> >> +
> >> }
> > Unrelated change.
> >
> >>
> >> if (unlikely(err)) {
> >> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
> >> index d558e882ca0c..f9b73c726ad7 100644
> >> --- a/net/openvswitch/conntrack.c
> >> +++ b/net/openvswitch/conntrack.c
> >> @@ -1129,6 +1129,18 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
> >> return err;
> >> }
> >>
> >> +int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key)
> >> +{
> >> + if (skb_nfct(skb)) {
> >> + nf_conntrack_put(skb_nfct(skb));
> >> + nf_ct_set(skb, NULL, 0);
> > Can the new conntract state be appropriate? may be IP_CT_UNTRACKED?
> >
> >> + }
> >> +
> >> + ovs_ct_fill_key(skb, key);
> >> +
> > I do not see need to refill the key if there is no skb-nf-ct.
>
> Really this is trying to just zero the CT key fields, but reuses
> existing functions, right? This means that subsequent upcalls, for
Right.
> instance, won't have the outdated view of the CT state from the
> previous lookup (that was prior to the ct_clear). I'd expect these key
> fields to be cleared.
I assumed Pravin was saying that we don't need to clear them if there is
no conntrack state. They should already be zero.
^ permalink raw reply
* [PATCH v4 net-next] rtnetlink: bridge: use ext_ack instead of printk
From: Florian Westphal @ 2017-10-10 15:10 UTC (permalink / raw)
To: netdev; +Cc: Florian Westphal, David Ahern
We can now piggyback error strings to userspace via extended acks
rather than using printk.
Before:
bridge fdb add 01:02:03:04:05:06 dev br0 vlan 4095
RTNETLINK answers: Invalid argument
After:
bridge fdb add 01:02:03:04:05:06 dev br0 vlan 4095
Error: invalid vlan id.
v3: drop 'RTM_' prefixes, suggested by David Ahern, they
are not useful, the add/del in bridge command line is enough.
Also reword error in response to malformed/bad vlan id attribute
size.
Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
change since v3: forgot to remove "RTM_SETLINK:" prefix in error message.
net/core/rtnetlink.c | 28 ++++++++++++++--------------
1 file changed, 14 insertions(+), 14 deletions(-)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e84d108cfee4..6a09f3d575af 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3066,21 +3066,21 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm,
}
EXPORT_SYMBOL(ndo_dflt_fdb_add);
-static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid)
+static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid,
+ struct netlink_ext_ack *extack)
{
u16 vid = 0;
if (vlan_attr) {
if (nla_len(vlan_attr) != sizeof(u16)) {
- pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan\n");
+ NL_SET_ERR_MSG(extack, "invalid vlan attribute size");
return -EINVAL;
}
vid = nla_get_u16(vlan_attr);
if (!vid || vid >= VLAN_VID_MASK) {
- pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan id %d\n",
- vid);
+ NL_SET_ERR_MSG(extack, "invalid vlan id");
return -EINVAL;
}
}
@@ -3105,24 +3105,24 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
ndm = nlmsg_data(nlh);
if (ndm->ndm_ifindex == 0) {
- pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid ifindex\n");
+ NL_SET_ERR_MSG(extack, "invalid ifindex");
return -EINVAL;
}
dev = __dev_get_by_index(net, ndm->ndm_ifindex);
if (dev == NULL) {
- pr_info("PF_BRIDGE: RTM_NEWNEIGH with unknown ifindex\n");
+ NL_SET_ERR_MSG(extack, "unknown ifindex");
return -ENODEV;
}
if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
- pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid address\n");
+ NL_SET_ERR_MSG(extack, "invalid address");
return -EINVAL;
}
addr = nla_data(tb[NDA_LLADDR]);
- err = fdb_vid_parse(tb[NDA_VLAN], &vid);
+ err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
if (err)
return err;
@@ -3209,24 +3209,24 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
ndm = nlmsg_data(nlh);
if (ndm->ndm_ifindex == 0) {
- pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid ifindex\n");
+ NL_SET_ERR_MSG(extack, "invalid ifindex");
return -EINVAL;
}
dev = __dev_get_by_index(net, ndm->ndm_ifindex);
if (dev == NULL) {
- pr_info("PF_BRIDGE: RTM_DELNEIGH with unknown ifindex\n");
+ NL_SET_ERR_MSG(extack, "unknown ifindex");
return -ENODEV;
}
if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
- pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid address\n");
+ NL_SET_ERR_MSG(extack, "invalid address");
return -EINVAL;
}
addr = nla_data(tb[NDA_LLADDR]);
- err = fdb_vid_parse(tb[NDA_VLAN], &vid);
+ err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
if (err)
return err;
@@ -3666,7 +3666,7 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
dev = __dev_get_by_index(net, ifm->ifi_index);
if (!dev) {
- pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n");
+ NL_SET_ERR_MSG(extack, "unknown ifindex");
return -ENODEV;
}
@@ -3741,7 +3741,7 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
dev = __dev_get_by_index(net, ifm->ifi_index);
if (!dev) {
- pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n");
+ NL_SET_ERR_MSG(extack, "unknown ifindex");
return -ENODEV;
}
--
2.13.6
^ permalink raw reply related
* Re: [PATCH v4 net-next] rtnetlink: bridge: use ext_ack instead of printk
From: David Ahern @ 2017-10-10 15:11 UTC (permalink / raw)
To: Florian Westphal, netdev
In-Reply-To: <20171010151004.20056-1-fw@strlen.de>
On 10/10/17 9:10 AM, Florian Westphal wrote:
> We can now piggyback error strings to userspace via extended acks
> rather than using printk.
>
> Before:
> bridge fdb add 01:02:03:04:05:06 dev br0 vlan 4095
> RTNETLINK answers: Invalid argument
>
> After:
> bridge fdb add 01:02:03:04:05:06 dev br0 vlan 4095
> Error: invalid vlan id.
>
> v3: drop 'RTM_' prefixes, suggested by David Ahern, they
> are not useful, the add/del in bridge command line is enough.
>
> Also reword error in response to malformed/bad vlan id attribute
> size.
>
> Cc: David Ahern <dsahern@gmail.com>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
> change since v3: forgot to remove "RTM_SETLINK:" prefix in error message.
>
> net/core/rtnetlink.c | 28 ++++++++++++++--------------
> 1 file changed, 14 insertions(+), 14 deletions(-)
Reviewed-by: David Ahern <dsahern@gmail.com>
^ permalink raw reply
* RE: [patch net-next 2/4] net: sched: introduce per-egress action device callbacks
From: David Laight @ 2017-10-10 15:12 UTC (permalink / raw)
To: 'Jiri Pirko'
Cc: netdev@vger.kernel.org, davem@davemloft.net, jhs@mojatatu.com,
xiyou.wangcong@gmail.com, saeedm@mellanox.com,
matanb@mellanox.com, leonro@mellanox.com, mlxsw@mellanox.com
In-Reply-To: <20171010143152.GG2033@nanopsycho>
From: Jiri Pirko
> Sent: 10 October 2017 15:32
> To: David Laight
> Cc: netdev@vger.kernel.org; davem@davemloft.net; jhs@mojatatu.com; xiyou.wangcong@gmail.com;
> saeedm@mellanox.com; matanb@mellanox.com; leonro@mellanox.com; mlxsw@mellanox.com
> Subject: Re: [patch net-next 2/4] net: sched: introduce per-egress action device callbacks
>
> Tue, Oct 10, 2017 at 03:31:59PM CEST, David.Laight@ACULAB.COM wrote:
> >From: Jiri Pirko
> >> Sent: 10 October 2017 08:30
> >> Introduce infrastructure that allows drivers to register callbacks that
> >> are called whenever tc would offload inserted rule and specified device
> >> acts as tc action egress device.
> >
> >How does a driver safely unregister a callback?
> >(to avoid a race with the callback being called.)
> >
> >Usually this requires a callback in the context that makes the
> >notification callbacks indicating that no more such callbacks
> >will be made.
>
> rtnl is your answer. It is being held during register/unregister/cb
Do you mean 'acquired during register/unregister' and 'held across the
callback' ?
So the unregister sleeps (or spins?) until any callbacks complete?
So the driver mustn't hold any locks (etc) across the unregister that
it acquires in the callback.
That ought to be noted somewhere.
David
^ permalink raw reply
* RE: [PATCH v2 net-next 1/2] net: dsa: lan9303: Move tag setup to new lan9303_setup_tagging
From: Woojung.Huh @ 2017-10-10 15:14 UTC (permalink / raw)
To: privat, andrew, vivien.didelot, f.fainelli, netdev, linux-kernel
In-Reply-To: <20171010124953.386-2-privat@egil-hjelmeland.no>
> +/* forward special tagged packets from port 0 to port 1 *or* port 2 */
> +static int lan9303_setup_tagging(struct lan9303 *chip)
> +{
> + int ret;
> + u32 val;
> + /* enable defining the destination port via special VLAN tagging
> + * for port 0
> + */
> + ret = lan9303_write_switch_reg(chip,
> LAN9303_SWE_INGRESS_PORT_TYPE,
> +
> LAN9303_SWE_INGRESS_PORT_TYPE_VLAN);
> + if (ret)
> + return ret;
> +
> + /* tag incoming packets at port 1 and 2 on their way to port 0 to be
> + * able to discover their source port
> + */
> + val = LAN9303_BM_EGRSS_PORT_TYPE_SPECIAL_TAG_PORT0;
> + return lan9303_write_switch_reg(chip,
> LAN9303_BM_EGRSS_PORT_TYPE, val);
Specific reason to use val then using LAN9303_BM_EGRSS_PORT_TYPE_SPECIAL_TAG_PORT0
like previous line?
> @@ -644,6 +648,10 @@ static int lan9303_setup(struct dsa_switch *ds)
> return -EINVAL;
> }
>
> + ret = lan9303_setup_tagging(chip);
> + if (ret)
> + dev_err(chip->dev, "failed to setup port tagging %d\n", ret);
> +
Still move on when error happens?
> ret = lan9303_separate_ports(chip);
> if (ret)
> dev_err(chip->dev, "failed to separate ports %d\n", ret);
> --
> 2.11.0
- Woojung
^ permalink raw reply
* [net 0/2][pull request] Intel Wired LAN Driver Updates 2017-10-10
From: Jeff Kirsher @ 2017-10-10 15:14 UTC (permalink / raw)
To: davem; +Cc: Jeff Kirsher, netdev, nhorman, sassmann, jogreene
This series contains updates to i40e only.
Stefano Brivio fixes the grammar in a function header comment.
Alex fixes a memory leak where we were not correctly placing the pages
from buffers that had been used to return a filter programming status
back on the ring.
The following are changes since commit 529a86e063e9ff625c4ff247d8aa17d8072444fb:
Merge branch 'ppc-bundle' (bundle from Michael Ellerman)
and are available in the git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-queue 40GbE
Alexander Duyck (1):
i40e: Fix memory leak related filter programming status
Stefano Brivio (1):
i40e: Fix comment about locking for __i40e_read_nvm_word()
drivers/net/ethernet/intel/i40e/i40e_nvm.c | 2 +-
drivers/net/ethernet/intel/i40e/i40e_txrx.c | 63 ++++++++++++++++-------------
2 files changed, 37 insertions(+), 28 deletions(-)
--
2.14.2
^ permalink raw reply
* [net 1/2] i40e: Fix comment about locking for __i40e_read_nvm_word()
From: Jeff Kirsher @ 2017-10-10 15:14 UTC (permalink / raw)
To: davem; +Cc: Stefano Brivio, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <20171010151416.43149-1-jeffrey.t.kirsher@intel.com>
From: Stefano Brivio <sbrivio@redhat.com>
Caller needs to acquire the lock. Called functions will not.
Fixes: 09f79fd49d94 ("i40e: avoid NVM acquire deadlock during NVM update")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ethernet/intel/i40e/i40e_nvm.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_nvm.c b/drivers/net/ethernet/intel/i40e/i40e_nvm.c
index 57505b1df98d..d591b3e6bd7c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_nvm.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_nvm.c
@@ -298,7 +298,7 @@ static i40e_status i40e_read_nvm_word_aq(struct i40e_hw *hw, u16 offset,
}
/**
- * __i40e_read_nvm_word - Reads nvm word, assumes called does the locking
+ * __i40e_read_nvm_word - Reads nvm word, assumes caller does the locking
* @hw: pointer to the HW structure
* @offset: offset of the Shadow RAM word to read (0x000000 - 0x001FFF)
* @data: word read from the Shadow RAM
--
2.14.2
^ permalink raw reply related
* [net 2/2] i40e: Fix memory leak related filter programming status
From: Jeff Kirsher @ 2017-10-10 15:14 UTC (permalink / raw)
To: davem; +Cc: Alexander Duyck, netdev, nhorman, sassmann, jogreene,
Jeff Kirsher
In-Reply-To: <20171010151416.43149-1-jeffrey.t.kirsher@intel.com>
From: Alexander Duyck <alexander.h.duyck@intel.com>
It looks like we weren't correctly placing the pages from buffers that had
been used to return a filter programming status back on the ring. As a
result they were being overwritten and tracking of the pages was lost.
This change works to correct that by incorporating part of
i40e_put_rx_buffer into the programming status handler code. As a result we
should now be correctly placing the pages for those buffers on the
re-allocation list instead of letting them stay in place.
Fixes: 0e626ff7ccbf ("i40e: Fix support for flow director programming status")
Reported-by: Anders K. Pedersen <akp@cohaesio.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Anders K Pedersen <akp@cohaesio.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
drivers/net/ethernet/intel/i40e/i40e_txrx.c | 63 ++++++++++++++++-------------
1 file changed, 36 insertions(+), 27 deletions(-)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 1519dfb851d0..2756131495f0 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1037,6 +1037,32 @@ static bool i40e_set_new_dynamic_itr(struct i40e_ring_container *rc)
return false;
}
+/**
+ * i40e_reuse_rx_page - page flip buffer and store it back on the ring
+ * @rx_ring: rx descriptor ring to store buffers on
+ * @old_buff: donor buffer to have page reused
+ *
+ * Synchronizes page for reuse by the adapter
+ **/
+static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
+ struct i40e_rx_buffer *old_buff)
+{
+ struct i40e_rx_buffer *new_buff;
+ u16 nta = rx_ring->next_to_alloc;
+
+ new_buff = &rx_ring->rx_bi[nta];
+
+ /* update, and store next to alloc */
+ nta++;
+ rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
+
+ /* transfer page from old buffer to new buffer */
+ new_buff->dma = old_buff->dma;
+ new_buff->page = old_buff->page;
+ new_buff->page_offset = old_buff->page_offset;
+ new_buff->pagecnt_bias = old_buff->pagecnt_bias;
+}
+
/**
* i40e_rx_is_programming_status - check for programming status descriptor
* @qw: qword representing status_error_len in CPU ordering
@@ -1071,15 +1097,24 @@ static void i40e_clean_programming_status(struct i40e_ring *rx_ring,
union i40e_rx_desc *rx_desc,
u64 qw)
{
- u32 ntc = rx_ring->next_to_clean + 1;
+ struct i40e_rx_buffer *rx_buffer;
+ u32 ntc = rx_ring->next_to_clean;
u8 id;
/* fetch, update, and store next to clean */
+ rx_buffer = &rx_ring->rx_bi[ntc++];
ntc = (ntc < rx_ring->count) ? ntc : 0;
rx_ring->next_to_clean = ntc;
prefetch(I40E_RX_DESC(rx_ring, ntc));
+ /* place unused page back on the ring */
+ i40e_reuse_rx_page(rx_ring, rx_buffer);
+ rx_ring->rx_stats.page_reuse_count++;
+
+ /* clear contents of buffer_info */
+ rx_buffer->page = NULL;
+
id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT;
@@ -1638,32 +1673,6 @@ static bool i40e_cleanup_headers(struct i40e_ring *rx_ring, struct sk_buff *skb,
return false;
}
-/**
- * i40e_reuse_rx_page - page flip buffer and store it back on the ring
- * @rx_ring: rx descriptor ring to store buffers on
- * @old_buff: donor buffer to have page reused
- *
- * Synchronizes page for reuse by the adapter
- **/
-static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
- struct i40e_rx_buffer *old_buff)
-{
- struct i40e_rx_buffer *new_buff;
- u16 nta = rx_ring->next_to_alloc;
-
- new_buff = &rx_ring->rx_bi[nta];
-
- /* update, and store next to alloc */
- nta++;
- rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
- /* transfer page from old buffer to new buffer */
- new_buff->dma = old_buff->dma;
- new_buff->page = old_buff->page;
- new_buff->page_offset = old_buff->page_offset;
- new_buff->pagecnt_bias = old_buff->pagecnt_bias;
-}
-
/**
* i40e_page_is_reusable - check if any reuse is possible
* @page: page struct to check
--
2.14.2
^ permalink raw reply related
* Re: RIF/VRF overflow in spectrum and reporting errors back to user
From: David Ahern @ 2017-10-10 15:23 UTC (permalink / raw)
To: Ido Schimmel; +Cc: Ido Schimmel, Jiri Pirko, netdev@vger.kernel.org
In-Reply-To: <20171009093110.GA5193@shredder.mtl.com>
On 10/9/17 3:31 AM, Ido Schimmel wrote:
> Hi David,
>
> On Sun, Oct 08, 2017 at 02:10:33PM -0600, David Ahern wrote:
>> Jiri / Ido:
>>
>> I am looking at adding user messages for spectrum failures related to
>> RIF and VRF overflow coming from the inetaddr and inet6addr notifier
>> paths. The key is that if the notifiers fail the address add needs to
>> fail and an error reported to the user as to what happened.
>
> Thanks for working on this. Very nice idea!
>
>> Earlier this year 3ad7d2468f79f added in_validator_info and
>> in6_validator_info as a way for the notifiers to fail adding an address.
>> Adding support to spectrum for that notifier is complicated by the fact
>> that the validator notifier and address notifiers will come in back to
>> back for the NETDEV_UP case. Ignoring NETDEV_UP in
>> mlxsw_sp_inetaddr_event seems ok for IPv6 but not clear for IPv4 since
>> the NETDEV_UP case is emitted on an address delete that involves a
>> promotion. Handling the back to back NETDEV_UP is complicated since
>> functions invoked by __mlxsw_sp_inetaddr_event can take multiple
>> references. Specifically, in mlxsw_sp_port_vlan_router_join():
>> fid = rif->ops->fid_get(rif);
>>
>> Can NETDEV_UP be ignored for the inetaddr notifier if it is handled by
>> the validator notitifer?
>
> Yes. The case where we get a NETDEV_DOWN for an address delete and then
> a NETDEV_UP for a promotion is basically a NOP from the driver's
> perspective. When the NETDEV_DOWN is received, the RIF isn't destroyed
> because the address list isn't empty (there's an address to be
> promoted). When the NETDEV_UP is received, it's ignored because we
> already have a RIF.
You lost me on the RIF. Looking at the chain:
mlxsw_sp_inet6addr_event_work or mlxsw_sp_inetaddr_event
- __mlxsw_sp_inetaddr_event
+ mlxsw_sp_inetaddr_vlan_event
* mlxsw_sp_inetaddr_port_vlan_event
- NETDEV_UP: mlxsw_sp_port_vlan_router_join
mlxsw_sp_port_vlan_router_join does the rif lookup and if it exists
calls fid_get() which takes a reference. I read that to mean
back-to-back NETDEV_UP notifiers (the address validator and then the
address notifier) would lead to a reference count leak.
Based on your address delete comment, I take the IPv4 solution to be
adding the validator notifier to spectrum and then ignoring NETDEV_UP in
mlxsw_sp_inetaddr_event. That means IPv4 inetaddr work is done for the
validator notifier while NETDEV_DOWN is done through the inetaddr notifier.
>
> Regarding IPv6, it's a bit more complicated actually, since we do the
> actual work in a workqueue, as the notification chain is atomic. I
> believe this is because the notifier can be called from softirq in
> response to RA packets.
>
> However, this case isn't interesting for mlxsw, as the fact that you
> process an RA packet suggests you already have a link-local address and
> thus a RIF. Plus, the kernel won't even process such packets in our case
> as you most likely have forwarding enabled (unless you tweaked accept_ra
> for some reason).
>
> Looking at ipvlan (the only user of inet6addr_validator_chain), I see
> that it ignores this specific case and returns NOTIFY_DONE. Maybe we can
> move this notification chain to be blocking and not call it in response
> to RA packets seeing that all its users ignore it?
Seems reasonable to me.
I have it coded. Let me test and send an rfc.
^ permalink raw reply
* Re: [PATCH RFC 0/3] tun zerocopy stats
From: Willem de Bruijn @ 2017-10-10 15:29 UTC (permalink / raw)
To: David Miller
Cc: Network Development, Michael S. Tsirkin, Jason Wang,
Willem de Bruijn
In-Reply-To: <20171009.205228.714368596112967819.davem@davemloft.net>
On Mon, Oct 9, 2017 at 11:52 PM, David Miller <davem@davemloft.net> wrote:
> From: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
> Date: Fri, 6 Oct 2017 18:25:13 -0400
>
>> From: Willem de Bruijn <willemb@google.com>
>>
>> Add zerocopy transfer statistics to the vhost_net/tun zerocopy path.
>>
>> I've been using this to verify recent changes to zerocopy tuning [1].
>> Sharing more widely, as it may be useful in similar future work.
>>
>> Use ethtool stats as interface, as these are defined per device
>> driver and can easily be extended.
>>
>> Make the zerocopy release callback take an extra hop through the tun
>> driver to allow the driver to increment its counters.
>>
>> Care must be taken to avoid adding an alloc/free to this hot path.
>> Since the caller already must allocate a ubuf_info, make it allocate
>> two at a time and grant one to the tun device.
>>
>> 1/3: introduce ethtool stats (`ethtool -S $DEV`) for tun devices
>> 2/3: add zerocopy tx and tx_err counters
>> 3/3: convert vhost_net to pass a pair of ubuf_info to tun
>>
>> [1] http://patchwork.ozlabs.org/patch/822613/
>
> This looks mostly fine to me, but I don't know enough about how vhost
> and tap interact to tell whether this makes sense to upstream.
Thanks for taking a look. The need for monitoring these stats has
come up in a couple of patch evaluation discussions, so I wanted
to share at least one implementation to get the data.
Because the choice to use zerocopy is based on heuristics and
there is a cost if it mispredicts, I think we even want to being able
to continuously monitor this in production.
The implementation is probably not ready for that as is.
> What are the runtime costs for these new statistics?
It currently doubles the size of the ubuf_info memory pool. That can be
fixed, as the current size is UIO_MAXIOV (1024), but the number of
zerocopy packets in flight is bound by VHOST_MAX_PEND (128).
It also adds an indirect function call to call to each zerocopy skb free
path, though.
If there is a way to expose these stats through vhost_net directly,
instead of through tun, that may be better. But I did not see a
suitable interface. Perhaps debugfs.
^ permalink raw reply
* Re: [PATCH v2 net-next 1/2] net: dsa: lan9303: Move tag setup to new lan9303_setup_tagging
From: Egil Hjelmeland @ 2017-10-10 15:30 UTC (permalink / raw)
To: Woojung.Huh, andrew, vivien.didelot, f.fainelli, netdev,
linux-kernel
In-Reply-To: <9235D6609DB808459E95D78E17F2E43D40B48C3D@CHN-SV-EXMX02.mchp-main.com>
On 10. okt. 2017 17:14, Woojung.Huh@microchip.com wrote:
>> +/* forward special tagged packets from port 0 to port 1 *or* port 2 */
>> +static int lan9303_setup_tagging(struct lan9303 *chip)
>> +{
>> + int ret;
>> + u32 val;
>> + /* enable defining the destination port via special VLAN tagging
>> + * for port 0
>> + */
>> + ret = lan9303_write_switch_reg(chip,
>> LAN9303_SWE_INGRESS_PORT_TYPE,
>> +
>> LAN9303_SWE_INGRESS_PORT_TYPE_VLAN);
>> + if (ret)
>> + return ret;
>> +
>> + /* tag incoming packets at port 1 and 2 on their way to port 0 to be
>> + * able to discover their source port
>> + */
>> + val = LAN9303_BM_EGRSS_PORT_TYPE_SPECIAL_TAG_PORT0;
>> + return lan9303_write_switch_reg(chip,
>> LAN9303_BM_EGRSS_PORT_TYPE, val);
> Specific reason to use val then using LAN9303_BM_EGRSS_PORT_TYPE_SPECIAL_TAG_PORT0
> like previous line?
>
Specific reason was to please a reviewer that did not like my
indenting in first version. I did not agree with him, but since
nobody else spoke up, I changed the code.
>> @@ -644,6 +648,10 @@ static int lan9303_setup(struct dsa_switch *ds)
>> return -EINVAL;
>> }
>>
>> + ret = lan9303_setup_tagging(chip);
>> + if (ret)
>> + dev_err(chip->dev, "failed to setup port tagging %d\n", ret);
>> +
> Still move on when error happens?
>
Good question. I just followed the pattern from the original function,
which was not made by me. Actually I did once reflect on whether this
was the correct way. Perhaps it could be argued that it is better to
allow the device to come up, so the problem can be investigated?
>> ret = lan9303_separate_ports(chip);
>> if (ret)
>> dev_err(chip->dev, "failed to separate ports %d\n", ret);
>> --
>> 2.11.0
>
> - Woojung
>
^ permalink raw reply
* Re: [patch net-next 2/4] net: sched: introduce per-egress action device callbacks
From: Jiri Pirko @ 2017-10-10 15:39 UTC (permalink / raw)
To: David Laight
Cc: netdev@vger.kernel.org, davem@davemloft.net, jhs@mojatatu.com,
xiyou.wangcong@gmail.com, saeedm@mellanox.com,
matanb@mellanox.com, leonro@mellanox.com, mlxsw@mellanox.com
In-Reply-To: <063D6719AE5E284EB5DD2968C1650D6DD008F06E@AcuExch.aculab.com>
Tue, Oct 10, 2017 at 05:12:34PM CEST, David.Laight@ACULAB.COM wrote:
>From: Jiri Pirko
>> Sent: 10 October 2017 15:32
>> To: David Laight
>> Cc: netdev@vger.kernel.org; davem@davemloft.net; jhs@mojatatu.com; xiyou.wangcong@gmail.com;
>> saeedm@mellanox.com; matanb@mellanox.com; leonro@mellanox.com; mlxsw@mellanox.com
>> Subject: Re: [patch net-next 2/4] net: sched: introduce per-egress action device callbacks
>>
>> Tue, Oct 10, 2017 at 03:31:59PM CEST, David.Laight@ACULAB.COM wrote:
>> >From: Jiri Pirko
>> >> Sent: 10 October 2017 08:30
>> >> Introduce infrastructure that allows drivers to register callbacks that
>> >> are called whenever tc would offload inserted rule and specified device
>> >> acts as tc action egress device.
>> >
>> >How does a driver safely unregister a callback?
>> >(to avoid a race with the callback being called.)
>> >
>> >Usually this requires a callback in the context that makes the
>> >notification callbacks indicating that no more such callbacks
>> >will be made.
>>
>> rtnl is your answer. It is being held during register/unregister/cb
>
>Do you mean 'acquired during register/unregister' and 'held across the
>callback' ?
>
>So the unregister sleeps (or spins?) until any callbacks complete?
>So the driver mustn't hold any locks (etc) across the unregister that
>it acquires in the callback.
>That ought to be noted somewhere.
You actually have a point. I don't take rtnl for reg/unreg as I suppose
to. Will fix.
>
> David
>
^ permalink raw reply
* Re: RIF/VRF overflow in spectrum and reporting errors back to user
From: Ido Schimmel @ 2017-10-10 15:47 UTC (permalink / raw)
To: David Ahern; +Cc: Ido Schimmel, Jiri Pirko, netdev@vger.kernel.org
In-Reply-To: <779c2ba6-fd29-b260-f008-b41a607acd41@gmail.com>
On Tue, Oct 10, 2017 at 09:23:48AM -0600, David Ahern wrote:
> On 10/9/17 3:31 AM, Ido Schimmel wrote:
> >> Can NETDEV_UP be ignored for the inetaddr notifier if it is handled by
> >> the validator notitifer?
> >
> > Yes. The case where we get a NETDEV_DOWN for an address delete and then
> > a NETDEV_UP for a promotion is basically a NOP from the driver's
> > perspective. When the NETDEV_DOWN is received, the RIF isn't destroyed
> > because the address list isn't empty (there's an address to be
> > promoted). When the NETDEV_UP is received, it's ignored because we
> > already have a RIF.
>
> You lost me on the RIF. Looking at the chain:
>
> mlxsw_sp_inet6addr_event_work or mlxsw_sp_inetaddr_event
> - __mlxsw_sp_inetaddr_event
> + mlxsw_sp_inetaddr_vlan_event
> * mlxsw_sp_inetaddr_port_vlan_event
> - NETDEV_UP: mlxsw_sp_port_vlan_router_join
>
> mlxsw_sp_port_vlan_router_join does the rif lookup and if it exists
> calls fid_get() which takes a reference. I read that to mean
> back-to-back NETDEV_UP notifiers (the address validator and then the
> address notifier) would lead to a reference count leak.
>
> Based on your address delete comment, I take the IPv4 solution to be
> adding the validator notifier to spectrum and then ignoring NETDEV_UP in
> mlxsw_sp_inetaddr_event. That means IPv4 inetaddr work is done for the
> validator notifier while NETDEV_DOWN is done through the inetaddr notifier.
Exactly. The only NETDEV_UP we "miss" is the one sent for the promoted
address in the inetaddr chain, but it's irrelevant because when we got
the preceding NETDEV_DOWN for the deleted primary address we didn't
destroy the RIF as the address list wasn't empty (see
mlxsw_sp_rif_should_config() which is called by both top functions in
your call chain).
> > Regarding IPv6, it's a bit more complicated actually, since we do the
> > actual work in a workqueue, as the notification chain is atomic. I
> > believe this is because the notifier can be called from softirq in
> > response to RA packets.
> >
> > However, this case isn't interesting for mlxsw, as the fact that you
> > process an RA packet suggests you already have a link-local address and
> > thus a RIF. Plus, the kernel won't even process such packets in our case
> > as you most likely have forwarding enabled (unless you tweaked accept_ra
> > for some reason).
> >
> > Looking at ipvlan (the only user of inet6addr_validator_chain), I see
> > that it ignores this specific case and returns NOTIFY_DONE. Maybe we can
> > move this notification chain to be blocking and not call it in response
> > to RA packets seeing that all its users ignore it?
>
> Seems reasonable to me.
>
> I have it coded. Let me test and send an rfc.
Great. Looking forward to it.
^ permalink raw reply
* RE: [PATCH v2 net-next 1/2] net: dsa: lan9303: Move tag setup to new lan9303_setup_tagging
From: Woojung.Huh @ 2017-10-10 15:51 UTC (permalink / raw)
To: privat, andrew, vivien.didelot, f.fainelli, netdev, linux-kernel
In-Reply-To: <2d03fb99-fd1c-9a0b-3274-eda605c3eb2d@egil-hjelmeland.no>
> > Specific reason to use val then using
> LAN9303_BM_EGRSS_PORT_TYPE_SPECIAL_TAG_PORT0
> > like previous line?
> >
> Specific reason was to please a reviewer that did not like my
> indenting in first version. I did not agree with him, but since
> nobody else spoke up, I changed the code.
Got it. Missed previous patch/comment.
> >> @@ -644,6 +648,10 @@ static int lan9303_setup(struct dsa_switch *ds)
> >> return -EINVAL;
> >> }
> >>
> >> + ret = lan9303_setup_tagging(chip);
> >> + if (ret)
> >> + dev_err(chip->dev, "failed to setup port tagging %d\n", ret);
> >> +
> > Still move on when error happens?
> >
> Good question. I just followed the pattern from the original function,
> which was not made by me. Actually I did once reflect on whether this
> was the correct way. Perhaps it could be argued that it is better to
> allow the device to come up, so the problem can be investigated?
Maybe depends on severity of setting?
BTW, lan9303_setup() still returns ZERO at the end?
Thanks.
Woojung
^ permalink raw reply
* Re: [PATCH v2 net-next 1/2] net: dsa: lan9303: Move tag setup to new lan9303_setup_tagging
From: Vivien Didelot @ 2017-10-10 15:51 UTC (permalink / raw)
To: Egil Hjelmeland, Woojung.Huh, andrew, f.fainelli, netdev,
linux-kernel
In-Reply-To: <2d03fb99-fd1c-9a0b-3274-eda605c3eb2d@egil-hjelmeland.no>
Hi Egil,
Egil Hjelmeland <privat@egil-hjelmeland.no> writes:
>>> + val = LAN9303_BM_EGRSS_PORT_TYPE_SPECIAL_TAG_PORT0;
>>> + return lan9303_write_switch_reg(chip,
>>> LAN9303_BM_EGRSS_PORT_TYPE, val);
>> Specific reason to use val then using LAN9303_BM_EGRSS_PORT_TYPE_SPECIAL_TAG_PORT0
>> like previous line?
>>
> Specific reason was to please a reviewer that did not like my
> indenting in first version. I did not agree with him, but since
> nobody else spoke up, I changed the code.
Your indentation was broken and did not respect the Kernel Coding
Style. Using a temporary variable here ensures you respect both the
80-char limit and the vertical alignment on opening parenthesis.
You'd like to use ./scripts/checkpatch.pl before submitting patches.
Thanks,
Vivien
^ permalink raw reply
* Re: [PATCH RFC tip/core/rcu 14/15] netfilter: Remove now-redundant smp_read_barrier_depends()
From: Paul E. McKenney @ 2017-10-10 15:56 UTC (permalink / raw)
To: Peter Zijlstra
Cc: linux-kernel, mingo, torvalds, mark.rutland, dhowells, linux-arch,
will.deacon, Pablo Neira Ayuso, Jozsef Kadlecsik,
Florian Westphal, David S. Miller, netfilter-devel, coreteam,
netdev
In-Reply-To: <20171010084334.nbyhryiwyrl6km4u@hirez.programming.kicks-ass.net>
On Tue, Oct 10, 2017 at 10:43:34AM +0200, Peter Zijlstra wrote:
> On Mon, Oct 09, 2017 at 05:22:48PM -0700, Paul E. McKenney wrote:
> > READ_ONCE() now implies smp_read_barrier_depends(), which means that
> > the instances in arpt_do_table(), ipt_do_table(), and ip6t_do_table()
> > are now redundant. This commit removes them and adjusts the comments.
>
> Similar to the previous patch, the lack of READ_ONCE() in the original
> code is a pre-existing bug. It would allow the compiler to tear the load
> and observe a composite of two difference pointer values, or reload the
> private pointer and result in table_base and jumpstacl being part of
> different objects.
>
> It would be good to point out this actually fixes a bug in the code.
Assuming that these changes actually fixed something, agreed. ;-)
Thanx, Paul
^ permalink raw reply
* [PATCH net 0/2] nfp: fix ethtool stats and page allocation
From: Jakub Kicinski @ 2017-10-10 16:16 UTC (permalink / raw)
To: netdev; +Cc: oss-drivers, Jakub Kicinski
Hi!
Two fixes for net. First one makes sure we handle gather of stats on
32bit machines correctly (ouch). The second fix solves a potential
NULL-deref if we fail to allocate a page with XDP running.
I used Fixes: tags pointing to where the bug was introduced, but for
patch 1 it has been in the driver "for ever" and fix won't backport
cleanly beyond commit 325945ede6d4 ("nfp: split software and hardware
vNIC statistics") which is in net.
Jakub Kicinski (2):
nfp: fix ethtool stats gather retry
nfp: handle page allocation failures
drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 20 ++++++++++++++------
drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c | 8 +++++---
2 files changed, 19 insertions(+), 9 deletions(-)
--
2.14.1
^ permalink raw reply
* [PATCH net 1/2] nfp: fix ethtool stats gather retry
From: Jakub Kicinski @ 2017-10-10 16:16 UTC (permalink / raw)
To: netdev; +Cc: oss-drivers, Jakub Kicinski
In-Reply-To: <20171010161623.23838-1-jakub.kicinski@netronome.com>
The while loop fetching 64 bit ethtool statistics may have
to retry multiple times, it shouldn't modify the outside state.
Fixes: 4c3523623dc0 ("net: add driver for Netronome NFP4000/NFP6000 NIC VFs")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
---
drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index 07969f06df10..dc016dfec64d 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -464,7 +464,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data)
do {
start = u64_stats_fetch_begin(&nn->r_vecs[i].rx_sync);
- *data++ = nn->r_vecs[i].rx_pkts;
+ data[0] = nn->r_vecs[i].rx_pkts;
tmp[0] = nn->r_vecs[i].hw_csum_rx_ok;
tmp[1] = nn->r_vecs[i].hw_csum_rx_inner_ok;
tmp[2] = nn->r_vecs[i].hw_csum_rx_error;
@@ -472,14 +472,16 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device *netdev, u64 *data)
do {
start = u64_stats_fetch_begin(&nn->r_vecs[i].tx_sync);
- *data++ = nn->r_vecs[i].tx_pkts;
- *data++ = nn->r_vecs[i].tx_busy;
+ data[1] = nn->r_vecs[i].tx_pkts;
+ data[2] = nn->r_vecs[i].tx_busy;
tmp[3] = nn->r_vecs[i].hw_csum_tx;
tmp[4] = nn->r_vecs[i].hw_csum_tx_inner;
tmp[5] = nn->r_vecs[i].tx_gather;
tmp[6] = nn->r_vecs[i].tx_lso;
} while (u64_stats_fetch_retry(&nn->r_vecs[i].tx_sync, start));
+ data += 3;
+
for (j = 0; j < NN_ET_RVEC_GATHER_STATS; j++)
gathered_stats[j] += tmp[j];
}
--
2.14.1
^ permalink raw reply related
* [PATCH net 2/2] nfp: handle page allocation failures
From: Jakub Kicinski @ 2017-10-10 16:16 UTC (permalink / raw)
To: netdev; +Cc: oss-drivers, Jakub Kicinski
In-Reply-To: <20171010161623.23838-1-jakub.kicinski@netronome.com>
page_address() does not handle NULL argument gracefully,
make sure we NULL-check the page pointer before passing it
to page_address().
Fixes: ecd63a0217d5 ("nfp: add XDP support in the driver")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
---
drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 20 ++++++++++++++------
1 file changed, 14 insertions(+), 6 deletions(-)
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 1c0187f0af51..e118b5f23996 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1180,10 +1180,14 @@ static void *nfp_net_rx_alloc_one(struct nfp_net_dp *dp, dma_addr_t *dma_addr)
{
void *frag;
- if (!dp->xdp_prog)
+ if (!dp->xdp_prog) {
frag = netdev_alloc_frag(dp->fl_bufsz);
- else
- frag = page_address(alloc_page(GFP_KERNEL | __GFP_COLD));
+ } else {
+ struct page *page;
+
+ page = alloc_page(GFP_KERNEL | __GFP_COLD);
+ frag = page ? page_address(page) : NULL;
+ }
if (!frag) {
nn_dp_warn(dp, "Failed to alloc receive page frag\n");
return NULL;
@@ -1203,10 +1207,14 @@ static void *nfp_net_napi_alloc_one(struct nfp_net_dp *dp, dma_addr_t *dma_addr)
{
void *frag;
- if (!dp->xdp_prog)
+ if (!dp->xdp_prog) {
frag = napi_alloc_frag(dp->fl_bufsz);
- else
- frag = page_address(alloc_page(GFP_ATOMIC | __GFP_COLD));
+ } else {
+ struct page *page;
+
+ page = alloc_page(GFP_ATOMIC | __GFP_COLD);
+ frag = page ? page_address(page) : NULL;
+ }
if (!frag) {
nn_dp_warn(dp, "Failed to alloc receive page frag\n");
return NULL;
--
2.14.1
^ permalink raw reply related
* Re: [RFC net 1/1] net: sched: act: fix rcu race in dump
From: Cong Wang @ 2017-10-10 16:40 UTC (permalink / raw)
To: Alexander Aring
Cc: Jamal Hadi Salim, Jiri Pirko, Linux Kernel Network Developers,
kurup.manish, Brenda Butler
In-Reply-To: <20171010123218.5251-2-aring@mojatatu.com>
On Tue, Oct 10, 2017 at 5:32 AM, Alexander Aring <aring@mojatatu.com> wrote:
> This patch fixes an issue with kfree_rcu which is not protected by RTNL
> lock. It could be that the current assigned rcu pointer will be freed by
> kfree_rcu while dump callback is running.
Why? kfree_rcu() respects existing readers, so why this could happen?
>
> To prevent this, we call rcu_synchronize at first. Then we are sure all
> latest rcu functions e.g. rcu_assign_pointer and kfree_rcu in init are
> done. After rcu_synchronize we dereference under RTNL lock which is also
> held in init function, which means no other rcu_assign_pointer or
> kfree_rcu will occur.
If you really want to wait for kfree_rcu(), rcu_barrier() is the one
instead of rcu_synchronize(). Just FYI.
>
> To call rcu_synchronize will also prevent weird behaviours by doing over
> netlink:
>
> - set params A
> - set params B
> - dump params
> \--> will dump params A
What's wrong with this? Existing readers could still read old data,
which is _perfectly_ fine as long as we don't free the old data before
they are gone.
^ permalink raw reply
* [RFC net-next 0/4] mlxsw: spectrum_router: Add extack messages for RIF and VRF overflow
From: David Ahern @ 2017-10-10 16:41 UTC (permalink / raw)
To: netdev; +Cc: jiri, idosch, kjlx, David Ahern
Currently, exceeding the number of VRF instances or the number of router
interfaces either fails with a non-intuitive EBUSY:
$ ip li set swp1s1.6 vrf vrf-1s1-6 up
RTNETLINK answers: Device or resource busy
or fails silently (IPv6) since the checks are done in a work queue. This
set adds support for the address validator notifier to spectrum which
allows ext-ack based messages to be returned on failure.
To make that happen the IPv6 version needs to be converted from atomic
to blocking (patch 1), and then support for extack needs to be added
to the notifier (patch 2). Patches 3 and 4 add the validator notifier
to spectrum and then plumb the extack argument.
With this set, VRF overflows fail with:
$ ip li set swp1s1.6 vrf vrf-1s1-6 up
Error: spectrum: Exceeded number of supported VRF.
and RIF overflows fail with:
$ ip addr add dev swp1s2.191 10.12.191.1/24
Error: spectrum: Exceeded number of supported router interfaces.
David Ahern (4):
net: ipv6: Make inet6addr_validator a blocking notifier
net: Add extack to validator_info structs used for address notifier
mlxsw: spectrum: router: Add support for address validator notifier
mlxsw: spectrum_router: Add extack message for RIF and VRF overflow
drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 10 ++
drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 4 +
.../net/ethernet/mellanox/mlxsw/spectrum_router.c | 163 +++++++++++++++------
drivers/net/ipvlan/ipvlan_main.c | 10 +-
include/linux/inetdevice.h | 1 +
include/net/addrconf.h | 1 +
net/ipv4/devinet.c | 8 +-
net/ipv6/addrconf.c | 51 ++++---
net/ipv6/addrconf_core.c | 9 +-
9 files changed, 184 insertions(+), 73 deletions(-)
--
2.1.4
^ permalink raw reply
* [RFC net-next 1/4] net: ipv6: Make inet6addr_validator a blocking notifier
From: David Ahern @ 2017-10-10 16:41 UTC (permalink / raw)
To: netdev; +Cc: jiri, idosch, kjlx, David Ahern
In-Reply-To: <1507653665-20540-1-git-send-email-dsahern@gmail.com>
inet6addr_validator chain was added by commit 3ad7d2468f79f ("Ipvlan
should return an error when an address is already in use") to allow
address validation before changes are committed and to be able to
fail the address change with an error back to the user. The address
validation is not done for addresses received from router
advertisements.
Handling RAs in softirq context is the only reason for the notifier
chain to be atomic versus blocking. Since the only current user, ipvlan,
of the validator chain ignores softirq context, the notifier can be made
blocking and simply not invoked for softirq path.
The blocking option is needed by spectrum for example to validate
resources for an adding an address to an interface.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
net/ipv6/addrconf.c | 24 +++++++++++++++---------
net/ipv6/addrconf_core.c | 9 +++++----
2 files changed, 20 insertions(+), 13 deletions(-)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d9f6226694eb..632cf4b26277 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -963,7 +963,6 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
struct net *net = dev_net(idev->dev);
struct inet6_ifaddr *ifa = NULL;
struct rt6_info *rt;
- struct in6_validator_info i6vi;
unsigned int hash;
int err = 0;
int addr_type = ipv6_addr_type(addr);
@@ -988,16 +987,23 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
goto out2;
}
- i6vi.i6vi_addr = *addr;
- i6vi.i6vi_dev = idev;
- rcu_read_unlock_bh();
+ /* validator notifier needs to be blocking;
+ * do not call in softirq context
+ */
+ if (!in_softirq()) {
+ struct in6_validator_info i6vi = {
+ .i6vi_addr = *addr,
+ .i6vi_dev = idev,
+ };
- err = inet6addr_validator_notifier_call_chain(NETDEV_UP, &i6vi);
+ rcu_read_unlock_bh();
+ err = inet6addr_validator_notifier_call_chain(NETDEV_UP, &i6vi);
+ rcu_read_lock_bh();
- rcu_read_lock_bh();
- err = notifier_to_errno(err);
- if (err)
- goto out2;
+ err = notifier_to_errno(err);
+ if (err)
+ goto out2;
+ }
spin_lock(&addrconf_hash_lock);
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 9e3488d50b15..32b564dfd02a 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -88,7 +88,7 @@ int __ipv6_addr_type(const struct in6_addr *addr)
EXPORT_SYMBOL(__ipv6_addr_type);
static ATOMIC_NOTIFIER_HEAD(inet6addr_chain);
-static ATOMIC_NOTIFIER_HEAD(inet6addr_validator_chain);
+static BLOCKING_NOTIFIER_HEAD(inet6addr_validator_chain);
int register_inet6addr_notifier(struct notifier_block *nb)
{
@@ -110,19 +110,20 @@ EXPORT_SYMBOL(inet6addr_notifier_call_chain);
int register_inet6addr_validator_notifier(struct notifier_block *nb)
{
- return atomic_notifier_chain_register(&inet6addr_validator_chain, nb);
+ return blocking_notifier_chain_register(&inet6addr_validator_chain, nb);
}
EXPORT_SYMBOL(register_inet6addr_validator_notifier);
int unregister_inet6addr_validator_notifier(struct notifier_block *nb)
{
- return atomic_notifier_chain_unregister(&inet6addr_validator_chain, nb);
+ return blocking_notifier_chain_unregister(&inet6addr_validator_chain,
+ nb);
}
EXPORT_SYMBOL(unregister_inet6addr_validator_notifier);
int inet6addr_validator_notifier_call_chain(unsigned long val, void *v)
{
- return atomic_notifier_call_chain(&inet6addr_validator_chain, val, v);
+ return blocking_notifier_call_chain(&inet6addr_validator_chain, val, v);
}
EXPORT_SYMBOL(inet6addr_validator_notifier_call_chain);
--
2.1.4
^ permalink raw reply related
* [RFC net-next 2/4] net: Add extack to validator_info structs used for address notifier
From: David Ahern @ 2017-10-10 16:41 UTC (permalink / raw)
To: netdev; +Cc: jiri, idosch, kjlx, David Ahern
In-Reply-To: <1507653665-20540-1-git-send-email-dsahern@gmail.com>
Add extack to in_validator_info and in6_validator_info. Update the one
user of each, ipvlan, to return an error message for failures.
Only manual configuration of an address is plumbed in the IPv6 code path.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
drivers/net/ipvlan/ipvlan_main.c | 10 ++++++++--
include/linux/inetdevice.h | 1 +
include/net/addrconf.h | 1 +
net/ipv4/devinet.c | 8 +++++---
net/ipv6/addrconf.c | 23 +++++++++++++----------
5 files changed, 28 insertions(+), 15 deletions(-)
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 57c3856bab05..56a868415ba2 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -846,8 +846,11 @@ static int ipvlan_addr6_validator_event(struct notifier_block *unused,
switch (event) {
case NETDEV_UP:
- if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true))
+ if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true)) {
+ NL_SET_ERR_MSG(i6vi->extack,
+ "Address already assigned to an ipvlan device");
return notifier_from_errno(-EADDRINUSE);
+ }
break;
}
@@ -916,8 +919,11 @@ static int ipvlan_addr4_validator_event(struct notifier_block *unused,
switch (event) {
case NETDEV_UP:
- if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false))
+ if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false)) {
+ NL_SET_ERR_MSG(ivi->extack,
+ "Address already assigned to an ipvlan device");
return notifier_from_errno(-EADDRINUSE);
+ }
break;
}
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 751d051f0bc7..681dff30940b 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -154,6 +154,7 @@ struct in_ifaddr {
struct in_validator_info {
__be32 ivi_addr;
struct in_device *ivi_dev;
+ struct netlink_ext_ack *extack;
};
int register_inetaddr_notifier(struct notifier_block *nb);
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 87981cd63180..b8b16437c6d5 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -55,6 +55,7 @@ struct prefix_info {
struct in6_validator_info {
struct in6_addr i6vi_addr;
struct inet6_dev *i6vi_dev;
+ struct netlink_ext_ack *extack;
};
#define IN6_ADDR_HSIZE_SHIFT 4
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 7ce22a2c07ce..0118698cd623 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -444,7 +444,7 @@ static void check_lifetime(struct work_struct *work);
static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime);
static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
- u32 portid)
+ u32 portid, struct netlink_ext_ack *extack)
{
struct in_device *in_dev = ifa->ifa_dev;
struct in_ifaddr *ifa1, **ifap, **last_primary;
@@ -489,6 +489,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
*/
ivi.ivi_addr = ifa->ifa_address;
ivi.ivi_dev = ifa->ifa_dev;
+ ivi.extack = extack;
ret = blocking_notifier_call_chain(&inetaddr_validator_chain,
NETDEV_UP, &ivi);
ret = notifier_to_errno(ret);
@@ -521,7 +522,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
static int inet_insert_ifa(struct in_ifaddr *ifa)
{
- return __inet_insert_ifa(ifa, NULL, 0);
+ return __inet_insert_ifa(ifa, NULL, 0, NULL);
}
static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
@@ -902,7 +903,8 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
return ret;
}
}
- return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid);
+ return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid,
+ extack);
} else {
inet_free_ifa(ifa);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 632cf4b26277..0bad4a800f73 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -958,7 +958,8 @@ static u32 inet6_addr_hash(const struct in6_addr *addr)
static struct inet6_ifaddr *
ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
const struct in6_addr *peer_addr, int pfxlen,
- int scope, u32 flags, u32 valid_lft, u32 prefered_lft)
+ int scope, u32 flags, u32 valid_lft, u32 prefered_lft,
+ struct netlink_ext_ack *extack)
{
struct net *net = dev_net(idev->dev);
struct inet6_ifaddr *ifa = NULL;
@@ -994,6 +995,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
struct in6_validator_info i6vi = {
.i6vi_addr = *addr,
.i6vi_dev = idev,
+ .extack = extack,
};
rcu_read_unlock_bh();
@@ -1336,7 +1338,7 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *i
ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen,
ipv6_addr_scope(&addr), addr_flags,
- tmp_valid_lft, tmp_prefered_lft);
+ tmp_valid_lft, tmp_prefered_lft, NULL);
if (IS_ERR(ift)) {
in6_ifa_put(ifp);
in6_dev_put(idev);
@@ -2020,7 +2022,7 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp)
ifp2 = ipv6_add_addr(idev, &new_addr, NULL, pfxlen,
scope, flags, valid_lft,
- preferred_lft);
+ preferred_lft, NULL);
if (IS_ERR(ifp2))
goto lock_errdad;
@@ -2478,7 +2480,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
pinfo->prefix_len,
addr_type&IPV6_ADDR_SCOPE_MASK,
addr_flags, valid_lft,
- prefered_lft);
+ prefered_lft, NULL);
if (IS_ERR_OR_NULL(ifp))
return -1;
@@ -2788,7 +2790,8 @@ static int inet6_addr_add(struct net *net, int ifindex,
const struct in6_addr *pfx,
const struct in6_addr *peer_pfx,
unsigned int plen, __u32 ifa_flags,
- __u32 prefered_lft, __u32 valid_lft)
+ __u32 prefered_lft, __u32 valid_lft,
+ struct netlink_ext_ack *extack)
{
struct inet6_ifaddr *ifp;
struct inet6_dev *idev;
@@ -2847,7 +2850,7 @@ static int inet6_addr_add(struct net *net, int ifindex,
}
ifp = ipv6_add_addr(idev, pfx, peer_pfx, plen, scope, ifa_flags,
- valid_lft, prefered_lft);
+ valid_lft, prefered_lft, extack);
if (!IS_ERR(ifp)) {
if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
@@ -2932,7 +2935,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg)
rtnl_lock();
err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, NULL,
ireq.ifr6_prefixlen, IFA_F_PERMANENT,
- INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
+ INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, NULL);
rtnl_unlock();
return err;
}
@@ -2962,7 +2965,7 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
ifp = ipv6_add_addr(idev, addr, NULL, plen,
scope, IFA_F_PERMANENT,
- INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
+ INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, NULL);
if (!IS_ERR(ifp)) {
spin_lock_bh(&ifp->lock);
ifp->flags &= ~IFA_F_TENTATIVE;
@@ -3062,7 +3065,7 @@ void addrconf_add_linklocal(struct inet6_dev *idev,
#endif
ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags,
- INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
+ INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, NULL);
if (!IS_ERR(ifp)) {
addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0);
addrconf_dad_start(ifp);
@@ -4565,7 +4568,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
*/
return inet6_addr_add(net, ifm->ifa_index, pfx, peer_pfx,
ifm->ifa_prefixlen, ifa_flags,
- preferred_lft, valid_lft);
+ preferred_lft, valid_lft, extack);
}
if (nlh->nlmsg_flags & NLM_F_EXCL ||
--
2.1.4
^ permalink raw reply related
* [RFC net-next 3/4] mlxsw: spectrum: router: Add support for address validator notifier
From: David Ahern @ 2017-10-10 16:41 UTC (permalink / raw)
To: netdev; +Cc: jiri, idosch, kjlx, David Ahern
In-Reply-To: <1507653665-20540-1-git-send-email-dsahern@gmail.com>
Add support for inetaddr_validator and inet6addr_validator. The
notifiers provide a means for validating ipv4 and ipv6 addresses
before the addresses are installed and on failure the error
is propagated back to the user.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 10 ++++
drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 4 ++
.../net/ethernet/mellanox/mlxsw/spectrum_router.c | 53 ++++++++++++++++++++++
3 files changed, 67 insertions(+)
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 321988ac57cc..da4ee91235be 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -4505,11 +4505,19 @@ static struct notifier_block mlxsw_sp_netdevice_nb __read_mostly = {
.notifier_call = mlxsw_sp_netdevice_event,
};
+static struct notifier_block mlxsw_sp_inetaddr_valid_nb __read_mostly = {
+ .notifier_call = mlxsw_sp_inetaddr_valid_event,
+};
+
static struct notifier_block mlxsw_sp_inetaddr_nb __read_mostly = {
.notifier_call = mlxsw_sp_inetaddr_event,
.priority = 10, /* Must be called before FIB notifier block */
};
+static struct notifier_block mlxsw_sp_inet6addr_valid_nb __read_mostly = {
+ .notifier_call = mlxsw_sp_inet6addr_valid_event,
+};
+
static struct notifier_block mlxsw_sp_inet6addr_nb __read_mostly = {
.notifier_call = mlxsw_sp_inet6addr_event,
};
@@ -4533,7 +4541,9 @@ static int __init mlxsw_sp_module_init(void)
int err;
register_netdevice_notifier(&mlxsw_sp_netdevice_nb);
+ register_inetaddr_validator_notifier(&mlxsw_sp_inetaddr_valid_nb);
register_inetaddr_notifier(&mlxsw_sp_inetaddr_nb);
+ register_inet6addr_validator_notifier(&mlxsw_sp_inet6addr_valid_nb);
register_inet6addr_notifier(&mlxsw_sp_inet6addr_nb);
register_netevent_notifier(&mlxsw_sp_router_netevent_nb);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 8e45183dc9bb..4865a6f58c83 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -390,8 +390,12 @@ int mlxsw_sp_router_netevent_event(struct notifier_block *unused,
int mlxsw_sp_netdevice_router_port_event(struct net_device *dev);
int mlxsw_sp_inetaddr_event(struct notifier_block *unused,
unsigned long event, void *ptr);
+int mlxsw_sp_inetaddr_valid_event(struct notifier_block *unused,
+ unsigned long event, void *ptr);
int mlxsw_sp_inet6addr_event(struct notifier_block *unused,
unsigned long event, void *ptr);
+int mlxsw_sp_inet6addr_valid_event(struct notifier_block *unused,
+ unsigned long event, void *ptr);
int mlxsw_sp_netdevice_vrf_event(struct net_device *l3_dev, unsigned long event,
struct netdev_notifier_changeupper_info *info);
void
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 6a356f4b99a3..7d53fdf2c0a8 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -5656,6 +5656,33 @@ int mlxsw_sp_inetaddr_event(struct notifier_block *unused,
struct mlxsw_sp_rif *rif;
int err = 0;
+ /* NETDEV_UP event is handled by mlxsw_sp_inetaddr_valid_event */
+ if (event == NETDEV_UP)
+ goto out;
+
+ mlxsw_sp = mlxsw_sp_lower_get(dev);
+ if (!mlxsw_sp)
+ goto out;
+
+ rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev);
+ if (!mlxsw_sp_rif_should_config(rif, dev, event))
+ goto out;
+
+ err = __mlxsw_sp_inetaddr_event(dev, event);
+out:
+ return notifier_from_errno(err);
+}
+
+/* only expected to be called for event == NETDEV_UP */
+int mlxsw_sp_inetaddr_valid_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct in_validator_info *ivi = (struct in_validator_info *)ptr;
+ struct net_device *dev = ivi->ivi_dev->dev;
+ struct mlxsw_sp *mlxsw_sp;
+ struct mlxsw_sp_rif *rif;
+ int err = 0;
+
mlxsw_sp = mlxsw_sp_lower_get(dev);
if (!mlxsw_sp)
goto out;
@@ -5708,6 +5735,10 @@ int mlxsw_sp_inet6addr_event(struct notifier_block *unused,
struct mlxsw_sp_inet6addr_event_work *inet6addr_work;
struct net_device *dev = if6->idev->dev;
+ /* NETDEV_UP event is handled by mlxsw_sp_inet6addr_valid_event */
+ if (event == NETDEV_UP)
+ return NOTIFY_DONE;
+
if (!mlxsw_sp_port_dev_lower_find_rcu(dev))
return NOTIFY_DONE;
@@ -5724,6 +5755,28 @@ int mlxsw_sp_inet6addr_event(struct notifier_block *unused,
return NOTIFY_DONE;
}
+int mlxsw_sp_inet6addr_valid_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct in6_validator_info *i6vi = (struct in6_validator_info *)ptr;
+ struct net_device *dev = i6vi->i6vi_dev->dev;
+ struct mlxsw_sp *mlxsw_sp;
+ struct mlxsw_sp_rif *rif;
+ int err = 0;
+
+ mlxsw_sp = mlxsw_sp_lower_get(dev);
+ if (!mlxsw_sp)
+ goto out;
+
+ rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev);
+ if (!mlxsw_sp_rif_should_config(rif, dev, event))
+ goto out;
+
+ err = __mlxsw_sp_inetaddr_event(dev, event);
+out:
+ return notifier_from_errno(err);
+}
+
static int mlxsw_sp_rif_edit(struct mlxsw_sp *mlxsw_sp, u16 rif_index,
const char *mac, int mtu)
{
--
2.1.4
^ permalink raw reply related
* [RFC net-next 4/4] mlxsw: spectrum_router: Add extack message for RIF and VRF overflow
From: David Ahern @ 2017-10-10 16:41 UTC (permalink / raw)
To: netdev; +Cc: jiri, idosch, kjlx, David Ahern
In-Reply-To: <1507653665-20540-1-git-send-email-dsahern@gmail.com>
Add extack argument down to mlxsw_sp_rif_create and mlxsw_sp_vr_create
to set an error message on RIF or VR overflow. Now an overflow of
either resource the use gets an informative message as opposed to
failing with EBUSY.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
.../net/ethernet/mellanox/mlxsw/spectrum_router.c | 114 +++++++++++++--------
1 file changed, 69 insertions(+), 45 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 7d53fdf2c0a8..ec4d313b9eca 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -731,14 +731,17 @@ static struct mlxsw_sp_fib *mlxsw_sp_vr_fib(const struct mlxsw_sp_vr *vr,
}
static struct mlxsw_sp_vr *mlxsw_sp_vr_create(struct mlxsw_sp *mlxsw_sp,
- u32 tb_id)
+ u32 tb_id,
+ struct netlink_ext_ack *extack)
{
struct mlxsw_sp_vr *vr;
int err;
vr = mlxsw_sp_vr_find_unused(mlxsw_sp);
- if (!vr)
+ if (!vr) {
+ NL_SET_ERR_MSG(extack, "spectrum: Exceeded number of supported VRF");
return ERR_PTR(-EBUSY);
+ }
vr->fib4 = mlxsw_sp_fib_create(vr, MLXSW_SP_L3_PROTO_IPV4);
if (IS_ERR(vr->fib4))
return ERR_CAST(vr->fib4);
@@ -775,14 +778,15 @@ static void mlxsw_sp_vr_destroy(struct mlxsw_sp_vr *vr)
vr->fib4 = NULL;
}
-static struct mlxsw_sp_vr *mlxsw_sp_vr_get(struct mlxsw_sp *mlxsw_sp, u32 tb_id)
+static struct mlxsw_sp_vr *mlxsw_sp_vr_get(struct mlxsw_sp *mlxsw_sp, u32 tb_id,
+ struct netlink_ext_ack *extack)
{
struct mlxsw_sp_vr *vr;
tb_id = mlxsw_sp_fix_tb_id(tb_id);
vr = mlxsw_sp_vr_find(mlxsw_sp, tb_id);
if (!vr)
- vr = mlxsw_sp_vr_create(mlxsw_sp, tb_id);
+ vr = mlxsw_sp_vr_create(mlxsw_sp, tb_id, extack);
return vr;
}
@@ -948,7 +952,8 @@ static u32 mlxsw_sp_ipip_dev_ul_tb_id(const struct net_device *ol_dev)
static struct mlxsw_sp_rif *
mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp,
- const struct mlxsw_sp_rif_params *params);
+ const struct mlxsw_sp_rif_params *params,
+ struct netlink_ext_ack *extack);
static struct mlxsw_sp_rif_ipip_lb *
mlxsw_sp_ipip_ol_ipip_lb_create(struct mlxsw_sp *mlxsw_sp,
@@ -966,7 +971,7 @@ mlxsw_sp_ipip_ol_ipip_lb_create(struct mlxsw_sp *mlxsw_sp,
.lb_config = ipip_ops->ol_loopback_config(mlxsw_sp, ol_dev),
};
- rif = mlxsw_sp_rif_create(mlxsw_sp, &lb_params.common);
+ rif = mlxsw_sp_rif_create(mlxsw_sp, &lb_params.common, NULL);
if (IS_ERR(rif))
return ERR_CAST(rif);
return container_of(rif, struct mlxsw_sp_rif_ipip_lb, common);
@@ -3711,7 +3716,7 @@ mlxsw_sp_fib_node_get(struct mlxsw_sp *mlxsw_sp, u32 tb_id, const void *addr,
struct mlxsw_sp_vr *vr;
int err;
- vr = mlxsw_sp_vr_get(mlxsw_sp, tb_id);
+ vr = mlxsw_sp_vr_get(mlxsw_sp, tb_id, NULL);
if (IS_ERR(vr))
return ERR_CAST(vr);
fib = mlxsw_sp_vr_fib(vr, proto);
@@ -4750,7 +4755,7 @@ static int mlxsw_sp_router_fibmr_add(struct mlxsw_sp *mlxsw_sp,
if (mlxsw_sp->router->aborted)
return 0;
- vr = mlxsw_sp_vr_get(mlxsw_sp, men_info->tb_id);
+ vr = mlxsw_sp_vr_get(mlxsw_sp, men_info->tb_id, NULL);
if (IS_ERR(vr))
return PTR_ERR(vr);
@@ -4783,7 +4788,7 @@ mlxsw_sp_router_fibmr_vif_add(struct mlxsw_sp *mlxsw_sp,
if (mlxsw_sp->router->aborted)
return 0;
- vr = mlxsw_sp_vr_get(mlxsw_sp, ven_info->tb_id);
+ vr = mlxsw_sp_vr_get(mlxsw_sp, ven_info->tb_id, NULL);
if (IS_ERR(vr))
return PTR_ERR(vr);
@@ -5346,7 +5351,8 @@ const struct net_device *mlxsw_sp_rif_dev(const struct mlxsw_sp_rif *rif)
static struct mlxsw_sp_rif *
mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp,
- const struct mlxsw_sp_rif_params *params)
+ const struct mlxsw_sp_rif_params *params,
+ struct netlink_ext_ack *extack)
{
u32 tb_id = l3mdev_fib_table(params->dev);
const struct mlxsw_sp_rif_ops *ops;
@@ -5360,14 +5366,16 @@ mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp,
type = mlxsw_sp_dev_rif_type(mlxsw_sp, params->dev);
ops = mlxsw_sp->router->rif_ops_arr[type];
- vr = mlxsw_sp_vr_get(mlxsw_sp, tb_id ? : RT_TABLE_MAIN);
+ vr = mlxsw_sp_vr_get(mlxsw_sp, tb_id ? : RT_TABLE_MAIN, extack);
if (IS_ERR(vr))
return ERR_CAST(vr);
vr->rif_count++;
err = mlxsw_sp_rif_index_alloc(mlxsw_sp, &rif_index);
- if (err)
+ if (err) {
+ NL_SET_ERR_MSG(extack, "spectrum: Exceeded number of supported router interfaces");
goto err_rif_index_alloc;
+ }
rif = mlxsw_sp_rif_alloc(ops->rif_size, rif_index, vr->id, params->dev);
if (!rif) {
@@ -5454,7 +5462,8 @@ mlxsw_sp_rif_subport_params_init(struct mlxsw_sp_rif_params *params,
static int
mlxsw_sp_port_vlan_router_join(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan,
- struct net_device *l3_dev)
+ struct net_device *l3_dev,
+ struct netlink_ext_ack *extack)
{
struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp_port_vlan->mlxsw_sp_port;
struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
@@ -5470,7 +5479,7 @@ mlxsw_sp_port_vlan_router_join(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan,
};
mlxsw_sp_rif_subport_params_init(¶ms, mlxsw_sp_port_vlan);
- rif = mlxsw_sp_rif_create(mlxsw_sp, ¶ms);
+ rif = mlxsw_sp_rif_create(mlxsw_sp, ¶ms, extack);
if (IS_ERR(rif))
return PTR_ERR(rif);
}
@@ -5525,7 +5534,8 @@ mlxsw_sp_port_vlan_router_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan)
static int mlxsw_sp_inetaddr_port_vlan_event(struct net_device *l3_dev,
struct net_device *port_dev,
- unsigned long event, u16 vid)
+ unsigned long event, u16 vid,
+ struct netlink_ext_ack *extack)
{
struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(port_dev);
struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
@@ -5537,7 +5547,7 @@ static int mlxsw_sp_inetaddr_port_vlan_event(struct net_device *l3_dev,
switch (event) {
case NETDEV_UP:
return mlxsw_sp_port_vlan_router_join(mlxsw_sp_port_vlan,
- l3_dev);
+ l3_dev, extack);
case NETDEV_DOWN:
mlxsw_sp_port_vlan_router_leave(mlxsw_sp_port_vlan);
break;
@@ -5547,19 +5557,22 @@ static int mlxsw_sp_inetaddr_port_vlan_event(struct net_device *l3_dev,
}
static int mlxsw_sp_inetaddr_port_event(struct net_device *port_dev,
- unsigned long event)
+ unsigned long event,
+ struct netlink_ext_ack *extack)
{
if (netif_is_bridge_port(port_dev) ||
netif_is_lag_port(port_dev) ||
netif_is_ovs_port(port_dev))
return 0;
- return mlxsw_sp_inetaddr_port_vlan_event(port_dev, port_dev, event, 1);
+ return mlxsw_sp_inetaddr_port_vlan_event(port_dev, port_dev, event, 1,
+ extack);
}
static int __mlxsw_sp_inetaddr_lag_event(struct net_device *l3_dev,
struct net_device *lag_dev,
- unsigned long event, u16 vid)
+ unsigned long event, u16 vid,
+ struct netlink_ext_ack *extack)
{
struct net_device *port_dev;
struct list_head *iter;
@@ -5569,7 +5582,8 @@ static int __mlxsw_sp_inetaddr_lag_event(struct net_device *l3_dev,
if (mlxsw_sp_port_dev_check(port_dev)) {
err = mlxsw_sp_inetaddr_port_vlan_event(l3_dev,
port_dev,
- event, vid);
+ event, vid,
+ extack);
if (err)
return err;
}
@@ -5579,16 +5593,19 @@ static int __mlxsw_sp_inetaddr_lag_event(struct net_device *l3_dev,
}
static int mlxsw_sp_inetaddr_lag_event(struct net_device *lag_dev,
- unsigned long event)
+ unsigned long event,
+ struct netlink_ext_ack *extack)
{
if (netif_is_bridge_port(lag_dev))
return 0;
- return __mlxsw_sp_inetaddr_lag_event(lag_dev, lag_dev, event, 1);
+ return __mlxsw_sp_inetaddr_lag_event(lag_dev, lag_dev, event, 1,
+ extack);
}
static int mlxsw_sp_inetaddr_bridge_event(struct net_device *l3_dev,
- unsigned long event)
+ unsigned long event,
+ struct netlink_ext_ack *extack)
{
struct mlxsw_sp *mlxsw_sp = mlxsw_sp_lower_get(l3_dev);
struct mlxsw_sp_rif_params params = {
@@ -5598,7 +5615,7 @@ static int mlxsw_sp_inetaddr_bridge_event(struct net_device *l3_dev,
switch (event) {
case NETDEV_UP:
- rif = mlxsw_sp_rif_create(mlxsw_sp, ¶ms);
+ rif = mlxsw_sp_rif_create(mlxsw_sp, ¶ms, extack);
if (IS_ERR(rif))
return PTR_ERR(rif);
break;
@@ -5612,7 +5629,8 @@ static int mlxsw_sp_inetaddr_bridge_event(struct net_device *l3_dev,
}
static int mlxsw_sp_inetaddr_vlan_event(struct net_device *vlan_dev,
- unsigned long event)
+ unsigned long event,
+ struct netlink_ext_ack *extack)
{
struct net_device *real_dev = vlan_dev_real_dev(vlan_dev);
u16 vid = vlan_dev_vlan_id(vlan_dev);
@@ -5622,27 +5640,28 @@ static int mlxsw_sp_inetaddr_vlan_event(struct net_device *vlan_dev,
if (mlxsw_sp_port_dev_check(real_dev))
return mlxsw_sp_inetaddr_port_vlan_event(vlan_dev, real_dev,
- event, vid);
+ event, vid, extack);
else if (netif_is_lag_master(real_dev))
return __mlxsw_sp_inetaddr_lag_event(vlan_dev, real_dev, event,
- vid);
+ vid, extack);
else if (netif_is_bridge_master(real_dev) && br_vlan_enabled(real_dev))
- return mlxsw_sp_inetaddr_bridge_event(vlan_dev, event);
+ return mlxsw_sp_inetaddr_bridge_event(vlan_dev, event, extack);
return 0;
}
static int __mlxsw_sp_inetaddr_event(struct net_device *dev,
- unsigned long event)
+ unsigned long event,
+ struct netlink_ext_ack *extack)
{
if (mlxsw_sp_port_dev_check(dev))
- return mlxsw_sp_inetaddr_port_event(dev, event);
+ return mlxsw_sp_inetaddr_port_event(dev, event, extack);
else if (netif_is_lag_master(dev))
- return mlxsw_sp_inetaddr_lag_event(dev, event);
+ return mlxsw_sp_inetaddr_lag_event(dev, event, extack);
else if (netif_is_bridge_master(dev))
- return mlxsw_sp_inetaddr_bridge_event(dev, event);
+ return mlxsw_sp_inetaddr_bridge_event(dev, event, extack);
else if (is_vlan_dev(dev))
- return mlxsw_sp_inetaddr_vlan_event(dev, event);
+ return mlxsw_sp_inetaddr_vlan_event(dev, event, extack);
else
return 0;
}
@@ -5668,7 +5687,7 @@ int mlxsw_sp_inetaddr_event(struct notifier_block *unused,
if (!mlxsw_sp_rif_should_config(rif, dev, event))
goto out;
- err = __mlxsw_sp_inetaddr_event(dev, event);
+ err = __mlxsw_sp_inetaddr_event(dev, event, NULL);
out:
return notifier_from_errno(err);
}
@@ -5691,7 +5710,7 @@ int mlxsw_sp_inetaddr_valid_event(struct notifier_block *unused,
if (!mlxsw_sp_rif_should_config(rif, dev, event))
goto out;
- err = __mlxsw_sp_inetaddr_event(dev, event);
+ err = __mlxsw_sp_inetaddr_event(dev, event, ivi->extack);
out:
return notifier_from_errno(err);
}
@@ -5720,7 +5739,7 @@ static void mlxsw_sp_inet6addr_event_work(struct work_struct *work)
if (!mlxsw_sp_rif_should_config(rif, dev, event))
goto out;
- __mlxsw_sp_inetaddr_event(dev, event);
+ __mlxsw_sp_inetaddr_event(dev, event, NULL);
out:
rtnl_unlock();
dev_put(dev);
@@ -5772,7 +5791,7 @@ int mlxsw_sp_inet6addr_valid_event(struct notifier_block *unused,
if (!mlxsw_sp_rif_should_config(rif, dev, event))
goto out;
- err = __mlxsw_sp_inetaddr_event(dev, event);
+ err = __mlxsw_sp_inetaddr_event(dev, event, i6vi->extack);
out:
return notifier_from_errno(err);
}
@@ -5849,7 +5868,8 @@ int mlxsw_sp_netdevice_router_port_event(struct net_device *dev)
}
static int mlxsw_sp_port_vrf_join(struct mlxsw_sp *mlxsw_sp,
- struct net_device *l3_dev)
+ struct net_device *l3_dev,
+ struct netlink_ext_ack *extack)
{
struct mlxsw_sp_rif *rif;
@@ -5858,9 +5878,9 @@ static int mlxsw_sp_port_vrf_join(struct mlxsw_sp *mlxsw_sp,
*/
rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, l3_dev);
if (rif)
- __mlxsw_sp_inetaddr_event(l3_dev, NETDEV_DOWN);
+ __mlxsw_sp_inetaddr_event(l3_dev, NETDEV_DOWN, extack);
- return __mlxsw_sp_inetaddr_event(l3_dev, NETDEV_UP);
+ return __mlxsw_sp_inetaddr_event(l3_dev, NETDEV_UP, extack);
}
static void mlxsw_sp_port_vrf_leave(struct mlxsw_sp *mlxsw_sp,
@@ -5871,7 +5891,7 @@ static void mlxsw_sp_port_vrf_leave(struct mlxsw_sp *mlxsw_sp,
rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, l3_dev);
if (!rif)
return;
- __mlxsw_sp_inetaddr_event(l3_dev, NETDEV_DOWN);
+ __mlxsw_sp_inetaddr_event(l3_dev, NETDEV_DOWN, NULL);
}
int mlxsw_sp_netdevice_vrf_event(struct net_device *l3_dev, unsigned long event,
@@ -5887,10 +5907,14 @@ int mlxsw_sp_netdevice_vrf_event(struct net_device *l3_dev, unsigned long event,
case NETDEV_PRECHANGEUPPER:
return 0;
case NETDEV_CHANGEUPPER:
- if (info->linking)
- err = mlxsw_sp_port_vrf_join(mlxsw_sp, l3_dev);
- else
+ if (info->linking) {
+ struct netlink_ext_ack *extack;
+
+ extack = netdev_notifier_info_to_extack(&info->info);
+ err = mlxsw_sp_port_vrf_join(mlxsw_sp, l3_dev, extack);
+ } else {
mlxsw_sp_port_vrf_leave(mlxsw_sp, l3_dev);
+ }
break;
}
@@ -6197,7 +6221,7 @@ mlxsw_sp_rif_ipip_lb_configure(struct mlxsw_sp_rif *rif)
struct mlxsw_sp_vr *ul_vr;
int err;
- ul_vr = mlxsw_sp_vr_get(mlxsw_sp, ul_tb_id);
+ ul_vr = mlxsw_sp_vr_get(mlxsw_sp, ul_tb_id, NULL);
if (IS_ERR(ul_vr))
return PTR_ERR(ul_vr);
--
2.1.4
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox