Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 5/8] ibmvnic: Handle error case when setting link state
From: Thomas Falcon @ 2018-05-23 18:37 UTC (permalink / raw)
  To: netdev; +Cc: nfont, jallen, linuxppc-dev, Thomas Falcon
In-Reply-To: <1527100682-23099-1-git-send-email-tlfalcon@linux.vnet.ibm.com>

If setting the link state is not successful, print a warning
with the resulting return code and return it to be handled
by the caller.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index f1f744e..b1bbd5b 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -929,6 +929,10 @@ static int set_link_state(struct ibmvnic_adapter *adapter, u8 link_state)
 			/* Partuial success, delay and re-send */
 			mdelay(1000);
 			resend = true;
+		} else if (adapter->init_done_rc) {
+			netdev_warn(netdev, "Unable to set link state, rc=%d\n",
+				    adapter->init_done_rc);
+			return adapter->init_done_rc;
 		}
 	} while (resend);
 
-- 
2.7.5

^ permalink raw reply related

* [PATCH net-next 6/8] ibmvnic: Create separate initialization routine for resets
From: Thomas Falcon @ 2018-05-23 18:38 UTC (permalink / raw)
  To: netdev; +Cc: nfont, jallen, linuxppc-dev, Thomas Falcon
In-Reply-To: <1527100682-23099-1-git-send-email-tlfalcon@linux.vnet.ibm.com>

Instead of having one initialization routine for all cases, create
a separate, simpler function for standard initialization, such as during
device probe. Use the original initialization function to handle
device reset scenarios. The goal of this patch is to avoid having
a single, cluttered init function to handle all possible
scenarios.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 48 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index b1bbd5b..f26e1f8 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -116,6 +116,7 @@ static void send_cap_queries(struct ibmvnic_adapter *adapter);
 static int init_sub_crqs(struct ibmvnic_adapter *);
 static int init_sub_crq_irqs(struct ibmvnic_adapter *adapter);
 static int ibmvnic_init(struct ibmvnic_adapter *);
+static int ibmvnic_reset_init(struct ibmvnic_adapter *);
 static void release_crq_queue(struct ibmvnic_adapter *);
 static int __ibmvnic_set_mac(struct net_device *netdev, struct sockaddr *p);
 static int init_crq_queue(struct ibmvnic_adapter *adapter);
@@ -1807,7 +1808,7 @@ static int do_reset(struct ibmvnic_adapter *adapter,
 			return rc;
 		}
 
-		rc = ibmvnic_init(adapter);
+		rc = ibmvnic_reset_init(adapter);
 		if (rc)
 			return IBMVNIC_INIT_FAILED;
 
@@ -4571,7 +4572,7 @@ static int init_crq_queue(struct ibmvnic_adapter *adapter)
 	return retrc;
 }
 
-static int ibmvnic_init(struct ibmvnic_adapter *adapter)
+static int ibmvnic_reset_init(struct ibmvnic_adapter *adapter)
 {
 	struct device *dev = &adapter->vdev->dev;
 	unsigned long timeout = msecs_to_jiffies(30000);
@@ -4630,6 +4631,49 @@ static int ibmvnic_init(struct ibmvnic_adapter *adapter)
 	return rc;
 }
 
+static int ibmvnic_init(struct ibmvnic_adapter *adapter)
+{
+	struct device *dev = &adapter->vdev->dev;
+	unsigned long timeout = msecs_to_jiffies(30000);
+	int rc;
+
+	adapter->from_passive_init = false;
+
+	init_completion(&adapter->init_done);
+	adapter->init_done_rc = 0;
+	ibmvnic_send_crq_init(adapter);
+	if (!wait_for_completion_timeout(&adapter->init_done, timeout)) {
+		dev_err(dev, "Initialization sequence timed out\n");
+		return -1;
+	}
+
+	if (adapter->init_done_rc) {
+		release_crq_queue(adapter);
+		return adapter->init_done_rc;
+	}
+
+	if (adapter->from_passive_init) {
+		adapter->state = VNIC_OPEN;
+		adapter->from_passive_init = false;
+		return -1;
+	}
+
+	rc = init_sub_crqs(adapter);
+	if (rc) {
+		dev_err(dev, "Initialization of sub crqs failed\n");
+		release_crq_queue(adapter);
+		return rc;
+	}
+
+	rc = init_sub_crq_irqs(adapter);
+	if (rc) {
+		dev_err(dev, "Failed to initialize sub crq irqs\n");
+		release_crq_queue(adapter);
+	}
+
+	return rc;
+}
+
 static struct device_attribute dev_attr_failover;
 
 static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
-- 
2.7.5

^ permalink raw reply related

* [PATCH net-next 8/8] ibmvnic: Introduce hard reset recovery
From: Thomas Falcon @ 2018-05-23 18:38 UTC (permalink / raw)
  To: netdev; +Cc: nfont, jallen, linuxppc-dev, Thomas Falcon
In-Reply-To: <1527100682-23099-1-git-send-email-tlfalcon@linux.vnet.ibm.com>

Introduce a recovery hard reset to handle reset failure as a result of
change of device context following a transport event, such as a
backing device failover or partition migration. These operations reset
the device context to its initial state. If this occurs during a reset,
any initialization commands are likely to fail with an invalid state
error as backing device firmware requests reinitialization.

When this happens, make one more attempt by performing a hard reset,
which frees any resources currently allocated and performs device
initialization. If a transport event occurs during a device reset, a
flag is set which will trigger a new hard reset following the
completionof the current reset event.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 101 +++++++++++++++++++++++++++++++++++--
 drivers/net/ethernet/ibm/ibmvnic.h |   1 +
 2 files changed, 98 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index ee51deb..09f8e6b 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1878,6 +1878,85 @@ static int do_reset(struct ibmvnic_adapter *adapter,
 	return 0;
 }
 
+static int do_hard_reset(struct ibmvnic_adapter *adapter,
+			 struct ibmvnic_rwi *rwi, u32 reset_state)
+{
+	struct net_device *netdev = adapter->netdev;
+	int rc;
+
+	netdev_dbg(adapter->netdev, "Hard resetting driver (%d)\n",
+		   rwi->reset_reason);
+
+	netif_carrier_off(netdev);
+	adapter->reset_reason = rwi->reset_reason;
+
+	ibmvnic_cleanup(netdev);
+	release_resources(adapter);
+	release_sub_crqs(adapter, 0);
+	release_crq_queue(adapter);
+
+	/* remove the closed state so when we call open it appears
+	 * we are coming from the probed state.
+	 */
+	adapter->state = VNIC_PROBED;
+
+	rc = init_crq_queue(adapter);
+	if (rc) {
+		netdev_err(adapter->netdev,
+			   "Couldn't initialize crq. rc=%d\n", rc);
+		return rc;
+	}
+
+	rc = ibmvnic_init(adapter);
+	if (rc)
+		return rc;
+
+	/* If the adapter was in PROBE state prior to the reset,
+	 * exit here.
+	 */
+	if (reset_state == VNIC_PROBED)
+		return 0;
+
+	rc = ibmvnic_login(netdev);
+	if (rc) {
+		adapter->state = VNIC_PROBED;
+		return 0;
+	}
+	/* netif_set_real_num_xx_queues needs to take rtnl lock here
+	 * unless wait_for_reset is set, in which case the rtnl lock
+	 * has already been taken before initializing the reset
+	 */
+	if (!adapter->wait_for_reset) {
+		rtnl_lock();
+		rc = init_resources(adapter);
+		rtnl_unlock();
+	} else {
+		rc = init_resources(adapter);
+	}
+	if (rc)
+		return rc;
+
+	ibmvnic_disable_irqs(adapter);
+	adapter->state = VNIC_CLOSED;
+
+	if (reset_state == VNIC_CLOSED)
+		return 0;
+
+	rc = __ibmvnic_open(netdev);
+	if (rc) {
+		if (list_empty(&adapter->rwi_list))
+			adapter->state = VNIC_CLOSED;
+		else
+			adapter->state = reset_state;
+
+		return 0;
+	}
+
+	netif_carrier_on(netdev);
+
+	return 0;
+}
+
 static struct ibmvnic_rwi *get_next_rwi(struct ibmvnic_adapter *adapter)
 {
 	struct ibmvnic_rwi *rwi;
@@ -1923,9 +2002,15 @@ static void __ibmvnic_reset(struct work_struct *work)
 
 	rwi = get_next_rwi(adapter);
 	while (rwi) {
-		rc = do_reset(adapter, rwi, reset_state);
+		if (adapter->force_reset_recovery) {
+			adapter->force_reset_recovery = false;
+			rc = do_hard_reset(adapter, rwi, reset_state);
+		} else {
+			rc = do_reset(adapter, rwi, reset_state);
+		}
 		kfree(rwi);
-		if (rc && rc != IBMVNIC_INIT_FAILED)
+		if (rc && rc != IBMVNIC_INIT_FAILED &&
+		    !adapter->force_reset_recovery)
 			break;
 
 		rwi = get_next_rwi(adapter);
@@ -1951,9 +2036,9 @@ static void __ibmvnic_reset(struct work_struct *work)
 static int ibmvnic_reset(struct ibmvnic_adapter *adapter,
 			 enum ibmvnic_reset_reason reason)
 {
+	struct list_head *entry, *tmp_entry;
 	struct ibmvnic_rwi *rwi, *tmp;
 	struct net_device *netdev = adapter->netdev;
-	struct list_head *entry;
 	int ret;
 
 	if (adapter->state == VNIC_REMOVING ||
@@ -1989,7 +2074,13 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter,
 		ret = ENOMEM;
 		goto err;
 	}
-
+	/* if we just received a transport event,
+	 * flush reset queue and process this reset
+	 */
+	if (adapter->force_reset_recovery && !list_empty(&adapter->rwi_list)) {
+		list_for_each_safe(entry, tmp_entry, &adapter->rwi_list)
+			list_del(entry);
+	}
 	rwi->reset_reason = reason;
 	list_add_tail(&rwi->list, &adapter->rwi_list);
 	mutex_unlock(&adapter->rwi_lock);
@@ -4271,6 +4362,8 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 	case IBMVNIC_CRQ_XPORT_EVENT:
 		netif_carrier_off(netdev);
 		adapter->crq.active = false;
+		if (adapter->resetting)
+			adapter->force_reset_recovery = true;
 		if (gen_crq->cmd == IBMVNIC_PARTITION_MIGRATED) {
 			dev_info(dev, "Migrated, re-enabling adapter\n");
 			ibmvnic_reset(adapter, VNIC_RESET_MOBILITY);
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index edfc312..f9fb780 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -1109,6 +1109,7 @@ struct ibmvnic_adapter {
 
 	bool mac_change_pending;
 	bool failover_pending;
+	bool force_reset_recovery;
 
 	struct ibmvnic_tunables desired;
 	struct ibmvnic_tunables fallback;
-- 
2.7.5

^ permalink raw reply related

* [PATCH net-next 7/8] ibmvnic: Set resetting state at earliest possible point
From: Thomas Falcon @ 2018-05-23 18:38 UTC (permalink / raw)
  To: netdev; +Cc: nfont, jallen, linuxppc-dev, Thomas Falcon
In-Reply-To: <1527100682-23099-1-git-send-email-tlfalcon@linux.vnet.ibm.com>

Set device resetting state at the earliest possible point: as soon as a
reset is successfully scheduled. The reset state is toggled off when
all resets have been processed to completion.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index f26e1f8..ee51deb 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1919,7 +1919,6 @@ static void __ibmvnic_reset(struct work_struct *work)
 	netdev = adapter->netdev;
 
 	mutex_lock(&adapter->reset_lock);
-	adapter->resetting = true;
 	reset_state = adapter->state;
 
 	rwi = get_next_rwi(adapter);
@@ -1994,7 +1993,7 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter,
 	rwi->reset_reason = reason;
 	list_add_tail(&rwi->list, &adapter->rwi_list);
 	mutex_unlock(&adapter->rwi_lock);
-
+	adapter->resetting = true;
 	netdev_dbg(adapter->netdev, "Scheduling reset (reason %d)\n", reason);
 	schedule_work(&adapter->ibmvnic_reset);
 
-- 
2.7.5

^ permalink raw reply related

* [PATCH iproute2] ip route: Print expires as signed int
From: dsahern @ 2018-05-23 18:50 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

rta_expires is a signed int; print it as one.

Fixes: 663c3cb23103f ("iproute: implement JSON and color output")
Signed-off-by: David Ahern <dsahern@gmail.com>
---
 ip/iproute.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ip/iproute.c b/ip/iproute.c
index 56dd9f25e38e..cbc43e2b691a 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -463,8 +463,8 @@ static void print_rta_cacheinfo(FILE *fp, const struct rta_cacheinfo *ci)
 		hz = get_user_hz();
 
 	if (ci->rta_expires != 0)
-		print_uint(PRINT_ANY, "expires",
-			   "expires %usec ", ci->rta_expires/hz);
+		print_int(PRINT_ANY, "expires",
+			   "expires %dsec ", ci->rta_expires/hz);
 	if (ci->rta_error != 0)
 		print_uint(PRINT_ANY, "error",
 			   "error %u ", ci->rta_error);
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH][V2] net: vxge: fix spelling mistake in macro VXGE_HW_ERR_PRIVILAGED_OPEARATION
From: David Miller @ 2018-05-23 18:50 UTC (permalink / raw)
  To: colin.king; +Cc: jdmason, netdev, kernel-janitors, linux-kernel
In-Reply-To: <20180522161809.30299-1-colin.king@canonical.com>

From: Colin King <colin.king@canonical.com>
Date: Tue, 22 May 2018 17:18:09 +0100

> From: Colin Ian King <colin.king@canonical.com>
> 
> Rename VXGE_HW_ERR_PRIVILAGED_OPEARATION to VXGE_HW_ERR_PRIVILEGED_OPERATION
> to fix spelling mistake.
> 
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
> ---
> V2: PRIVILAGED -> PRIVILEGED, thanks to Edward Cree for spotting that mistake

Applied to net-next, thanks Colin.

^ permalink raw reply

* [PATCH iproute2-next] ip route: print RTA_CACHEINFO if it exists
From: dsahern @ 2018-05-23 18:53 UTC (permalink / raw)
  To: netdev, stephen; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

RTA_CACHEINFO can be sent for non-cloned routes. If the attribute is
present print it. Allows route dumps to print expires times for example
which can exist on FIB entries.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 ip/iproute.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/ip/iproute.c b/ip/iproute.c
index 56dd9f25e38e..647c170b1d10 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -899,17 +899,13 @@ int print_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
 			   rta_getattr_u32(tb[RTA_UID]));
 
 	if (r->rtm_family == AF_INET) {
-		if (r->rtm_flags & RTM_F_CLONED) {
-			print_cache_flags(fp, r->rtm_flags);
+		print_cache_flags(fp, r->rtm_flags);
 
-			if (tb[RTA_CACHEINFO])
-				print_rta_cacheinfo(fp, RTA_DATA(tb[RTA_CACHEINFO]));
-		}
+		if (tb[RTA_CACHEINFO])
+			print_rta_cacheinfo(fp, RTA_DATA(tb[RTA_CACHEINFO]));
 	} else if (r->rtm_family == AF_INET6) {
-		if (r->rtm_flags & RTM_F_CLONED) {
-			if (tb[RTA_CACHEINFO])
-				print_rta_cacheinfo(fp, RTA_DATA(tb[RTA_CACHEINFO]));
-		}
+		if (tb[RTA_CACHEINFO])
+			print_rta_cacheinfo(fp, RTA_DATA(tb[RTA_CACHEINFO]));
 	}
 
 	if (tb[RTA_METRICS])
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH net] ibmvnic: Only do H_EOI for mobility events
From: David Miller @ 2018-05-23 18:54 UTC (permalink / raw)
  To: nfont; +Cc: netdev, jallen, tlfalcon
In-Reply-To: <152700607013.58815.10070919641026174502.stgit@ltcalpine2-lp14.aus.stglabs.ibm.com>

From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Tue, 22 May 2018 11:21:10 -0500

> When enabling the sub-CRQ IRQ a previous update sent a H_EOI prior
> to the enablement to clear any pending interrupts that may be present
> across a partition migration. This fixed a firmware bug where a
> migration could erroneously indicate that a H_EOI was pending.
> 
> The H_EOI should only be sent when enabling during a mobility
> event though. Doing so at other time could wrong and can produce
> extra driver output when IRQs are enabled when doing TX completion.
> 
> Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH net-next] hv_netvsc: Add handlers for ethtool get/set msg level
From: David Miller @ 2018-05-23 18:59 UTC (permalink / raw)
  To: haiyangz, haiyangz
  Cc: netdev, kys, sthemmin, olaf, vkuznets, devel, linux-kernel
In-Reply-To: <20180522182934.31515-1-haiyangz@linuxonhyperv.com>

From: Haiyang Zhang <haiyangz@linuxonhyperv.com>
Date: Tue, 22 May 2018 11:29:34 -0700

> From: Haiyang Zhang <haiyangz@microsoft.com>
> 
> The handlers for ethtool get/set msg level are missing from netvsc.
> This patch adds them.
> 
> Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>

Applied.

^ permalink raw reply

* Re: [net-next PATCH v2 1/4] net: Refactor XPS for CPUs and Rx queues
From: Nambiar, Amritha @ 2018-05-23 18:59 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Linux Kernel Network Developers, David S. Miller, Alexander Duyck,
	Sridhar Samudrala, Eric Dumazet, Hannes Frederic Sowa
In-Reply-To: <CALx6S37VqLRGUXD5OCbXCL4Fheb7e+JjtDPa8vvXM7bWBWNw-w@mail.gmail.com>

On 5/17/2018 9:08 PM, Tom Herbert wrote:
> On Tue, May 15, 2018 at 6:26 PM, Amritha Nambiar
> <amritha.nambiar@intel.com> wrote:
>> Refactor XPS code to support Tx queue selection based on
>> CPU map or Rx queue map.
>>
>> Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
>> ---
>>  include/linux/cpumask.h   |   11 ++
>>  include/linux/netdevice.h |   72 +++++++++++++++-
>>  net/core/dev.c            |  208 +++++++++++++++++++++++++++++----------------
>>  net/core/net-sysfs.c      |    4 -
>>  4 files changed, 215 insertions(+), 80 deletions(-)
>>
>> diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
>> index bf53d89..57f20a0 100644
>> --- a/include/linux/cpumask.h
>> +++ b/include/linux/cpumask.h
>> @@ -115,12 +115,17 @@ extern struct cpumask __cpu_active_mask;
>>  #define cpu_active(cpu)                ((cpu) == 0)
>>  #endif
>>
>> -/* verify cpu argument to cpumask_* operators */
>> -static inline unsigned int cpumask_check(unsigned int cpu)
>> +static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
>>  {
>>  #ifdef CONFIG_DEBUG_PER_CPU_MAPS
>> -       WARN_ON_ONCE(cpu >= nr_cpumask_bits);
>> +       WARN_ON_ONCE(cpu >= bits);
>>  #endif /* CONFIG_DEBUG_PER_CPU_MAPS */
>> +}
>> +
>> +/* verify cpu argument to cpumask_* operators */
>> +static inline unsigned int cpumask_check(unsigned int cpu)
>> +{
>> +       cpu_max_bits_warn(cpu, nr_cpumask_bits);
>>         return cpu;
>>  }
>>
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index 03ed492..c2eeb36 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -730,10 +730,21 @@ struct xps_map {
>>   */
>>  struct xps_dev_maps {
>>         struct rcu_head rcu;
>> -       struct xps_map __rcu *cpu_map[0];
>> +       struct xps_map __rcu *attr_map[0];
>>  };
>> -#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +         \
>> +
>> +#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +     \
>>         (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
>> +
>> +#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
>> +       (_rxqs * (_tcs) * sizeof(struct xps_map *)))
>> +
>> +enum xps_map_type {
>> +       XPS_MAP_RXQS,
>> +       XPS_MAP_CPUS,
>> +       __XPS_MAP_MAX
>> +};
>> +
>>  #endif /* CONFIG_XPS */
>>
>>  #define TC_MAX_QUEUE   16
>> @@ -1891,7 +1902,7 @@ struct net_device {
>>         int                     watchdog_timeo;
>>
>>  #ifdef CONFIG_XPS
>> -       struct xps_dev_maps __rcu *xps_maps;
>> +       struct xps_dev_maps __rcu *xps_maps[__XPS_MAP_MAX];
>>  #endif
>>  #ifdef CONFIG_NET_CLS_ACT
>>         struct mini_Qdisc __rcu *miniq_egress;
>> @@ -3229,6 +3240,61 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
>>  #ifdef CONFIG_XPS
>>  int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>                         u16 index);
>> +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
>> +                         u16 index, enum xps_map_type type);
>> +
>> +static inline bool attr_test_mask(unsigned long j, const unsigned long *mask,
>> +                                 unsigned int nr_bits)
>> +{
>> +       cpu_max_bits_warn(j, nr_bits);
>> +       return test_bit(j, mask);
>> +}
>> +
>> +static inline bool attr_test_online(unsigned long j,
>> +                                   const unsigned long *online_mask,
>> +                                   unsigned int nr_bits)
>> +{
>> +       cpu_max_bits_warn(j, nr_bits);
>> +
>> +       if (online_mask)
>> +               return test_bit(j, online_mask);
>> +
>> +       if (j >= 0 && j < nr_bits)
>> +               return true;
>> +
>> +       return false;
>> +}
>> +
>> +static inline unsigned int attrmask_next(int n, const unsigned long *srcp,
>> +                                        unsigned int nr_bits)
>> +{
>> +       /* -1 is a legal arg here. */
>> +       if (n != -1)
>> +               cpu_max_bits_warn(n, nr_bits);
>> +
>> +       if (srcp)
>> +               return find_next_bit(srcp, nr_bits, n + 1);
>> +
>> +       return n + 1;
>> +}
>> +
>> +static inline int attrmask_next_and(int n, const unsigned long *src1p,
>> +                                   const unsigned long *src2p,
>> +                                   unsigned int nr_bits)
>> +{
>> +       /* -1 is a legal arg here. */
>> +       if (n != -1)
>> +               cpu_max_bits_warn(n, nr_bits);
>> +
>> +       if (src1p && src2p)
>> +               return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
>> +       else if (src1p)
>> +               return find_next_bit(src1p, nr_bits, n + 1);
>> +       else if (src2p)
>> +               return find_next_bit(src2p, nr_bits, n + 1);
>> +
>> +       return n + 1;
>> +}
>>  #else
>>  static inline int netif_set_xps_queue(struct net_device *dev,
>>                                       const struct cpumask *mask,
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 9f43901..7e5dfdb 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -2092,7 +2092,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
>>         int pos;
>>
>>         if (dev_maps)
>> -               map = xmap_dereference(dev_maps->cpu_map[tci]);
>> +               map = xmap_dereference(dev_maps->attr_map[tci]);
>>         if (!map)
>>                 return false;
>>
>> @@ -2105,7 +2105,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
>>                         break;
>>                 }
>>
>> -               RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
>> +               RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
>>                 kfree_rcu(map, rcu);
>>                 return false;
>>         }
>> @@ -2125,7 +2125,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
>>                 int i, j;
>>
>>                 for (i = count, j = offset; i--; j++) {
>> -                       if (!remove_xps_queue(dev_maps, cpu, j))
>> +                       if (!remove_xps_queue(dev_maps, tci, j))
>>                                 break;
>>                 }
>>
>> @@ -2138,30 +2138,47 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
>>  static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
>>                                    u16 count)
>>  {
>> +       const unsigned long *possible_mask = NULL;
>> +       enum xps_map_type type = XPS_MAP_RXQS;
>>         struct xps_dev_maps *dev_maps;
>> -       int cpu, i;
>>         bool active = false;
>> +       unsigned int nr_ids;
>> +       int i, j;
>>
>>         mutex_lock(&xps_map_mutex);
>> -       dev_maps = xmap_dereference(dev->xps_maps);
>>
>> -       if (!dev_maps)
>> -               goto out_no_maps;
>> +       while (type < __XPS_MAP_MAX) {
>> +               dev_maps = xmap_dereference(dev->xps_maps[type]);
>> +               if (!dev_maps)
>> +                       goto out_no_maps;
>> +
>> +               if (type == XPS_MAP_CPUS) {
>> +                       if (num_possible_cpus() > 1)
>> +                               possible_mask = cpumask_bits(cpu_possible_mask);
>> +                       nr_ids = nr_cpu_ids;
>> +               } else if (type == XPS_MAP_RXQS) {
>> +                       nr_ids = dev->num_rx_queues;
>> +               }
> type is an enum so this should be a switch

Will fix in v3.

> 
>>
>> -       for_each_possible_cpu(cpu)
>> -               active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
>> -                                              offset, count);
>> +               for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>> +                    j < nr_ids;)
>> +                       active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
>> +                                                      count);
>> +               if (!active) {
>> +                       RCU_INIT_POINTER(dev->xps_maps[type], NULL);
>> +                       kfree_rcu(dev_maps, rcu);
>> +               }
>>
>> -       if (!active) {
>> -               RCU_INIT_POINTER(dev->xps_maps, NULL);
>> -               kfree_rcu(dev_maps, rcu);
>> +               if (type == XPS_MAP_CPUS) {
>> +                       for (i = offset + (count - 1); count--; i--)
>> +                               netdev_queue_numa_node_write(
>> +                                       netdev_get_tx_queue(dev, i),
>> +                                                           NUMA_NO_NODE);
>> +               }
>> +out_no_maps:
>> +               type++;
>>         }
>>
>> -       for (i = offset + (count - 1); count--; i--)
>> -               netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
>> -                                            NUMA_NO_NODE);
>> -
>> -out_no_maps:
>>         mutex_unlock(&xps_map_mutex);
>>  }
>>
>> @@ -2170,11 +2187,11 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
>>         netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
>>  }
>>
>> -static struct xps_map *expand_xps_map(struct xps_map *map,
>> -                                     int cpu, u16 index)
>> +static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
>> +                                     u16 index, enum xps_map_type type)
>>  {
>> -       struct xps_map *new_map;
>>         int alloc_len = XPS_MIN_MAP_ALLOC;
>> +       struct xps_map *new_map = NULL;
>>         int i, pos;
>>
>>         for (pos = 0; map && pos < map->len; pos++) {
>> @@ -2183,7 +2200,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
>>                 return map;
>>         }
>>
>> -       /* Need to add queue to this CPU's existing map */
>> +       /* Need to add tx-queue to this CPU's/rx-queue's existing map */
>>         if (map) {
>>                 if (pos < map->alloc_len)
>>                         return map;
>> @@ -2191,9 +2208,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
>>                 alloc_len = map->alloc_len * 2;
>>         }
>>
>> -       /* Need to allocate new map to store queue on this CPU's map */
>> -       new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
>> -                              cpu_to_node(cpu));
>> +       /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
>> +        *  map
>> +        */
>> +       if (type == XPS_MAP_RXQS)
>> +               new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
>> +       else if (type == XPS_MAP_CPUS)
>> +               new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
>> +                                      cpu_to_node(attr_index));
> switch here also

Will fix in v3.

> 
>>         if (!new_map)
>>                 return NULL;
>>
>> @@ -2205,14 +2227,16 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
>>         return new_map;
>>  }
>>
>> -int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>> -                       u16 index)
>> +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
>> +                         u16 index, enum xps_map_type type)
>>  {
>> +       const unsigned long *online_mask = NULL, *possible_mask = NULL;
>>         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
>> -       int i, cpu, tci, numa_node_id = -2;
>> +       int i, j, tci, numa_node_id = -2;
>>         int maps_sz, num_tc = 1, tc = 0;
>>         struct xps_map *map, *new_map;
>>         bool active = false;
>> +       unsigned int nr_ids;
>>
>>         if (dev->num_tc) {
>>                 num_tc = dev->num_tc;
>> @@ -2221,16 +2245,33 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>                         return -EINVAL;
>>         }
>>
>> -       maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
>> +       switch (type) {
>> +       case XPS_MAP_RXQS:
>> +               maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
>> +               dev_maps = xmap_dereference(dev->xps_maps[XPS_MAP_RXQS]);
>> +               nr_ids = dev->num_rx_queues;
>> +               break;
>> +       case XPS_MAP_CPUS:
>> +               maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
>> +               if (num_possible_cpus() > 1) {
>> +                       online_mask = cpumask_bits(cpu_online_mask);
>> +                       possible_mask = cpumask_bits(cpu_possible_mask);
>> +               }
>> +               dev_maps = xmap_dereference(dev->xps_maps[XPS_MAP_CPUS]);
>> +               nr_ids = nr_cpu_ids;
>> +               break;
>> +       default:
>> +               return -EINVAL;
>> +       }
>> +
>>         if (maps_sz < L1_CACHE_BYTES)
>>                 maps_sz = L1_CACHE_BYTES;
>>
>>         mutex_lock(&xps_map_mutex);
>>
>> -       dev_maps = xmap_dereference(dev->xps_maps);
>> -
>>         /* allocate memory for queue storage */
>> -       for_each_cpu_and(cpu, cpu_online_mask, mask) {
>> +       for (j = -1; j = attrmask_next_and(j, online_mask, mask, nr_ids),
>> +            j < nr_ids;) {
>>                 if (!new_dev_maps)
>>                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
>>                 if (!new_dev_maps) {
>> @@ -2238,73 +2279,81 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>                         return -ENOMEM;
>>                 }
>>
>> -               tci = cpu * num_tc + tc;
>> -               map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
>> +               tci = j * num_tc + tc;
>> +               map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
>>                                  NULL;
>>
>> -               map = expand_xps_map(map, cpu, index);
>> +               map = expand_xps_map(map, j, index, type);
>>                 if (!map)
>>                         goto error;
>>
>> -               RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>> +               RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>         }
>>
>>         if (!new_dev_maps)
>>                 goto out_no_new_maps;
>>
>> -       for_each_possible_cpu(cpu) {
>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>> +            j < nr_ids;) {
>>                 /* copy maps belonging to foreign traffic classes */
>> -               for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
>> +               for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
>>                         /* fill in the new device map from the old device map */
>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>> -                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>> +                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>                 }
>>
>>                 /* We need to explicitly update tci as prevous loop
>>                  * could break out early if dev_maps is NULL.
>>                  */
>> -               tci = cpu * num_tc + tc;
>> +               tci = j * num_tc + tc;
>>
>> -               if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
>> -                       /* add queue to CPU maps */
>> +               if (attr_test_mask(j, mask, nr_ids) &&
>> +                   attr_test_online(j, online_mask, nr_ids)) {
>> +                       /* add tx-queue to CPU/rx-queue maps */
>>                         int pos = 0;
>>
>> -                       map = xmap_dereference(new_dev_maps->cpu_map[tci]);
>> +                       map = xmap_dereference(new_dev_maps->attr_map[tci]);
>>                         while ((pos < map->len) && (map->queues[pos] != index))
>>                                 pos++;
>>
>>                         if (pos == map->len)
>>                                 map->queues[map->len++] = index;
>>  #ifdef CONFIG_NUMA
>> -                       if (numa_node_id == -2)
>> -                               numa_node_id = cpu_to_node(cpu);
>> -                       else if (numa_node_id != cpu_to_node(cpu))
>> -                               numa_node_id = -1;
>> +                       if (type == XPS_MAP_CPUS) {
>> +                               if (numa_node_id == -2)
>> +                                       numa_node_id = cpu_to_node(j);
>> +                               else if (numa_node_id != cpu_to_node(j))
>> +                                       numa_node_id = -1;
>> +                       }
>>  #endif
>>                 } else if (dev_maps) {
>>                         /* fill in the new device map from the old device map */
>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>> -                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>> +                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>                 }
>>
>>                 /* copy maps belonging to foreign traffic classes */
>>                 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
>>                         /* fill in the new device map from the old device map */
>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>> -                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>> +                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>                 }
>>         }
>>
>> -       rcu_assign_pointer(dev->xps_maps, new_dev_maps);
>> +       if (type == XPS_MAP_RXQS)
>> +               rcu_assign_pointer(dev->xps_maps[XPS_MAP_RXQS], new_dev_maps);
>> +       else if (type == XPS_MAP_CPUS)
>> +               rcu_assign_pointer(dev->xps_maps[XPS_MAP_CPUS], new_dev_maps);
>>
>>         /* Cleanup old maps */
>>         if (!dev_maps)
>>                 goto out_no_old_maps;
>>
>> -       for_each_possible_cpu(cpu) {
>> -               for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
>> -                       new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>> +            j < nr_ids;) {
>> +               for (i = num_tc, tci = j * num_tc; i--; tci++) {
>> +                       new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>>                         if (map && map != new_map)
>>                                 kfree_rcu(map, rcu);
>>                 }
>> @@ -2317,19 +2366,23 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>         active = true;
>>
>>  out_no_new_maps:
>> -       /* update Tx queue numa node */
>> -       netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
>> -                                    (numa_node_id >= 0) ? numa_node_id :
>> -                                    NUMA_NO_NODE);
>> +       if (type == XPS_MAP_CPUS) {
>> +               /* update Tx queue numa node */
>> +               netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
>> +                                            (numa_node_id >= 0) ?
>> +                                            numa_node_id : NUMA_NO_NODE);
>> +       }
>>
>>         if (!dev_maps)
>>                 goto out_no_maps;
>>
>> -       /* removes queue from unused CPUs */
>> -       for_each_possible_cpu(cpu) {
>> -               for (i = tc, tci = cpu * num_tc; i--; tci++)
>> +       /* removes tx-queue from unused CPUs/rx-queues */
>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>> +            j < nr_ids;) {
>> +               for (i = tc, tci = j * num_tc; i--; tci++)
>>                         active |= remove_xps_queue(dev_maps, tci, index);
>> -               if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
>> +               if (!attr_test_mask(j, mask, nr_ids) ||
>> +                   !attr_test_online(j, online_mask, nr_ids))
>>                         active |= remove_xps_queue(dev_maps, tci, index);
>>                 for (i = num_tc - tc, tci++; --i; tci++)
>>                         active |= remove_xps_queue(dev_maps, tci, index);
>> @@ -2337,7 +2390,10 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>
>>         /* free map if not active */
>>         if (!active) {
>> -               RCU_INIT_POINTER(dev->xps_maps, NULL);
>> +               if (type == XPS_MAP_RXQS)
>> +                       RCU_INIT_POINTER(dev->xps_maps[XPS_MAP_RXQS], NULL);
>> +               else if (type == XPS_MAP_CPUS)
>> +                       RCU_INIT_POINTER(dev->xps_maps[XPS_MAP_CPUS], NULL);
>>                 kfree_rcu(dev_maps, rcu);
>>         }
>>
>> @@ -2347,11 +2403,12 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>         return 0;
>>  error:
>>         /* remove any maps that we added */
>> -       for_each_possible_cpu(cpu) {
>> -               for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
>> -                       new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>> +            j < nr_ids;) {
>> +               for (i = num_tc, tci = j * num_tc; i--; tci++) {
>> +                       new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
>>                         map = dev_maps ?
>> -                             xmap_dereference(dev_maps->cpu_map[tci]) :
>> +                             xmap_dereference(dev_maps->attr_map[tci]) :
>>                               NULL;
>>                         if (new_map && new_map != map)
>>                                 kfree(new_map);
>> @@ -2363,6 +2420,13 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>         kfree(new_dev_maps);
>>         return -ENOMEM;
>>  }
>> +
>> +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>> +                       u16 index)
>> +{
>> +       return __netif_set_xps_queue(dev, cpumask_bits(mask), index,
>> +                                    XPS_MAP_CPUS);
>> +}
>>  EXPORT_SYMBOL(netif_set_xps_queue);
>>
>>  #endif
>> @@ -3402,7 +3466,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>         int queue_index = -1;
>>
>>         rcu_read_lock();
>> -       dev_maps = rcu_dereference(dev->xps_maps);
>> +       dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_CPUS]);
>>         if (dev_maps) {
>>                 unsigned int tci = skb->sender_cpu - 1;
>>
>> @@ -3411,7 +3475,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>                         tci += netdev_get_prio_tc_map(dev, skb->priority);
>>                 }
>>
>> -               map = rcu_dereference(dev_maps->cpu_map[tci]);
>> +               map = rcu_dereference(dev_maps->attr_map[tci]);
>>                 if (map) {
>>                         if (map->len == 1)
>>                                 queue_index = map->queues[0];
>> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
>> index c476f07..d7abd33 100644
>> --- a/net/core/net-sysfs.c
>> +++ b/net/core/net-sysfs.c
>> @@ -1227,13 +1227,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
>>         }
>>
>>         rcu_read_lock();
>> -       dev_maps = rcu_dereference(dev->xps_maps);
>> +       dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_CPUS]);
>>         if (dev_maps) {
>>                 for_each_possible_cpu(cpu) {
>>                         int i, tci = cpu * num_tc + tc;
>>                         struct xps_map *map;
>>
>> -                       map = rcu_dereference(dev_maps->cpu_map[tci]);
>> +                       map = rcu_dereference(dev_maps->attr_map[tci]);
>>                         if (!map)
>>                                 continue;
>>
>>

^ permalink raw reply

* Re: [net-next] i40iw/i40e: Remove link dependency on i40e
From: David Miller @ 2018-05-23 19:00 UTC (permalink / raw)
  To: jeffrey.t.kirsher
  Cc: dledford, jgg, sindhu.devale, netdev, linux-rdma, nhorman,
	sassmann, jogreene, shiraz.saleem
In-Reply-To: <20180522203831.20624-1-jeffrey.t.kirsher@intel.com>

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Tue, 22 May 2018 13:38:31 -0700

> From: Sindhu Devale <sindhu.devale@intel.com>
> 
> Currently i40iw is dependent on i40e symbols
> i40e_register_client and i40e_unregister_client due to
> which i40iw cannot be loaded without i40e being loaded.
> 
> This patch allows RDMA driver to build and load without
> linking to LAN driver and without LAN driver being loaded
> first. Once the LAN driver is loaded, the RDMA driver
> is notified through the netdevice notifiers to register
> as client to the LAN driver. Add function pointers to IDC
> register/unregister in the private VSI structure. This
> allows a RDMA driver to build without linking to i40e.
> 
> Signed-off-by: Sindhu Devale <sindhu.devale@intel.com>
> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

If two drivers depend upon eachother, and a change to one can create
an incompatibility with the other, by definition they must be upgraded
together.

This doesn't even get into recompiling or anything like that, it's a
simple fact of life.

I'm not applying this sorry.

^ permalink raw reply

* Re: [PATCH V4 8/8] dt-bindings: stm32: add compatible for syscon
From: Rob Herring @ 2018-05-23 19:03 UTC (permalink / raw)
  To: Christophe Roullier
  Cc: mark.rutland, mcoquelin.stm32, alexandre.torgue, peppe.cavallaro,
	devicetree, linux-arm-kernel, netdev, andrew
In-Reply-To: <1527090479-5263-9-git-send-email-christophe.roullier@st.com>

On Wed, May 23, 2018 at 05:47:59PM +0200, Christophe Roullier wrote:
> This patch describes syscon DT bindings.
> 
> Signed-off-by: Christophe Roullier <christophe.roullier@st.com>
> ---
>  Documentation/devicetree/bindings/arm/stm32.txt            | 10 ----------
>  .../devicetree/bindings/arm/stm32/stm32-syscon.txt         | 14 ++++++++++++++
>  Documentation/devicetree/bindings/arm/stm32/stm32.txt      | 10 ++++++++++
>  3 files changed, 24 insertions(+), 10 deletions(-)
>  delete mode 100644 Documentation/devicetree/bindings/arm/stm32.txt
>  create mode 100644 Documentation/devicetree/bindings/arm/stm32/stm32-syscon.txt
>  create mode 100644 Documentation/devicetree/bindings/arm/stm32/stm32.txt

In the future, use the -M option so file moves don't show any diff.

Reviewed-by: Rob Herring <robh@kernel.org>

^ permalink raw reply

* Re: [PATCH net] net: ipv4: add missing RTA_TABLE to rtm_ipv4_policy
From: David Miller @ 2018-05-23 19:04 UTC (permalink / raw)
  To: roopa; +Cc: netdev, eric.dumazet
In-Reply-To: <1527021891-6837-1-git-send-email-roopa@cumulusnetworks.com>

From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 22 May 2018 13:44:51 -0700

> From: Roopa Prabhu <roopa@cumulusnetworks.com>
> 
> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>

Applied and queued up for -stable.

Please provide an appropriate Fixes: tag next time.

^ permalink raw reply

* Re: [PATCH net-next v5 0/3] fib rule selftest
From: David Miller @ 2018-05-23 19:14 UTC (permalink / raw)
  To: roopa; +Cc: netdev, nikolay, dsa, idosch, eric.dumazet
In-Reply-To: <1527023009-13609-1-git-send-email-roopa@cumulusnetworks.com>

From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 22 May 2018 14:03:26 -0700

> From: Roopa Prabhu <roopa@cumulusnetworks.com>
> 
> This series adds a new test to test fib rules.
> ip route get is used to test fib rule matches.
> This series also extends ip route get to match on
> sport and dport to test recent support of sport
> and dport fib rule match.
 ...

Looks good, series applied, thanks.

^ permalink raw reply

* Re: [PATCH net] net: phy: broadcom: Fix auxiliary control register reads
From: David Miller @ 2018-05-23 19:19 UTC (permalink / raw)
  To: f.fainelli; +Cc: netdev, jon.mason, andrew, linux-kernel
In-Reply-To: <20180522232227.6355-1-f.fainelli@gmail.com>

From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 22 May 2018 16:22:26 -0700

> We are currently doing auxiliary control register reads with the shadow
> register value 0b111 (0x7) which incidentally is also the selector value
> that should be present in bits [2:0]. Fix this by using the appropriate
> selector mask which is defined (MII_BCM54XX_AUXCTL_SHDWSEL_MASK).
> 
> This does not have a functional impact yet because we always access the
> MII_BCM54XX_AUXCTL_SHDWSEL_MISC (0x7) register in the current code.
> This might change at some point though.
> 
> Fixes: 5b4e29005123 ("net: phy: broadcom: add bcm54xx_auxctl_read")
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> ---
> David, please queue this for -stable as well, thank you!

Applied and queued up for -stable.

^ permalink raw reply

* Re: [net-next PATCH v2 2/4] net: Enable Tx queue selection based on Rx queues
From: Nambiar, Amritha @ 2018-05-23 19:19 UTC (permalink / raw)
  To: Willem de Bruijn, Tom Herbert
  Cc: Linux Kernel Network Developers, David S. Miller, Alexander Duyck,
	Sridhar Samudrala, Eric Dumazet, Hannes Frederic Sowa
In-Reply-To: <CAF=yD-JXpiJwxM_mHvAgJ6qhsgq4uOZYbsMBVvcOmZawbueayQ@mail.gmail.com>

On 5/19/2018 1:13 PM, Willem de Bruijn wrote:
> On Fri, May 18, 2018 at 12:03 AM, Tom Herbert <tom@herbertland.com> wrote:
>> On Tue, May 15, 2018 at 6:26 PM, Amritha Nambiar
>> <amritha.nambiar@intel.com> wrote:
>>> This patch adds support to pick Tx queue based on the Rx queue map
>>> configuration set by the admin through the sysfs attribute
>>> for each Tx queue. If the user configuration for receive
>>> queue map does not apply, then the Tx queue selection falls back
>>> to CPU map based selection and finally to hashing.
>>>
>>> Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>>> ---
>>>  include/net/sock.h       |   18 ++++++++++++++++++
>>>  net/core/dev.c           |   36 +++++++++++++++++++++++++++++-------
>>>  net/core/sock.c          |    5 +++++
>>>  net/ipv4/tcp_input.c     |    7 +++++++
>>>  net/ipv4/tcp_ipv4.c      |    1 +
>>>  net/ipv4/tcp_minisocks.c |    1 +
>>>  6 files changed, 61 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/include/net/sock.h b/include/net/sock.h
>>> index 4f7c584..0613f63 100644
>>> --- a/include/net/sock.h
>>> +++ b/include/net/sock.h
>>> @@ -139,6 +139,8 @@ typedef __u64 __bitwise __addrpair;
>>>   *     @skc_node: main hash linkage for various protocol lookup tables
>>>   *     @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>>>   *     @skc_tx_queue_mapping: tx queue number for this connection
>>> + *     @skc_rx_queue_mapping: rx queue number for this connection
>>> + *     @skc_rx_ifindex: rx ifindex for this connection
>>>   *     @skc_flags: place holder for sk_flags
>>>   *             %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
>>>   *             %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
>>> @@ -215,6 +217,10 @@ struct sock_common {
>>>                 struct hlist_nulls_node skc_nulls_node;
>>>         };
>>>         int                     skc_tx_queue_mapping;
>>> +#ifdef CONFIG_XPS
>>> +       int                     skc_rx_queue_mapping;
>>> +       int                     skc_rx_ifindex;
>>
>> Isn't this increasing size of sock_common for a narrow use case functionality?
> 
> You can get the device from the already recorded sk_napi_id.
> Sadly, not the queue number as far as I can see.
> 
I plan to not have the ifindex cached in the sock_common, but retain the
rx_queue only. This way, it'll look similar to skb_tx_hash where
rx_queue recorded is used and if not, fall through to flow hash
calculation. Likewise, we use the rx_queue mapped and fall through to
CPU map on failures.
> 
>>> +static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
>>> +{
>>> +#ifdef CONFIG_XPS
>>> +       sk->sk_rx_ifindex = skb->skb_iif;
>>> +       sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
>>> +#endif
>>> +}
>>> +
> 
> Instead of adding this function and calls to it in many locations in
> the stack, you can expand sk_mark_napi_id.
> 
> Also, it is not clear why this should be called in locations where
> sk_mark_napi_id is not.
> 
Makes sense, I will add this as part of sk_mark_napi_id.

> 
>>> +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>> +{
>>> +#ifdef CONFIG_XPS
>>> +       enum xps_map_type i = XPS_MAP_RXQS;
>>> +       struct xps_dev_maps *dev_maps;
>>> +       struct sock *sk = skb->sk;
>>> +       int queue_index = -1;
>>> +       unsigned int tci = 0;
>>> +
>>> +       if (sk && sk->sk_rx_queue_mapping <= dev->real_num_rx_queues &&
>>> +           dev->ifindex == sk->sk_rx_ifindex)
>>> +               tci = sk->sk_rx_queue_mapping;
>>> +
>>> +       rcu_read_lock();
>>> +       while (queue_index < 0 && i < __XPS_MAP_MAX) {
>>> +               if (i == XPS_MAP_CPUS)
>>
>> This while loop typifies exactly why I don't think the XPS maps should
>> be an array.
> 
> +1
> 
Okay, I will change this to two maps with separate pointers.

^ permalink raw reply

* aio poll and a new in-kernel poll API V13
From: Christoph Hellwig @ 2018-05-23 19:19 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-api,
	linux-kernel

Hi all,

this series adds support for the IOCB_CMD_POLL operation to poll for the
readyness of file descriptors using the aio subsystem.  The API is based
on patches that existed in RHAS2.1 and RHEL3, which means it already is
supported by libaio.  To implement the poll support efficiently new
methods to poll are introduced in struct file_operations:  get_poll_head
and poll_mask.  The first one returns a wait_queue_head to wait on
(lifetime is bound by the file), and the second does a non-blocking
check for the POLL* events.  This allows aio poll to work without
any additional context switches, unlike epoll.

This series sits on top of the aio-fsync series that also includes
support for io_pgetevents.

The changes were sponsored by Scylladb, and improve performance
of the seastar framework up to 10%, while also removing the need
for a privileged SCHED_FIFO epoll listener thread.

    git://git.infradead.org/users/hch/vfs.git aio-poll.13

Gitweb:

    http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/aio-poll.13

Libaio changes:

    https://pagure.io/libaio.git io-poll

Seastar changes (not updated for the new io_pgetevens ABI yet):

    https://github.com/avikivity/seastar/commits/aio

Changes since v12:
 - remove iocb from ki_list only after ki_cancel has completed
 - fix __poll_t annotations
 - turn __poll_t sparse checkin on by default
 - call fput after aio_complete
 - only add the iocb to active_reqs if we wait for it

Changes since v11:
 - simplify cancellation by completion poll requests from a workqueue
   if we can't take the ctx_lock

Changes since v10:
 - fixed a mismerge that let a sock_rps_record_flow sneak into
   tcp_poll_mask
 - remove the now unused struct proto_ops get_poll_head method

Changes since v9:
 - add to the delayed_cancel_reqs earlier to avoid a race
 - get rid of POLL_TO_PTR magic

Changes since v8:
 - make delayed cancellation conditional again
 - add a cancel_kiocb file operation to split delayed vs normal cancel

Changes since v7:
 - make delayed cancellation safe and unconditional

Changes since v6:
 - reworked cancellation

Changes since v5:
 - small changelog updates
 - rebased on top of the aio-fsync changes

Changes since v4:
 - rebased ontop of Linux 4.16-rc4

Changes since v3:
 - remove the pre-sleep ->poll_mask call in vfs_poll,
   allow ->get_poll_head to return POLL* values.

Changes since v2:
 - removed a double initialization
 - new vfs_get_poll_head helper
 - document that ->get_poll_head can return NULL
 - call ->poll_mask before sleeping
 - various ACKs
 - add conversion of random to ->poll_mask
 - add conversion of af_alg to ->poll_mask
 - lacking ->poll_mask support now returns -EINVAL for IOCB_CMD_POLL
 - reshuffled the series so that prep patches and everything not
   requiring the new in-kernel poll API is in the beginning

Changes since v1:
 - handle the NULL ->poll case in vfs_poll
 - dropped the file argument to the ->poll_mask socket operation
 - replace the ->pre_poll socket operation with ->get_poll_head as
   in the file operations

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply

* [PATCH 01/33] fix io_destroy()/aio_complete() race
From: Christoph Hellwig @ 2018-05-23 19:19 UTC (permalink / raw)
  To: viro
  Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-api,
	linux-kernel, stable
In-Reply-To: <20180523192022.1703-1-hch@lst.de>

From: Al Viro <viro@zeniv.linux.org.uk>

If io_destroy() gets to cancelling everything that can be cancelled and
gets to kiocb_cancel() calling the function driver has left in ->ki_cancel,
it becomes vulnerable to a race with IO completion.  At that point req
is already taken off the list and aio_complete() does *NOT* spin until
we (in free_ioctx_users()) releases ->ctx_lock.  As the result, it proceeds
to kiocb_free(), freing req just it gets passed to ->ki_cancel().

Fix is simple - remove from the list after the call of kiocb_cancel().  All
instances of ->ki_cancel() already have to cope with the being called with
iocb still on list - that's what happens in io_cancel(2).

Cc: stable@kernel.org
Fixes: 0460fef2a921 "aio: use cancellation list lazily"
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/aio.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 755d3f57bcc8..1c383bb44b2d 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -639,9 +639,8 @@ static void free_ioctx_users(struct percpu_ref *ref)
 	while (!list_empty(&ctx->active_reqs)) {
 		req = list_first_entry(&ctx->active_reqs,
 				       struct aio_kiocb, ki_list);
-
-		list_del_init(&req->ki_list);
 		kiocb_cancel(req);
+		list_del_init(&req->ki_list);
 	}
 
 	spin_unlock_irq(&ctx->ctx_lock);
-- 
2.17.0

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 02/33] uapi: turn __poll_t sparse checkin on by default
From: Christoph Hellwig @ 2018-05-23 19:19 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-api,
	linux-kernel
In-Reply-To: <20180523192022.1703-1-hch@lst.de>

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/uapi/linux/types.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/include/uapi/linux/types.h b/include/uapi/linux/types.h
index cd4f0b897a48..2fce8b6876e9 100644
--- a/include/uapi/linux/types.h
+++ b/include/uapi/linux/types.h
@@ -49,11 +49,7 @@ typedef __u32 __bitwise __wsum;
 #define __aligned_be64 __be64 __attribute__((aligned(8)))
 #define __aligned_le64 __le64 __attribute__((aligned(8)))
 
-#ifdef __CHECK_POLL
 typedef unsigned __bitwise __poll_t;
-#else
-typedef unsigned __poll_t;
-#endif
 
 #endif /*  __ASSEMBLY__ */
 #endif /* _UAPI_LINUX_TYPES_H */
-- 
2.17.0

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 03/33] fs: unexport poll_schedule_timeout
From: Christoph Hellwig @ 2018-05-23 19:19 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-api,
	linux-kernel
In-Reply-To: <20180523192022.1703-1-hch@lst.de>

No users outside of select.c.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/select.c          | 3 +--
 include/linux/poll.h | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/select.c b/fs/select.c
index ba879c51288f..a87f396f0313 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -233,7 +233,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 	add_wait_queue(wait_address, &entry->wait);
 }
 
-int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+static int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 			  ktime_t *expires, unsigned long slack)
 {
 	int rc = -EINTR;
@@ -258,7 +258,6 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 
 	return rc;
 }
-EXPORT_SYMBOL(poll_schedule_timeout);
 
 /**
  * poll_select_set_timeout - helper function to setup the timeout value
diff --git a/include/linux/poll.h b/include/linux/poll.h
index f45ebd017eaa..a3576da63377 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -96,8 +96,6 @@ struct poll_wqueues {
 
 extern void poll_initwait(struct poll_wqueues *pwq);
 extern void poll_freewait(struct poll_wqueues *pwq);
-extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
-				 ktime_t *expires, unsigned long slack);
 extern u64 select_estimate_accuracy(struct timespec64 *tv);
 
 #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)
-- 
2.17.0

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 04/33] fs: cleanup do_pollfd
From: Christoph Hellwig @ 2018-05-23 19:19 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-api,
	linux-kernel
In-Reply-To: <20180523192022.1703-1-hch@lst.de>

Use straightline code with failure handling gotos instead of a lot
of nested conditionals.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/select.c | 48 +++++++++++++++++++++++-------------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/fs/select.c b/fs/select.c
index a87f396f0313..25da26253485 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -812,34 +812,32 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
 				     bool *can_busy_poll,
 				     __poll_t busy_flag)
 {
-	__poll_t mask;
-	int fd;
-
-	mask = 0;
-	fd = pollfd->fd;
-	if (fd >= 0) {
-		struct fd f = fdget(fd);
-		mask = EPOLLNVAL;
-		if (f.file) {
-			/* userland u16 ->events contains POLL... bitmap */
-			__poll_t filter = demangle_poll(pollfd->events) |
-						EPOLLERR | EPOLLHUP;
-			mask = DEFAULT_POLLMASK;
-			if (f.file->f_op->poll) {
-				pwait->_key = filter;
-				pwait->_key |= busy_flag;
-				mask = f.file->f_op->poll(f.file, pwait);
-				if (mask & busy_flag)
-					*can_busy_poll = true;
-			}
-			/* Mask out unneeded events. */
-			mask &= filter;
-			fdput(f);
-		}
+	int fd = pollfd->fd;
+	__poll_t mask = 0, filter;
+	struct fd f;
+
+	if (fd < 0)
+		goto out;
+	mask = EPOLLNVAL;
+	f = fdget(fd);
+	if (!f.file)
+		goto out;
+
+	/* userland u16 ->events contains POLL... bitmap */
+	filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
+	mask = DEFAULT_POLLMASK;
+	if (f.file->f_op->poll) {
+		pwait->_key = filter | busy_flag;
+		mask = f.file->f_op->poll(f.file, pwait);
+		if (mask & busy_flag)
+			*can_busy_poll = true;
 	}
+	mask &= filter;		/* Mask out unneeded events. */
+	fdput(f);
+
+out:
 	/* ... and so does ->revents */
 	pollfd->revents = mangle_poll(mask);
-
 	return mask;
 }
 
-- 
2.17.0

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 05/33] fs: update documentation to mention __poll_t and match the code
From: Christoph Hellwig @ 2018-05-23 19:19 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-api,
	linux-kernel
In-Reply-To: <20180523192022.1703-1-hch@lst.de>

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/filesystems/Locking | 2 +-
 Documentation/filesystems/vfs.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 75d2d57e2c44..220bba28f72b 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -439,7 +439,7 @@ prototypes:
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
-	unsigned int (*poll) (struct file *, struct poll_table_struct *);
+	__poll_t (*poll) (struct file *, struct poll_table_struct *);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 5fd325df59e2..f608180ad59d 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -856,7 +856,7 @@ struct file_operations {
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
-	unsigned int (*poll) (struct file *, struct poll_table_struct *);
+	__poll_t (*poll) (struct file *, struct poll_table_struct *);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
-- 
2.17.0

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 06/33] fs: add new vfs_poll and file_can_poll helpers
From: Christoph Hellwig @ 2018-05-23 19:19 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-api,
	linux-kernel
In-Reply-To: <20180523192022.1703-1-hch@lst.de>

These abstract out calls to the poll method in preparation for changes
in how we poll.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 drivers/staging/comedi/drivers/serial2002.c |  4 ++--
 drivers/vfio/virqfd.c                       |  2 +-
 drivers/vhost/vhost.c                       |  2 +-
 fs/eventpoll.c                              |  5 ++---
 fs/select.c                                 | 23 +++++++--------------
 include/linux/poll.h                        | 12 +++++++++++
 mm/memcontrol.c                             |  2 +-
 net/9p/trans_fd.c                           | 18 ++++------------
 virt/kvm/eventfd.c                          |  2 +-
 9 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/drivers/staging/comedi/drivers/serial2002.c b/drivers/staging/comedi/drivers/serial2002.c
index b3f3b4a201af..5471b2212a62 100644
--- a/drivers/staging/comedi/drivers/serial2002.c
+++ b/drivers/staging/comedi/drivers/serial2002.c
@@ -113,7 +113,7 @@ static void serial2002_tty_read_poll_wait(struct file *f, int timeout)
 		long elapsed;
 		__poll_t mask;
 
-		mask = f->f_op->poll(f, &table.pt);
+		mask = vfs_poll(f, &table.pt);
 		if (mask & (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN |
 			    EPOLLHUP | EPOLLERR)) {
 			break;
@@ -136,7 +136,7 @@ static int serial2002_tty_read(struct file *f, int timeout)
 
 	result = -1;
 	if (!IS_ERR(f)) {
-		if (f->f_op->poll) {
+		if (file_can_poll(f)) {
 			serial2002_tty_read_poll_wait(f, timeout);
 
 			if (kernel_read(f, &ch, 1, &pos) == 1)
diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
index 085700f1be10..2a1be859ee71 100644
--- a/drivers/vfio/virqfd.c
+++ b/drivers/vfio/virqfd.c
@@ -166,7 +166,7 @@ int vfio_virqfd_enable(void *opaque,
 	init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup);
 	init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc);
 
-	events = irqfd.file->f_op->poll(irqfd.file, &virqfd->pt);
+	events = vfs_poll(irqfd.file, &virqfd->pt);
 
 	/*
 	 * Check if there was an event already pending on the eventfd
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f3bd8e941224..f6022881f147 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -208,7 +208,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file)
 	if (poll->wqh)
 		return 0;
 
-	mask = file->f_op->poll(file, &poll->table);
+	mask = vfs_poll(file, &poll->table);
 	if (mask)
 		vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
 	if (mask & EPOLLERR) {
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 602ca4285b2e..67db22fe99c5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -884,8 +884,7 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
 
 	pt->_key = epi->event.events;
 	if (!is_file_epoll(epi->ffd.file))
-		return epi->ffd.file->f_op->poll(epi->ffd.file, pt) &
-		       epi->event.events;
+		return vfs_poll(epi->ffd.file, pt) & epi->event.events;
 
 	ep = epi->ffd.file->private_data;
 	poll_wait(epi->ffd.file, &ep->poll_wait, pt);
@@ -2025,7 +2024,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 
 	/* The target file descriptor must support poll */
 	error = -EPERM;
-	if (!tf.file->f_op->poll)
+	if (!file_can_poll(tf.file))
 		goto error_tgt_fput;
 
 	/* Check if EPOLLWAKEUP is allowed */
diff --git a/fs/select.c b/fs/select.c
index 25da26253485..e30def680b2e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -502,14 +502,10 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
 					continue;
 				f = fdget(i);
 				if (f.file) {
-					const struct file_operations *f_op;
-					f_op = f.file->f_op;
-					mask = DEFAULT_POLLMASK;
-					if (f_op->poll) {
-						wait_key_set(wait, in, out,
-							     bit, busy_flag);
-						mask = (*f_op->poll)(f.file, wait);
-					}
+					wait_key_set(wait, in, out, bit,
+						     busy_flag);
+					mask = vfs_poll(f.file, wait);
+
 					fdput(f);
 					if ((mask & POLLIN_SET) && (in & bit)) {
 						res_in |= bit;
@@ -825,13 +821,10 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
 
 	/* userland u16 ->events contains POLL... bitmap */
 	filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
-	mask = DEFAULT_POLLMASK;
-	if (f.file->f_op->poll) {
-		pwait->_key = filter | busy_flag;
-		mask = f.file->f_op->poll(f.file, pwait);
-		if (mask & busy_flag)
-			*can_busy_poll = true;
-	}
+	pwait->_key = filter | busy_flag;
+	mask = vfs_poll(f.file, pwait);
+	if (mask & busy_flag)
+		*can_busy_poll = true;
 	mask &= filter;		/* Mask out unneeded events. */
 	fdput(f);
 
diff --git a/include/linux/poll.h b/include/linux/poll.h
index a3576da63377..7e0fdcf905d2 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -74,6 +74,18 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
 	pt->_key   = ~(__poll_t)0; /* all events enabled */
 }
 
+static inline bool file_can_poll(struct file *file)
+{
+	return file->f_op->poll;
+}
+
+static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
+{
+	if (unlikely(!file->f_op->poll))
+		return DEFAULT_POLLMASK;
+	return file->f_op->poll(file, pt);
+}
+
 struct poll_table_entry {
 	struct file *filp;
 	__poll_t key;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2bd3df3d101a..1695f38630f1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3849,7 +3849,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
 	if (ret)
 		goto out_put_css;
 
-	efile.file->f_op->poll(efile.file, &event->pt);
+	vfs_poll(efile.file, &event->pt);
 
 	spin_lock(&memcg->event_list_lock);
 	list_add(&event->list, &memcg->event_list);
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 848969fe7979..588bf88c3305 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -231,7 +231,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
 static __poll_t
 p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err)
 {
-	__poll_t ret, n;
+	__poll_t ret;
 	struct p9_trans_fd *ts = NULL;
 
 	if (client && client->status == Connected)
@@ -243,19 +243,9 @@ p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err)
 		return EPOLLERR;
 	}
 
-	if (!ts->rd->f_op->poll)
-		ret = DEFAULT_POLLMASK;
-	else
-		ret = ts->rd->f_op->poll(ts->rd, pt);
-
-	if (ts->rd != ts->wr) {
-		if (!ts->wr->f_op->poll)
-			n = DEFAULT_POLLMASK;
-		else
-			n = ts->wr->f_op->poll(ts->wr, pt);
-		ret = (ret & ~EPOLLOUT) | (n & ~EPOLLIN);
-	}
-
+	ret = vfs_poll(ts->rd, pt);
+	if (ts->rd != ts->wr)
+		ret = (ret & ~EPOLLOUT) | (vfs_poll(ts->wr, pt) & ~EPOLLIN);
 	return ret;
 }
 
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 6e865e8b5b10..90d30fbe95ae 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -397,7 +397,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	 * Check if there was an event already pending on the eventfd
 	 * before we registered, and trigger it as if we didn't miss it.
 	 */
-	events = f.file->f_op->poll(f.file, &irqfd->pt);
+	events = vfs_poll(f.file, &irqfd->pt);
 
 	if (events & EPOLLIN)
 		schedule_work(&irqfd->inject);
-- 
2.17.0

^ permalink raw reply related

* [PATCH 07/33] fs: introduce new ->get_poll_head and ->poll_mask methods
From: Christoph Hellwig @ 2018-05-23 19:19 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-api,
	linux-kernel
In-Reply-To: <20180523192022.1703-1-hch@lst.de>

->get_poll_head returns the waitqueue that the poll operation is going
to sleep on.  Note that this means we can only use a single waitqueue
for the poll, unlike some current drivers that use two waitqueues for
different events.  But now that we have keyed wakeups and heavily use
those for poll there aren't that many good reason left to keep the
multiple waitqueues, and if there are any ->poll is still around, the
driver just won't support aio poll.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 Documentation/filesystems/Locking |  7 ++++++-
 Documentation/filesystems/vfs.txt | 13 +++++++++++++
 fs/select.c                       | 23 +++++++++++++++++++++++
 include/linux/fs.h                |  2 ++
 include/linux/poll.h              | 12 ++++++------
 5 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 220bba28f72b..6d227f9d7bd9 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -440,6 +440,8 @@ prototypes:
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
+	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
+	__poll_t (*poll_mask) (struct file *, __poll_t);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
@@ -470,7 +472,7 @@ prototypes:
 };
 
 locking rules:
-	All may block.
+	All except for ->poll_mask may block.
 
 ->llseek() locking has moved from llseek to the individual llseek
 implementations.  If your fs is not using generic_file_llseek, you
@@ -498,6 +500,9 @@ in sys_read() and friends.
 the lease within the individual filesystem to record the result of the
 operation
 
+->poll_mask can be called with or without the waitqueue lock for the waitqueue
+returned from ->get_poll_head.
+
 --------------------------- dquot_operations -------------------------------
 prototypes:
 	int (*write_dquot) (struct dquot *);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index f608180ad59d..829a7b7857a4 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -857,6 +857,8 @@ struct file_operations {
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
+	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
+	__poll_t (*poll_mask) (struct file *, __poll_t);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
@@ -901,6 +903,17 @@ otherwise noted.
 	activity on this file and (optionally) go to sleep until there
 	is activity. Called by the select(2) and poll(2) system calls
 
+  get_poll_head: Returns the struct wait_queue_head that callers can
+  wait on.  Callers need to check the returned events using ->poll_mask
+  once woken.  Can return NULL to indicate polling is not supported,
+  or any error code using the ERR_PTR convention to indicate that a
+  grave error occured and ->poll_mask shall not be called.
+
+  poll_mask: return the mask of EPOLL* values describing the file descriptor
+  state.  Called either before going to sleep on the waitqueue returned by
+  get_poll_head, or after it has been woken.  If ->get_poll_head and
+  ->poll_mask are implemented ->poll does not need to be implement.
+
   unlocked_ioctl: called by the ioctl(2) system call.
 
   compat_ioctl: called by the ioctl(2) system call when 32 bit system calls
diff --git a/fs/select.c b/fs/select.c
index e30def680b2e..bc3cc0f98896 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -34,6 +34,29 @@
 
 #include <linux/uaccess.h>
 
+__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
+{
+	if (file->f_op->poll) {
+		return file->f_op->poll(file, pt);
+	} else if (file_has_poll_mask(file)) {
+		unsigned int events = poll_requested_events(pt);
+		struct wait_queue_head *head;
+
+		if (pt && pt->_qproc) {
+			head = file->f_op->get_poll_head(file, events);
+			if (!head)
+				return DEFAULT_POLLMASK;
+			if (IS_ERR(head))
+				return EPOLLERR;
+			pt->_qproc(file, head, pt);
+		}
+
+		return file->f_op->poll_mask(file, events);
+	} else {
+		return DEFAULT_POLLMASK;
+	}
+}
+EXPORT_SYMBOL_GPL(vfs_poll);
 
 /*
  * Estimate expected accuracy in ns from a timeval.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7f07977bdfd7..d467bd7b35b7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1711,6 +1711,8 @@ struct file_operations {
 	int (*iterate) (struct file *, struct dir_context *);
 	int (*iterate_shared) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
+	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
+	__poll_t (*poll_mask) (struct file *, __poll_t);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 7e0fdcf905d2..fdf86b4cbc71 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -74,18 +74,18 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
 	pt->_key   = ~(__poll_t)0; /* all events enabled */
 }
 
-static inline bool file_can_poll(struct file *file)
+static inline bool file_has_poll_mask(struct file *file)
 {
-	return file->f_op->poll;
+	return file->f_op->get_poll_head && file->f_op->poll_mask;
 }
 
-static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
+static inline bool file_can_poll(struct file *file)
 {
-	if (unlikely(!file->f_op->poll))
-		return DEFAULT_POLLMASK;
-	return file->f_op->poll(file, pt);
+	return file->f_op->poll || file_has_poll_mask(file);
 }
 
+__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt);
+
 struct poll_table_entry {
 	struct file *filp;
 	__poll_t key;
-- 
2.17.0

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 08/33] aio: simplify KIOCB_KEY handling
From: Christoph Hellwig @ 2018-05-23 19:19 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-api,
	linux-kernel
In-Reply-To: <20180523192022.1703-1-hch@lst.de>

No need to pass the key field to lookup_iocb to compare it with KIOCB_KEY,
as we can do that right after retrieving it from userspace.  Also move the
KIOCB_KEY definition to aio.c as it is an internal value not used by any
other place in the kernel.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/aio.c            | 14 +++++++-------
 include/linux/aio.h |  2 --
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 1c383bb44b2d..50a90e5581ed 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -46,6 +46,8 @@
 
 #include "internal.h"
 
+#define KIOCB_KEY		0
+
 #define AIO_RING_MAGIC			0xa10a10a1
 #define AIO_RING_COMPAT_FEATURES	1
 #define AIO_RING_INCOMPAT_FEATURES	0
@@ -1811,15 +1813,12 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
  *	Finds a given iocb for cancellation.
  */
 static struct aio_kiocb *
-lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key)
+lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb)
 {
 	struct aio_kiocb *kiocb;
 
 	assert_spin_locked(&ctx->ctx_lock);
 
-	if (key != KIOCB_KEY)
-		return NULL;
-
 	/* TODO: use a hash or array, this sucks. */
 	list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
 		if (kiocb->ki_user_iocb == iocb)
@@ -1846,9 +1845,10 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 	u32 key;
 	int ret;
 
-	ret = get_user(key, &iocb->aio_key);
-	if (unlikely(ret))
+	if (unlikely(get_user(key, &iocb->aio_key)))
 		return -EFAULT;
+	if (unlikely(key != KIOCB_KEY))
+		return -EINVAL;
 
 	ctx = lookup_ioctx(ctx_id);
 	if (unlikely(!ctx))
@@ -1856,7 +1856,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 
 	spin_lock_irq(&ctx->ctx_lock);
 
-	kiocb = lookup_kiocb(ctx, iocb, key);
+	kiocb = lookup_kiocb(ctx, iocb);
 	if (kiocb)
 		ret = kiocb_cancel(kiocb);
 	else
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 9d8aabecfe2d..b83e68dd006f 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -8,8 +8,6 @@ struct kioctx;
 struct kiocb;
 struct mm_struct;
 
-#define KIOCB_KEY		0
-
 typedef int (kiocb_cancel_fn)(struct kiocb *);
 
 /* prototypes */
-- 
2.17.0

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox