Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 05/13] timer: Remove init_timer_deferrable() in favor of timer_setup()
From: Kees Cook @ 2017-10-04 23:26 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Kees Cook, Benjamin Herrenschmidt, Michael Ellerman,
	Sebastian Reichel, Harish Patil, Manish Chopra, Kalle Valo,
	linuxppc-dev, netdev, linux-wireless, Andrew Morton,
	Arnd Bergmann, Chris Metcalf, Geert Uytterhoeven,
	Greg Kroah-Hartman, Guenter Roeck, Heiko Carstens,
	James E.J. Bottomley, John Stultz, Julian 
In-Reply-To: <1507159627-127660-1-git-send-email-keescook@chromium.org>

This refactors the only users of init_timer_deferrable() to use
the new timer_setup() and from_timer(). Removes definition of
init_timer_deferrable().

Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Harish Patil <harish.patil@cavium.com>
Cc: Manish Chopra <manish.chopra@cavium.com>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: netdev@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/powerpc/mm/numa.c                       | 12 +++++------
 drivers/hsi/clients/ssi_protocol.c           | 32 ++++++++++++++++------------
 drivers/net/ethernet/qlogic/qlge/qlge_main.c | 11 ++++------
 drivers/net/vxlan.c                          |  8 +++----
 drivers/net/wireless/ath/ath6kl/recovery.c   |  9 ++++----
 include/linux/timer.h                        |  2 --
 6 files changed, 34 insertions(+), 40 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index b95c584ce19d..f9b6107d6854 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1453,7 +1453,7 @@ static void topology_schedule_update(void)
 	schedule_work(&topology_work);
 }
 
-static void topology_timer_fn(unsigned long ignored)
+static void topology_timer_fn(struct timer_list *unused)
 {
 	if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
 		topology_schedule_update();
@@ -1463,14 +1463,11 @@ static void topology_timer_fn(unsigned long ignored)
 		reset_topology_timer();
 	}
 }
-static struct timer_list topology_timer =
-	TIMER_INITIALIZER(topology_timer_fn, 0, 0);
+static struct timer_list topology_timer;
 
 static void reset_topology_timer(void)
 {
-	topology_timer.data = 0;
-	topology_timer.expires = jiffies + 60 * HZ;
-	mod_timer(&topology_timer, topology_timer.expires);
+	mod_timer(&topology_timer, jiffies + 60 * HZ);
 }
 
 #ifdef CONFIG_SMP
@@ -1530,7 +1527,8 @@ int start_topology_update(void)
 			prrn_enabled = 0;
 			vphn_enabled = 1;
 			setup_cpu_associativity_change_counters();
-			init_timer_deferrable(&topology_timer);
+			timer_setup(&topology_timer, topology_timer_fn,
+				    TIMER_DEFERRABLE);
 			reset_topology_timer();
 		}
 	}
diff --git a/drivers/hsi/clients/ssi_protocol.c b/drivers/hsi/clients/ssi_protocol.c
index 93d28c0ec8bf..67af03d3aeb3 100644
--- a/drivers/hsi/clients/ssi_protocol.c
+++ b/drivers/hsi/clients/ssi_protocol.c
@@ -464,10 +464,10 @@ static void ssip_error(struct hsi_client *cl)
 	hsi_async_read(cl, msg);
 }
 
-static void ssip_keep_alive(unsigned long data)
+static void ssip_keep_alive(struct timer_list *t)
 {
-	struct hsi_client *cl = (struct hsi_client *)data;
-	struct ssi_protocol *ssi = hsi_client_drvdata(cl);
+	struct ssi_protocol *ssi = from_timer(ssi, t, keep_alive);
+	struct hsi_client *cl = ssi->cl;
 
 	dev_dbg(&cl->device, "Keep alive kick in: m(%d) r(%d) s(%d)\n",
 		ssi->main_state, ssi->recv_state, ssi->send_state);
@@ -490,9 +490,19 @@ static void ssip_keep_alive(unsigned long data)
 	spin_unlock(&ssi->lock);
 }
 
-static void ssip_wd(unsigned long data)
+static void ssip_rx_wd(struct timer_list *t)
+{
+	struct ssi_protocol *ssi = from_timer(ssi, t, rx_wd);
+	struct hsi_client *cl = ssi->cl;
+
+	dev_err(&cl->device, "Watchdog trigerred\n");
+	ssip_error(cl);
+}
+
+static void ssip_tx_wd(unsigned long data)
 {
-	struct hsi_client *cl = (struct hsi_client *)data;
+	struct ssi_protocol *ssi = from_timer(ssi, t, tx_wd);
+	struct hsi_client *cl = ssi->cl;
 
 	dev_err(&cl->device, "Watchdog trigerred\n");
 	ssip_error(cl);
@@ -1084,15 +1094,9 @@ static int ssi_protocol_probe(struct device *dev)
 	}
 
 	spin_lock_init(&ssi->lock);
-	init_timer_deferrable(&ssi->rx_wd);
-	init_timer_deferrable(&ssi->tx_wd);
-	init_timer(&ssi->keep_alive);
-	ssi->rx_wd.data = (unsigned long)cl;
-	ssi->rx_wd.function = ssip_wd;
-	ssi->tx_wd.data = (unsigned long)cl;
-	ssi->tx_wd.function = ssip_wd;
-	ssi->keep_alive.data = (unsigned long)cl;
-	ssi->keep_alive.function = ssip_keep_alive;
+	timer_setup(&ssi->rx_wd, ssip_rx_wd, TIMER_DEFERRABLE);
+	timer_setup(&ssi->tx_wd, ssip_tx_wd, TIMER_DEFERRABLE);
+	timer_setup(&ssi->keep_alive, ssip_keep_alive, 0);
 	INIT_LIST_HEAD(&ssi->txqueue);
 	INIT_LIST_HEAD(&ssi->cmdqueue);
 	atomic_set(&ssi->tx_usecnt, 0);
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
index 9feec7009443..29fea74bff2e 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
@@ -4725,9 +4725,9 @@ static const struct net_device_ops qlge_netdev_ops = {
 	.ndo_vlan_rx_kill_vid	= qlge_vlan_rx_kill_vid,
 };
 
-static void ql_timer(unsigned long data)
+static void ql_timer(struct timer_list *t)
 {
-	struct ql_adapter *qdev = (struct ql_adapter *)data;
+	struct ql_adapter *qdev = from_timer(qdev, t, timer);
 	u32 var = 0;
 
 	var = ql_read32(qdev, STS);
@@ -4806,11 +4806,8 @@ static int qlge_probe(struct pci_dev *pdev,
 	/* Start up the timer to trigger EEH if
 	 * the bus goes dead
 	 */
-	init_timer_deferrable(&qdev->timer);
-	qdev->timer.data = (unsigned long)qdev;
-	qdev->timer.function = ql_timer;
-	qdev->timer.expires = jiffies + (5*HZ);
-	add_timer(&qdev->timer);
+	timer_setup(&qdev->timer, ql_timer, TIMER_DEFERRABLE);
+	mod_timer(&qdev->timer, jiffies + (5*HZ));
 	ql_link_off(qdev);
 	ql_display_dev_info(ndev);
 	atomic_set(&qdev->lb_count, 0);
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index d7c49cf1d5e9..3247d2feda07 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2325,9 +2325,9 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 /* Walk the forwarding table and purge stale entries */
-static void vxlan_cleanup(unsigned long arg)
+static void vxlan_cleanup(struct timer_list *t)
 {
-	struct vxlan_dev *vxlan = (struct vxlan_dev *) arg;
+	struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer);
 	unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
 	unsigned int h;
 
@@ -2647,9 +2647,7 @@ static void vxlan_setup(struct net_device *dev)
 	INIT_LIST_HEAD(&vxlan->next);
 	spin_lock_init(&vxlan->hash_lock);
 
-	init_timer_deferrable(&vxlan->age_timer);
-	vxlan->age_timer.function = vxlan_cleanup;
-	vxlan->age_timer.data = (unsigned long) vxlan;
+	timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE);
 
 	vxlan->dev = dev;
 
diff --git a/drivers/net/wireless/ath/ath6kl/recovery.c b/drivers/net/wireless/ath/ath6kl/recovery.c
index 3a8d5e97dc8e..c09e40c9010f 100644
--- a/drivers/net/wireless/ath/ath6kl/recovery.c
+++ b/drivers/net/wireless/ath/ath6kl/recovery.c
@@ -60,9 +60,9 @@ void ath6kl_recovery_hb_event(struct ath6kl *ar, u32 cookie)
 		ar->fw_recovery.hb_pending = false;
 }
 
-static void ath6kl_recovery_hb_timer(unsigned long data)
+static void ath6kl_recovery_hb_timer(struct timer_list *t)
 {
-	struct ath6kl *ar = (struct ath6kl *) data;
+	struct ath6kl *ar = from_timer(ar, t, fw_recovery.hb_timer);
 	int err;
 
 	if (test_bit(RECOVERY_CLEANUP, &ar->flag) ||
@@ -104,9 +104,8 @@ void ath6kl_recovery_init(struct ath6kl *ar)
 	recovery->seq_num = 0;
 	recovery->hb_misscnt = 0;
 	ar->fw_recovery.hb_pending = false;
-	ar->fw_recovery.hb_timer.function = ath6kl_recovery_hb_timer;
-	ar->fw_recovery.hb_timer.data = (unsigned long) ar;
-	init_timer_deferrable(&ar->fw_recovery.hb_timer);
+	timer_setup(&ar->fw_recovery.hb_timer, ath6kl_recovery_hb_timer,
+		    TIMER_DEFERRABLE);
 
 	if (ar->fw_recovery.hb_poll)
 		mod_timer(&ar->fw_recovery.hb_timer, jiffies +
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 9da903562ed4..10cc45ca5803 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -128,8 +128,6 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 
 #define init_timer(timer)						\
 	__init_timer((timer), 0)
-#define init_timer_deferrable(timer)					\
-	__init_timer((timer), TIMER_DEFERRABLE)
 
 #define __setup_timer(_timer, _fn, _data, _flags)			\
 	do {								\
-- 
2.7.4

^ permalink raw reply related

* [PATCH 04/13] timer: Remove init_timer_pinned() in favor of timer_setup()
From: Kees Cook @ 2017-10-04 23:26 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Kees Cook, Chris Metcalf, netdev, Andrew Morton, Arnd Bergmann,
	Benjamin Herrenschmidt, Geert Uytterhoeven, Greg Kroah-Hartman,
	Guenter Roeck, Harish Patil, Heiko Carstens, James E.J. Bottomley,
	John Stultz, Julian Wiedmann, Kalle Valo, Lai Jiangshan,
	Len Brown, Manish Chopra, Mark Gross
In-Reply-To: <1507159627-127660-1-git-send-email-keescook@chromium.org>

This refactors the only users of init_timer_pinned() to use
the new timer_setup() and from_timer(). Drops the definition of
init_timer_pinned().

Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/net/ethernet/tile/tilepro.c | 9 ++++-----
 include/linux/timer.h               | 2 --
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/tile/tilepro.c b/drivers/net/ethernet/tile/tilepro.c
index 49ccee4b9aec..56d06282fbde 100644
--- a/drivers/net/ethernet/tile/tilepro.c
+++ b/drivers/net/ethernet/tile/tilepro.c
@@ -608,9 +608,9 @@ static void tile_net_schedule_egress_timer(struct tile_net_cpu *info)
  * ISSUE: Maybe instead track number of expected completions, and free
  * only that many, resetting to zero if "pending" is ever false.
  */
-static void tile_net_handle_egress_timer(unsigned long arg)
+static void tile_net_handle_egress_timer(struct timer_list *t)
 {
-	struct tile_net_cpu *info = (struct tile_net_cpu *)arg;
+	struct tile_net_cpu *info = from_timer(info, t, egress_timer);
 	struct net_device *dev = info->napi.dev;
 
 	/* The timer is no longer scheduled. */
@@ -1004,9 +1004,8 @@ static void tile_net_register(void *dev_ptr)
 		BUG();
 
 	/* Initialize the egress timer. */
-	init_timer_pinned(&info->egress_timer);
-	info->egress_timer.data = (long)info;
-	info->egress_timer.function = tile_net_handle_egress_timer;
+	timer_setup(&info->egress_timer, tile_net_handle_egress_timer,
+		    TIMER_PINNED);
 
 	u64_stats_init(&info->stats.syncp);
 
diff --git a/include/linux/timer.h b/include/linux/timer.h
index b10c4bdc6fbd..9da903562ed4 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -128,8 +128,6 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 
 #define init_timer(timer)						\
 	__init_timer((timer), 0)
-#define init_timer_pinned(timer)					\
-	__init_timer((timer), TIMER_PINNED)
 #define init_timer_deferrable(timer)					\
 	__init_timer((timer), TIMER_DEFERRABLE)
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH 03/13] timer: Remove init_timer_on_stack() in favor of timer_setup_on_stack()
From: Kees Cook @ 2017-10-04 23:26 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Kees Cook, Rafael J. Wysocki, Pavel Machek, Len Brown,
	Greg Kroah-Hartman, Stefan Richter, Sudip Mukherjee,
	Martin Schwidefsky, Heiko Carstens, Julian Wiedmann, Ursula Braun,
	Michael Reed, James E.J. Bottomley, Martin K. Petersen, linux-pm,
	linux1394-devel, linux-s390, linux-scsi, Andrew Morton,
	Arnd Bergmann
In-Reply-To: <1507159627-127660-1-git-send-email-keescook@chromium.org>

Remove uses of init_timer_on_stack() with open-coded function and data
assignments that could be expressed using timer_setup_on_stack(). Several
were removed from the stack entirely since there was a one-to-one mapping
of parent structure to timer, those are switched to using timer_setup()
instead. All related callbacks were adjusted to use from_timer().

Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Len Brown <len.brown@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Cc: Michael Reed <mdr@sgi.com>
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-pm@vger.kernel.org
Cc: linux1394-devel@lists.sourceforge.net
Cc: linux-s390@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/base/power/main.c           |  8 +++-----
 drivers/firewire/core-transaction.c | 10 +++++-----
 drivers/parport/ieee1284.c          | 21 +++++++--------------
 drivers/s390/char/tape.h            |  1 +
 drivers/s390/char/tape_std.c        | 18 ++++++------------
 drivers/s390/net/lcs.c              | 16 ++++++----------
 drivers/s390/net/lcs.h              |  1 +
 drivers/scsi/qla1280.c              | 14 +++++---------
 drivers/scsi/qla1280.h              |  1 +
 include/linux/parport.h             |  1 +
 include/linux/timer.h               |  2 --
 11 files changed, 36 insertions(+), 57 deletions(-)

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 770b1539a083..ae47b2ec84b4 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -478,9 +478,9 @@ struct dpm_watchdog {
  * There's not much we can do here to recover so panic() to
  * capture a crash-dump in pstore.
  */
-static void dpm_watchdog_handler(unsigned long data)
+static void dpm_watchdog_handler(struct timer_list *t)
 {
-	struct dpm_watchdog *wd = (void *)data;
+	struct dpm_watchdog *wd = from_timer(wd, t, timer);
 
 	dev_emerg(wd->dev, "**** DPM device timeout ****\n");
 	show_stack(wd->tsk, NULL);
@@ -500,11 +500,9 @@ static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev)
 	wd->dev = dev;
 	wd->tsk = current;
 
-	init_timer_on_stack(timer);
+	timer_setup_on_stack(timer, dpm_watchdog_handler, 0);
 	/* use same timeout value for both suspend and resume */
 	timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_TIMEOUT;
-	timer->function = dpm_watchdog_handler;
-	timer->data = (unsigned long)wd;
 	add_timer(timer);
 }
 
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index d6a09b9cd8cc..4372f9e4b0da 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -137,9 +137,9 @@ int fw_cancel_transaction(struct fw_card *card,
 }
 EXPORT_SYMBOL(fw_cancel_transaction);
 
-static void split_transaction_timeout_callback(unsigned long data)
+static void split_transaction_timeout_callback(struct timer_list *timer)
 {
-	struct fw_transaction *t = (struct fw_transaction *)data;
+	struct fw_transaction *t = from_timer(t, timer, split_timeout_timer);
 	struct fw_card *card = t->card;
 	unsigned long flags;
 
@@ -373,8 +373,8 @@ void fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode,
 	t->tlabel = tlabel;
 	t->card = card;
 	t->is_split_transaction = false;
-	setup_timer(&t->split_timeout_timer,
-		    split_transaction_timeout_callback, (unsigned long)t);
+	timer_setup(&t->split_timeout_timer,
+		    split_transaction_timeout_callback, 0);
 	t->callback = callback;
 	t->callback_data = callback_data;
 
@@ -423,7 +423,7 @@ int fw_run_transaction(struct fw_card *card, int tcode, int destination_id,
 	struct transaction_callback_data d;
 	struct fw_transaction t;
 
-	init_timer_on_stack(&t.split_timeout_timer);
+	timer_setup_on_stack(&t.split_timeout_timer, NULL, 0);
 	init_completion(&d.done);
 	d.payload = payload;
 	fw_send_request(card, &t, tcode, destination_id, generation, speed,
diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c
index 74cc6dd982d2..2d1a5c737c6e 100644
--- a/drivers/parport/ieee1284.c
+++ b/drivers/parport/ieee1284.c
@@ -44,10 +44,11 @@ static void parport_ieee1284_wakeup (struct parport *port)
 	up (&port->physport->ieee1284.irq);
 }
 
-static struct parport *port_from_cookie[PARPORT_MAX];
-static void timeout_waiting_on_port (unsigned long cookie)
+static void timeout_waiting_on_port (struct timer_list *t)
 {
-	parport_ieee1284_wakeup (port_from_cookie[cookie % PARPORT_MAX]);
+	struct parport *port = from_timer(port, t, timer);
+
+	parport_ieee1284_wakeup (port);
 }
 
 /**
@@ -69,27 +70,19 @@ static void timeout_waiting_on_port (unsigned long cookie)
 int parport_wait_event (struct parport *port, signed long timeout)
 {
 	int ret;
-	struct timer_list timer;
 
 	if (!port->physport->cad->timeout)
 		/* Zero timeout is special, and we can't down() the
 		   semaphore. */
 		return 1;
 
-	init_timer_on_stack(&timer);
-	timer.expires = jiffies + timeout;
-	timer.function = timeout_waiting_on_port;
-	port_from_cookie[port->number % PARPORT_MAX] = port;
-	timer.data = port->number;
-
-	add_timer (&timer);
+	timer_setup(&port->timer, timeout_waiting_on_port, 0);
+	mod_timer(&port->timer, jiffies + timeout);
 	ret = down_interruptible (&port->physport->ieee1284.irq);
-	if (!del_timer_sync(&timer) && !ret)
+	if (!del_timer_sync(&port->timer) && !ret)
 		/* Timed out. */
 		ret = 1;
 
-	destroy_timer_on_stack(&timer);
-
 	return ret;
 }
 
diff --git a/drivers/s390/char/tape.h b/drivers/s390/char/tape.h
index ea664dd4f56d..52fbcd9c3cf8 100644
--- a/drivers/s390/char/tape.h
+++ b/drivers/s390/char/tape.h
@@ -128,6 +128,7 @@ struct tape_request {
 	int options;			/* options for execution. */
 	int retries;			/* retry counter for error recovery. */
 	int rescnt;			/* residual count from devstat. */
+	struct timer_list timer;	/* timer for std_assign_timeout(). */
 
 	/* Callback for delivering final status. */
 	void (*callback)(struct tape_request *, void *);
diff --git a/drivers/s390/char/tape_std.c b/drivers/s390/char/tape_std.c
index 3478e19ae194..cd204abdc0bc 100644
--- a/drivers/s390/char/tape_std.c
+++ b/drivers/s390/char/tape_std.c
@@ -32,14 +32,12 @@
  * tape_std_assign
  */
 static void
-tape_std_assign_timeout(unsigned long data)
+tape_std_assign_timeout(struct timer_list *t)
 {
-	struct tape_request *	request;
-	struct tape_device *	device;
+	struct tape_request *	request = from_timer(request, t, timer);
+	struct tape_device *	device = request->device;
 	int rc;
 
-	request = (struct tape_request *) data;
-	device = request->device;
 	BUG_ON(!device);
 
 	DBF_EVENT(3, "%08x: Assignment timeout. Device busy.\n",
@@ -70,16 +68,12 @@ tape_std_assign(struct tape_device *device)
 	 * to another host (actually this shouldn't happen but it does).
 	 * So we set up a timeout for this call.
 	 */
-	init_timer_on_stack(&timeout);
-	timeout.function = tape_std_assign_timeout;
-	timeout.data     = (unsigned long) request;
-	timeout.expires  = jiffies + 2 * HZ;
-	add_timer(&timeout);
+	timer_setup(&request->timer, tape_std_assign_timeout, 0);
+	mod_timer(&timeout, jiffies + 2 * HZ);
 
 	rc = tape_do_io_interruptible(device, request);
 
-	del_timer_sync(&timeout);
-	destroy_timer_on_stack(&timeout);
+	del_timer_sync(&request->timer);
 
 	if (rc != 0) {
 		DBF_EVENT(3, "%08x: assign failed - device might be busy\n",
diff --git a/drivers/s390/net/lcs.c b/drivers/s390/net/lcs.c
index d01b5c2a7760..21bba406d5be 100644
--- a/drivers/s390/net/lcs.c
+++ b/drivers/s390/net/lcs.c
@@ -834,9 +834,10 @@ lcs_notify_lancmd_waiters(struct lcs_card *card, struct lcs_cmd *cmd)
  * Emit buffer of a lan command.
  */
 static void
-lcs_lancmd_timeout(unsigned long data)
+lcs_lancmd_timeout(struct timer_list *t)
 {
-	struct lcs_reply *reply, *list_reply, *r;
+	struct lcs_reply *reply = from_timer(reply, t, timer);
+	struct lcs_reply *list_reply, *r;
 	unsigned long flags;
 
 	LCS_DBF_TEXT(4, trace, "timeout");
@@ -864,7 +865,6 @@ lcs_send_lancmd(struct lcs_card *card, struct lcs_buffer *buffer,
 {
 	struct lcs_reply *reply;
 	struct lcs_cmd *cmd;
-	struct timer_list timer;
 	unsigned long flags;
 	int rc;
 
@@ -885,14 +885,10 @@ lcs_send_lancmd(struct lcs_card *card, struct lcs_buffer *buffer,
 	rc = lcs_ready_buffer(&card->write, buffer);
 	if (rc)
 		return rc;
-	init_timer_on_stack(&timer);
-	timer.function = lcs_lancmd_timeout;
-	timer.data = (unsigned long) reply;
-	timer.expires = jiffies + HZ*card->lancmd_timeout;
-	add_timer(&timer);
+	timer_setup(&reply->timer, lcs_lancmd_timeout, 0);
+	mod_timer(&reply->timer, jiffies + HZ * card->lancmd_timeout);
 	wait_event(reply->wait_q, reply->received);
-	del_timer_sync(&timer);
-	destroy_timer_on_stack(&timer);
+	del_timer_sync(&reply->timer);
 	LCS_DBF_TEXT_(4, trace, "rc:%d",reply->rc);
 	rc = reply->rc;
 	lcs_put_reply(reply);
diff --git a/drivers/s390/net/lcs.h b/drivers/s390/net/lcs.h
index 150fcb4cebc3..d44fb8d9378f 100644
--- a/drivers/s390/net/lcs.h
+++ b/drivers/s390/net/lcs.h
@@ -275,6 +275,7 @@ struct lcs_reply {
 	void (*callback)(struct lcs_card *, struct lcs_cmd *);
 	wait_queue_head_t wait_q;
 	struct lcs_card *card;
+	struct timer_list timer;
 	int received;
 	int rc;
 };
diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c
index 8a29fb09db14..390775d5c918 100644
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -758,9 +758,9 @@ enum action {
 };
 
 
-static void qla1280_mailbox_timeout(unsigned long __data)
+static void qla1280_mailbox_timeout(struct timer_list *t)
 {
-	struct scsi_qla_host *ha = (struct scsi_qla_host *)__data;
+	struct scsi_qla_host *ha = from_timer(ha, t, mailbox_timer);
 	struct device_reg __iomem *reg;
 	reg = ha->iobase;
 
@@ -2465,7 +2465,6 @@ qla1280_mailbox_command(struct scsi_qla_host *ha, uint8_t mr, uint16_t *mb)
 	uint16_t __iomem *mptr;
 	uint16_t data;
 	DECLARE_COMPLETION_ONSTACK(wait);
-	struct timer_list timer;
 
 	ENTER("qla1280_mailbox_command");
 
@@ -2494,18 +2493,15 @@ qla1280_mailbox_command(struct scsi_qla_host *ha, uint8_t mr, uint16_t *mb)
 	/* Issue set host interrupt command. */
 
 	/* set up a timer just in case we're really jammed */
-	init_timer_on_stack(&timer);
-	timer.expires = jiffies + 20*HZ;
-	timer.data = (unsigned long)ha;
-	timer.function = qla1280_mailbox_timeout;
-	add_timer(&timer);
+	timer_setup(&ha->mailbox_timer, qla1280_mailbox_timeout, 0);
+	mod_timer(&ha->mailbox_timer, jiffies + 20 * HZ);
 
 	spin_unlock_irq(ha->host->host_lock);
 	WRT_REG_WORD(&reg->host_cmd, HC_SET_HOST_INT);
 	data = qla1280_debounce_register(&reg->istatus);
 
 	wait_for_completion(&wait);
-	del_timer_sync(&timer);
+	del_timer_sync(&ha->mailbox_timer);
 
 	spin_lock_irq(ha->host->host_lock);
 
diff --git a/drivers/scsi/qla1280.h b/drivers/scsi/qla1280.h
index 834884b9eed5..1522aca2c8c8 100644
--- a/drivers/scsi/qla1280.h
+++ b/drivers/scsi/qla1280.h
@@ -1055,6 +1055,7 @@ struct scsi_qla_host {
 	struct list_head done_q;	/* Done queue */
 
 	struct completion *mailbox_wait;
+	struct timer_list mailbox_timer;
 
 	volatile struct {
 		uint32_t online:1;			/* 0 */
diff --git a/include/linux/parport.h b/include/linux/parport.h
index 58e3c64c6b49..397607a0c0eb 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -225,6 +225,7 @@ struct parport {
 	struct pardevice *waittail;
 
 	struct list_head list;
+	struct timer_list timer;
 	unsigned int flags;
 
 	void *sysctl_table;
diff --git a/include/linux/timer.h b/include/linux/timer.h
index d11e819a86e2..b10c4bdc6fbd 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -132,8 +132,6 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 	__init_timer((timer), TIMER_PINNED)
 #define init_timer_deferrable(timer)					\
 	__init_timer((timer), TIMER_DEFERRABLE)
-#define init_timer_on_stack(timer)					\
-	__init_timer_on_stack((timer), 0)
 
 #define __setup_timer(_timer, _fn, _data, _flags)			\
 	do {								\
-- 
2.7.4

^ permalink raw reply related

* [PATCH 02/13] timer: Remove init_timer_pinned_deferrable() in favor of timer_setup()
From: Kees Cook @ 2017-10-04 23:26 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Kees Cook, Rafael J. Wysocki, Viresh Kumar,
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linux-pm, linuxppc-dev, Andrew Morton, Arnd Bergmann,
	Chris Metcalf, Geert Uytterhoeven, Greg Kroah-Hartman,
	Guenter Roeck, Harish Patil, Heiko Carstens, James E.J. Bottomley,
	John Stultz, Julian Wiedmann
In-Reply-To: <1507159627-127660-1-git-send-email-keescook@chromium.org>

This refactors the only user of init_timer_pinned_deferrable() to use the
new timer_setup() and from_timer(). Adds a pointer back to the policy,
and drops the definition of init_timer_pinned_deferrable().

Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-pm@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/cpufreq/powernv-cpufreq.c | 13 +++++++------
 include/linux/timer.h             |  2 --
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index 3ff5160451b4..b6d7c4c98d0a 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -90,6 +90,7 @@ struct global_pstate_info {
 	int last_gpstate_idx;
 	spinlock_t gpstate_lock;
 	struct timer_list timer;
+	struct cpufreq_policy *policy;
 };
 
 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
@@ -625,10 +626,10 @@ static inline void  queue_gpstate_timer(struct global_pstate_info *gpstates)
  * according quadratic equation. Queues a new timer if it is still not equal
  * to local pstate
  */
-void gpstate_timer_handler(unsigned long data)
+void gpstate_timer_handler(struct timer_list *t)
 {
-	struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
-	struct global_pstate_info *gpstates = policy->driver_data;
+	struct global_pstate_info *gpstates = from_timer(gpstates, t, timer);
+	struct cpufreq_policy *policy = gpstates->policy;
 	int gpstate_idx, lpstate_idx;
 	unsigned long val;
 	unsigned int time_diff = jiffies_to_msecs(jiffies)
@@ -800,9 +801,9 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	policy->driver_data = gpstates;
 
 	/* initialize timer */
-	init_timer_pinned_deferrable(&gpstates->timer);
-	gpstates->timer.data = (unsigned long)policy;
-	gpstates->timer.function = gpstate_timer_handler;
+	gpstates->policy = policy;
+	timer_setup(&gpstates->timer, gpstate_timer_handler,
+		    TIMER_PINNED | TIMER_DEFERRABLE);
 	gpstates->timer.expires = jiffies +
 				msecs_to_jiffies(GPSTATE_TIMER_INTERVAL);
 	spin_lock_init(&gpstates->gpstate_lock);
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 5ef5c9e41a09..d11e819a86e2 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -132,8 +132,6 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 	__init_timer((timer), TIMER_PINNED)
 #define init_timer_deferrable(timer)					\
 	__init_timer((timer), TIMER_DEFERRABLE)
-#define init_timer_pinned_deferrable(timer)				\
-	__init_timer((timer), TIMER_DEFERRABLE | TIMER_PINNED)
 #define init_timer_on_stack(timer)					\
 	__init_timer_on_stack((timer), 0)
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH 01/13] timer: Convert schedule_timeout() to use from_timer()
From: Kees Cook @ 2017-10-04 23:26 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Kees Cook, John Stultz, Stephen Boyd, Andrew Morton,
	Arnd Bergmann, Benjamin Herrenschmidt, Chris Metcalf,
	Geert Uytterhoeven, Greg Kroah-Hartman, Guenter Roeck,
	Harish Patil, Heiko Carstens, James E.J. Bottomley,
	Julian Wiedmann, Kalle Valo, Lai Jiangshan, Len Brown,
	Manish Chopra, Mark Gross <mark.gross
In-Reply-To: <1507159627-127660-1-git-send-email-keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>

In preparation for unconditionally passing the struct timer_list pointer to
all timer callbacks, switch to using the new from_timer() helper and passing
the timer pointer explicitly. Since this special timer is on the stack, it
needs to have a wrapper structure to carry state once .data is eliminated.

Cc: John Stultz <john.stultz-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>
Cc: Thomas Gleixner <tglx-hfZtesqFncYOwBW4kG4KsQ@public.gmane.org>
Cc: Stephen Boyd <sboyd-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
Signed-off-by: Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
---
 include/linux/timer.h |  8 ++++++++
 kernel/time/timer.c   | 26 +++++++++++++++++++-------
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 6383c528b148..5ef5c9e41a09 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -179,6 +179,14 @@ static inline void timer_setup(struct timer_list *timer,
 		      (TIMER_DATA_TYPE)timer, flags);
 }
 
+static inline void timer_setup_on_stack(struct timer_list *timer,
+			       void (*callback)(struct timer_list *),
+			       unsigned int flags)
+{
+	__setup_timer_on_stack(timer, (TIMER_FUNC_TYPE)callback,
+			       (TIMER_DATA_TYPE)timer, flags);
+}
+
 #define from_timer(var, callback_timer, timer_fieldname) \
 	container_of(callback_timer, typeof(*var), timer_fieldname)
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index f2674a056c26..38613ced2324 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1668,9 +1668,20 @@ void run_local_timers(void)
 	raise_softirq(TIMER_SOFTIRQ);
 }
 
-static void process_timeout(unsigned long __data)
+/*
+ * Since schedule_timeout()'s timer is defined on the stack, it must store
+ * the target task on the stack as well.
+ */
+struct process_timer {
+	struct timer_list timer;
+	struct task_struct *task;
+};
+
+static void process_timeout(struct timer_list *t)
 {
-	wake_up_process((struct task_struct *)__data);
+	struct process_timer *timeout = from_timer(timeout, t, timer);
+
+	wake_up_process(timeout->task);
 }
 
 /**
@@ -1704,7 +1715,7 @@ static void process_timeout(unsigned long __data)
  */
 signed long __sched schedule_timeout(signed long timeout)
 {
-	struct timer_list timer;
+	struct process_timer timer;
 	unsigned long expire;
 
 	switch (timeout)
@@ -1738,13 +1749,14 @@ signed long __sched schedule_timeout(signed long timeout)
 
 	expire = timeout + jiffies;
 
-	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-	__mod_timer(&timer, expire, false);
+	timer.task = current;
+	timer_setup_on_stack(&timer.timer, process_timeout, 0);
+	__mod_timer(&timer.timer, expire, false);
 	schedule();
-	del_singleshot_timer_sync(&timer);
+	del_singleshot_timer_sync(&timer.timer);
 
 	/* Remove the timer from the object tracker */
-	destroy_timer_on_stack(&timer);
+	destroy_timer_on_stack(&timer.timer);
 
 	timeout = expire - jiffies;
 
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-watchdog" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH 00/13] timer: Start conversion to timer_setup()
From: Kees Cook @ 2017-10-04 23:26 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Kees Cook, Andrew Morton, Arnd Bergmann, Benjamin Herrenschmidt,
	Chris Metcalf, Geert Uytterhoeven, Greg Kroah-Hartman,
	Guenter Roeck, Harish Patil, Heiko Carstens, James E.J. Bottomley,
	John Stultz, Julian Wiedmann, Kalle Valo, Lai Jiangshan,
	Len Brown, Manish Chopra, Mark Gross,
	"Martin K. Petersen" <ma

Hi,

This is the first of many timer infrastructure cleanups to simplify the
timer API[1]. All of these patches are expected to land via the timer
tree, so Acks (or corrections) appreciated.

These patches refactor various users of timer API that are NOT just using
init_timer() or setup_timer() (which is the vast majority of users,
and are being converted separately). These changes are focused on the
lesser-used init_timer_*(), TIMER_*INITIALIZER(), and DEFINE_TIMER()
methods of preparing a timer.

Thanks!

-Kees

[1] https://git.kernel.org/linus/686fef928bba6be13cabe639f154af7d72b63120

^ permalink raw reply

* Re: [PATCH] nfp: convert nfp_eth_set_bit_config() into a macro
From: Jakub Kicinski @ 2017-10-04 23:25 UTC (permalink / raw)
  To: Matthias Kaehlcke
  Cc: Joe Perches, David S . Miller, Simon Horman, Dirk van der Merwe,
	oss-drivers, netdev, linux-kernel, Renato Golin, Manoj Gupta,
	Guenter Roeck, Doug Anderson
In-Reply-To: <20171004231649.GP173745@google.com>

On Wed, 4 Oct 2017 16:16:49 -0700, Matthias Kaehlcke wrote:
> > > Thanks for the suggestion. This seems a viable alternative if David
> > > and the NFP owners can live without the extra checking provided by
> > > __BF_FIELD_CHECK.  
> > 
> > The reason the __BF_FIELD_CHECK refuses to compile non-constant masks
> > is that it will require runtime ffs on the mask, which is potentially
> > costly.  I would also feel quite stupid adding those macros to the nfp
> > driver, given that I specifically created the bitfield.h header to not
> > have to reimplement these in every driver I write/maintain.  
> 
> That make sense, thanks for providing more context.
> 
> > Can you please test the patch I provided in the other reply?  
> 
> With this patch there are no errors when building the kernel with
> clang.

Cool, thanks for checking!  I will run it through full tests and queue
for upstreaming :)

^ permalink raw reply

* Re: [PATCH] nfp: convert nfp_eth_set_bit_config() into a macro
From: Matthias Kaehlcke @ 2017-10-04 23:16 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Joe Perches, David S . Miller, Simon Horman, Dirk van der Merwe,
	oss-drivers, netdev, linux-kernel, Renato Golin, Manoj Gupta,
	Guenter Roeck, Doug Anderson
In-Reply-To: <20171004152203.2a4f564d@cakuba.netronome.com>

Hi Jakub,

El Wed, Oct 04, 2017 at 03:22:03PM -0700 Jakub Kicinski ha dit:

> On Wed, 4 Oct 2017 11:49:57 -0700, Matthias Kaehlcke wrote:
> > Hi Joe,
> > 
> > El Wed, Oct 04, 2017 at 11:07:19AM -0700 Joe Perches ha dit:
> > 
> > > On Tue, 2017-10-03 at 13:05 -0700, Matthias Kaehlcke wrote:  
> > > > nfp_eth_set_bit_config() is marked as __always_inline to allow gcc to
> > > > identify the 'mask' parameter as known to be constant at compile time,
> > > > which is required to use the FIELD_GET() macro.
> > > > 
> > > > The forced inlining does the trick for gcc, but for kernel builds with
> > > > clang it results in undefined symbols:  
> > > 
> > > Can't you use local different FIELD_PREP/FIELD_GET macros
> > > with a different name without the BUILD_BUG tests?
> > > 
> > > i.e.:
> > > 
> > > #define NFP_FIELD_PREP(_mask, _val)				\
> > > ({								\
> > > 	((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask);	\
> > > })
> > > 
> > > #define NFP_FIELD_GET(_mask, _reg)				\
> > > ({								\
> > > 	(typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask));	\
> > > })
> > > 
> > > Then the __always_inline can be removed from
> > > nfp_eth_set_bit_config too.  
> > 
> > Thanks for the suggestion. This seems a viable alternative if David
> > and the NFP owners can live without the extra checking provided by
> > __BF_FIELD_CHECK.
> 
> The reason the __BF_FIELD_CHECK refuses to compile non-constant masks
> is that it will require runtime ffs on the mask, which is potentially
> costly.  I would also feel quite stupid adding those macros to the nfp
> driver, given that I specifically created the bitfield.h header to not
> have to reimplement these in every driver I write/maintain.

That make sense, thanks for providing more context.

> Can you please test the patch I provided in the other reply?

With this patch there are no errors when building the kernel with
clang.

Thanks!

Matthias

^ permalink raw reply

* Re: [PATCH v2 net-next 0/8] bpf: muli prog support for cgroup-bpf
From: David Miller @ 2017-10-04 23:05 UTC (permalink / raw)
  To: ast; +Cc: daniel, tj, dsa, netdev, kernel-team
In-Reply-To: <20171003055028.1294791-1-ast@fb.com>

From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 2 Oct 2017 22:50:20 -0700

> v1->v2:
> - fixed accidentally swapped two lines which caused static_key not going to zero
> - addressed Martin's feedback and changed prog_query to be consistent
>   with verifier output: return -enospc and fill supplied buffer instead
>   of just returning -enospc when buffer is too small to fit all prog_ids
> 
> v1:
> cgroup-bpf use cases are getting more advanced and running only
> one program per cgroup is no longer enough. Therefore introduce
> support for attaching multiple programs per cgroup and running
> a set of effective programs.
> 
> These patches introduces BPF_F_ALLOW_MULTI flag for BPF_PROG_ATTACH cmd.
> The default is still NONE and behavior of BPF_F_ALLOW_OVERRIDE flag
> is unchanged.
> The difference between three possible flags for BPF_PROG_ATTACH command:
> - NONE(default): No further bpf programs allowed in the subtree.
> - BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
>   the program in this cgroup yields to sub-cgroup program.
> - BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
>   that cgroup program gets run in addition to the program in this cgroup.
> 
> Most of the logic is in patch 1. Even when cgroup doesn't have
> any programs attached its set of effective program can be non-empty.
> To quickly execute them and avoid penalizing cgroups without
> any effective programs introduce 'struct bpf_prog_array'
> which has an optimization for cgroups with zero effective programs.
> 
> Patch 2 introduces BPF_PROG_QUERY command for introspection
> Patch 3 makes verifier more strict for cgroup-bpf program types.
> Patch 4+ are tests.
> 
> More details in individual patches

Looks good, series applied, thanks!

^ permalink raw reply

* Re: [PATCH net-next] openvswitch: Add erspan tunnel support.
From: William Tu @ 2017-10-04 23:02 UTC (permalink / raw)
  To: Pravin Shelar; +Cc: Linux Kernel Network Developers
In-Reply-To: <CAOrHB_DNkfnUty=QzO5d6SAyscLXBP-teE_j0RvDaJiMHqRaaw@mail.gmail.com>

On Wed, Oct 4, 2017 at 2:31 PM, Pravin Shelar <pshelar@ovn.org> wrote:
> On Wed, Oct 4, 2017 at 5:02 AM, William Tu <u9012063@gmail.com> wrote:
>> Add type II erspan vport implementation.  Since erspan protocol is
>> on top of the GRE header, the implementation is extended from the
>> existing gre implementation.
>>
>> Signed-off-by: William Tu <u9012063@gmail.com>
>> Cc: Pravin B Shelar <pshelar@ovn.org>
>
> Why are you adding ERSPAN support to compat code. Isn't this supported
> over OVS netlink-rtnl (dpif-netlink-rtnl)?

You're right. I will remove it and submit v2. Thanks for the review.

William

^ permalink raw reply

* Re: [PATCH] PCI: Check/Set ARI capability before setting numVFs
From: Bjorn Helgaas @ 2017-10-04 23:01 UTC (permalink / raw)
  To: Tony Nguyen
  Cc: linux-pci, intel-wired-lan, linux-kernel, netdev, bhelgaas,
	Alexander Duyck, Emil Tantilov
In-Reply-To: <20171004155258.35634-1-anthony.l.nguyen@intel.com>

On Wed, Oct 04, 2017 at 08:52:58AM -0700, Tony Nguyen wrote:
> This fixes a bug that can occur if an AER error is encountered while SRIOV
> devices are present.
> 
> This issue was seen by doing the following. Inject an AER error to a device
> that has SRIOV devices.  After the device has recovered, remove the driver.
> Reload the driver and enable SRIOV which causes the following crash to
> occur:
> 
> kernel BUG at drivers/pci/iov.c:157!
> invalid opcode: 0000 [#1] SMP
> CPU: 36 PID: 2295 Comm: bash Not tainted 4.14.0-rc1+ #74
> Hardware name: Supermicro X9DAi/X9DAi, BIOS 3.0a 04/29/2014
> task: ffff9fa41cd45a00 task.stack: ffffb4b2036e8000
> RIP: 0010:pci_iov_add_virtfn+0x2eb/0x350
> RSP: 0018:ffffb4b2036ebcb8 EFLAGS: 00010286
> RAX: 00000000fffffff0 RBX: ffff9fa42c1c8800 RCX: ffff9fa421ce2388
> RDX: 00000000df900000 RSI: ffff9fa8214fb388 RDI: 00000000df903fff
> RBP: ffffb4b2036ebd18 R08: ffff9fa421ce23b8 R09: ffffb4b2036ebc2c
> R10: ffff9fa42c1a5548 R11: 000000000000058e R12: ffff9fa8214fb000
> R13: ffff9fa42c1a5000 R14: ffff9fa8214fb388 R15: 0000000000000000
> FS:  00007f60724b6700(0000) GS:ffff9fa82f300000(0000)
> knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 0000559eca8b0f40 CR3: 0000000864146000 CR4: 00000000001606e0
> Call Trace:
>  pci_enable_sriov+0x353/0x440
>  ixgbe_pci_sriov_configure+0xd5/0x1f0 [ixgbe]
>  sriov_numvfs_store+0xf7/0x170
>  dev_attr_store+0x18/0x30
>  sysfs_kf_write+0x37/0x40
>  kernfs_fop_write+0x120/0x1b0
>  __vfs_write+0x37/0x170
>  ? __alloc_fd+0x3f/0x170
>  ? set_close_on_exec+0x30/0x70
>  vfs_write+0xb5/0x1a0
>  SyS_write+0x55/0xc0
>  entry_SYSCALL_64_fastpath+0x1a/0xa5
> RIP: 0033:0x7f6071bafc20
> RSP: 002b:00007ffe7d42ba48 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
> RAX: ffffffffffffffda RBX: 0000559eca8b0f30 RCX: 00007f6071bafc20
> RDX: 0000000000000002 RSI: 0000559eca961f60 RDI: 0000000000000001
> RBP: 00007f6071e78ae0 R08: 00007f6071e7a740 R09: 00007f60724b6700
> R10: 0000000000000073 R11: 0000000000000246 R12: 0000000000000000
> R13: 0000000000000000 R14: 0000000000000000 R15: 0000559eca892170
> RIP: pci_iov_add_virtfn+0x2eb/0x350 RSP: ffffb4b2036ebcb8
> 
> The occurs since during AER recovery the ARI Capable Hierarchy bit,
> which can affect the values for First VF Offset and VF Stride, is not set
> until after pci_iov_set_numvfs() is called.  

Can you elaborate on where exactly this happens?  The only place we
explicitly set PCI_SRIOV_CTRL_ARI is in sriov_init(), which is only
called at enumeration-time.  So I'm guessing you're talking about this
path:

  ixgbe_io_slot_reset
    pci_restore_state
      pci_restore_iov_state
	sriov_restore_state
	  pci_iov_set_numvfs

where we don't set PCI_SRIOV_CTRL_ARI at all.  The fact that you say
PCI_SRIOV_CTRL_ARI isn't set until *after* pci_iov_set_numvfs() is
called suggests that it is being set *somewhere*, but I don't know
where.

> This can cause the iov
> structure to be populated with values that are incorrect if the bit is
> later set.   Check and set this bit, if needed, before calling
> pci_iov_set_numvfs() so that the values being populated properly take
> the ARI bit into account.
> 
> CC: Alexander Duyck <alexander.h.duyck@intel.com>
> CC: Emil Tantilov <emil.s.tantilov@intel.com>
> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
> ---
>  drivers/pci/iov.c | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index 7492a65..a8896c7 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -497,6 +497,10 @@ static void sriov_restore_state(struct pci_dev *dev)
>  	if (ctrl & PCI_SRIOV_CTRL_VFE)
>  		return;
>  
> +	if ((iov->ctrl & PCI_SRIOV_CTRL_ARI) && !(ctrl & PCI_SRIOV_CTRL_ARI))
> +		pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL,
> +				      ctrl | PCI_SRIOV_CTRL_ARI);
> +
>  	for (i = PCI_IOV_RESOURCES; i <= PCI_IOV_RESOURCE_END; i++)
>  		pci_update_resource(dev, i);
>  
> -- 
> 2.9.5
> 

^ permalink raw reply

* Re: [PATCH net-next v6 0/4] bpf: add two helpers to read perf event enabled/running time
From: David Miller @ 2017-10-04 23:00 UTC (permalink / raw)
  To: yhs; +Cc: peterz, rostedt, ast, daniel, netdev, kernel-team
In-Reply-To: <20171002224218.3181418-1-yhs@fb.com>

From: Yonghong Song <yhs@fb.com>
Date: Mon, 2 Oct 2017 15:42:14 -0700

> [Dave, Peter,
> 
>  Previous communcation shows that this patch may potentially have
>  merge conflict with upcoming tip changes in the next merge window.
> 
>  Could you advise how this patch should proceed?
> 
>  Thanks!
> ]

Indeed, Peter how do you want to handle this?

Thanks.

^ permalink raw reply

* Re: [PATCH v2 4/5] VSOCK: add sock_diag interface
From: David Miller @ 2017-10-04 22:58 UTC (permalink / raw)
  To: stefanha; +Cc: netdev, jhansen, decui
In-Reply-To: <20171004163716.3964-5-stefanha@redhat.com>

From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Wed,  4 Oct 2017 12:37:15 -0400

> This patch adds the sock_diag interface for querying sockets from
> userspace.  Tools like ss(8) and netstat(8) can use this interface to
> list open sockets.
> 
> The userspace ABI is defined in <linux/vm_sockets_diag.h> and includes
> netlink request and response structs.  The request can query sockets
> based on their sk_state (e.g. listening sockets only) and the response
> contains socket information fields including the local/remote addresses,
> inode number, etc.
> 
> This patch does not dump VMCI pending sockets because I have only tested
> the virtio transport, which does not use pending sockets.  Support can
> be added later by extending vsock_diag_dump() if needed by VMCI users.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>

Please post new feature patches against net-next.

> diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
> index 09fc2eb29dc8..e5dbf153aff0 100644
> --- a/net/vmw_vsock/Makefile
> +++ b/net/vmw_vsock/Makefile
> @@ -1,10 +1,13 @@
>  obj-$(CONFIG_VSOCKETS) += vsock.o
> +obj-$(CONFIG_VSOCKETS_DIAG) += vsock_diag.o
>  obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o
>  obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o
>  obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o
>  

This hunk fails to apply to the net-next tree, the context looks
different.

Thanks.

^ permalink raw reply

* Re: [PATCH net-next v4 1/3] bridge: add new BR_NEIGH_SUPPRESS port flag to suppress arp and nd flood
From: David Miller @ 2017-10-04 22:52 UTC (permalink / raw)
  To: roopa; +Cc: netdev, nikolay, stephen, bridge
In-Reply-To: <1507093953-59929-2-git-send-email-roopa@cumulusnetworks.com>

From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue,  3 Oct 2017 22:12:31 -0700

> BR_ARP_PROXY flag but has a few semantic differences to conform

This should be "BR_PROXYARP".

Otherwise this series looks fine to me, but I see there will be
a v5 respin.

^ permalink raw reply

* RE: [PATCH 2/3 v2] net: phy: DP83822 initial driver submission
From: Woojung.Huh @ 2017-10-04 22:44 UTC (permalink / raw)
  To: dmurphy, andrew, f.fainelli; +Cc: netdev, afd
In-Reply-To: <20171004182031.13794-2-dmurphy@ti.com>

> +static int dp83822_suspend(struct phy_device *phydev)
> +{
> +	int value;
> +
> +	mutex_lock(&phydev->lock);
> +	value = phy_read_mmd(phydev, DP83822_DEVADDR,
> MII_DP83822_WOL_CFG);
> +	mutex_unlock(&phydev->lock);
Would we need mutex to access phy_read_mmd()?
phy_read_mmd() has mdio_lock for indirect access.

> +	if (!(value & DP83822_WOL_EN))
> +		genphy_suspend(phydev);
> +
> +	return 0;
> +}
> +
> +static int dp83822_resume(struct phy_device *phydev)
> +{
> +	int value;
> +
> +	genphy_resume(phydev);
> +
> +	mutex_lock(&phydev->lock);
> +	value = phy_read_mmd(phydev, DP83822_DEVADDR,
> MII_DP83822_WOL_CFG);
> +
> +	phy_write_mmd(phydev, DP83822_DEVADDR,
> MII_DP83822_WOL_CFG, value |
> +		      DP83822_WOL_CLR_INDICATION);
> +
> +	mutex_unlock(&phydev->lock);
Same here.

Woojung

^ permalink raw reply

* Re: [RFC] bpf: remove global verifier state
From: Alexei Starovoitov @ 2017-10-04 22:40 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Daniel Borkmann, Eric Dumazet, dsahern, netdev, oss-drivers,
	david.beckett
In-Reply-To: <20171004145744.27c08d4a@cakuba.netronome.com>

On Wed, Oct 04, 2017 at 02:57:44PM -0700, Jakub Kicinski wrote:
> On Wed, 04 Oct 2017 21:13:47 +0200, Daniel Borkmann wrote:
> > On 10/04/2017 05:43 AM, Alexei Starovoitov wrote:
> > > On Tue, Oct 03, 2017 at 08:24:06PM -0700, Eric Dumazet wrote:  
> > >> On Tue, 2017-10-03 at 19:52 -0700, Alexei Starovoitov wrote:
> > >>  
> > >>> yep. looks great.
> > >>> Please test it and submit officially :)
> > >>> The commit aafe6ae9cee3 ("bpf: dynamically allocate digest scratch buffer")
> > >>> fixed the other case where we were relying on the above mutex.
> > >>> The only other spot to be adjusted is to add spin_lock/mutex or DO_ONCE() to
> > >>> bpf_get_skb_set_tunnel_proto() to protect md_dst init.
> > >>> imo that would be it.
> > >>> Daniel, anything else comes to mind?  
> > 
> > Yes, this should be all. DO_ONCE() for the tunnel proto seems a
> > good choice.
> 
> Hm.  I actually did:
> 
> if (!dst) {
> 	tmp = alloc();
> 	if (!tmp)
> 		return;
> 	if (cmpxchg(&dst, NULL, tmp))
> 		free(tmp);
> }
> 
> I don't like how DO_ONCE() doesn't handle errors from the init
> function :(

yeah. good point.
Above looks good to me.

^ permalink raw reply

* Re: [PATCH] nfp: convert nfp_eth_set_bit_config() into a macro
From: Jakub Kicinski @ 2017-10-04 22:22 UTC (permalink / raw)
  To: Matthias Kaehlcke
  Cc: Joe Perches, David S . Miller, Simon Horman, Dirk van der Merwe,
	oss-drivers, netdev, linux-kernel, Renato Golin, Manoj Gupta,
	Guenter Roeck, Doug Anderson
In-Reply-To: <20171004184957.GO173745@google.com>

On Wed, 4 Oct 2017 11:49:57 -0700, Matthias Kaehlcke wrote:
> Hi Joe,
> 
> El Wed, Oct 04, 2017 at 11:07:19AM -0700 Joe Perches ha dit:
> 
> > On Tue, 2017-10-03 at 13:05 -0700, Matthias Kaehlcke wrote:  
> > > nfp_eth_set_bit_config() is marked as __always_inline to allow gcc to
> > > identify the 'mask' parameter as known to be constant at compile time,
> > > which is required to use the FIELD_GET() macro.
> > > 
> > > The forced inlining does the trick for gcc, but for kernel builds with
> > > clang it results in undefined symbols:  
> > 
> > Can't you use local different FIELD_PREP/FIELD_GET macros
> > with a different name without the BUILD_BUG tests?
> > 
> > i.e.:
> > 
> > #define NFP_FIELD_PREP(_mask, _val)				\
> > ({								\
> > 	((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask);	\
> > })
> > 
> > #define NFP_FIELD_GET(_mask, _reg)				\
> > ({								\
> > 	(typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask));	\
> > })
> > 
> > Then the __always_inline can be removed from
> > nfp_eth_set_bit_config too.  
> 
> Thanks for the suggestion. This seems a viable alternative if David
> and the NFP owners can live without the extra checking provided by
> __BF_FIELD_CHECK.

The reason the __BF_FIELD_CHECK refuses to compile non-constant masks
is that it will require runtime ffs on the mask, which is potentially
costly.  I would also feel quite stupid adding those macros to the nfp
driver, given that I specifically created the bitfield.h header to not
have to reimplement these in every driver I write/maintain.

Can you please test the patch I provided in the other reply?

^ permalink raw reply

* Re: [RFC] bpf: remove global verifier state
From: Jakub Kicinski @ 2017-10-04 21:57 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Alexei Starovoitov, Eric Dumazet, dsahern, netdev, oss-drivers,
	david.beckett
In-Reply-To: <59D532EB.4000104@iogearbox.net>

On Wed, 04 Oct 2017 21:13:47 +0200, Daniel Borkmann wrote:
> On 10/04/2017 05:43 AM, Alexei Starovoitov wrote:
> > On Tue, Oct 03, 2017 at 08:24:06PM -0700, Eric Dumazet wrote:  
> >> On Tue, 2017-10-03 at 19:52 -0700, Alexei Starovoitov wrote:
> >>  
> >>> yep. looks great.
> >>> Please test it and submit officially :)
> >>> The commit aafe6ae9cee3 ("bpf: dynamically allocate digest scratch buffer")
> >>> fixed the other case where we were relying on the above mutex.
> >>> The only other spot to be adjusted is to add spin_lock/mutex or DO_ONCE() to
> >>> bpf_get_skb_set_tunnel_proto() to protect md_dst init.
> >>> imo that would be it.
> >>> Daniel, anything else comes to mind?  
> 
> Yes, this should be all. DO_ONCE() for the tunnel proto seems a
> good choice.

Hm.  I actually did:

if (!dst) {
	tmp = alloc();
	if (!tmp)
		return;
	if (cmpxchg(&dst, NULL, tmp))
		free(tmp);
}

I don't like how DO_ONCE() doesn't handle errors from the init
function :(

> >> 16 MB of log (unswappable kernel memory) per active checker.
> >>
> >> We might offer a way to oom hosts.  
> >
> > right. good point!
> > we need to switch to continuous copy_to_user() after a page or so.
> > Can even do it after every vscnprintf()
> > but page at a time is probably faster.  
> 
> Also worst case upper limits on verification side for holding state
> aside from the log would need to be checked in terms of how much mem
> we end up holding that is not accounted against any process (and not
> really "rate-limited" anymore once we drop the mutex).

^ permalink raw reply

* Re: [PATCH net-next] openvswitch: Add erspan tunnel support.
From: Pravin Shelar @ 2017-10-04 21:31 UTC (permalink / raw)
  To: William Tu; +Cc: Linux Kernel Network Developers
In-Reply-To: <1507118559-13774-1-git-send-email-u9012063@gmail.com>

On Wed, Oct 4, 2017 at 5:02 AM, William Tu <u9012063@gmail.com> wrote:
> Add type II erspan vport implementation.  Since erspan protocol is
> on top of the GRE header, the implementation is extended from the
> existing gre implementation.
>
> Signed-off-by: William Tu <u9012063@gmail.com>
> Cc: Pravin B Shelar <pshelar@ovn.org>

Why are you adding ERSPAN support to compat code. Isn't this supported
over OVS netlink-rtnl (dpif-netlink-rtnl)?

^ permalink raw reply

* Re: [PATCH net-next 0/3] A own subdirectory for shared TCP code
From: Andrew Lunn @ 2017-10-04 20:27 UTC (permalink / raw)
  To: Richard Siegfried; +Cc: David Miller, netdev
In-Reply-To: <d7237d11-ec83-0ef6-d201-da8b99c94b88@systemli.org>

On Wed, Oct 04, 2017 at 08:54:17PM +0200, Richard Siegfried wrote:
> On 04/10/17 01:03, David Miller wrote:
> > As someone who has to do backports regularly to -stable, there is no way
> > I am applying this.
> > 
> > Sorry.
> Okay, I see.
> 
> Is grouping files into subdirectories something generally
> unwanted/unlikely to be applied or is this specific to TCP / networking?
> 
> Because there are several other places in the source tree where I would
> like to group things.

Hi Richard

It is generally unwanted.

Have you tried back porting patches when the directory structure has
changed? Files have moved around? It makes it a lot harder to
do. Meaning patches are going to be back ported less often. Fixes
which could be security relevant might not get back ported, etc.

Kernel 4.4 is going to be supported until 2022. So moving files around
is going to make Greg Kroah-Hartman life more difficult for the next 5
years.

	Andrew

^ permalink raw reply

* RE
From: Stefan E Persson @ 2017-10-04 20:05 UTC (permalink / raw)


Your email has been randonly selected for my foundation Donation reply  
to email: eriling.persson089@swedenmail.com for more details

^ permalink raw reply

* Re: [PATCH net-next v3 3/3] tools: bpftool: add documentation
From: Jakub Kicinski @ 2017-10-04 20:02 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: netdev, alexei.starovoitov, daniel, dsahern, oss-drivers,
	David Beckett, linux-doc@vger.kernel.org
In-Reply-To: <20171004203642.699f5f1a@redhat.com>

On Wed, 4 Oct 2017 20:36:42 +0200, Jesper Dangaard Brouer wrote:
> On Wed,  4 Oct 2017 08:40:32 -0700
> Jakub Kicinski <jakub.kicinski@netronome.com> wrote:
> 
> > Add documentation for bpftool.  Separate files for each subcommand.
> > Use rst format.  Documentation is compiled into man pages using
> > rst2man.
> > 
> > Signed-off-by: David Beckett <david.beckett@netronome.com>
> > Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> > Acked-by: Alexei Starovoitov <ast@kernel.org>
> > Acked-by: Daniel Borkmann <daniel@iogearbox.net>
> > ---
> >  tools/bpf/bpftool/Documentation/Makefile         |  34 +++++++
> >  tools/bpf/bpftool/Documentation/bpftool-map.txt  | 110 +++++++++++++++++++++++
> >  tools/bpf/bpftool/Documentation/bpftool-prog.txt |  79 ++++++++++++++++
> >  tools/bpf/bpftool/Documentation/bpftool.txt      |  34 +++++++  
> 
> RST-format files are usually called .rst and not .txt
> 
> This is useful when people happen to browse the code via github, then they get formatted nicely e.g.:
>  https://github.com/torvalds/linux/blob/master/samples/bpf/README.rst

I was following perf's example.  Are perf's docs not RST?

^ permalink raw reply

* [PATCH net-next 3/3] tcp: a small refactor of RACK loss detection
From: Yuchung Cheng @ 2017-10-04 20:00 UTC (permalink / raw)
  To: davem; +Cc: netdev, Yuchung Cheng, Neal Cardwell, Eric Dumazet
In-Reply-To: <20171004200000.39257-1-ycheng@google.com>

Refactor the RACK loop to improve readability and speed up the checks.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp_recovery.c | 40 ++++++++++++++++++----------------------
 1 file changed, 18 insertions(+), 22 deletions(-)

diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 8aa56caefde8..cda6074a429a 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -61,32 +61,28 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 	list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
 				 tcp_tsorted_anchor) {
 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+		s32 remaining;
 
-		if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
-					tp->rack.end_seq, scb->end_seq)) {
-			/* Step 3 in draft-cheng-tcpm-rack-00.txt:
-			 * A packet is lost if its elapsed time is beyond
-			 * the recent RTT plus the reordering window.
-			 */
-			u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp,
-							 skb->skb_mstamp);
-			s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
-
-			if (remaining < 0) {
-				tcp_rack_mark_skb_lost(sk, skb);
-				list_del_init(&skb->tcp_tsorted_anchor);
-				continue;
-			}
-
-			/* Skip ones marked lost but not yet retransmitted */
-			if ((scb->sacked & TCPCB_LOST) &&
-			    !(scb->sacked & TCPCB_SACKED_RETRANS))
-				continue;
+		/* Skip ones marked lost but not yet retransmitted */
+		if ((scb->sacked & TCPCB_LOST) &&
+		    !(scb->sacked & TCPCB_SACKED_RETRANS))
+			continue;
 
+		if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
+					 tp->rack.end_seq, scb->end_seq))
+			break;
+
+		/* A packet is lost if it has not been s/acked beyond
+		 * the recent RTT plus the reordering window.
+		 */
+		remaining = tp->rack.rtt_us + reo_wnd -
+			    tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
+		if (remaining < 0) {
+			tcp_rack_mark_skb_lost(sk, skb);
+			list_del_init(&skb->tcp_tsorted_anchor);
+		} else {
 			/* Record maximum wait time (+1 to avoid 0) */
 			*reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
-		} else {
-			break;
 		}
 	}
 }
-- 
2.14.2.920.gcf0c67979c-goog

^ permalink raw reply related

* [PATCH net-next 2/3] tcp: more efficient RACK loss detection
From: Yuchung Cheng @ 2017-10-04 19:59 UTC (permalink / raw)
  To: davem; +Cc: netdev, Yuchung Cheng, Neal Cardwell, Eric Dumazet
In-Reply-To: <20171004200000.39257-1-ycheng@google.com>

Use the new time-ordered list to speed up RACK. The detection
logic is identical. But since the list is chronologically ordered
by skb_mstamp and contains only skbs not yet acked or sacked,
RACK can abort the loop upon hitting skbs that were sent more
recently. On YouTube servers this patch reduces the iterations on
write queue by 40x. The improvement is even bigger with large
BDP networks.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/tcp_recovery.c | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 449cd914d58e..8aa56caefde8 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -45,7 +45,7 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
 static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb;
+	struct sk_buff *skb, *n;
 	u32 reo_wnd;
 
 	*reo_timeout = 0;
@@ -58,17 +58,10 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 	if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
 		reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
 
-	tcp_for_write_queue(skb, sk) {
+	list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
+				 tcp_tsorted_anchor) {
 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
 
-		if (skb == tcp_send_head(sk))
-			break;
-
-		/* Skip ones already (s)acked */
-		if (!after(scb->end_seq, tp->snd_una) ||
-		    scb->sacked & TCPCB_SACKED_ACKED)
-			continue;
-
 		if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
 					tp->rack.end_seq, scb->end_seq)) {
 			/* Step 3 in draft-cheng-tcpm-rack-00.txt:
@@ -81,6 +74,7 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 
 			if (remaining < 0) {
 				tcp_rack_mark_skb_lost(sk, skb);
+				list_del_init(&skb->tcp_tsorted_anchor);
 				continue;
 			}
 
@@ -91,11 +85,7 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 
 			/* Record maximum wait time (+1 to avoid 0) */
 			*reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
-
-		} else if (!(scb->sacked & TCPCB_RETRANS)) {
-			/* Original data are sent sequentially so stop early
-			 * b/c the rest are all sent after rack_sent
-			 */
+		} else {
 			break;
 		}
 	}
-- 
2.14.2.920.gcf0c67979c-goog

^ permalink raw reply related

* [PATCH net-next 1/3] tcp: new list for sent but unacked skbs for RACK recovery
From: Yuchung Cheng @ 2017-10-04 19:59 UTC (permalink / raw)
  To: davem; +Cc: netdev, Eric Dumazet, Yuchung Cheng, Neal Cardwell
In-Reply-To: <20171004200000.39257-1-ycheng@google.com>

From: Eric Dumazet <edumazet@google.com>

This patch adds a new queue (list) that tracks the sent but not yet
acked or SACKed skbs for a TCP connection. The list is chronologically
ordered by skb->skb_mstamp (the head is the oldest sent skb).

This list will be used to optimize TCP Rack recovery, which checks
an skb's timestamp to judge if it has been lost and needs to be
retransmitted. Since TCP write queue is ordered by sequence instead
of sent time, RACK has to scan over the write queue to catch all
eligible packets to detect lost retransmission, and iterates through
SACKed skbs repeatedly.

Special cares for rare events:
1. TCP repair fakes skb transmission so the send queue needs adjusted
2. SACK reneging would require re-inserting SACKed skbs into the
   send queue. For now I believe it's not worth the complexity to
   make RACK work perfectly on SACK reneging, so we do nothing here.
3. Fast Open: currently for non-TFO, send-queue correctly queues
   the pure SYN packet. For TFO which queues a pure SYN and
   then a data packet, send-queue only queues the data packet but
   not the pure SYN due to the structure of TFO code. This is okay
   because the SYN receiver would never respond with a SACK on a
   missing SYN (i.e. SYN is never fast-retransmitted by SACK/RACK).

In order to not grow sk_buff, we use an union for the new list and
_skb_refdst/destructor fields. This is a bit complicated because
we need to make sure _skb_refdst and destructor are properly zeroed
before skb is cloned/copied at transmit, and before being freed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
---
 include/linux/skbuff.h   | 11 +++++++++--
 include/linux/tcp.h      |  1 +
 include/net/tcp.h        | 24 +++++++++++++++++++++++-
 net/ipv4/tcp.c           |  2 ++
 net/ipv4/tcp_input.c     |  9 +++++++--
 net/ipv4/tcp_minisocks.c |  1 +
 net/ipv4/tcp_output.c    | 42 +++++++++++++++++++++++++++++++-----------
 7 files changed, 74 insertions(+), 16 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ada821466e88..01a985937867 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -617,6 +617,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@nf_trace: netfilter packet trace flag
  *	@protocol: Packet protocol from driver
  *	@destructor: Destruct function
+ *	@tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
  *	@_nfct: Associated connection, if any (with nfctinfo bits)
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *	@skb_iif: ifindex of device we arrived on
@@ -686,8 +687,14 @@ struct sk_buff {
 	 */
 	char			cb[48] __aligned(8);
 
-	unsigned long		_skb_refdst;
-	void			(*destructor)(struct sk_buff *skb);
+	union {
+		struct {
+			unsigned long	_skb_refdst;
+			void		(*destructor)(struct sk_buff *skb);
+		};
+		struct list_head	tcp_tsorted_anchor;
+	};
+
 #ifdef CONFIG_XFRM
 	struct	sec_path	*sp;
 #endif
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4aa40ef02d32..1d2c44e09e31 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -191,6 +191,7 @@ struct tcp_sock {
 	u32	tsoffset;	/* timestamp offset */
 
 	struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
+	struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
 
 	u32	snd_wl1;	/* Sequence for window update		*/
 	u32	snd_wnd;	/* The window we expect to receive	*/
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6d25d8305054..c39bcc222c9b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1588,14 +1588,34 @@ enum tcp_chrono {
 void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
 void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);
 
+/* This helper is needed, because skb->tcp_tsorted_anchor uses
+ * the same memory storage than skb->destructor/_skb_refdst
+ */
+static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
+{
+	skb->destructor = NULL;
+	skb->_skb_refdst = 0UL;
+}
+
+#define tcp_skb_tsorted_save(skb) {		\
+	unsigned long _save = skb->_skb_refdst;	\
+	skb->_skb_refdst = 0UL;
+
+#define tcp_skb_tsorted_restore(skb)		\
+	skb->_skb_refdst = _save;		\
+}
+
 /* write queue abstraction */
 static inline void tcp_write_queue_purge(struct sock *sk)
 {
 	struct sk_buff *skb;
 
 	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
-	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
+	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+		tcp_skb_tsorted_anchor_cleanup(skb);
 		sk_wmem_free_skb(sk, skb);
+	}
+	INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
 	sk_mem_reclaim(sk);
 	tcp_clear_all_retrans_hints(tcp_sk(sk));
 }
@@ -1710,6 +1730,8 @@ static inline void tcp_insert_write_queue_before(struct sk_buff *new,
 
 static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
 {
+	list_del(&skb->tcp_tsorted_anchor);
+	tcp_skb_tsorted_anchor_cleanup(skb);
 	__skb_unlink(skb, &sk->sk_write_queue);
 }
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 23225c98d287..6d25008be84b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -415,6 +415,7 @@ void tcp_init_sock(struct sock *sk)
 	tp->out_of_order_queue = RB_ROOT;
 	tcp_init_xmit_timers(sk);
 	INIT_LIST_HEAD(&tp->tsq_node);
+	INIT_LIST_HEAD(&tp->tsorted_sent_queue);
 
 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
 	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
@@ -869,6 +870,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
 			 * available to the caller, no more, no less.
 			 */
 			skb->reserved_tailroom = skb->end - skb->tail - size;
+			INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
 			return skb;
 		}
 		__kfree_skb(skb);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index db9bb46b5776..f0402bd9fd7e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1593,6 +1593,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
 						tcp_skb_pcount(skb),
 						skb->skb_mstamp);
 			tcp_rate_skb_delivered(sk, skb, state->rate);
+			if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+				list_del_init(&skb->tcp_tsorted_anchor);
 
 			if (!before(TCP_SKB_CB(skb)->seq,
 				    tcp_highest_sack_seq(tp)))
@@ -3054,8 +3056,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
 
 	shinfo = skb_shinfo(skb);
 	if (!before(shinfo->tskey, prior_snd_una) &&
-	    before(shinfo->tskey, tcp_sk(sk)->snd_una))
-		__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+	    before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
+		tcp_skb_tsorted_save(skb) {
+			__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+		} tcp_skb_tsorted_restore(skb);
+	}
 }
 
 /* Remove acknowledged frames from the retransmission queue. If our packet
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 188a6f31356d..2341b9f857b6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -446,6 +446,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
 
 		INIT_LIST_HEAD(&newtp->tsq_node);
+		INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
 
 		tcp_init_wl(newtp, treq->rcv_isn);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0bc9e46a5369..8162e2880178 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -971,6 +971,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
 		      HRTIMER_MODE_ABS_PINNED);
 }
 
+static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	skb->skb_mstamp = tp->tcp_mstamp;
+	list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+}
+
 /* This routine actually transmits TCP packets queued in by
  * tcp_do_sendmsg().  This is used by both the initial
  * transmission and possible later retransmissions.
@@ -1003,10 +1009,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
 			- tp->snd_una;
 		oskb = skb;
-		if (unlikely(skb_cloned(skb)))
-			skb = pskb_copy(skb, gfp_mask);
-		else
-			skb = skb_clone(skb, gfp_mask);
+
+		tcp_skb_tsorted_save(oskb) {
+			if (unlikely(skb_cloned(oskb)))
+				skb = pskb_copy(oskb, gfp_mask);
+			else
+				skb = skb_clone(oskb, gfp_mask);
+		} tcp_skb_tsorted_restore(oskb);
+
 		if (unlikely(!skb))
 			return -ENOBUFS;
 	}
@@ -1127,7 +1137,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 		err = net_xmit_eval(err);
 	}
 	if (!err && oskb) {
-		oskb->skb_mstamp = tp->tcp_mstamp;
+		tcp_update_skb_after_send(tp, oskb);
 		tcp_rate_skb_sent(sk, oskb);
 	}
 	return err;
@@ -1328,6 +1338,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 	/* Link BUFF into the send queue. */
 	__skb_header_release(buff);
 	tcp_insert_write_queue_after(skb, buff, sk);
+	list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
 
 	return 0;
 }
@@ -2260,7 +2271,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 
 		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
 			/* "skb_mstamp" is used as a start point for the retransmit timer */
-			skb->skb_mstamp = tp->tcp_mstamp;
+			tcp_update_skb_after_send(tp, skb);
 			goto repair; /* Skip network transmission */
 		}
 
@@ -2838,11 +2849,14 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 		     skb_headroom(skb) >= 0xFFFF)) {
 		struct sk_buff *nskb;
 
-		nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
-		err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
-			     -ENOBUFS;
+		tcp_skb_tsorted_save(skb) {
+			nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
+			err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
+				     -ENOBUFS;
+		} tcp_skb_tsorted_restore(skb);
+
 		if (!err)
-			skb->skb_mstamp = tp->tcp_mstamp;
+			tcp_update_skb_after_send(tp, skb);
 	} else {
 		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 	}
@@ -3023,6 +3037,7 @@ void tcp_send_fin(struct sock *sk)
 				goto coalesce;
 			return;
 		}
+		INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
 		skb_reserve(skb, MAX_TCP_HEADER);
 		sk_forced_mem_schedule(sk, skb->truesize);
 		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
@@ -3078,9 +3093,14 @@ int tcp_send_synack(struct sock *sk)
 	}
 	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
 		if (skb_cloned(skb)) {
-			struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
+			struct sk_buff *nskb;
+
+			tcp_skb_tsorted_save(skb) {
+				nskb = skb_copy(skb, GFP_ATOMIC);
+			} tcp_skb_tsorted_restore(skb);
 			if (!nskb)
 				return -ENOMEM;
+			INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
 			tcp_unlink_write_queue(skb, sk);
 			__skb_header_release(nskb);
 			__tcp_add_write_queue_head(sk, nskb);
-- 
2.14.2.920.gcf0c67979c-goog

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox