Netdev List

Netdev List
 help / color / mirror / Atom feed

* [TSN RFC v2 5/9] Add TSN header for the driver
From: henrik @ 2016-12-16 17:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Richard Cochran, henrik, Henrik Austad, linux-media, alsa-devel,
	netdev, David S. Miller
In-Reply-To: <1481911153-549-1-git-send-email-henrik@austad.us>

From: Henrik Austad <haustad@cisco.com>

This defines the general TSN headers for network packets, the
shim-interface and the central 'tsn_list' structure.

Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Henrik Austad <haustad@cisco.com>
---
 include/linux/tsn.h | 952 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 952 insertions(+)
 create mode 100644 include/linux/tsn.h

diff --git a/include/linux/tsn.h b/include/linux/tsn.h
new file mode 100644
index 0000000..9123b25
--- /dev/null
+++ b/include/linux/tsn.h
@@ -0,0 +1,952 @@
+/*   TSN - Time Sensitive Networking
+ *
+ *   Copyright (C) 2016- Henrik Austad <haustad@cisco.com>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ */
+#ifndef _TSN_H
+#define _TSN_H
+#include <linux/list.h>
+#include <linux/kthread.h>
+#include <linux/configfs.h>
+#include <linux/hrtimer.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+
+/*
+ * List of current subtype fields in the common header of AVTPDU
+ *
+ * Note: AVTPDU is a remnant of the standards from when it was AVB.
+ *
+ * The list has been updated with the recent values from IEEE 1722, draft 16.
+ */
+enum avtp_subtype {
+	TSN_61883_IIDC = 0,	/* IEC 61883/IIDC Format */
+	TSN_MMA_STREAM,		/* MMA Streams */
+	TSN_AAF,		/* AVTP Audio Format */
+	TSN_CVF,		/* Compressed Video Format */
+	TSN_CRF,		/* Clock Reference Format */
+	TSN_TSCF,		/* Time-Synchronous Control Format */
+	TSN_SVF,		/* SDI Video Format */
+	TSN_RVF,		/* Raw Video Format */
+	/* 0x08 - 0x6D reserved */
+	TSN_AEF_CONTINOUS = 0x6e, /* AES Encrypted Format Continous */
+	TSN_VSF_STREAM,		/* Vendor Specific Format Stream */
+	/* 0x70 - 0x7e reserved */
+	TSN_EF_STREAM = 0x7f,	/* Experimental Format Stream */
+	/* 0x80 - 0x81 reserved */
+	TSN_NTSCF = 0x82,	/* Non Time-Synchronous Control Format */
+	/* 0x83 - 0xed reserved */
+	TSN_ESCF = 0xec,	/* ECC Signed Control Format */
+	TSN_EECF,		/* ECC Encrypted Control Format */
+	TSN_AEF_DISCRETE,	/* AES Encrypted Format Discrete */
+	/* 0xef - 0xf9 reserved */
+	TSN_ADP = 0xfa,		/* AVDECC Discovery Protocol */
+	TSN_AECP,		/* AVDECC Enumeration and Control Protocol */
+	TSN_ACMP,		/* AVDECC Connection Management Protocol */
+	/* 0xfd reserved */
+	TSN_MAAP = 0xfe,	/* MAAP Protocol */
+	TSN_EF_CONTROL,		/* Experimental Format Control */
+};
+
+/* Link-states to help error-recovery detected from irq context.
+ */
+enum link_states {
+	LINK_OFF = 0,
+	LINK_RUNNING,
+	LINK_ERROR,
+};
+
+
+/*		NOTE NOTE NOTE !!
+ * The headers below use bitfields extensively and verifications
+ * are needed when using little-endian vs big-endian systems.
+ */
+
+/* Common part of avtph header
+ *
+ * AVB Transport Protocol Common Header
+ *
+ * Defined in 1722-2011 Sec. 5.2
+ */
+struct avtp_ch {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	/* use avtp_subtype enum.
+	 */
+	u8 subtype:7;
+
+	/* Controlframe: 1
+	 * Dataframe   : 0
+	 */
+	u8 cd:1;
+
+	/* Type specific data, part 1 */
+	u8 tsd_1:4;
+
+	/* In current version of AVB, only 0 is valid, all other values
+	 * are reserved for future versions.
+	 */
+	u8 version:3;
+
+	/* Valid StreamID in frame
+	 *
+	 * ControlData not related to a specific stream should clear
+	 * this (and have stream_id = 0), _all_ other values should set
+	 * this to 1.
+	 */
+	u8 sv:1;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	u8 cd:1;
+	u8 subtype:7;
+	u8 sv:1;
+	u8 version:3;
+	u8 tsd_1:4;
+#else
+#error "Unknown Endianness, cannot determine bitfield ordering"
+#endif
+	/* Type specific data (adjacent to tsd_1, but split due to bitfield) */
+	u16 tsd_2;
+	u64 stream_id;
+
+	/*
+	 * payload by subtype
+	 */
+	u8 pbs[0];
+} __packed;
+
+/* AVTPDU Common Control header format
+ * IEEE 1722#5.3
+ */
+struct avtpc_header {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	u8 subtype:7;
+	u8 cd:1;
+	u8 control_data:4;
+	u8 version:3;
+	u8 sv:1;
+	u16 control_data_length:11;
+	u16 status:5;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	u8 cd:1;
+	u8 subtype:7;
+	u8 sv:1;
+	u8 version:3;
+	u8 control_data:4;
+	u16 status:5;
+	u16 control_data_length:11;
+#else
+#error "Unknown Endianness, cannot determine bitfield ordering"
+#endif
+	u64 stream_id;
+} __packed;
+
+/* AVTP common stream data AVTPDU header format
+ * IEEE 1722#5.4
+ */
+struct avtpdu_header {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	u8 subtype:7;
+	u8 cd:1;
+
+	/* avtp_timestamp valid */
+	u8 tv: 1;
+
+	/* gateway_info valid */
+	u8 gv:1;
+
+	/* reserved */
+	u8 r:1;
+
+	/*
+	 * Media clock Restart toggle
+	 */
+	u8 mr:1;
+
+	u8 version:3;
+
+	/* StreamID valid */
+	u8 sv:1;
+	u8 seqnr;
+
+	/* Timestamp uncertain */
+	u8 tu:1;
+	u8 r2:7;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	u8 cd:1;
+	u8 subtype:7;
+
+	u8 sv:1;
+	u8 version:3;
+	u8 mr:1;
+	u8 r:1;
+	u8 gv:1;
+	u8 tv: 1;
+
+	u8 seqnr;
+	u8 r2:7;
+	u8 tu:1;
+#else
+#error "Unknown Endianness, cannot determine bitfield ordering"
+#endif
+
+	u64 stream_id;
+
+	u32 avtp_timestamp;
+	u32 gateway_info;
+
+	/* Stream Data Length */
+	u16 sd_len;
+
+	/* Protocol specific header, derived from avtp_subtype */
+	u16 psh;
+
+	/* Stream Payload Data 0 to n octets
+	 * n so that total size < MTU
+	 */
+	u8 data[0];
+} __packed;
+
+
+/**
+ * struct tsn_list - The top level container of TSN
+ *
+ * This is what tsn_configfs refers to as 'tier-0'
+ *
+ * @head List of TSN cards
+ * @lock lock protecting global entries
+ * @tsn_subsys Ref to ConfigFS subsystem
+ *
+ * @running: hrtimer is running driving data out
+ * @tsn_timer: hrtimer container
+ * @num_avail Number of available TSN NICs exposed through ConfigFS
+ */
+struct tsn_list {
+	struct list_head head;
+	spinlock_t lock;
+	struct configfs_subsystem tsn_subsys;
+
+	/*
+	 * TSN-timer is running. Not to be confused with the per-link
+	 * disabled flag which indicates if a remote client, like aplay,
+	 * is pushing data to it.
+	 */
+	atomic_t running;
+	struct hrtimer tsn_timer;
+	unsigned int period_ns;
+
+	struct task_struct *tsn_thread;
+	int should_run;
+
+	size_t num_avail;
+};
+
+static inline void tsn_list_lock(struct tsn_list *list)
+{
+	spin_lock(&list->lock);
+}
+static inline void tsn_list_unlock(struct tsn_list *list)
+{
+	spin_unlock(&list->lock);
+}
+
+/**
+ * struct tsn_nic
+ *
+ * Individual TSN-capable NICs, or 'tier-1' struct
+ *
+ * @list linked list of all TSN NICs
+ * @group configfs group
+ * @dev corresponding net_device
+ * @dma_size : size of the DMA buffer
+ * @dma_handle: housekeeping DMA-stuff
+ * @dma_mem : pointer to memory region we're using for DMAing to the NIC
+ * @name Name of NIC (same as name in dev), TO BE REMOVED
+ * @txq Size of Tx-queue. TO BE REMOVED
+ * @rx_registered flag indicating if a handler is registered for the nic
+ * @capable: if the NIC is capable for proper TSN traffic or if it must
+ *	     be emulated in software.
+ *
+ */
+struct tsn_nic {
+	struct list_head list;
+	struct config_group group;
+	struct net_device *dev;
+	struct tsn_list *tsn_list;
+
+	size_t dma_size;
+	dma_addr_t dma_handle;
+	void *dma_mem;
+
+	char *name;
+	int txq;
+	u8 rx_registered:1;
+	u8 capable:1;
+
+	/*
+	 * Any AVTP data stream must set the 802.1Q vlan id and priority
+	 * Code point. This should be obtained from MSRP, default values
+	 * are:
+	 *
+	 * pcp: Class A: 3
+	 *	Class B: 2
+	 *
+	 * See IEEE 802.1Q-2011, Sec 35.2.2.9.3 and table 6-6 in 6.6.2
+	 * for details.
+	 */
+	u8 pcp_a:3;
+	u8 pcp_b:3;
+};
+
+struct tsn_shim_ops;
+/**
+ * tsn_link - Structure describing a single TSN link
+ *
+ */
+struct tsn_link {
+	/* Locks for protecting the link
+	 *
+	 * Due to how we do Rx and Tx, we need different types of locks
+	 * in these settings. A link _cannot_ be both, so even though
+	 * this way of doing it is ugly, it should be safe.
+	 *
+	 * Reader: must disable interrupt as we take the lock in rx-handler (Network bh)
+	 * Talker: must not disable interrupt
+	 */
+	spinlock_t tlock;
+	raw_spinlock_t llock;
+	unsigned long lflags;
+
+	struct config_group group;
+	struct tsn_nic *nic;
+	struct hlist_node node;
+
+	/* The link itself is active, and the tsn_core will treat it as
+	 * an active participant and feed data from it to the
+	 * network. This places some restrictions on what attributes
+	 * (most actually) that can be changed.
+	 *
+	 */
+	atomic_t link_state;
+
+	/* keep track of how many frames we have sent (for debugging) */
+	u64 frames_sent;
+
+	/* timestamp for last frame going in/out over the network, delta
+	 * and avg delta
+	 */
+	u64 ts_net_ns;
+	u64 ts_delta_ns;
+
+	/* simple, exponential smoothing of the time between received
+	 * samples. This is useful for shims that need to calculate an
+	 * offset into the buffer of received data.
+	 * exp_avg = alpha * ts_delta_ns + (1-alpha)exp_avg_{-1}
+	 * avg_delta_ns = ts_delta_ns * alpha_scale + avg_delta_ns * (1 - alpha_scale)
+	 */
+	u64 ts_exp_avg;
+
+	/* alpha_scale is currently in the range 0 - (2^14 - 1) because
+	 * 16384 different values is "probably enough" for a smoothed
+	 * avg.
+	 */
+	u16 ts_exp_alpha;
+
+	/* Pointer to media-specific data.
+	 * e.g. struct avb_chip
+	 */
+	void *media_chip;
+
+	u64 stream_id;
+
+	/*
+	 * The max required size for a _single_ TSN frame.
+	 *
+	 * To be used instead of channels and sample_freq.
+	 */
+	u16 max_payload_size;
+	u16 shim_header_size;
+
+	/*
+	 * Size of buffer (in bytes) to use when handling data to/from
+	 * NIC.
+	 *
+	 * Smaller size will result in client being called more often
+	 * but also provides lower latencies.
+	 */
+	size_t buffer_size;
+	size_t used_buffer_size;
+
+	/* used to keep skb when we overproduce so that we can do a
+	 * somewhat sane backoff.
+	 */
+	struct sk_buff *old_skb;
+
+	/*
+	 * Used when frames are constructed and shipped to the network
+	 * layer. If this is true, 0-frames will be sent insted of data
+	 * from the buffer.
+	 */
+	atomic_t buffer_active;
+
+	/*
+	 * ringbuffer for incoming or outging traffic
+	 * +-----------------------------------+
+	 * |                  ##########       |
+	 * +-----------------------------------+
+	 * ^                  ^         ^      ^
+	 * buffer           tail      head    end
+	 *
+	 * Buffer: start of memory area
+	 * tail: first byte of data in buffer
+	 * head: first unused slot in which to store new data
+	 *
+	 * head,tail is used to represent the position of 'live data' in
+	 * the buffer.
+	 */
+	void *buffer;
+	void *head;
+	void *tail;
+	void *end;
+
+	/* Number of bytes to run refill/drain callbacks */
+	size_t low_water_mark;
+	size_t high_water_mark;
+
+
+	/*
+	 * callback ops.
+	 */
+	struct tsn_shim_ops *ops;
+
+	/*
+	 * EndStation Type
+	 *
+	 * Either Talker or Listener
+	 *
+	 * 1: We are *Talker*, i.e. producing data to send
+	 * 0: We are *Listener*, i.e. we receive data from another ES.
+	 *
+	 * This is for a single link, so even though an end-station can
+	 * be both Talker *and* Listener, a link can only be one.
+	 */
+	u8 estype_talker;
+
+	/*
+	 * Link will use buffer managed by the shim. For this to work,
+	 * the shim must:
+	 *
+	 * - call tsn_use_external_buffer(link, size);
+	 * - provide tsn_shim_buffer_swap(link) in tsn_shim_ops
+	 */
+	u8 external_buffer;
+
+	u8 last_seqnr;
+
+	/*
+	 * Class can be of different classes, currently A or B
+	 *
+	 * ClassA: every 125us
+	 * ClassB: every 250us
+	 *
+	 * This will also affect how large each frame will be and will
+	 * also grab the PCP from the NIC-struct
+	 */
+	enum sr_class class;
+
+	u16 vlan_id;
+
+	u8 remote_mac[6];
+};
+
+static inline void tsn_lock(struct tsn_link *link)
+{
+	if (link->estype_talker)
+		spin_lock(&link->tlock);
+	else
+		raw_spin_lock_irqsave(&link->llock, link->lflags);
+}
+
+static inline void tsn_unlock(struct tsn_link *link)
+{
+	if (link->estype_talker)
+		spin_unlock(&link->tlock);
+	else
+		raw_spin_unlock_irqrestore(&link->llock, link->lflags);
+}
+
+void tsn_lock_init(struct tsn_link *link);
+
+/**
+ * tsn_link_on - make link active
+ *
+ * This cause most of the attributes to be treated read-only since we
+ * will have to re-negotiate with the network if most of these
+ * parameters change.
+ *
+ * Note: this means that the link will be handled by the rx-handler or
+ * the timer callback, but until the link_buffer is set active (via
+ * tsn_lb_on()), actual data is not moved.
+ *
+ * @link: link being set to active
+ */
+static inline void tsn_link_on(struct tsn_link *link)
+{
+	if (link)
+		atomic_set(&link->link_state, LINK_RUNNING);
+}
+
+/**
+ * tsn_link_off - make link inactive
+ *
+ * The link will now be ignored by timer callback or the
+ * rx-handler. Attributes can be mostly freely changed (we assume that
+ * userspace sets values that are negotiated properly).
+ *
+ * @link: link to deactivate
+ */
+static inline void tsn_link_off(struct tsn_link *link)
+{
+	if (link)
+		atomic_set(&link->link_state, LINK_OFF);
+}
+
+static inline int tsn_link_is_off(struct tsn_link *link)
+{
+	if (link)
+		return atomic_read(&link->link_state) == LINK_OFF;
+	return 0;
+}
+/**
+ * tsn_link_err - mark link as having an error.
+ *
+ * Link will be ignored by timer callback after being marked as in error
+ * and will be torn down upon next non-irq handling of link. After being
+ * torn down, it will be marked as 'off'.
+ *
+ *@link: link in error
+ */
+static inline void tsn_link_err(struct tsn_link *link)
+{
+	if (link)
+		atomic_set(&link->link_state, LINK_ERROR);
+}
+static inline int tsn_link_is_err(struct tsn_link *link)
+{
+	if (link)
+		return atomic_read(&link->link_state) == LINK_ERROR;
+	return 0;
+}
+/**
+ * tsn_link_is_on - query link to see if it is active
+ *
+ * Mostly used by tsn_configfs to respect the "read-only" once link is
+ * configured and made active.
+ *
+ * @link active link
+ * @returns 1 if active/on, 0 otherwise
+ */
+static inline int tsn_link_is_on(struct tsn_link *link)
+{
+	if (link)
+		return atomic_read(&link->link_state) == LINK_RUNNING;
+	return 0;
+}
+
+/**
+ * tsn_set_buffer_size - adjust buffersize to match a shim
+ *
+ * This will not allocate (or deallcoate) memory, just adjust how much
+ * of the buffer allocated in tsn_prepare_link is being used. tsn_
+ * expects tsn_clear_buffer_size() to be invoked when stream is closed.
+ */
+int tsn_set_buffer_size(struct tsn_link *link, size_t bsize);
+int tsn_clear_buffer_size(struct tsn_link *link);
+
+/**
+ * tsn_buffer_write write data into the buffer from shim
+ *
+ * This is called from the shim-driver when more data is available and
+ * data needs to be pushed out to the network.
+ *
+ * NOTE: This is used when TSN handles the databuffer. This will not be
+ *	 needed for "shim-hosted" buffers.
+ *
+ * _If_ this function is called when the link is inactive, it will
+ * _enable_ the link (i.e. link will mark the buffer as 'active'). Do
+ * not copy data into the buffer unless you are ready to start sending
+ * frames!
+ *
+ * @link active link
+ * @src	the buffer to copy data from
+ * @bytes bytes to copy
+ * @return bytes copied from link->buffer or negative error
+ */
+int tsn_buffer_write(struct tsn_link *link, void *src, size_t bytes);
+
+
+/**
+ * tsn_buffer_read - read data from link->buffer and give to shim
+ *
+ * When we act as a listener, this is what the shim (should|will) call
+ * to grab data. It typically grabs much more data than the _net
+ * equivalent. It also do not trigger a refill-event the same way
+ * buffer_read_net does.
+ *
+ * @param link current link that holds the buffer
+ * @param buffer the buffer to copy into, must be at least of size bytes
+ * @param bytes number of bytes.
+ *
+ * Note that this routine does NOT CARE about channels, samplesize etc,
+ * it is a _pure_ copy that handles ringbuffer wraps etc.
+ *
+ * This function have side-effects as it will update internal tsn_link
+ * values.
+ *
+ * @return Bytes copied into link->buffer, negative value upon error.
+ */
+int tsn_buffer_read(struct tsn_link *link, void *buffer, size_t bytes);
+
+/**
+ * tsn_lb_enable - TSN Link Buffer Enable
+ *
+ * Mark the link as "buffer-enabled" which will let the core start
+ * shifting data in/out of the buffer instead of ignoring incoming
+ * frames or sending "nullframes".
+ *
+ * This is for the network-end of the tsn-buffer, i.e.
+ * - when enabled frames *from* the network will be inserted into the buffer,
+ * - or frames going *out* will include data from the buffer instead of sending
+ *   null-frames.
+ *
+ * When disabled, data will be zero'd, e.g Tx will send NULL-frames and
+ * Rx will silently drop the frames.
+ *
+ * @link: active link
+ */
+static inline void tsn_lb_enable(struct tsn_link *link)
+{
+	if (link)
+		atomic_set(&link->buffer_active, 1);
+}
+
+/**
+ * tsn_lb_disable - stop using the buffer for the net-side of TSN
+ *
+ * When we close a stream, we do not necessarily tear down the link, and
+ * we need to handle the data in some way.
+ */
+static inline void tsn_lb_disable(struct tsn_link *link)
+{
+	if (link)
+		atomic_set(&link->buffer_active, 0);
+}
+
+/**
+ * tsn_lb() - query if we have disabled pushing of data to/from link-buffer
+ *
+ * @param struct tsn_link *link - active link
+ * @returns 1 if link is enabled
+ */
+static inline int tsn_lb(struct tsn_link *link)
+{
+	if (link)
+		return atomic_read(&link->buffer_active);
+
+	/* if link is NULL; buffer not active */
+	return 0;
+}
+
+/**
+ * tsn_update_net_time - update timestamp for latest activity on the net-side
+ *
+ * This function expects link to be locked, i.e. no-one else is updating
+ * the fields. Typically called from tsn_rx_handler (which grabs tsn_link()).
+ *
+ * @param link active link
+ * @param tim_ns timestamp
+ * @param increment the number if times buffer has been updated
+ *
+ * @returns 0 on success, negative on error.
+ */
+int tsn_update_net_time(struct tsn_link *link, u64 time_ns, int increment);
+
+/**
+ * Shim ops - what tsn_core use when calling back into the shim. All ops
+ * must be reentrant.
+ */
+#define SHIM_NAME_SIZE 32
+struct tsn_shim_ops {
+
+	/* internal linked list used by tsn_core to keep track of all
+	 * shims.
+	 */
+	struct list_head head;
+
+	/**
+	 * name - a unique name identifying this shim
+	 *
+	 * This is what userspace use to indicate to core what SHIM a
+	 * particular link will use. If the name is already present,
+	 * core will reject this name.
+	 */
+	char shim_name[SHIM_NAME_SIZE];
+
+	/**
+	 * probe - callback when a new link of this type is instantiated.
+	 *
+	 * When a new link is brought online, this is called once the
+	 * essential parts of tsn_core has finiesh. Once probe_cb has
+	 * finisehd, the shim _must_ be ready to accept data to/from
+	 * tsn_core. On the other hand, due to the final steps of setup,
+	 * it cannot expect to be called into action immediately after
+	 * probe has finished.
+	 *
+	 * In other words, shim must be ready, but core doesn't have to
+	 *
+	 * @param : a particular link to pass along to the probe-function.
+	 */
+	int (*probe)(struct tsn_link *link);
+
+	/**
+	 * buffer_swap - set a new buffer for the link. [OPTIONAL]
+	 *
+	 * Used when external buffering is enabled.
+	 *
+	 * When called, a new buffer must be returned WITHOUT blocking
+	 * as this will be called from interrupt context.
+	 *
+	 * The buffer returned from the shim must be at least the size
+	 * of used_buffer_size.
+	 *
+	 * @param current link
+	 * @param old_buffer the buffer that are no longer needed
+	 * @param used number of bytes in buffer that has been filled with data.
+	 * @return new buffer to use
+	 */
+	void * (*buffer_swap)(struct tsn_link *link, void *old_buffer,
+			      size_t used);
+
+	/**
+	 * buffer_refill - signal shim that more data is required
+	 * @link Active link
+	 *
+	 * This function should not do anything that can preempt the
+	 * task (kmalloc, sleeping lock) or invoke actions that can take
+	 * a long time to complete.
+	 *
+	 * This will be called from tsn_buffer_read_net() when available
+	 * data in the buffer drops below low_water_mark. It will be
+	 * called with the link-lock *held*
+	 */
+	size_t (*buffer_refill)(struct tsn_link *link);
+
+	/**
+	 * buffer_drain - shim need to copy data from buffer
+	 *
+	 * This will be called from tsn_buffer_write_net() when data in
+	 * the buffer exceeds high_water_mark.
+	 *
+	 * The expected behavior is for the shim to then fill data into
+	 * the buffer via tsn_buffer_write()
+	 */
+	size_t (*buffer_drain)(struct tsn_link *link);
+
+	/**
+	 * media_close - shut down media controller properly
+	 *
+	 * when the link is closed/removed for some reason
+	 * external to the media controller (ALSA soundcard, v4l2 driver
+	 * etc), we call this to clean up.
+	 *
+	 * Normal operation is stopped before media_close is called, but
+	 * all references should be valid. TSN core expects media_close
+	 * to handle any local cleanup, once returned, any references in
+	 * stale tsn_links cannot be trusted.
+	 *
+	 * @link: current link where data is stored
+	 * @returns: 0 upon success, negative on error.
+	 */
+	int (*media_close)(struct tsn_link *link);
+
+	/**
+	 * hdr_size - ask shim how large the header is
+	 *
+	 * Needed when reserving space in skb for transmitting data.
+	 *
+	 * @link: current link where data is stored
+	 * @return: size of header for this shim
+	 */
+	size_t (*hdr_size)(struct tsn_link *link);
+
+	/**
+	 * copy_size - ask client how much from the buffer to include in
+	 *	       the next frame.
+	 *
+	 *	       This is for *outgoing* frames, incoming frames
+	 *	       have 'sd_len' set in the header.
+	 *
+	 *	       Note: copy_size should not return a size larger
+	 *		     than link->max_payload_size
+	 */
+	size_t (*copy_size)(struct tsn_link *link);
+
+	/**
+	 * validate_header - let the shim validate subtype-header
+	 *
+	 * Both psh and data may (or may not) contain headers that need
+	 * validating. This is the responsibility of the shim to
+	 * validate, and ops->valdiate_header() will be called before
+	 * any data is copied from the incoming frame and into the
+	 * buffer.
+	 *
+	 * Important: tsn_core expects validate_header to _not_ alter
+	 * the contents of the frame, and ideally, validate_header could
+	 * be called multiple times and give the same result.
+	 *
+	 * @param: active link owning the new data
+	 * @param: start of data-unit header
+	 *
+	 * This function will be called from interrupt-context and MUST
+	 * NOT take any locks.
+	 */
+	int (*validate_header)(struct tsn_link *link,
+			       struct avtpdu_header *header);
+
+	/**
+	 * assemble_header - add shim-specific headers
+	 *
+	 * This adds the headers required by the current shim after the
+	 * generic 1722-header.
+	 *
+	 * @param: active link
+	 * @param: start of data-unit header
+	 * @param: size of data to send in this frame
+	 * @return void
+	 */
+	void (*assemble_header)(struct tsn_link *link,
+				struct avtpdu_header *header, size_t bytes);
+
+	/**
+	 * get_payload_data - get a pointer to where the data is stored
+	 *
+	 * core will use the pointer (or drop it if NULL is returned)
+	 * and copy header->sd_len bytes of *consecutive* data from the
+	 * target memory and into the buffer memory.
+	 *
+	 * This is called with relevant locks held, from interrupt context.
+	 *
+	 * @param link active link
+	 * @param header header of frame, which contains data
+	 * @returns pointer to memory to copy from
+	 */
+	void * (*get_payload_data)(struct tsn_link *link,
+				   struct avtpdu_header *header);
+};
+/**
+ * tsn_shim_register_ops - register shim-callbacks for a given shim
+ *
+ * @param shim_ops - callbacks. The ops-struct should be kept intact for
+ *		     as long as the driver is running.
+ *
+ *
+ */
+int tsn_shim_register_ops(struct tsn_shim_ops *shim_ops);
+
+/**
+ * tsn_shim_deregister_ops - remove callback for module
+ *
+ * Completely remove shim_ops. This will close any links currently using
+ * this shim. Note: the links will be closed, but _not_ removed.
+ *
+ * @param shim_ops ops associated with this shim
+ */
+void tsn_shim_deregister_ops(struct tsn_shim_ops *shim_ops);
+
+/**
+ * tsn_shim_get_active : return the name of the currently loaded shim
+ *
+ * @param current link
+ * @return name of shim (matches an entry from exported triggers)
+ */
+char *tsn_shim_get_active(struct tsn_link *link);
+
+/**
+ * tsn_shim_find_by_name find shim_ops by name
+ *
+ * @param name of shim
+ * @return shim or NULL if not found/error.
+ */
+struct tsn_shim_ops *tsn_shim_find_by_name(const char *name);
+
+/**
+ * tsn_shim_export_probe_triggers - export a list of registered shims
+ *
+ * @param page to write content into
+ * @returns length of data written to page
+ */
+ssize_t tsn_shim_export_probe_triggers(char *page);
+
+/**
+ * tsn_get_framesize - get the size of the next TSN frame to send
+ *
+ * This will call into the shim to get the next chunk of data to
+ * read. Some sanitychecking is performed, i.e.
+ *
+ * 0 <= size <= max_payload_size
+ *
+ * @param struct tsn_link *link active link
+ * @returns size of frame in bytes or negative on error.
+ */
+static inline size_t tsn_shim_get_framesize(struct tsn_link *link)
+{
+	size_t ret;
+
+	ret = link->ops->copy_size(link);
+	if (ret <= link->max_payload_size)
+		return ret;
+	return link->max_payload_size;
+}
+
+/**
+ * tsn_get_hdr_size - get the size of the shim-specific header size
+ *
+ * The shim will add it's own header to the frame.
+ */
+static inline size_t tsn_shim_get_hdr_size(struct tsn_link *link)
+{
+	size_t ret;
+
+	if (!link || !link->ops->hdr_size)
+		return -EINVAL;
+	ret = link->ops->hdr_size(link);
+	if (ret > link->max_payload_size)
+		return -EINVAL;
+	return ret;
+}
+
+static inline u8 sr_class_to_pcp(struct tsn_nic *nic, enum sr_class class)
+{
+	if (!nic)
+		return 0;
+	switch (class) {
+	case SR_CLASS_A:
+		return nic->pcp_a;
+	case SR_CLASS_B:
+		return nic->pcp_b;
+		/* room for future class C & D */
+	default:
+		pr_err("Unknown class in mapping %d\n", class);
+	}
+	return 0;
+}
+
+#endif	/* _TSN_H */
-- 
2.7.4

^ permalink raw reply related

* [TSN RFC v2 4/9] Adding TSN-driver to Intel I210 controller
From: henrik @ 2016-12-16 17:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Richard Cochran, henrik, linux-media, alsa-devel, netdev,
	Jeff Kirsher, Jesse Brandeburg, intel-wired-lan, David S. Miller,
	Henrik Austad
In-Reply-To: <1481911153-549-1-git-send-email-henrik@austad.us>

From: Henrik Austad <henrik@austad.us>

This adds support for loading the igb.ko module with tsn
capabilities. This requires a 2-step approach. First enabling TSN in
.config, then load the module with use_tsn=1.

Once enabled and loaded, the controller will be placed in "Qav-mode"
which is when the credit-based shaper is available, 3 of the queues are
removed from regular traffic, max payload is set to 1522 octets (no
jumboframes allowed).

It dumps the registers of interest before and after, so this clutters
kern.log a bit if it is loaded with debug_tsn=1.

Regardless of number of online CPUs, it will enable *all* for Tx-queues as
2 is required for Qav traffic. This has not been tested extensively, so
there may be some instabilities in this.

Improved SR(A|B) accounting:
Use the idleslope-bins to keep track of how much time is reserved for
each class. This can then be used to strip the vlan-tag on the NIC when
the last stream goes (and also allow for reconfiguration of PCP when the
NIC is not sending TSN traffic).

Note: currently this driver is *not* stable, it is still a work in
progress, some points to keep tabs on:
- Set hicred to unlim (for testing this is ok and we avoid some nasty
  calculations)
- once we configure it for TSN, enable credit shaping, do not wait for
  first link to be configured (nobody else should use these queues after
  being configured).
- enable all Tx-/Rx-queues in TSN-mode regardless of num_online_cpus()
- Add 802.1Qav Prio-bit in adapter->flags
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: intel-wired-lan@lists.osuosl.org
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Henrik Austad <haustad@cisco.com>
---
 drivers/net/ethernet/intel/Kconfig        |  18 ++
 drivers/net/ethernet/intel/igb/Makefile   |   2 +-
 drivers/net/ethernet/intel/igb/igb.h      |  26 ++
 drivers/net/ethernet/intel/igb/igb_main.c |  39 ++-
 drivers/net/ethernet/intel/igb/igb_tsn.c  | 468 ++++++++++++++++++++++++++++++
 5 files changed, 550 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/igb/igb_tsn.c

diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig
index c0e1743..d4382b4 100644
--- a/drivers/net/ethernet/intel/Kconfig
+++ b/drivers/net/ethernet/intel/Kconfig
@@ -99,6 +99,24 @@ config IGB
 	  To compile this driver as a module, choose M here. The module
 	  will be called igb.
 
+config IGB_TSN
+       tristate "TSN Support for Intel(R) 82575/82576 i210 Network Controller"
+       depends on IGB && TSN
+	---help---
+	  This driver supports TSN (AVB) on Intel I210 network controllers.
+
+	  When enabled, it will allow the module to be loaded with
+	  "use_tsn" which will initialize the controller to A/V-mode
+	  instead of legacy-mode. This will take 3 of the tx-queues and
+	  place them in 802.1Q QoS mode and enable the credit-based
+	  shaper for 2 of the queues.
+
+	  If built with this option, but not loaded with use_tsn, the
+	  only difference is a slightly larger module, no extra
+	  code paths are called.
+
+	  If unsure, say No
+
 config IGB_HWMON
 	bool "Intel(R) PCI-Express Gigabit adapters HWMON support"
 	default y
diff --git a/drivers/net/ethernet/intel/igb/Makefile b/drivers/net/ethernet/intel/igb/Makefile
index 5bcb2de..1a9b776 100644
--- a/drivers/net/ethernet/intel/igb/Makefile
+++ b/drivers/net/ethernet/intel/igb/Makefile
@@ -33,4 +33,4 @@ obj-$(CONFIG_IGB) += igb.o
 
 igb-objs := igb_main.o igb_ethtool.o e1000_82575.o \
 	    e1000_mac.o e1000_nvm.o e1000_phy.o e1000_mbx.o \
-	    e1000_i210.o igb_ptp.o igb_hwmon.o
+	    e1000_i210.o igb_ptp.o igb_hwmon.o igb_tsn.o
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index d11093d..474a5b4 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -394,6 +394,7 @@ struct igb_nfc_filter {
 };
 
 /* board specific private data structure */
+
 struct igb_adapter {
 	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
 
@@ -519,6 +520,17 @@ struct igb_adapter {
 	/* lock for RX network flow classification filter */
 	spinlock_t nfc_lock;
 	bool etype_bitmap[MAX_ETYPE_FILTER];
+
+#if IS_ENABLED(CONFIG_IGB_TSN)
+	/* Reserved BW for class A and B */
+	s32 sra_idleslope_res;
+	s32 srb_idleslope_res;
+	u8 pcp_hi;
+	u8 pcp_lo;
+	u8 tsn_ready:1;
+	u8 tsn_vlan_added:1;
+	u8 res:6;
+#endif	/* IGB_TSN */
 };
 
 /* flags controlling PTP/1588 function */
@@ -540,6 +552,7 @@ struct igb_adapter {
 #define IGB_FLAG_HAS_MSIX		BIT(13)
 #define IGB_FLAG_EEE			BIT(14)
 #define IGB_FLAG_VLAN_PROMISC		BIT(15)
+#define IGB_FLAG_QAV_PRIO		BIT(16)
 
 /* Media Auto Sense */
 #define IGB_MAS_ENABLE_0		0X0001
@@ -603,6 +616,19 @@ void igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, unsigned char *va,
 			 struct sk_buff *skb);
 int igb_ptp_set_ts_config(struct net_device *netdev, struct ifreq *ifr);
 int igb_ptp_get_ts_config(struct net_device *netdev, struct ifreq *ifr);
+/* This should be the only place where we add ifdeffery
+ * to include tsn-stuff or not. Everything else is located in igb_tsn.c
+ */
+#if IS_ENABLED(CONFIG_IGB_TSN)
+void igb_tsn_init(struct igb_adapter *adapter);
+int igb_tsn_capable(struct net_device *netdev);
+int igb_tsn_link_configure(struct net_device *netdev, enum sr_class sr_class,
+			u16 framesize, u16 vid, u8 add_link, u8 pcp_hi, u8 pcp_lo);
+u16 igb_tsn_select_queue(struct net_device *netdev, struct sk_buff *skb,
+			void *accel_priv, select_queue_fallback_t fallback);
+#else
+static inline void igb_tsn_init(struct igb_adapter *adapter) { }
+#endif	/* CONFIG_IGB_TSN */
 void igb_set_flag_queue_pairs(struct igb_adapter *, const u32);
 #ifdef CONFIG_IGB_HWMON
 void igb_sysfs_exit(struct igb_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 9affd7c..0283864 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -256,6 +256,12 @@ static int debug = -1;
 module_param(debug, int, 0);
 MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
 
+static int use_tsn = 0;
+#if IS_ENABLED(CONFIG_IGB_TSN)
+MODULE_PARM_DESC(use_tsn, "use_tsn (0=off, 1=enabled)");
+module_param(use_tsn, int, 0);
+#endif
+
 struct igb_reg_info {
 	u32 ofs;
 	char *name;
@@ -675,6 +681,9 @@ static int __init igb_init_module(void)
 {
 	int ret;
 
+	if (use_tsn != 1)
+		use_tsn = 0;
+
 	pr_info("%s - version %s\n",
 	       igb_driver_string, igb_driver_version);
 	pr_info("%s\n", igb_copyright);
@@ -2161,6 +2170,11 @@ static const struct net_device_ops igb_netdev_ops = {
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= igb_netpoll,
 #endif
+#if IS_ENABLED(CONFIG_IGB_TSN)
+	.ndo_tsn_capable	= igb_tsn_capable,
+	.ndo_tsn_link_configure = igb_tsn_link_configure,
+	.ndo_select_queue	= igb_tsn_select_queue,
+#endif	/* CONFIG_IGB_TSN */
 	.ndo_fix_features	= igb_fix_features,
 	.ndo_set_features	= igb_set_features,
 	.ndo_fdb_add		= igb_ndo_fdb_add,
@@ -2682,6 +2696,9 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* do hw tstamp init after resetting */
 	igb_ptp_init(adapter);
 
+	if (use_tsn)
+		igb_tsn_init(adapter);
+
 	dev_info(&pdev->dev, "Intel(R) Gigabit Ethernet Network Connection\n");
 	/* print bus type/speed/width info, not applicable to i354 */
 	if (hw->mac.type != e1000_i354) {
@@ -3009,6 +3026,21 @@ static void igb_init_queue_configuration(struct igb_adapter *adapter)
 
 	adapter->rss_queues = min_t(u32, max_rss_queues, num_online_cpus());
 
+#if IS_ENABLED(CONFIG_IGB_TSN)
+	/* For I210: If we are using TSN, we don't want to use num_online_cpus(),
+	 * as we need 4 Tx-queues.
+	 *
+	 * Rx is another matter, but in time we probably want to assign
+	 * Rx-0 to class A, Rx-1 to B, -2 to PTP/MSRP control etc and -3
+	 * to all other traffic.
+	 */
+	if (use_tsn && hw->mac.type == e1000_i210) {
+		adapter->rss_queues = max_rss_queues;
+		pr_info("igb_init_queue_configuration: rss_queues=%u\n",
+			adapter->rss_queues);
+	}
+#endif	/* IGB_TSN */
+
 	igb_set_flag_queue_pairs(adapter, max_rss_queues);
 }
 
@@ -3397,7 +3429,8 @@ void igb_configure_tx_ring(struct igb_adapter *adapter,
 	txdctl |= IGB_TX_PTHRESH;
 	txdctl |= IGB_TX_HTHRESH << 8;
 	txdctl |= IGB_TX_WTHRESH << 16;
-
+	if (ring->flags & IGB_FLAG_QAV_PRIO)
+		txdctl |= E1000_TXDCTL_PRIORITY;
 	txdctl |= E1000_TXDCTL_QUEUE_ENABLE;
 	wr32(E1000_TXDCTL(reg_idx), txdctl);
 }
@@ -5345,8 +5378,10 @@ static netdev_tx_t igb_xmit_frame(struct sk_buff *skb,
 	/* The minimum packet size with TCTL.PSP set is 17 so pad the skb
 	 * in order to meet this minimum size requirement.
 	 */
-	if (skb_put_padto(skb, 17))
+	if (skb_put_padto(skb, 17)) {
+		pr_err("%s: skb_put_padto FAILED. skb->len < 17\n", __func__);
 		return NETDEV_TX_OK;
+	}
 
 	return igb_xmit_frame_ring(skb, igb_tx_queue_mapping(adapter, skb));
 }
diff --git a/drivers/net/ethernet/intel/igb/igb_tsn.c b/drivers/net/ethernet/intel/igb/igb_tsn.c
new file mode 100644
index 0000000..dd5d9ed
--- /dev/null
+++ b/drivers/net/ethernet/intel/igb/igb_tsn.c
@@ -0,0 +1,468 @@
+/*
+ * Copyright(c) 2015-2016 Henrik Austad <haustad@cisco.com>
+ *                        Cisco Systems, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+/* FIXME: This should probably be handled by some Makefile-magic */
+
+#if IS_ENABLED(CONFIG_IGB_TSN)
+#include "igb.h"
+#include <linux/module.h>
+
+/* NOTE: keep the defines not present in e1000_regs.h to avoid
+ * cluttering too many files. Once we are pretty stable, these will move
+ * into it's proper home. Until then, make merge a bit easier by
+ * avoiding it
+ */
+
+/* Qav regs */
+#define E1000_IRPBS	 0x02404 /* Rx Packet Buffer Size - RW */
+#define E1000_ITPBS	 0x03404 /* Tx buffer size assignment */
+#define E1000_TQAVCTRL   0x03570 /* Tx Qav Control */
+#define E1000_DTXMXPKTSZ 0x0355C /* DMA TX Maximum Packet Size */
+
+/* Qav defines. */
+#define E1000_TQAVCH_UNLIM_CREDIT      0xFFFFFFFF
+#define E1000_TQAVCH_ZERO_CREDIT       0x80000000
+#define E1000_LINK_RATE		       0x7735
+
+/* 0:15  idleSlope
+ * 16:29 reserved
+ * 30    reserved
+ * 31    queue mode, 0=strict, 1=SR mode
+ */
+#define E1000_TQAVCC_QUEUEMODE         0x80000000
+#define E1000_TQAVCC_IDLE_SLOPE_MASK   0x0000ffff
+
+/* Transmit mode, 0=legacy, 1=QAV */
+#define E1000_TQAVCTRL_TXMODE          0x00000001
+/* report DMA time of tx packets */
+#define E1000_TQAVCTRL_1588_STAT_EN    0x00000004
+/* data fetch arbitration */
+#define E1000_TQAVCTRL_DATA_FETCH_ARB  0x00000010
+/* data tx arbitration */
+#define E1000_TQAVCTRL_DATA_TRAN_ARB   0x00000100
+/* data launch time valid */
+#define E1000_TQAVCTRL_DATA_TRAN_TIM   0x00000200
+/* stall SP to guarantee SR */
+#define E1000_TQAVCTRL_SP_WAIT_SR      0x00000400
+
+/* ... and associated shift value */
+#define E1000_TQAVCTRL_FETCH_TM_SHIFT  (16)
+
+/* QAV Tx mode control registers where _n can be 0 or 1. */
+#define E1000_TQAVCC(_idx)			(0x03004 + 0x40 * (_idx))
+
+/* Tx Qav High Credit - See 7.2.7.6 for calculations
+ * intel 8.12.18
+ */
+#define E1000_TQAVHC(_idx)			(0x0300C + 0x40 * (_idx))
+
+/* Queues priority masks where _n and _p can be 0-3. */
+
+#define MAX_FRAME_SIZE 1522
+#define MIN_FRAME_SIZE   64
+
+static int debug_tsn = -1;
+module_param(debug_tsn, int, 0);
+MODULE_PARM_DESC(debug_tsn, "debug_tsn (0=off, 1=enabled)");
+
+/* For a full list of the registers dumped here, see sec 8.1.3 in the
+ * i210 controller datasheet.
+ */
+static inline void _tsn_dump_regs(struct igb_adapter *adapter)
+{
+	u32 val = 0;
+	struct device *dev;
+	struct e1000_hw *hw = &adapter->hw;
+
+	/* do not dump regs if we're not debugging driver */
+	if (debug_tsn != 1)
+		return;
+
+	dev = &adapter->pdev->dev;
+	dev_info(dev, "num_tx_queues=%d (netdev=%d, real=%d), num_rx_queues=%d (netdev=%d, real=%d)\n",
+		adapter->num_tx_queues, adapter->netdev->num_tx_queues, adapter->netdev->real_num_tx_queues,
+		adapter->num_rx_queues, adapter->netdev->num_rx_queues, adapter->netdev->real_num_rx_queues);
+
+	/* 0x0008 - E1000_STATUS Device status register */
+	val = rd32(E1000_STATUS);
+	dev_info(&adapter->pdev->dev, "\n");
+	dev_info(dev, "Status: FullDuplex=%s, LinkUp=%s, speed=0x%x\n",
+		 val & 0x1 ? "FD" : "HD",
+		 val & 0x2 ? "LU" : "LD",
+		 val & 0xc0 >> 6);
+
+	/* E1000_VET vlan ether type */
+	val = rd32(E1000_VET);
+	dev_info(dev, "VLAN ether type: VET.VET=0x%04x, VET.VET_EXT=0x%04x\n",
+		 val & 0xffff, (val >> 16) & 0xffff);
+
+	/* E1000_RXPBS (RXPBSIZE) Rx Packet Buffer Size */
+	val = rd32(E1000_RXPBS);
+	dev_info(dev, "Rx Packet buffer: RXPBSIZE=%dkB, Bmc2ospbsize=%dkB, cfg_ts_en=%s\n",
+		 val & 0x1f,
+		 (val >> 6) & 0x1f,
+		 (val & (1 << 31)) ? "cfg_ts_en" : "cfg_ts_dis");
+
+	/* Transmit stuff */
+	/* E1000_TXPBS (TXPBSIZE) Tx Packet Buffer Size - RW */
+	val = rd32(E1000_TXPBS);
+	dev_info(dev, "Tx Packet buffer: Txpb0size=%dkB, Txpb1size=%dkB, Txpb2size=%dkB, Txpb3size=%dkB, os2Bmcpbsize=%dkB\n",
+		 val & 0x3f, (val >> 6) & 0x3f, (val >> 12) & 0x3f,
+		 (val >> 18) & 0x3f, (val >> 24) & 0x3f);
+
+	/* E1000_TCTL (TCTL) Tx control - RW*/
+	val = rd32(E1000_TCTL);
+	dev_info(dev, "Tx control reg: TxEnable=%s, CT=0x%X\n",
+		 val & 2 ? "EN" : "DIS", (val >> 3) & 0x3F);
+
+	/* TQAVHC     : Transmit Qav High credits 0x300C + 0x40*n - RW */
+	val = rd32(E1000_TQAVHC(0));
+	dev_info(dev, "E1000_TQAVHC0: %0x08x\n", val);
+	val = rd32(E1000_TQAVHC(1));
+	dev_info(dev, "E1000_TQAVHC1: %0x08x\n", val);
+
+	/* TQAVCC[0-1]: Transmit Qav 0x3004 + 0x40*n  - RW */
+	val = rd32(E1000_TQAVCC(0));
+	dev_info(dev, "E1000_TQAVCC0: idleSlope=0x%02x, QueueMode=%s\n",
+		 val % 0xff,
+		 val > 31 ? "Stream reservation" : "Strict priority");
+	val = rd32(E1000_TQAVCC(1));
+	dev_info(dev, "E1000_TQAVCC1: idleSlope=0x%02x, QueueMode=%s\n",
+		 val % 0xff,
+		 val > 31 ? "Stream reservation" : "Strict priority");
+
+	/* TQAVCTRL   : Transmit Qav control - RW */
+	val = rd32(E1000_TQAVCTRL);
+	dev_info(dev, "E1000_TQAVCTRL: TransmitMode=%s,1588_STAT_EN=%s,DataFetchARB=%s,DataTranARB=%s,DataTranTIM=%s,SP_WAIT_SR=%s,FetchTimDelta=%dns (0x%04x)\n",
+		 (val & 0x0001) ? "Qav" : "Legacy",
+		 (val & 0x0004) ? "En" : "Dis",
+		 (val & 0x0010) ? "Most Empty" : "Round Robin",
+		 (val & 0x0100) ? "Credit Shaper" : "Strict priority",
+		 (val & 0x0200) ? "Valid" : "N/A",
+		 (val & 0x0400) ? "Wait" : "nowait",
+		 (val >> 16) * 32, (val >> 16));
+}
+
+/* Place the NIC in Qav-mode.
+ *
+ * This will result in a _single_ queue for normal BE traffic, the rest
+ * will be grabbed by the Qav-machinery and kept for strict priority
+ * transmission.
+ *
+ * I210 Datasheet Sec 7.2.7.7 gives a lot of information.
+ */
+void igb_tsn_init(struct igb_adapter *adapter)
+{
+	struct e1000_hw *hw = &adapter->hw;
+	u32 txpbsize;
+	u32 tqavctrl;
+
+	if (debug_tsn < 0 || debug_tsn > 1)
+		debug_tsn = 0;
+
+	if (!adapter->pdev) {
+		adapter->tsn_ready = 0;
+		return;
+	}
+
+	switch (adapter->pdev->device) {
+	case 0x1533:   /* E1000_DEV_ID_I210_COPPER */
+	case 0x1536:   /* E1000_DEV_ID_I210_FIBER */
+	case 0x1537:   /* E1000_DEV_ID_I210_SERDES: */
+	case 0x1538:   /* E1000_DEV_ID_I210_SGMII: */
+	case 0x157b:   /* E1000_DEV_ID_I210_COPPER_FLASHLESS: */
+	case 0x157c:   /* E1000_DEV_ID_I210_SERDES_FLASHLESS: */
+		break;
+	default:
+		/* not a known IGB-TSN capable device */
+		adapter->tsn_ready = 0;
+		return;
+	}
+	_tsn_dump_regs(adapter);
+
+	if (adapter->num_tx_queues != 4) {
+		pr_err("IGB_TSN: ERROR, not enough TX-queues available, need 4, got %d\n",
+			adapter->num_tx_queues);
+		return;
+	}
+
+	/* setup the Transmit Descriptor Control (see 8.12.15)
+	 *
+	 * Set a flag for priority-queue and call igb_configure_tx_ring() in igb_main.c
+	 */
+	adapter->tx_ring[0]->flags |= IGB_FLAG_QAV_PRIO;
+	adapter->tx_ring[1]->flags |= IGB_FLAG_QAV_PRIO;
+	igb_configure_tx_ring(adapter, adapter->tx_ring[0]);
+	igb_configure_tx_ring(adapter, adapter->tx_ring[1]);
+
+	/* Set Tx packet buffer size assignment, see 7.2.7.7 in i210
+	 * PB0: 8kB (default 20 kB)
+	 * PB1: 8kB (default  0 pkB)
+	 * PB2: 4kB (default  0 kB)
+	 * PB3: 4kB (default  0 kB)
+	 *
+	 * os2bmcsize: 2kB (default 4 kB)
+
+	 * UPDATE: set os2bmcpbsize to 0, then we can drop setting RX
+	 * packet buffer
+	 *
+	 * sumTx: 24kB (26kB)
+	 *
+	 * Rxpbsize: 0x20 (32kB default 34kB
+	 * bmc2ossize: 0x02 (default 0x02)
+	 * sumRx: 34kB
+	 *
+	 * Total 60kB (see 4.5.9)
+	 *
+	 * See 8.3.1 && 8.3.2 for fields
+	 */
+	txpbsize = (0 << 30 | 0x00 << 24 | 0x04 << 18 | 0x04 << 12 | 0x08 << 6 | 0x08);
+	wr32(E1000_ITPBS, txpbsize);
+
+	/* Since we dropped os2bmcsize, we do not have to change the
+	 * default here after all */
+	/* wr32(E1000_IRPBS, 0x02 << 6 | 0x20); */
+
+	/* DMA Tx maximum packet size, the largest frame DMA should transport
+	 * do not allow frames larger than 1522 + preample. Reg expects
+	 * size in 64B increments. 802.1BA 6.3
+	 * Round up to 1536 to handle 64B increments
+	 *
+	 * Initial value: 0x98 (152 => 9728 bytes)
+	 */
+	wr32(E1000_DTXMXPKTSZ, 1536 >> 6);
+
+	/* For now, only set CreditBased shaper for A and B, do not set
+	 * idleSlope as we have not yet gotten any streams. Set HiCredit
+	 * to be unlimitied (this violates the 75% 'default' boundary
+	 *
+	 * 8.12.19
+	 */
+	wr32(E1000_TQAVCC(0), E1000_TQAVCC_QUEUEMODE);
+	wr32(E1000_TQAVCC(1), E1000_TQAVCC_QUEUEMODE);
+	wr32(E1000_TQAVHC(0), E1000_TQAVCH_UNLIM_CREDIT);
+	wr32(E1000_TQAVHC(1), E1000_TQAVCH_UNLIM_CREDIT);
+
+	/* Place card in Qav-mode, use tx-queue 0,1 for Qav
+	 * (Credit-based shaper), 2,3 for standard priority (and
+	 * best-effort) traffic.
+	 *
+	 * i210 8.12.19 and 8.12.21
+	 *
+	 * - Fetch: most empty and time based (not round-robin)
+	 * - Transmit: Credit based shaper for SR queues
+	 * - Data launch time valid (in Qav mode) is off (we do not want
+	 *			     time-triggered launch)
+	 * - Wait for SR queues to ensure that launch time is always valid.
+	 * - Set ~10us wait-time-delta, 32ns granularity
+	 */
+	tqavctrl = E1000_TQAVCTRL_TXMODE      | \
+		E1000_TQAVCTRL_DATA_FETCH_ARB | \
+		E1000_TQAVCTRL_DATA_TRAN_ARB  | \
+		E1000_TQAVCTRL_SP_WAIT_SR     | \
+		320 << E1000_TQAVCTRL_FETCH_TM_SHIFT;
+	wr32(E1000_TQAVCTRL, tqavctrl);
+
+	/* reset Tx Descriptor tail and head for the queues */
+	wr32(E1000_TDT(0), 0);
+	wr32(E1000_TDT(1), 0);
+	wr32(E1000_TDH(0), 0);
+	wr32(E1000_TDH(1), 0);
+
+	_tsn_dump_regs(adapter);
+	dev_info(&adapter->pdev->dev, "\n");
+
+	adapter->sra_idleslope_res = 0;
+	adapter->srb_idleslope_res = 0;
+	adapter->tsn_ready = 1;
+	adapter->tsn_vlan_added = 0;
+
+	dev_info(&adapter->pdev->dev, "%s: setup done\n", __func__);
+}
+
+int igb_tsn_capable(struct net_device *netdev)
+{
+	struct igb_adapter *adapter;
+
+	if (!netdev)
+		return -EINVAL;
+	adapter = netdev_priv(netdev);
+	return adapter->tsn_ready == 1;
+}
+
+/* igb_tsn_link_configure - configure NIC to handle a new stream
+ *
+ * @netdev: pointer to NIC device
+ * @class: the class for the stream used to find the correct queue.
+ * @framesize: size of each frame, *including* headers (not preamble)
+ * @vid: VLAN ID
+ *
+ * NOTE: the sr_class only instructs the driver which queue to use, not
+ * what priority the network expects for a given class. This is
+ * something userspace must find out and then let the tsn-driver set in
+ * the frame before xmit.
+ *
+ * FIXME: remove bw-req from a stream that goes away.
+ */
+int igb_tsn_link_configure(struct net_device *netdev, enum sr_class class,
+			   u16 framesize, u16 vid, u8 add_link, u8 pcp_hi, u8 pcp_lo)
+{
+	s32 idle_slope = 0;
+	s32 frames_pr_ms;
+	s32 new_is = 0;
+	u32 tqavcc;
+	int err;
+
+	struct igb_adapter *adapter;
+	struct e1000_hw *hw;
+
+	if (!netdev)
+		return -EINVAL;
+	adapter  = netdev_priv(netdev);
+	hw = &adapter->hw;
+
+	if (!igb_tsn_capable(netdev)) {
+		pr_err("%s:  NIC not capable\n", __func__);
+		return -EINVAL;
+	}
+
+	if (framesize > MAX_FRAME_SIZE || framesize < MIN_FRAME_SIZE) {
+		pr_err("%s: framesize (%u) must be [%d,%d]\n", __func__,
+		       framesize, MIN_FRAME_SIZE, MAX_FRAME_SIZE);
+		return -EINVAL;
+	}
+
+	/*
+	 * FIXME: This should disable EEE and (possibly) enable it when
+	 * removing the last link.
+	 *
+	 * FIXME: This is only done the first time and have the
+	 * potentional of never being reset as we do not count the
+	 * number of configured links.
+	 *
+	 * This means that if all links goes away and the network
+	 * reconfigures the domain to use different PCPs, we will drop
+	 * that.
+	 *
+	 * FIXME: this update is racy
+	 */
+	if (add_link && !adapter->tsn_vlan_added) {
+		rtnl_lock();
+		pr_info("%s: adding VLAN %u to HW filter on device %s\n",
+			__func__, vid, netdev->name);
+		err = vlan_vid_add(netdev, htons(ETH_P_8021Q), vid);
+		if (err != 0)
+			pr_err("%s: error adding vlan %u, res=%d\n",
+				__func__, vid, err);
+		rtnl_unlock();
+		adapter->pcp_hi = pcp_hi & 0x7;
+		adapter->pcp_lo = pcp_lo & 0x7;
+		adapter->tsn_vlan_added = 1;
+	}
+
+	/* we currently drop hicred (we have set this to unlim to ease calculation) */
+	/* Grab current values of idle_slope
+	 */
+	switch(class) {
+	case SR_CLASS_A:
+		tqavcc = rd32(E1000_TQAVCC(0));
+		frames_pr_ms = 8;
+		break;
+	case SR_CLASS_B:
+		tqavcc = rd32(E1000_TQAVCC(1));
+		frames_pr_ms = 4;
+		break;
+	default:
+		pr_err("igb_tsn: unknown traffic-class, aborting configuration\n");
+		return -EINVAL;
+	}
+	idle_slope = tqavcc & E1000_TQAVCC_IDLE_SLOPE_MASK;
+
+	/* Calculate new idle slope and add to appropriate idle_slope
+	 * idle_slope = BW * linkrate * 2 (0r 0.2 for 100Mbit)
+	 * BW: % of total bandwidth
+	 *
+	 * E1000_LINK_RATE: 0x7735
+	 * LINE_SPEED: 1e9
+	 */
+	new_is = (s32)framesize * frames_pr_ms * E1000_LINK_RATE * 2 * 8;
+	if (new_is % 1000000)
+		new_is += 1000000;
+	new_is /= 1000000;
+
+	pr_info("Framesize=%u,E1000_LINK_RATE=%u,class=%s,new_is=%s%d,current_is=%u\n",
+		framesize,E1000_LINK_RATE,
+		(class == SR_CLASS_A ? "A" : "B"),
+		(add_link ? "+" : "-"),
+		new_is, idle_slope);
+	if (!add_link)
+		new_is *= -1;
+	idle_slope += new_is;
+
+	tqavcc &= ~E1000_TQAVCC_IDLE_SLOPE_MASK;
+	tqavcc |= idle_slope & E1000_TQAVCC_IDLE_SLOPE_MASK;
+	if (class == SR_CLASS_A) {
+		wr32(E1000_TQAVCC(0), tqavcc);
+		adapter->sra_idleslope_res += (s16)new_is;
+	} else {
+		wr32(E1000_TQAVCC(1), tqavcc);
+		adapter->srb_idleslope_res += (s16)new_is;
+	}
+
+	if (adapter->sra_idleslope_res == 0 && adapter->srb_idleslope_res == 0) {
+		/* removing last stream going through NIC, drop vlan and
+		 * make it possible to change PCP */
+		rtnl_lock();
+		vlan_vid_del(netdev, htons(ETH_P_8021Q), vid);
+		adapter->tsn_vlan_added = 0;
+		rtnl_unlock();
+	}
+	_tsn_dump_regs(netdev_priv(netdev));
+	pr_info("igb_tsn_link_configure: done setting up TSN, idle_slope: %u,for frame: %u\n",
+		idle_slope, new_is);
+
+	return 0;
+}
+
+u16 igb_tsn_select_queue(struct net_device *netdev, struct sk_buff *skb,
+			void *accel_priv, select_queue_fallback_t fallback)
+{
+	struct igb_adapter *adapter = netdev_priv(netdev);
+	if (!adapter)
+		return fallback(netdev, skb);
+
+	/* we only return the special queue(s) for tsn-traffic, and gPTP
+	 * otherwise we pick the last queue
+	 */
+	if (igb_tsn_capable(netdev)) {
+		switch (vlan_get_protocol(skb)) {
+		case htons(ETH_P_TSN):
+			if (skb->priority == adapter->pcp_hi)
+				return 0;
+			if (skb->priority == adapter->pcp_lo)
+				return 1;
+			pr_err("igb_tsn select queu: unknown PCP:%u, expected either hi=%u or lo=%u\n",
+				skb->priority, adapter->pcp_hi, adapter->pcp_lo);
+			break;
+		case htons(ETH_P_1588):
+			/* PTP should be sent via tx-queue 2 */
+			return 2;
+		default:
+			/* rest via 3 */
+			return  3;
+		}
+	}
+	return fallback(netdev, skb);
+}
+#endif	/* #if IS_ENABLED(CONFIG_IGB_TSN) */
-- 
2.7.4

^ permalink raw reply related

* [TSN RFC v2 3/9] TSN: Add the standard formerly known as AVB to the kernel
From: henrik @ 2016-12-16 17:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Richard Cochran, henrik, linux-media, alsa-devel, netdev,
	David S. Miller, Henrik Austad
In-Reply-To: <1481911153-549-1-git-send-email-henrik@austad.us>

From: Henrik Austad <henrik@austad.us>

TSN provides a mechanism to create reliable, jitter-free, low latency
guaranteed bandwidth links over a local network. It does this by
reserving a path through the network. Support for TSN must be found in
both the NIC as well as in the network itself.

This adds required hooks into netdev_ops so that the core TSN driver can
use this when configuring a new NIC or setting up a new link. It also
provides hook for removing a link and reducing the idle_slope parameter on
the NIC.

(We need to set the PCP values when we first configure the link. This
 value should not change as long as we have valid streams running, and in
 most cases, the PCP for the domain will not change.)

Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Henrik Austad <haustad@cisco.com>
---
 include/linux/netdevice.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
 net/Kconfig               |  1 +
 net/tsn/Kconfig           | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+)
 create mode 100644 net/tsn/Kconfig

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e16a2a9..0d758aa 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -112,6 +112,15 @@ enum netdev_tx {
 };
 typedef enum netdev_tx netdev_tx_t;
 
+#if IS_ENABLED(CONFIG_TSN)
+enum sr_class {
+	SR_CLASS_A = 1,
+	SR_CLASS_B = 2,
+	SR_CLASS_LAST,
+	SR_CLASS_ERR,
+};
+#endif
+
 /*
  * Current order: NETDEV_TX_MASK > NET_XMIT_MASK >= 0 is significant;
  * hard_start_xmit() return < NET_XMIT_MASK means skb was consumed.
@@ -944,6 +953,31 @@ struct netdev_xdp {
  *
  * void (*ndo_poll_controller)(struct net_device *dev);
  *
+ *	TSN functions (if CONFIG_TSN)
+ *
+ * int (*ndo_tsn_capable)(struct net_device *dev);
+ *	If a particular device is capable of sustaining TSN traffic
+ *	provided current configuration
+ *
+ * int (*ndo_tsn_link_configure)(struct net_device *dev,
+ *				 enum sr_class class,
+ *				 u16 framesize,
+ *				 u16 vid,
+ *				 u8 add_link,
+ *				 u8 pcp_hi,
+ *				 u8 pcp_lo)
+);
+ *     Configure a NIC to handle TSN-streams
+ *     - Update the bandwidth for the particular stream-class.
+ *     - The framesize is the size of the _entire_ frame (not just the payload)
+ *       since the full size is required to allocate bandwidth through
+ *       the credit based shaper in the NIC
+ *     - the vlan_id is the configured vlan for TSN in this session.
+ *     - add_link: if the link should be added or subtracted from the current
+ *       budget.
+ *    - u8 pcp_hi: 802.1Q priority value for high-class traffic (class A)
+ *    - u8 pcp_lo: 802.1Q priority value for low-class traffic (class B)
+ *
  *	SR-IOV management functions.
  * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
  * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan,
@@ -1185,6 +1219,16 @@ struct net_device_ops {
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	int			(*ndo_busy_poll)(struct napi_struct *dev);
 #endif
+
+#if IS_ENABLED(CONFIG_TSN)
+	int			(*ndo_tsn_capable)(struct net_device *dev);
+	int			(*ndo_tsn_link_configure)(struct net_device *dev,
+							  enum sr_class class,
+							  u16 framesize,
+							  u16 vid, u8 add_link,
+							  u8 pcp_hi, u8 pcp_lo);
+#endif	/* CONFIG_TSN */
+
 	int			(*ndo_set_vf_mac)(struct net_device *dev,
 						  int queue, u8 *mac);
 	int			(*ndo_set_vf_vlan)(struct net_device *dev,
diff --git a/net/Kconfig b/net/Kconfig
index 7b6cd34..19b8f9a 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -215,6 +215,7 @@ source "net/802/Kconfig"
 source "net/bridge/Kconfig"
 source "net/dsa/Kconfig"
 source "net/8021q/Kconfig"
+source "net/tsn/Kconfig"
 source "net/decnet/Kconfig"
 source "net/llc/Kconfig"
 source "net/ipx/Kconfig"
diff --git a/net/tsn/Kconfig b/net/tsn/Kconfig
new file mode 100644
index 0000000..1fc3c1d
--- /dev/null
+++ b/net/tsn/Kconfig
@@ -0,0 +1,32 @@
+#
+# Configuration for 802.1 Time Sensitive Networking (TSN)
+#
+
+config TSN
+	tristate "802.1 TSN Support"
+	depends on VLAN_8021Q && PTP_1588_CLOCK && CONFIGFS_FS
+	---help---
+	  Select this if you want to enable TSN on capable interfaces.
+
+	  TSN allows you to set up deterministic links on your LAN (only
+	  L2 is currently supported). Once loaded, the driver will probe
+	  all available interfaces if they are capable of supporting TSN
+	  links.
+
+	  Once loaded, a directory in configfs called tsn/ will expose
+	  the capable NICs and allow userspace to create
+	  links. Userspace must provide us with a StreamID as well as
+	  reserving bandwidth through the network and once this is done,
+	  a new link can be created by issuing a mkdir() in configfs and
+	  updating the attributes for the new link.
+
+	  TSN itself does not produce nor consume data, it is dependent
+	  upon 'shims' doing this, which can be virtually anything. ALSA
+	  is a good candidate.
+
+	  For more information, refer to the TSN-documentation in the
+	  kernel documentation repository.
+
+	  The resulting module will be called 'tsn'
+
+	  If unsure, say N.
-- 
2.7.4

^ permalink raw reply related

* [TSN RFC v2 1/9] igb: add missing fields to TXDCTL-register
From: henrik @ 2016-12-16 17:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Richard Cochran, henrik, linux-media, alsa-devel, netdev,
	Jeff Kirsher, intel-wired-lan, Henrik Austad
In-Reply-To: <1481911153-549-1-git-send-email-henrik@austad.us>

From: Henrik Austad <henrik@austad.us>

The current list of E1000_TXDCTL-registers is incomplete. This adds the
missing parts for the Transmit Descriptor Control (TXDCTL) register.

The rest of these values (threshold for descriptor read/write) for
TXDCTL seems to be defined in igb/igb.h, not sure why this is split
though.

It seems that this was left out in the commit that added support for
82575 Gigabit Ethernet driver 9d5c8243 (igb: PCI-Express 82575 Gigabit
Ethernet driver).

Cc: linux-kernel@vger.kernel.org
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: intel-wired-lan@lists.osuosl.org
Signed-off-by: Henrik Austad <haustad@cisco.com>
---
 drivers/net/ethernet/intel/igb/e1000_82575.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/e1000_82575.h b/drivers/net/ethernet/intel/igb/e1000_82575.h
index acf0605..7faa482 100644
--- a/drivers/net/ethernet/intel/igb/e1000_82575.h
+++ b/drivers/net/ethernet/intel/igb/e1000_82575.h
@@ -158,7 +158,11 @@ struct e1000_adv_tx_context_desc {
 
 /* Additional Transmit Descriptor Control definitions */
 #define E1000_TXDCTL_QUEUE_ENABLE  0x02000000 /* Enable specific Tx Queue */
+
+/* Transmit Software Flush, sw-triggered desc writeback */
+#define E1000_TXDCTL_SWFLSH        0x04000000
 /* Tx Queue Arbitration Priority 0=low, 1=high */
+#define E1000_TXDCTL_PRIORITY      0x08000000
 
 /* Additional Receive Descriptor Control definitions */
 #define E1000_RXDCTL_QUEUE_ENABLE  0x02000000 /* Enable specific Rx Queue */
-- 
2.7.4

^ permalink raw reply related

* [TSN RFC v2 0/9] TSN driver for the kernel
From: henrik @ 2016-12-16 17:59 UTC (permalink / raw)
  To: linux-kernel
  Cc: Richard Cochran, henrik, Henrik Austad, linux-media, alsa-devel,
	netdev

From: Henrik Austad <haustad@cisco.com>


The driver is directed via ConfigFS as we need userspace to handle
stream-reservation (MSRP), discovery and enumeration (IEEE 1722.1) and
whatever other management is needed. This also includes running an
appropriate PTP daemon (TSN favors gPTP).

Once we have all the required attributes, we can create link using
mkdir, and use write() to set the attributes. Once ready, specify the
'shim' (basically a thin wrapper between TSN and another subsystem) and
we start pushing out frames.

The network part: it ties directly into the Rx-handler for receive and
writes skb's using dev_queue_xmit(). This could probably be improved.

2 new fields in netdev_ops have been introduced, and the Intel
igb-driver has been updated (as this an AVB-capable NIC which is
available as a PCI-e card).

What remains (tl;dr: a lot) a.k.a "Known problems" or "working on it!"
- tie to (g)PTP properly, currently using ktime_get() for presentation
  time
- get time from shim into TSN
- let shim create/manage buffer
- redo parts of the link-stuff using RCUs, the current setup is a bit
  clumsy.
- The igb-driver does not work properly when compiled with IGB_TSN, some
  details in setting the register values needs to be figured out. I am
  working on this, but as it stands, the best bet is to load tsn using
  in_debug=1 to bypass the capability-check. I have had e1000 and sky2
  running for several days without crashing, igb crashes and burns
  violently.
- The ALSA driver does not handle multiple devices very well and is a
  work in progress.

* v2: changes since v1

Changes since v1
- updated to latest upstream kernel (v4.8)
- set dedicated enabled-attribute and let shim be stored in own (support
  future plan for enabling per-shim attributes)
- fixed endianess issue in bitfields used in tsn-structs
- Updated some of the trace-events to use trace_class
- Fix various silly typos
- Handle disabling of link from hrtimer a bit more gracefully (that
  actually works-ish).
- use old skb and size of skb when that is set (Reporte by Nikita)
- Move PCP-codes to NIC and not in the link itself
- Allow TSN-capable card to be loaded even when in debug-mode (and do
  not enforce TSN behaviour)
- Start hooking into ALSA's get_time_info hooks (very much incomplete)
- use threads for sending frames, wake from hrtimer-callback.
  This also queues up awaiting timers if we fail to complete the
  transmit before another timer arrives, it will immediately execute
  another iteration, so no events should be lost. That being said,
  should this happen, it is a clear bug as we really should complete
  well before the next interval.
- Cleanup link-locking and differentiate between Talker and Listener (as
  Listener grab link-lock from IRQ context)
- Change list-lock to spinlock as we may need to take a link-lock whilst
  holding the master list-lock.
- Do a more careful dance holding the spinlocks to regions only doing
  actual update.

Network driver (I210 only)
- bring up all Tx-/Rx-queues when igb is in TSN-mode regardless of how
  many CPUs the system has for I210
- Correctly calculate the idle_slope in I210's configure hook
- Update igb-driver with queue-select and return correct queue when
  sending TSN-frames
- add IGB_FLAG_QAV_PRIO flag to igb_adapter (to handle proper config of
  tx-ring when adapter is brought up.
- add TXDCTL logic (part of preparatory work for TSN) to igb-driver
- Improve SR(A|B) accountingo

ALSA Shim
- Allow userspace to grab much smaller chunks of data (down to a single
  Class A frame for S16_LE 2ch 48kHz).
- Create the card with index/id pattern to avoid collision with other
  cards.
* v1

Before reading on - this is not even beta, but I'd really appreciate if
people would comment on the overall architecture and perhaps provide
some pointers to where I should improve/fix/update
- thanks!

This is a very early RFC for a TSN-driver in the kernel. It has been
floating around in my repo for a while and I would appreciate some
feedback on the overall design to avoid doing some major blunders.

There are at least one AVB-driver (the AV-part of TSN) in the kernel
already. This driver aims to solve a wider scope as TSN can do much more
than just audio. A very basic ALSA-driver is added to the end that
allows you to play music between 2 machines using aplay in one end and
arecord | aplay on the other (some fiddling required) We have plans for
doing the same for v4l2 eventually (but there are other fishes to fry
first). The same goes for a TSN_SOCK type approach as well.

Henrik Austad (9):
  igb: add missing fields to TXDCTL-register
  TSN: add documentation
  TSN: Add the standard formerly known as AVB to the kernel
  Adding TSN-driver to Intel I210 controller
  Add TSN header for the driver
  Add TSN machinery to drive the traffic from a shim over the network
  Add TSN event-tracing
  AVB ALSA - Add ALSA shim for TSN
  MAINTAINERS: add TSN/AVB-entries

 Documentation/TSN/tsn.txt                    |  345 ++++++++
 MAINTAINERS                                  |   14 +
 drivers/media/Kconfig                        |   15 +
 drivers/media/Makefile                       |    2 +-
 drivers/media/avb/Makefile                   |    5 +
 drivers/media/avb/avb_alsa.c                 |  793 +++++++++++++++++
 drivers/media/avb/tsn_iec61883.h             |  152 ++++
 drivers/net/ethernet/intel/Kconfig           |   18 +
 drivers/net/ethernet/intel/igb/Makefile      |    2 +-
 drivers/net/ethernet/intel/igb/e1000_82575.h |    4 +
 drivers/net/ethernet/intel/igb/igb.h         |   26 +
 drivers/net/ethernet/intel/igb/igb_main.c    |   39 +-
 drivers/net/ethernet/intel/igb/igb_tsn.c     |  468 ++++++++++
 include/linux/netdevice.h                    |   44 +
 include/linux/tsn.h                          |  952 +++++++++++++++++++++
 include/trace/events/tsn.h                   |  333 ++++++++
 net/Kconfig                                  |    1 +
 net/Makefile                                 |    1 +
 net/tsn/Kconfig                              |   32 +
 net/tsn/Makefile                             |    6 +
 net/tsn/tsn_configfs.c                       |  673 +++++++++++++++
 net/tsn/tsn_core.c                           | 1189 ++++++++++++++++++++++++++
 net/tsn/tsn_header.c                         |  162 ++++
 net/tsn/tsn_internal.h                       |  397 +++++++++
 net/tsn/tsn_net.c                            |  392 +++++++++
 25 files changed, 6061 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/TSN/tsn.txt
 create mode 100644 drivers/media/avb/Makefile
 create mode 100644 drivers/media/avb/avb_alsa.c
 create mode 100644 drivers/media/avb/tsn_iec61883.h
 create mode 100644 drivers/net/ethernet/intel/igb/igb_tsn.c
 create mode 100644 include/linux/tsn.h
 create mode 100644 include/trace/events/tsn.h
 create mode 100644 net/tsn/Kconfig
 create mode 100644 net/tsn/Makefile
 create mode 100644 net/tsn/tsn_configfs.c
 create mode 100644 net/tsn/tsn_core.c
 create mode 100644 net/tsn/tsn_header.c
 create mode 100644 net/tsn/tsn_internal.h
 create mode 100644 net/tsn/tsn_net.c

--
2.7.4

^ permalink raw reply

* Re: [PATCH v5 1/4] siphash: add cryptographically secure PRF
From: George Spelvin @ 2016-12-16 17:36 UTC (permalink / raw)
  To: Jason, jeanphilippe.aumasson
  Cc: ak, davem, David.Laight, djb, ebiggers3, hannes, kernel-hardening,
	linux-crypto, linux-kernel, linux, luto, netdev, tom, torvalds,
	tytso, vegard.nossum
In-Reply-To: <CAHmME9rxCYfwyF6EADWqpAEt+yqCPgCLUVH0FPdAy7r-oPnrRg@mail.gmail.com>

> It appears that hsiphash can produce either 32-bit output or 64-bit
> output, with the output length parameter as part of the hash algorithm
> in there. When I code this for my kernel patchset, I very likely will
> only implement one output length size. Right now I'm leaning toward
> 32-bit.

A 128-bit output option was added to SipHash after the initial publication;
this is just the equivalent in 32-bit.

> - Is this a reasonable choice?

Yes.

> - Are there reasons why hsiphash with 64-bit output would be
>   reasonable? Or will we be fine sticking with 32-bit output only?

Personally, I'd put in a comment saying that "there's a 64-bit output
variant that's not implemented" and punt until someone find a need.

> With both hsiphash and siphash, the division of usage will probably become:
> - Use 64-bit output 128-bit key siphash for keyed RNG-like things,
>   such as syncookies and sequence numbers
> - Use 64-bit output 128-bit key siphash for hashtables that must
>   absolutely be secure to an extremely high bandwidth attacker, such as
>   userspace directly DoSing a kernel hashtable
> - Use 32-bit output 64-bit key hsiphash for quick hashtable functions
>   that still must be secure but do not require as large of a security
>   margin.

On a 64-bit machine, 64-bit SipHash is *always* faster than 32-bit, and
should be used always.  Don't even compile the 32-bit code, to prevent
anyone accidentally using it, and make hsiphash an alias for siphash.

On a 32-bit machine, it's a much trickier case.  I'd be tempted to
use the 32-bit code always, but it needs examination.

Fortunately, the cost of brute-forcing hash functions can be fairly
exactly quantified, thanks to bitcoin miners.  It currently takes 2^70
hashes to create one bitcoin block, worth 25 bitcoins ($19,500).  Thus,
2^63 hashes cost $152.

Now, there are two factors that must be considered:
- That's a very very "wholesale" rate.  That's assuming you're doing
  large numbers of these and can put in the up-front effort designing
  silicon ASICs to do the attack.
- That's for a more difficult hash (double sha-256) than SipHash.
  That's a constant fator, but a pretty significant one.  If the wholesale
  assumption holds, that might bring the cost down another 6 or 7 bits,
  to $1-2 per break.

If you're not the NSA and limited to general-purpose silicon, let's
assume a state of the art GPU (Radeon HD 7970; AMD GPUs seem do to better
than nVidia).  The bitcoin mining rate for those is about 700M/second,
29.4 bits.  So 63 bits is 152502 GPU-days, divided by some factor
to account for SipHash's high speed compared to two rounds of SHA-2.
Call it 1000 GPU-days.

It's very doable, but also very non-trivial.  The question is, wouldn't
it be cheaper and easier just to do a brute-force flooding DDoS?

(This is why I wish the key size could be tweaked up to 80 bits.
That would take all these numbers out of the reasonable range.)

Let me consider your second example above, "secure against local users".
I should dig through your patchset and find the details, but what exactly
are the consequences of such an attack?  Hasn't a local user already
got much better ways to DoS the system?

The thing to remember is that we're worried only about the combination
of a *new* Linux kernel (new build or under active maintenance) and a
32-bit host.  You'd be hard-pressed to find a *single* machine fitting
that description which is hosting multiple users or VMs and is not 64-bit.

These days, 32-bit CPUs are for embedded applications: network appliances,
TVs, etc.  That means basically single-user.  Even phones are 64 bit.
Is this really a threat that needs to be defended against?

For your first case, network applications, the additional security
is definitely attractive.  Syncookies are only a DoS, but sequence
numbers are a real security issue; they can let you inject data into a
TCP connection.

Hash tables are much harder to attack.  The information you get back from
timing probes is statistical, and thus testing a key is more expensive.
With sequence numbers, large amounts (32 bits) the hash output is
directly observable.

I wish we could get away with 64-bit security, but given that the
modern internet involves attacks from NSA/Spetssvyaz/3PLA, I agree
it's just not enough.

^ permalink raw reply

* Re: [PULL] virtio, vhost: new device, fixes, speedups
From: Linus Torvalds @ 2016-12-16 17:32 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: arend.vanspriel, KVM list, Network Development,
	Linux Kernel Mailing List, virtualization
In-Reply-To: <20161216184123-mutt-send-email-mst@kernel.org>

On Fri, Dec 16, 2016 at 9:09 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
>
> Oh, that's because I set orderfile globally rather than
> just for the qemu project which wants it.
> Fixed, sorry about that.

That explains it. I should have remembered, I think this came up once
before with somebody else.

Yeah, for the kernel it makes things much easier (at least for me) to
have everything just the default alphabetical ordering, particularly
because we use directory structure for maintenance areas.

So ordering the diffs by type ends up breaking my mental model for "is
this pull request touching the right files", which is why I reacted.

             Linus

^ permalink raw reply

* "rfkill: Add rfkill-any LED trigger" causes deadlock
From: Mike Krinkin @ 2016-12-16 17:09 UTC (permalink / raw)
  To: kernel-ePNcKBjznIDVItvQsEIGlw,
	johannes.berg-ral2JQCrhuEAvxtiuMwx3w,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q, netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <CACpa5=_chGuU3zVcVR3nkNySwz1cGAQTs39v+BjwcDNSvOqhTw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Fri, Dec 16, 2016 at 07:46:06PM +0300, Михаил Кринкин wrote:
> Hi,
> 
> with recent i can't load my thinkpad with recent linux-next, bisect points
> at the commit 73f4f76a196d7adb ("rfkill: Add rfkill-any LED trigger").
> 
> Problem occurs because thinkapd_acpi rfkill set_block handler
> tpacpi_rfk_hook_set_block calls rfkill_set_sw_state, which in turn calls
> new rfkill_any_led_trigger_event. So when rfkill_set_block called from
> rfkill_register we have deadlock.
> 
> I added WARN to __rfkill_any_led_trigger_event to see how deadlock occurs,
> here is backtrace:
> 
> [    6.090079] WARNING: CPU: 2 PID: 307 at net/rfkill/core.c:184
> rfkill_any_led_trigger_event+0x2d/0x40
> [    6.090080] reached __rfkill_any_led_trigger_event
> [    6.090081] Modules linked in:
> [    6.090082]  snd_pcm sg thinkpad_acpi(+) mei_me nvram snd_seq mei
> idma64 virt_dma intel_pch_thermal ucsi intel_lpss_pci snd_seq_device
> hci_uart snd_timer snd btbcm soundcore btqca led_class btintel
> bluetooth i8042 rtc_cmos serio evdev intel_lpss_acpi intel_lpss
> acpi_pad tpm_infineon mfd_core kvm_intel kvm irqbypass ipv6 autofs4
> ext4 jbd2 mbcache sd_mod i915 i2c_algo_bit drm_kms_helper syscopyarea
> sysfillrect sysimgblt fb_sys_fops drm ahci libahci video
> pinctrl_sunrisepoint pinctrl_intel
> [    6.090112] CPU: 2 PID: 307 Comm: systemd-udevd Tainted: G        W
>       4.9.0-01741-g73f4f76-dirty #106
> [    6.090113] Hardware name: LENOVO 20GJ004ERT/20GJ004ERT, BIOS
> R0CET21W (1.09 ) 03/07/2016
> [    6.090114]  ffffb199c023f998 ffffffffb4334b6b ffffb199c023f9e8
> 0000000000000000
> [    6.090117]  ffffb199c023f9d8 ffffffffb40658ab 000000b84dd75d28
> 0000000080000006
> [    6.090119]  ffff99ea4dd75d28 0000000000000001 0000000000000001
> 0000000000000000
> [    6.090122] Call Trace:
> [    6.090126]  [<ffffffffb4334b6b>] dump_stack+0x4d/0x72
> [    6.090128]  [<ffffffffb40658ab>] __warn+0xcb/0xf0
> [    6.090130]  [<ffffffffb406592f>] warn_slowpath_fmt+0x5f/0x80
> [    6.090132]  [<ffffffffb4631bfd>] rfkill_any_led_trigger_event+0x2d/0x40
> [    6.090135]  [<ffffffffb46322d9>] rfkill_set_sw_state+0x89/0xd0
> [    6.090142]  [<ffffffffc06c9921>]
> tpacpi_rfk_update_swstate+0x31/0x50 [thinkpad_acpi]
> [    6.090148]  [<ffffffffc06c9972>]
> tpacpi_rfk_hook_set_block+0x32/0x70 [thinkpad_acpi]
> [    6.090150]  [<ffffffffb46327f0>] rfkill_set_block+0x90/0x160
> [    6.090152]  [<ffffffffb4632930>] __rfkill_switch_all+0x70/0xa0
> [    6.090154]  [<ffffffffb4633028>] rfkill_register+0x278/0x2b0
> [    6.090159]  [<ffffffffc06e1f0e>] tpacpi_new_rfkill+0xef/0x15d
> [thinkpad_acpi]
> [    6.090164]  [<ffffffffc06e2407>] bluetooth_init+0x15e/0x1a9 [thinkpad_acpi]
> [    6.090169]  [<ffffffffc06e2fb5>]
> thinkpad_acpi_module_init.part.47+0x5e7/0x996 [thinkpad_acpi]
> [    6.090174]  [<ffffffffc06e3364>] ?
> thinkpad_acpi_module_init.part.47+0x996/0x996 [thinkpad_acpi]
> [    6.090179]  [<ffffffffc06e36af>]
> thinkpad_acpi_module_init+0x34b/0xc9c [thinkpad_acpi]
> [    6.090181]  [<ffffffffb4000450>] do_one_initcall+0x50/0x180
> [    6.090184]  [<ffffffffb4177e12>] ? do_init_module+0x27/0x1ee
> [    6.090186]  [<ffffffffb41cc4b6>] ? kmem_cache_alloc_trace+0x156/0x1a0
> [    6.090188]  [<ffffffffb4177e4a>] do_init_module+0x5f/0x1ee
> [    6.090191]  [<ffffffffb40ed562>] load_module+0x22c2/0x28d0
> [    6.090192]  [<ffffffffb40ea0f0>] ? __symbol_put+0x60/0x60
> [    6.090195]  [<ffffffffb42ee15d>] ? ima_post_read_file+0x7d/0xa0
> [    6.090198]  [<ffffffffb42a809b>] ? security_kernel_post_read_file+0x6b/0x80
> [    6.090200]  [<ffffffffb40eddcf>] SYSC_finit_module+0xdf/0x110
> [    6.090202]  [<ffffffffb40ede1e>] SyS_finit_module+0xe/0x10
> [    6.090204]  [<ffffffffb463d524>] entry_SYSCALL_64_fastpath+0x17/0x98
> [    6.090206] ---[ end trace ea6da61c1ec208d1 ]---
> [    6.090207] rfkill_any_led_trigger unlocked
> [    6.091664] ------------[ cut here ]------------

^ permalink raw reply

* Re: [PATCH v5 1/4] siphash: add cryptographically secure PRF
From: Jason A. Donenfeld @ 2016-12-16 17:09 UTC (permalink / raw)
  To: David Laight
  Cc: George Spelvin, ak@linux.intel.com, davem@davemloft.net,
	ebiggers3@gmail.com, hannes@stressinduktion.org,
	jeanphilippe.aumasson@gmail.com,
	kernel-hardening@lists.openwall.com, linux-crypto@vger.kernel.org,
	linux-kernel@vger.kernel.org, luto@amacapital.net,
	netdev@vger.kernel.org, tom@herbertland.com,
	torvalds@linux-foundation.org, tytso@mit.edu,
	vegard.nossum@gmail.com, djb@cr.yp.to
In-Reply-To: <063D6719AE5E284EB5DD2968C1650D6DB0241238@AcuExch.aculab.com>

Hi David,

On Fri, Dec 16, 2016 at 6:06 PM, David Laight <David.Laight@aculab.com> wrote:
> A 32bit hash would also remove all the issues about the alignment
> of IP addresses (etc) on 64bit systems.

The current replacements of md5_transform with siphash in the v6 patch
series will continue to use the original siphash, since the 128-bit
key is rather important for these kinds of secrets. Additionally,
64-bit siphash is already faster than the md5_transform that it
replaces. So the alignment concerns (now, non-issues; problems have
been solved, I believe) still remain.

Jason

^ permalink raw reply

* Re: [PULL] virtio, vhost: new device, fixes, speedups
From: Michael S. Tsirkin @ 2016-12-16 17:09 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: arend.vanspriel, KVM list, Network Development,
	Linux Kernel Mailing List, virtualization
In-Reply-To: <CA+55aFxwG=jj6=2fKjNkHCOnbKr-mNvNucbA=CSdPvb4kzW3FA@mail.gmail.com>

On Thu, Dec 15, 2016 at 06:20:40PM -0800, Linus Torvalds wrote:
> On Thu, Dec 15, 2016 at 3:05 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> >   git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git tags/for_linus
> 
> Pulled, but I wonder...
> 
> >  Documentation/translations/zh_CN/sparse.txt        |   7 +-
> >  arch/arm/plat-samsung/include/plat/gpio-cfg.h      |   2 +-
> >  drivers/crypto/virtio/virtio_crypto_common.h       | 128 +++++
> [...]
> 
> what are you generating these diffstats with? Because they are pretty bogus..
> 
> The end result is correct:
> 
> >  86 files changed, 2106 insertions(+), 280 deletions(-)
> 
> but the file order in the diffstat is completely random, which makes
> it very hard to compare with what I get. It also makes it hard to see
> what you changed, because it's not alphabetical like it should be
> (strictly speaking the git pathname ordering isnt' really
> alphabetical, since the '/' sorts as the NUL character, but close
> enough).
> 
> I can't see the logic to the re-ordering of the lines, so I'm
> intrigued how you even generated it.
> 
>             Linus


Oh, that's because I set orderfile globally rather than
just for the qemu project which wants it.
Fixed, sorry about that.

-- 
MST

^ permalink raw reply

* RE: [PATCH v5 1/4] siphash: add cryptographically secure PRF
From: David Laight @ 2016-12-16 17:06 UTC (permalink / raw)
  To: 'George Spelvin', ak@linux.intel.com, davem@davemloft.net,
	ebiggers3@gmail.com, hannes@stressinduktion.org, Jason@zx2c4.com,
	jeanphilippe.aumasson@gmail.com,
	kernel-hardening@lists.openwall.com, linux-crypto@vger.kernel.org,
	linux-kernel@vger.kernel.org, luto@amacapital.net,
	netdev@vger.kernel.org, tom@herbertland.com,
	torvalds@linux-foundation.org, "tytso@mit.edu" <tytso
  Cc: djb@cr.yp.to
In-Reply-To: <20161215232840.22459.qmail@ns.sciencehorizons.net>

From: George Spelvin
> Sent: 15 December 2016 23:29
> > If a halved version of SipHash can bring significant performance boost
> > (with 32b words instead of 64b words) with an acceptable security level
> > (64-bit enough?) then we may design such a version.
> 
> I was thinking if the key could be pushed to 80 bits, that would be nice,
> but honestly 64 bits is fine.  This is DoS protection, and while it's
> possible to brute-force a 64 bit secret, there are more effective (DDoS)
> attacks possible for the same cost.

A 32bit hash would also remove all the issues about the alignment
of IP addresses (etc) on 64bit systems.

> (I'd suggest a name of "HalfSipHash" to convey the reduced security
> effectively.)
> 
> > Regarding output size, are 64 bits sufficient?
> 
> As a replacement for jhash, 32 bits are sufficient.  It's for
> indexing an in-memory hash table on a 32-bit machine.

It is also worth remembering that if the intent is to generate
a hash table index (not a unique fingerprint) you will always
get collisions on the final value.
Randomness could still give overlong hash chains - which might
still need rehashing with a different key.

	David

^ permalink raw reply

* RE: [upstream-release] [PATCH net 2/4] fsl/fman: arm: call of_platform_populate() for arm64 platfrom
From: Madalin-Cristian Bucur @ 2016-12-16 17:01 UTC (permalink / raw)
  To: Scott Wood, netdev@vger.kernel.org, mpe@ellerman.id.au
  Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org,
	davem@davemloft.net
In-Reply-To: <DB5PR0401MB1928F163614C64900D1458ED919D0@DB5PR0401MB1928.eurprd04.prod.outlook.com>

> From: Scott Wood
> Sent: Thursday, December 15, 2016 8:42 PM
> 
> On 12/15/2016 07:11 AM, Madalin Bucur wrote:
> > From: Igal Liberman <igal.liberman@freescale.com>
> >
> > Signed-off-by: Igal Liberman <igal.liberman@freescale.com>
> > ---
> >  drivers/net/ethernet/freescale/fman/fman.c | 10 ++++++++++
> >  1 file changed, 10 insertions(+)
> >
> > diff --git a/drivers/net/ethernet/freescale/fman/fman.c
> b/drivers/net/ethernet/freescale/fman/fman.c
> > index dafd9e1..f36b4eb 100644
> > --- a/drivers/net/ethernet/freescale/fman/fman.c
> > +++ b/drivers/net/ethernet/freescale/fman/fman.c
> > @@ -2868,6 +2868,16 @@ static struct fman *read_dts_node(struct
> platform_device *of_dev)
> >
> >  	fman->dev = &of_dev->dev;
> >
> > +#ifdef CONFIG_ARM64
> > +	/* call of_platform_populate in order to probe sub-nodes on arm64 */
> > +	err = of_platform_populate(fm_node, NULL, NULL, &of_dev->dev);
> > +	if (err) {
> > +		dev_err(&of_dev->dev, "%s: of_platform_populate() failed\n",
> > +			__func__);
> > +		goto fman_free;
> > +	}
> > +#endif
> 
> Should we remove fsl,fman from the PPC of_device_ids[], so this doesn't
> need an ifdef?
> 
> Why is it #ifdef CONFIG_ARM64 rather than #ifndef CONFIG_PPC?
> 
> -Scott

Igal was working on adding ARM64 support when this patch was created, thus the
choice of #ifdef CONFIG_ARM64. Unifying this for PPC and ARM64 by always calling
of_platform_populate() sounds like the best approach. I would need to synchronize
the introduction of this code with the removal of the fsl,fman entry from the
of_device_ids[] array.

Dave, Michael, Scott, is it ok to add to v2 of this patch set the patch that removes
the compatible "fsl,fman" from arch/powerpc/platforms/85xx/corenet_generic.c?

Thanks,
Madalin

^ permalink raw reply

* "rfkill: Add rfkill-any LED trigger" causes deadlock
From: Михаил Кринкин @ 2016-12-16 16:46 UTC (permalink / raw)
  To: kernel, johannes.berg, linux-wireless, davem, netdev
In-Reply-To: <20161216163707.GA2629@gmail.com>

Hi,

with recent i can't load my thinkpad with recent linux-next, bisect points
at the commit 73f4f76a196d7adb ("rfkill: Add rfkill-any LED trigger").

Problem occurs because thinkapd_acpi rfkill set_block handler
tpacpi_rfk_hook_set_block calls rfkill_set_sw_state, which in turn calls
new rfkill_any_led_trigger_event. So when rfkill_set_block called from
rfkill_register we have deadlock.

I added WARN to __rfkill_any_led_trigger_event to see how deadlock occurs,
here is backtrace:

[    6.090079] WARNING: CPU: 2 PID: 307 at net/rfkill/core.c:184
rfkill_any_led_trigger_event+0x2d/0x40
[    6.090080] reached __rfkill_any_led_trigger_event
[    6.090081] Modules linked in:
[    6.090082]  snd_pcm sg thinkpad_acpi(+) mei_me nvram snd_seq mei
idma64 virt_dma intel_pch_thermal ucsi intel_lpss_pci snd_seq_device
hci_uart snd_timer snd btbcm soundcore btqca led_class btintel
bluetooth i8042 rtc_cmos serio evdev intel_lpss_acpi intel_lpss
acpi_pad tpm_infineon mfd_core kvm_intel kvm irqbypass ipv6 autofs4
ext4 jbd2 mbcache sd_mod i915 i2c_algo_bit drm_kms_helper syscopyarea
sysfillrect sysimgblt fb_sys_fops drm ahci libahci video
pinctrl_sunrisepoint pinctrl_intel
[    6.090112] CPU: 2 PID: 307 Comm: systemd-udevd Tainted: G        W
      4.9.0-01741-g73f4f76-dirty #106
[    6.090113] Hardware name: LENOVO 20GJ004ERT/20GJ004ERT, BIOS
R0CET21W (1.09 ) 03/07/2016
[    6.090114]  ffffb199c023f998 ffffffffb4334b6b ffffb199c023f9e8
0000000000000000
[    6.090117]  ffffb199c023f9d8 ffffffffb40658ab 000000b84dd75d28
0000000080000006
[    6.090119]  ffff99ea4dd75d28 0000000000000001 0000000000000001
0000000000000000
[    6.090122] Call Trace:
[    6.090126]  [<ffffffffb4334b6b>] dump_stack+0x4d/0x72
[    6.090128]  [<ffffffffb40658ab>] __warn+0xcb/0xf0
[    6.090130]  [<ffffffffb406592f>] warn_slowpath_fmt+0x5f/0x80
[    6.090132]  [<ffffffffb4631bfd>] rfkill_any_led_trigger_event+0x2d/0x40
[    6.090135]  [<ffffffffb46322d9>] rfkill_set_sw_state+0x89/0xd0
[    6.090142]  [<ffffffffc06c9921>]
tpacpi_rfk_update_swstate+0x31/0x50 [thinkpad_acpi]
[    6.090148]  [<ffffffffc06c9972>]
tpacpi_rfk_hook_set_block+0x32/0x70 [thinkpad_acpi]
[    6.090150]  [<ffffffffb46327f0>] rfkill_set_block+0x90/0x160
[    6.090152]  [<ffffffffb4632930>] __rfkill_switch_all+0x70/0xa0
[    6.090154]  [<ffffffffb4633028>] rfkill_register+0x278/0x2b0
[    6.090159]  [<ffffffffc06e1f0e>] tpacpi_new_rfkill+0xef/0x15d
[thinkpad_acpi]
[    6.090164]  [<ffffffffc06e2407>] bluetooth_init+0x15e/0x1a9 [thinkpad_acpi]
[    6.090169]  [<ffffffffc06e2fb5>]
thinkpad_acpi_module_init.part.47+0x5e7/0x996 [thinkpad_acpi]
[    6.090174]  [<ffffffffc06e3364>] ?
thinkpad_acpi_module_init.part.47+0x996/0x996 [thinkpad_acpi]
[    6.090179]  [<ffffffffc06e36af>]
thinkpad_acpi_module_init+0x34b/0xc9c [thinkpad_acpi]
[    6.090181]  [<ffffffffb4000450>] do_one_initcall+0x50/0x180
[    6.090184]  [<ffffffffb4177e12>] ? do_init_module+0x27/0x1ee
[    6.090186]  [<ffffffffb41cc4b6>] ? kmem_cache_alloc_trace+0x156/0x1a0
[    6.090188]  [<ffffffffb4177e4a>] do_init_module+0x5f/0x1ee
[    6.090191]  [<ffffffffb40ed562>] load_module+0x22c2/0x28d0
[    6.090192]  [<ffffffffb40ea0f0>] ? __symbol_put+0x60/0x60
[    6.090195]  [<ffffffffb42ee15d>] ? ima_post_read_file+0x7d/0xa0
[    6.090198]  [<ffffffffb42a809b>] ? security_kernel_post_read_file+0x6b/0x80
[    6.090200]  [<ffffffffb40eddcf>] SYSC_finit_module+0xdf/0x110
[    6.090202]  [<ffffffffb40ede1e>] SyS_finit_module+0xe/0x10
[    6.090204]  [<ffffffffb463d524>] entry_SYSCALL_64_fastpath+0x17/0x98
[    6.090206] ---[ end trace ea6da61c1ec208d1 ]---
[    6.090207] rfkill_any_led_trigger unlocked
[    6.091664] ------------[ cut here ]------------

^ permalink raw reply

* Re: [PATCH] ip: vfinfo: remove code duplication for IFLA_VF_RSS_QUERY_EN
From: Phil Sutter @ 2016-12-16 16:43 UTC (permalink / raw)
  To: Julien Fortin; +Cc: stephen, netdev
In-Reply-To: <20161216163605.19728-1-julien@cumulusnetworks.com>

On Fri, Dec 16, 2016 at 05:36:05PM +0100, Julien Fortin wrote:
> From: Julien Fortin <julien@cumulusnetworks.com>
> 
> Fixes: 4fb4a10e120b1 ("ipaddress: Print IFLA_VF_QUERY_RSS_EN setting”)
> 
> Signed-off-by: Julien Fortin <julien@cumulusnetworks.com>

Oh! Looks like Git messed up during a rebase and the obviously wrong
result was overlooked.

Acked-by: Phil Sutter <phil@nwl.cc>

^ permalink raw reply

* [PATCH] ip: vfinfo: remove code duplication for IFLA_VF_RSS_QUERY_EN
From: Julien Fortin @ 2016-12-16 16:36 UTC (permalink / raw)
  To: stephen, netdev; +Cc: phil, Julien Fortin

From: Julien Fortin <julien@cumulusnetworks.com>

Fixes: 4fb4a10e120b1 ("ipaddress: Print IFLA_VF_QUERY_RSS_EN setting”)

Signed-off-by: Julien Fortin <julien@cumulusnetworks.com>
---
 ip/ipaddress.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index de64877..400ebb4 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -414,14 +414,6 @@ static void print_vfinfo(FILE *fp, struct rtattr *vfinfo)
 			fprintf(fp, ", query_rss %s",
 			        rss_query->setting ? "on" : "off");
 	}
-	if (vf[IFLA_VF_RSS_QUERY_EN]) {
-		struct ifla_vf_rss_query_en *rss_query =
-			RTA_DATA(vf[IFLA_VF_RSS_QUERY_EN]);
-
-		if (rss_query->setting != -1)
-			fprintf(fp, ", query_rss %s",
-			        rss_query->setting ? "on" : "off");
-	}
 	if (vf[IFLA_VF_STATS] && show_stats)
 		print_vf_stats64(fp, vf[IFLA_VF_STATS]);
 }
-- 
2.11.0

^ permalink raw reply related

* [PATCH] cgroup: Fix CGROUP_BPF config
From: Andy Lutomirski @ 2016-12-16 16:33 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Mack; +Cc: Network Development, Andy Lutomirski

CGROUP_BPF depended on SOCK_CGROUP_DATA which can't be manually
enabled, making it rather challenging to turn CGROUP_BPF on.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 init/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/init/Kconfig b/init/Kconfig
index aafafeb0c117..223b734abccd 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1156,7 +1156,8 @@ config CGROUP_PERF
 
 config CGROUP_BPF
 	bool "Support for eBPF programs attached to cgroups"
-	depends on BPF_SYSCALL && SOCK_CGROUP_DATA
+	depends on BPF_SYSCALL
+	select SOCK_CGROUP_DATA
 	help
 	  Allow attaching eBPF programs to a cgroup using the bpf(2)
 	  syscall command BPF_PROG_ATTACH.
-- 
2.9.3

^ permalink raw reply related

* Re: Soft lockup in inet_put_port on 4.6
From: Josef Bacik @ 2016-12-16 15:21 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: Tom Herbert, Craig Gallek, Eric Dumazet,
	Linux Kernel Network Developers
In-Reply-To: <1481900088.24490.6@smtp.office365.com>

On Fri, Dec 16, 2016 at 9:54 AM, Josef Bacik <jbacik@fb.com> wrote:
> On Thu, Dec 15, 2016 at 7:07 PM, Hannes Frederic Sowa 
> <hannes@stressinduktion.org> wrote:
>> Hi Josef,
>> 
>> On 15.12.2016 19:53, Josef Bacik wrote:
>>>  On Tue, Dec 13, 2016 at 6:32 PM, Tom Herbert <tom@herbertland.com> 
>>> wrote:
>>>>  On Tue, Dec 13, 2016 at 3:03 PM, Craig Gallek 
>>>> <kraigatgoog@gmail.com>
>>>>  wrote:
>>>>>   On Tue, Dec 13, 2016 at 3:51 PM, Tom Herbert 
>>>>> <tom@herbertland.com>
>>>>>  wrote:
>>>>>>   I think there may be some suspicious code in 
>>>>>> inet_csk_get_port. At
>>>>>>   tb_found there is:
>>>>>> 
>>>>>>                   if (((tb->fastreuse > 0 && reuse) ||
>>>>>>                        (tb->fastreuseport > 0 &&
>>>>>>                         !rcu_access_pointer(sk->sk_reuseport_cb) 
>>>>>> &&
>>>>>>                         sk->sk_reuseport && uid_eq(tb->fastuid,
>>>>>>  uid))) &&
>>>>>>                       smallest_size == -1)
>>>>>>                           goto success;
>>>>>>                   if 
>>>>>> (inet_csk(sk)->icsk_af_ops->bind_conflict(sk,
>>>>>>  tb, true)) {
>>>>>>                           if ((reuse ||
>>>>>>                                (tb->fastreuseport > 0 &&
>>>>>>                                 sk->sk_reuseport &&
>>>>>> 
>>>>>>  !rcu_access_pointer(sk->sk_reuseport_cb) &&
>>>>>>                                 uid_eq(tb->fastuid, uid))) &&
>>>>>>                               smallest_size != -1 && --attempts 
>>>>>> >= 0) {
>>>>>>                                   spin_unlock_bh(&head->lock);
>>>>>>                                   goto again;
>>>>>>                           }
>>>>>>                           goto fail_unlock;
>>>>>>                   }
>>>>>> 
>>>>>>   AFAICT there is redundancy in these two conditionals.  The 
>>>>>> same clause
>>>>>>   is being checked in both: (tb->fastreuseport > 0 &&
>>>>>>   !rcu_access_pointer(sk->sk_reuseport_cb) && sk->sk_reuseport &&
>>>>>>   uid_eq(tb->fastuid, uid))) && smallest_size == -1. If this is 
>>>>>> true the
>>>>>>   first conditional should be hit, goto done,  and the second 
>>>>>> will never
>>>>>>   evaluate that part to true-- unless the sk is changed (do we 
>>>>>> need
>>>>>>   READ_ONCE for sk->sk_reuseport_cb?).
>>>>>   That's an interesting point... It looks like this function also
>>>>>   changed in 4.6 from using a single local_bh_disable() at the 
>>>>> beginning
>>>>>   with several spin_lock(&head->lock) to exclusively
>>>>>   spin_lock_bh(&head->lock) at each locking point.  Perhaps the 
>>>>> full bh
>>>>>   disable variant was preventing the timers in your stack trace 
>>>>> from
>>>>>   running interleaved with this function before?
>>>> 
>>>>  Could be, although dropping the lock shouldn't be able to affect 
>>>> the
>>>>  search state. TBH, I'm a little lost in reading function, the
>>>>  SO_REUSEPORT handling is pretty complicated. For instance,
>>>>  rcu_access_pointer(sk->sk_reuseport_cb) is checked three times in 
>>>> that
>>>>  function and also in every call to inet_csk_bind_conflict. I 
>>>> wonder if
>>>>  we can simply this under the assumption that SO_REUSEPORT is only
>>>>  allowed if the port number (snum) is explicitly specified.
>>> 
>>>  Ok first I have data for you Hannes, here's the time distributions
>>>  before during and after the lockup (with all the debugging in 
>>> place the
>>>  box eventually recovers).  I've attached it as a text file since 
>>> it is
>>>  long.
>> 
>> Thanks a lot!
>> 
>>>  Second is I was thinking about why we would spend so much time 
>>> doing the
>>>  ->owners list, and obviously it's because of the massive amount of
>>>  timewait sockets on the owners list.  I wrote the following dumb 
>>> patch
>>>  and tested it and the problem has disappeared completely.  Now I 
>>> don't
>>>  know if this is right at all, but I thought it was weird we weren't
>>>  copying the soreuseport option from the original socket onto the 
>>> twsk.
>>>  Is there are reason we aren't doing this currently?  Does this help
>>>  explain what is happening?  Thanks,
>> 
>> The patch is interesting and a good clue, but I am immediately a bit
>> concerned that we don't copy/tag the socket with the uid also to keep
>> the security properties for SO_REUSEPORT. I have to think a bit more
>> about this.
>> 
>> We have seen hangs during connect. I am afraid this patch wouldn't 
>> help
>> there while also guaranteeing uniqueness.
> 
> 
> Yeah so I looked at the code some more and actually my patch is 
> really bad.  If sk2->sk_reuseport is set we'll look at 
> sk2->sk_reuseport_cb, which is outside of the timewait sock, so 
> that's definitely bad.
> 
> But we should at least be setting it to 0 so that we don't do this 
> normally.  Unfortunately simply setting it to 0 doesn't fix the 
> problem.  So for some reason having ->sk_reuseport set to 1 on a 
> timewait socket makes this problem non-existent, which is strange.
> 
> So back to the drawing board I guess.  I wonder if doing what craig 
> suggested and batching the timewait timer expires so it hurts less 
> would accomplish the same results.  Thanks,

Wait no I lied, we access the sk->sk_reuseport_cb, not sk2's.  This is 
the code

                        if ((!reuse || !sk2->sk_reuse ||
                            sk2->sk_state == TCP_LISTEN) &&
                            (!reuseport || !sk2->sk_reuseport ||
                             rcu_access_pointer(sk->sk_reuseport_cb) ||
                             (sk2->sk_state != TCP_TIME_WAIT &&
                             !uid_eq(uid, sock_i_uid(sk2))))) {

                                if (!sk2->sk_rcv_saddr || 
!sk->sk_rcv_saddr ||
                                    sk2->sk_rcv_saddr == 
sk->sk_rcv_saddr)
                                        break;
                        }

so in my patches case we now have reuseport == 1, sk2->sk_reuseport == 
1.  But now we are using reuseport, so sk->sk_reuseport_cb should be 
non-NULL right?  So really setting the timewait sock's sk_reuseport 
should have no bearing on how this loop plays out right?  Thanks,

Josef

^ permalink raw reply

* Re: [PATCH v5 3/4] secure_seq: use SipHash in place of MD5
From: Jason A. Donenfeld @ 2016-12-16 15:57 UTC (permalink / raw)
  To: David Laight
  Cc: Netdev, kernel-hardening@lists.openwall.com, LKML,
	linux-crypto@vger.kernel.org, Ted Tso, Hannes Frederic Sowa,
	Linus Torvalds, Eric Biggers, Tom Herbert, George Spelvin,
	Vegard Nossum, ak@linux.intel.com, davem@davemloft.net,
	luto@amacapital.net
In-Reply-To: <063D6719AE5E284EB5DD2968C1650D6DB0240E66@AcuExch.aculab.com>

Hi David,

On Fri, Dec 16, 2016 at 10:59 AM, David Laight <David.Laight@aculab.com> wrote:
> You are still putting over-aligned data on stack.
> You only need to align it to the alignment of u64 (not the size of u64).
> If an on-stack item has a stronger alignment requirement than the stack
> the gcc has to generate two stack frames for the function.

Yesterday, folks were saying that sometimes 32-bit platforms need
8-byte alignment for certain 64-bit operations, so I shouldn't fall
back to 4-byte alignment there. But actually, looking at this more
closely, I can just make SIPHASH_ALIGNMENT == __alignof__(u64), which
will take care of all possible concerns, since gcc knows best which
platforms need what alignment. Thanks for making this clear to me with
"the alignment of u64 (not the size of u64)".

> Oh - and wait a bit longer between revisions.

Okay. We can be turtles.

Jason

^ permalink raw reply

* Re: [PATCH v5 1/4] siphash: add cryptographically secure PRF
From: Jason A. Donenfeld @ 2016-12-16 15:51 UTC (permalink / raw)
  To: Jean-Philippe Aumasson
  Cc: George Spelvin, Andi Kleen, David Miller, David Laight,
	Eric Biggers, Hannes Frederic Sowa, kernel-hardening,
	Linux Crypto Mailing List, LKML, Andy Lutomirski, Netdev,
	Tom Herbert, Linus Torvalds, Theodore Ts'o, vegard.nossum,
	Daniel J . Bernstein
In-Reply-To: <CAGiyFddB_HT3H2yhYQ5rprYZ487rJ4iCaH9uPJQD57hiPbn9ng@mail.gmail.com>

Hi JP & George,

My function names:
- SipHash -> siphash
- HalfSipHash -> hsiphash

It appears that hsiphash can produce either 32-bit output or 64-bit
output, with the output length parameter as part of the hash algorithm
in there. When I code this for my kernel patchset, I very likely will
only implement one output length size. Right now I'm leaning toward
32-bit. Questions:

- Is this a reasonable choice?
- When hsiphash is desired due to its faster speed, are there any
circumstances in which producing a 64-bit output would actually be
useful? Namely, are there any hashtables that could benefit from a
64-bit functions?
- Are there reasons why hsiphash with 64-bit output would be
reasonable? Or will we be fine sticking with 32-bit output only?

With both hsiphash and siphash, the division of usage will probably become:
- Use 64-bit output 128-bit key siphash for keyed RNG-like things,
such as syncookies and sequence numbers
- Use 64-bit output 128-bit key siphash for hashtables that must
absolutely be secure to an extremely high bandwidth attacker, such as
userspace directly DoSing a kernel hashtable
- Use 32-bit output 64-bit key hsiphash for quick hashtable functions
that still must be secure but do not require as large of a security
margin

Sound good?

Jason

^ permalink raw reply

* Re: [PATCH net] sfc: skip ptp allocations on secondary interfaces
From: Jarod Wilson @ 2016-12-16 15:48 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-net-drivers, Edward Cree, Bert Kenward, netdev
In-Reply-To: <20161215203104.12714-1-jarod@redhat.com>

On 2016-12-15 3:31 PM, Jarod Wilson wrote:
> Rather than allocating efx_ptp_data, a buffer, a workqueue, etc., and then
> ultimately deciding not to call ptp_clock_register() for non-primary
> interfaces, just exit out of efx_ptp_prober() earlier. Saves a little
> memory and speeds up init and teardown slightly.
>
> CC: linux-net-drivers@solarflare.com
> CC: Edward Cree <ecree@solarflare.com>
> CC: Bert Kenward <bkenward@solarflare.com>
> CC: netdev@vger.kernel.org
> Signed-off-by: Jarod Wilson <jarod@redhat.com>

Self-Nak on this one. Talking to Bert off-list, he pointed out that the 
driver uses the PTP structure allocated here for the secondary interface 
to do internal time-stamping and the like. While it might be ideal not 
to intermingle PTP structs with non-PTP work, disabling this right now 
is obviously a non-starter.

-- 
Jarod Wilson
jarod@redhat.com

^ permalink raw reply

* RE: [PATCH v5 2/4] siphash: add Nu{32,64} helpers
From: George Spelvin @ 2016-12-16 15:44 UTC (permalink / raw)
  To: ak, davem, David.Laight, ebiggers3, hannes, Jason,
	kernel-hardening, linux-crypto, linux-kernel, linux, luto, netdev,
	tom, torvalds, tytso, vegard.nossum
In-Reply-To: <063D6719AE5E284EB5DD2968C1650D6DB0240EFA@AcuExch.aculab.com>

Jason A. Donenfeld wrote:
> Isn't that equivalent to:
>	v0 = key[0];
>	v1 = key[1];
>	v2 = key[0] ^ (0x736f6d6570736575ULL ^ 0x646f72616e646f6dULL);
>	v3 = key[1] ^ (0x646f72616e646f6dULL ^ 0x7465646279746573ULL);

(Pre-XORing key[] with the first two constants which, if the constants
are random in the first place, can be a no-op.)  Other than the typo
in the v2 line, yes.  If they key is non-public, then you can xor an
arbitrary constant in to both halves to slightly speed up the startup.

(Nits: There's a typo in the v2 line, you don't need to parenthesize
associative operators like xor, and the "ull" suffix is redundant here.)

> Those constants also look like ASCII strings.

They are.  The ASCII is "somepseudorandomlygeneratedbytes".

> What cryptographic analysis has been done on the values?

They're "nothing up my sleeve numbers".

They're arbitrary numbers, and almost any other values would do exactly
as well.  The main properties are:

1) They're different (particulatly v0 != v2 and v1 != v3), and
2) Neither they, nor their xor, is rotationally symmetric like 0x55555555.
   (Because SipHash is mostly rotationally symmetric, broken only by the
   interruption of the carry chain at the msbit, it helps slightly
   to break this up at the beginning.)

Those exact values only matter for portability.  If you don't need anyone
else to be able to compute matching outputs, then you could use any other
convenient constants (like the MD5 round constants).

^ permalink raw reply

* Re: [PATCH perf/core REBASE 2/5] samples/bpf: Switch over to libbpf
From: Arnaldo Carvalho de Melo @ 2016-12-16 15:42 UTC (permalink / raw)
  To: Joe Stringer
  Cc: LKML, netdev, Wang Nan, ast, Daniel Borkmann,
	Arnaldo Carvalho de Melo
In-Reply-To: <CAPWQB7EAN7Fg8+vO3Tn7WYWcZW_JpKQFNCJzY_K100ckab6JRg@mail.gmail.com>

Em Thu, Dec 15, 2016 at 05:48:31PM -0800, Joe Stringer escreveu:
> On 15 December 2016 at 14:00, Joe Stringer <joe@ovn.org> wrote:
> > On 15 December 2016 at 10:34, Arnaldo Carvalho de Melo <acme@kernel.org> wrote:
> >> Em Thu, Dec 15, 2016 at 03:29:18PM -0300, Arnaldo Carvalho de Melo escreveu:
> >>> Em Thu, Dec 15, 2016 at 12:50:22PM -0300, Arnaldo Carvalho de Melo escreveu:
> >>> > Em Wed, Dec 14, 2016 at 02:43:39PM -0800, Joe Stringer escreveu:
> >>> > > Now that libbpf under tools/lib/bpf/* is synced with the version from
> >>> > > samples/bpf, we can get rid most of the libbpf library here.
> >>> > >
> >>> > > Signed-off-by: Joe Stringer <joe@ovn.org>
> >>> > > Cc: Alexei Starovoitov <ast@fb.com>
> >>> > > Cc: Daniel Borkmann <daniel@iogearbox.net>
> >>> > > Cc: Wang Nan <wangnan0@huawei.com>
> >>> > > Link: http://lkml.kernel.org/r/20161209024620.31660-6-joe@ovn.org
> >>> > > [ Use -I$(srctree)/tools/lib/ to support out of source code tree builds, as noticed by Wang Nan ]
> >>>
> >>> So, the above comment no longer applied to this adjusted patch from you,
> >>> as you removed one hunk too much, that, after applied, gets samples/bpf/
> >>> to build successfully:
> >>>
> >>> diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
> >>> index add514e2984a..81b0ef2f7994 100644
> >>> --- a/samples/bpf/Makefile
> >>> +++ b/samples/bpf/Makefile
> >>> @@ -107,6 +107,7 @@ always += lwt_len_hist_kern.o
> >>>  always += xdp_tx_iptunnel_kern.o
> >>>
> >>>  HOSTCFLAGS += -I$(objtree)/usr/include
> >>> +HOSTCFLAGS += -I$(srctree)/tools/lib/
> >>>  HOSTCFLAGS += -I$(srctree)/tools/testing/selftests/bpf/
> >>>
> >>>  HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
> >>>
> >>> ---------------------
> >>>
> >>> I added it, continuing...
> >>
> >> But then, when I tried to run offwaketime with it, it fails:
> >>
> >> [root@jouet bpf]# ./offwaketime  ls
> >> bpf_load_program() err=22
> >> BPF_LDX uses reserved fields
> >> bpf_load_program() err=22
> >> BPF_LDX uses reserved fields
> >> [root@jouet bpf]#
> >>
> >> If I remove this patch and try again, it works:
> >>
> >> [root@jouet bpf]# ./offwaketime | head -4
> >> swapper/1;start_secondary;cpu_startup_entry;schedule_preempt_disabled;schedule;__schedule;-;---;; 46
> >> chrome;return_from_SYSCALL_64;do_syscall_64;exit_to_usermode_loop;schedule;__schedule;-;try_to_wake_up;do_futex;sys_futex;do_syscall_64;return_from_SYSCALL_64;;Chrome_ChildIOT 1
> >> firefox;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule;-;try_to_wake_up;pollwake;__wake_up_common;__wake_up_sync_key;pipe_write;__vfs_write;vfs_write;sys_write;entry_SYSCALL_64_fastpath;;Timer 3
> >> dockerd-current;entry_SYSCALL_64_fastpath;sys_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule;-;try_to_wake_up;futex_wake;do_futex;sys_futex;entry_SYSCALL_64_fastpath;;dockerd-current 2
> >> [root@jouet bpf]#
> >>
> >>
> >> So, I'm stopping here so that I can push what I have to Ingo, then I'll get
> >> back to this, hopefully by then you beat me and I have just to retest 8-)
> >
> > OK, thanks for the report. Looks like there was another difference
> > between the two libbpfs - one used total program size for its
> > load_program API; the actual kernel API uses instruction count. This
> > incremental should do the trick:
> >
> > https://github.com/joestringer/linux/commit/6ff7726f20077bed66fb725f5189c13690154b6a
> 
> The full branch with this change (fast-forward from your tmp branch)
> is available here:
> https://github.com/joestringer/linux/tree/submit/libbpf_samples_v5

Can you please repost the patches you changed, and just those, sometimes
I'm with limited net connectivity, so not having to go use the github
interface, figure out how to download the patches, etc, is a win.

- Arnaldo
 
> I tried running every selftest and BPF sample I could get my hands on;
> there's one or two that I couldn't run, but seemed more to do with my
> versions of TC/iproute and kernel config rather than libbpf changes.
> Let me know if you see any further trouble.

^ permalink raw reply

* Re: [PATCH iproute2 v2 1/3] ifstat: Add extended statistics to ifstat
From: Rami Rosen @ 2016-12-16 15:21 UTC (permalink / raw)
  To: Nogah Frankel
  Cc: Stephen Hemminger, netdev@vger.kernel.org,
	roopa@cumulusnetworks.com, Jiri Pirko, Elad Raz, Yotam Gigi,
	Ido Schimmel, Or Gerlitz
In-Reply-To: <DB5PR05MB1895991888E270CE790F1C16AC9C0@DB5PR05MB1895.eurprd05.prod.outlook.com>

Hi,

>Thanks, I'll fix it.
Another minor nit, on this occasion:

bool is_extanded should be: bool is_extended

Regards,
Rami Rosen

^ permalink raw reply

* Re: Soft lockup in inet_put_port on 4.6
From: Josef Bacik @ 2016-12-16 14:54 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: Tom Herbert, Craig Gallek, Eric Dumazet,
	Linux Kernel Network Developers
In-Reply-To: <aca378b5-aae4-49cf-0542-d1167a5768b8@stressinduktion.org>

On Thu, Dec 15, 2016 at 7:07 PM, Hannes Frederic Sowa 
<hannes@stressinduktion.org> wrote:
> Hi Josef,
> 
> On 15.12.2016 19:53, Josef Bacik wrote:
>>  On Tue, Dec 13, 2016 at 6:32 PM, Tom Herbert <tom@herbertland.com> 
>> wrote:
>>>  On Tue, Dec 13, 2016 at 3:03 PM, Craig Gallek 
>>> <kraigatgoog@gmail.com>
>>>  wrote:
>>>>   On Tue, Dec 13, 2016 at 3:51 PM, Tom Herbert 
>>>> <tom@herbertland.com>
>>>>  wrote:
>>>>>   I think there may be some suspicious code in inet_csk_get_port. 
>>>>> At
>>>>>   tb_found there is:
>>>>> 
>>>>>                   if (((tb->fastreuse > 0 && reuse) ||
>>>>>                        (tb->fastreuseport > 0 &&
>>>>>                         !rcu_access_pointer(sk->sk_reuseport_cb) 
>>>>> &&
>>>>>                         sk->sk_reuseport && uid_eq(tb->fastuid,
>>>>>  uid))) &&
>>>>>                       smallest_size == -1)
>>>>>                           goto success;
>>>>>                   if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk,
>>>>>  tb, true)) {
>>>>>                           if ((reuse ||
>>>>>                                (tb->fastreuseport > 0 &&
>>>>>                                 sk->sk_reuseport &&
>>>>> 
>>>>>  !rcu_access_pointer(sk->sk_reuseport_cb) &&
>>>>>                                 uid_eq(tb->fastuid, uid))) &&
>>>>>                               smallest_size != -1 && --attempts 
>>>>> >= 0) {
>>>>>                                   spin_unlock_bh(&head->lock);
>>>>>                                   goto again;
>>>>>                           }
>>>>>                           goto fail_unlock;
>>>>>                   }
>>>>> 
>>>>>   AFAICT there is redundancy in these two conditionals.  The same 
>>>>> clause
>>>>>   is being checked in both: (tb->fastreuseport > 0 &&
>>>>>   !rcu_access_pointer(sk->sk_reuseport_cb) && sk->sk_reuseport &&
>>>>>   uid_eq(tb->fastuid, uid))) && smallest_size == -1. If this is 
>>>>> true the
>>>>>   first conditional should be hit, goto done,  and the second 
>>>>> will never
>>>>>   evaluate that part to true-- unless the sk is changed (do we 
>>>>> need
>>>>>   READ_ONCE for sk->sk_reuseport_cb?).
>>>>   That's an interesting point... It looks like this function also
>>>>   changed in 4.6 from using a single local_bh_disable() at the 
>>>> beginning
>>>>   with several spin_lock(&head->lock) to exclusively
>>>>   spin_lock_bh(&head->lock) at each locking point.  Perhaps the 
>>>> full bh
>>>>   disable variant was preventing the timers in your stack trace 
>>>> from
>>>>   running interleaved with this function before?
>>> 
>>>  Could be, although dropping the lock shouldn't be able to affect 
>>> the
>>>  search state. TBH, I'm a little lost in reading function, the
>>>  SO_REUSEPORT handling is pretty complicated. For instance,
>>>  rcu_access_pointer(sk->sk_reuseport_cb) is checked three times in 
>>> that
>>>  function and also in every call to inet_csk_bind_conflict. I 
>>> wonder if
>>>  we can simply this under the assumption that SO_REUSEPORT is only
>>>  allowed if the port number (snum) is explicitly specified.
>> 
>>  Ok first I have data for you Hannes, here's the time distributions
>>  before during and after the lockup (with all the debugging in place 
>> the
>>  box eventually recovers).  I've attached it as a text file since it 
>> is
>>  long.
> 
> Thanks a lot!
> 
>>  Second is I was thinking about why we would spend so much time 
>> doing the
>>  ->owners list, and obviously it's because of the massive amount of
>>  timewait sockets on the owners list.  I wrote the following dumb 
>> patch
>>  and tested it and the problem has disappeared completely.  Now I 
>> don't
>>  know if this is right at all, but I thought it was weird we weren't
>>  copying the soreuseport option from the original socket onto the 
>> twsk.
>>  Is there are reason we aren't doing this currently?  Does this help
>>  explain what is happening?  Thanks,
> 
> The patch is interesting and a good clue, but I am immediately a bit
> concerned that we don't copy/tag the socket with the uid also to keep
> the security properties for SO_REUSEPORT. I have to think a bit more
> about this.
> 
> We have seen hangs during connect. I am afraid this patch wouldn't 
> help
> there while also guaranteeing uniqueness.


Yeah so I looked at the code some more and actually my patch is really 
bad.  If sk2->sk_reuseport is set we'll look at sk2->sk_reuseport_cb, 
which is outside of the timewait sock, so that's definitely bad.

But we should at least be setting it to 0 so that we don't do this 
normally.  Unfortunately simply setting it to 0 doesn't fix the 
problem.  So for some reason having ->sk_reuseport set to 1 on a 
timewait socket makes this problem non-existent, which is strange.

So back to the drawing board I guess.  I wonder if doing what craig 
suggested and batching the timewait timer expires so it hurts less 
would accomplish the same results.  Thanks,

Josef

^ permalink raw reply

* RE: [PATCH iproute2 v2 1/3] ifstat: Add extended statistics to ifstat
From: Nogah Frankel @ 2016-12-16 12:11 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: netdev@vger.kernel.org, roopa@cumulusnetworks.com, Jiri Pirko,
	Elad Raz, Yotam Gigi, Ido Schimmel, Or Gerlitz
In-Reply-To: <20161215093327.2daf60c6@xeon-e3>


> -----Original Message-----
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Thursday, December 15, 2016 7:33 PM
> To: Nogah Frankel <nogahf@mellanox.com>
> Cc: netdev@vger.kernel.org; roopa@cumulusnetworks.com; Jiri Pirko
> <jiri@mellanox.com>; Elad Raz <eladr@mellanox.com>; Yotam Gigi
> <yotamg@mellanox.com>; Ido Schimmel <idosch@mellanox.com>; Or Gerlitz
> <ogerlitz@mellanox.com>
> Subject: Re: [PATCH iproute2 v2 1/3] ifstat: Add extended statistics to ifstat
> 
> On Thu, 15 Dec 2016 15:00:43 +0200
> Nogah Frankel <nogahf@mellanox.com> wrote:
> 
> > Extended stats are part of the RTM_GETSTATS method. This patch adds them
> > to ifstat.
> > While extended stats can come in many forms, we support only the
> > rtnl_link_stats64 struct for them (which is the 64 bits version of struct
> > rtnl_link_stats).
> > We support stats in the main nesting level, or one lower.
> > The extension can be called by its name or any shorten of it. If there is
> > more than one matched, the first one will be picked.
> >
> > To get the extended stats the flag -x <stats type> is used.
> >
> > Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
> > Reviewed-by: Jiri Pirko <jiri@mellanox.com>
> > ---
> >  misc/ifstat.c | 161
> ++++++++++++++++++++++++++++++++++++++++++++++++++++------
> >  1 file changed, 146 insertions(+), 15 deletions(-)
> >
> > diff --git a/misc/ifstat.c b/misc/ifstat.c
> > index 92d67b0..d17ae21 100644
> > --- a/misc/ifstat.c
> > +++ b/misc/ifstat.c
> > @@ -35,6 +35,7 @@
> >
> >  #include <SNAPSHOT.h>
> >
> > +#include "utils.h"
> >  int dump_zeros;
> >  int reset_history;
> >  int ignore_history;
> 
> Minor nit, please cleanup include order here (original code was wrong).
> 
> Standard practice is:
>  #include system headers (like stdio.h etc)
>  #include "xxx.h" local headers.
> 
> Should be:
> #include <getopt.h>
> 
> #include <linux/if.h>
> #include <linux/if_link.h>
> 
> #include "json_writer.h"
> #include "libnetlink.h"
> #include "utils.h"
> #include "SNAPSHOT.h"

Thanks, I'll fix it.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox