* [net-next 7/8] tipc: handle collisions of 32-bit node address hash values
From: Jon Maloy @ 2018-03-22 19:42 UTC (permalink / raw)
To: davem, netdev; +Cc: tipc-discussion, mohan.krishna.ghanta.krishnamurthy
In-Reply-To: <1521747772-7727-1-git-send-email-jon.maloy@ericsson.com>
When a 32-bit node address is generated from a 128-bit identifier,
there is a risk of collisions which must be discovered and handled.
We do this as follows:
- We don't apply the generated address immediately to the node, but do
instead initiate a 1 sec trial period to allow other cluster members
to discover and handle such collisions.
- During the trial period the node periodically sends out a new type
of message, DSC_TRIAL_MSG, using broadcast or emulated broadcast,
to all the other nodes in the cluster.
- When a node is receiving such a message, it must check that the
presented 32-bit identifier either is unused, or was used by the very
same peer in a previous session. In both cases it accepts the request
by not responding to it.
- If it finds that the same node has been up before using a different
address, it responds with a DSC_TRIAL_FAIL_MSG containing that
address.
- If it finds that the address has already been taken by some other
node, it generates a new, unused address and returns it to the
requester.
- During the trial period the requesting node must always be prepared
to accept a failure message, i.e., a message where a peer suggests a
different (or equal) address to the one tried. In those cases it
must apply the suggested value as trial address and restart the trial
period.
This algorithm ensures that in the vast majority of cases a node will
have the same address before and after a reboot. If a legacy user
configures the address explicitly, there will be no trial period and
messages, so this protocol addition is completely backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
---
net/tipc/addr.c | 3 +-
net/tipc/bearer.c | 3 +-
net/tipc/core.c | 2 +
net/tipc/core.h | 2 +
net/tipc/discover.c | 126 ++++++++++++++++++++++++++++++++++++++++++++--------
net/tipc/link.c | 26 +++++++----
net/tipc/link.h | 4 +-
net/tipc/msg.h | 23 +++++++++-
net/tipc/net.c | 4 +-
net/tipc/node.c | 85 ++++++++++++++++++++++++++++++++---
net/tipc/node.h | 3 +-
11 files changed, 236 insertions(+), 45 deletions(-)
diff --git a/net/tipc/addr.c b/net/tipc/addr.c
index 4841e98..b88d48d 100644
--- a/net/tipc/addr.c
+++ b/net/tipc/addr.c
@@ -59,7 +59,7 @@ void tipc_set_node_id(struct net *net, u8 *id)
memcpy(tn->node_id, id, NODE_ID_LEN);
tipc_nodeid2string(tn->node_id_string, id);
- tn->node_addr = tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3];
+ tn->trial_addr = tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3];
pr_info("Own node identity %s, cluster identity %u\n",
tipc_own_id_string(net), tn->net_id);
}
@@ -74,6 +74,7 @@ void tipc_set_node_addr(struct net *net, u32 addr)
sprintf(node_id, "%x", addr);
tipc_set_node_id(net, node_id);
}
+ tn->trial_addr = addr;
pr_info("32-bit node address hash set to %x\n", addr);
}
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index a71f318..ae5b44c 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -235,7 +235,6 @@ static int tipc_enable_bearer(struct net *net, const char *name,
{
struct tipc_net *tn = tipc_net(net);
struct tipc_bearer_names b_names;
- u32 self = tipc_own_addr(net);
int with_this_prio = 1;
struct tipc_bearer *b;
struct tipc_media *m;
@@ -244,7 +243,7 @@ static int tipc_enable_bearer(struct net *net, const char *name,
int res = -EINVAL;
char *errstr = "";
- if (!self) {
+ if (!tipc_own_id(net)) {
errstr = "not supported in standalone mode";
res = -ENOPROTOOPT;
goto rejected;
diff --git a/net/tipc/core.c b/net/tipc/core.c
index e92fed4..52dfc51 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -57,6 +57,8 @@ static int __net_init tipc_init_net(struct net *net)
tn->net_id = 4711;
tn->node_addr = 0;
+ tn->trial_addr = 0;
+ tn->addr_trial_end = 0;
memset(tn->node_id, 0, sizeof(tn->node_id));
memset(tn->node_id_string, 0, sizeof(tn->node_id_string));
tn->mon_threshold = TIPC_DEF_MON_THRESHOLD;
diff --git a/net/tipc/core.h b/net/tipc/core.h
index eabad41..d0f64ca 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -82,6 +82,8 @@ extern int sysctl_tipc_named_timeout __read_mostly;
struct tipc_net {
u8 node_id[NODE_ID_LEN];
u32 node_addr;
+ u32 trial_addr;
+ unsigned long addr_trial_end;
char node_id_string[NODE_ID_STR_LEN];
int net_id;
int random;
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index b4c4cd1..e765573 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -1,7 +1,7 @@
/*
* net/tipc/discover.c
*
- * Copyright (c) 2003-2006, 2014-2015, Ericsson AB
+ * Copyright (c) 2003-2006, 2014-2018, Ericsson AB
* Copyright (c) 2005-2006, 2010-2011, Wind River Systems
* All rights reserved.
*
@@ -78,34 +78,40 @@ struct tipc_discoverer {
* @b: ptr to bearer issuing message
*/
static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb,
- u32 mtyp, struct tipc_bearer *b)
+ u32 mtyp, struct tipc_bearer *b)
{
struct tipc_net *tn = tipc_net(net);
- u32 self = tipc_own_addr(net);
u32 dest_domain = b->domain;
struct tipc_msg *hdr;
hdr = buf_msg(skb);
- tipc_msg_init(self, hdr, LINK_CONFIG, mtyp,
+ tipc_msg_init(tn->trial_addr, hdr, LINK_CONFIG, mtyp,
MAX_H_SIZE, dest_domain);
+ msg_set_size(hdr, MAX_H_SIZE + NODE_ID_LEN);
msg_set_non_seq(hdr, 1);
msg_set_node_sig(hdr, tn->random);
msg_set_node_capabilities(hdr, TIPC_NODE_CAPABILITIES);
msg_set_dest_domain(hdr, dest_domain);
msg_set_bc_netid(hdr, tn->net_id);
b->media->addr2msg(msg_media_addr(hdr), &b->addr);
+ msg_set_node_id(hdr, tipc_own_id(net));
}
-static void tipc_disc_msg_xmit(struct net *net, u32 mtyp, u32 dst, u32 src,
+static void tipc_disc_msg_xmit(struct net *net, u32 mtyp, u32 dst,
+ u32 src, u32 sugg_addr,
struct tipc_media_addr *maddr,
struct tipc_bearer *b)
{
+ struct tipc_msg *hdr;
struct sk_buff *skb;
- skb = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC);
+ skb = tipc_buf_acquire(MAX_H_SIZE + NODE_ID_LEN, GFP_ATOMIC);
if (!skb)
return;
+ hdr = buf_msg(skb);
tipc_disc_init_msg(net, skb, mtyp, b);
+ msg_set_sugg_node_addr(hdr, sugg_addr);
+ msg_set_dest_domain(hdr, dst);
tipc_bearer_xmit_skb(net, b->identity, skb, maddr);
}
@@ -126,6 +132,52 @@ static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr,
media_addr_str, b->name);
}
+/* tipc_disc_addr_trial(): - handle an address uniqueness trial from peer
+ */
+bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d,
+ struct tipc_media_addr *maddr,
+ struct tipc_bearer *b,
+ u32 dst, u32 src,
+ u32 sugg_addr,
+ u8 *peer_id,
+ int mtyp)
+{
+ struct net *net = d->net;
+ struct tipc_net *tn = tipc_net(net);
+ bool trial = time_before(jiffies, tn->addr_trial_end);
+ u32 self = tipc_own_addr(net);
+
+ if (mtyp == DSC_TRIAL_FAIL_MSG) {
+ if (!trial)
+ return true;
+
+ /* Ignore if somebody else already gave new suggestion */
+ if (dst != tn->trial_addr)
+ return true;
+
+ /* Otherwise update trial address and restart trial period */
+ tn->trial_addr = sugg_addr;
+ msg_set_prevnode(buf_msg(d->skb), sugg_addr);
+ tn->addr_trial_end = jiffies + msecs_to_jiffies(1000);
+ return true;
+ }
+
+ /* Apply trial address if we just left trial period */
+ if (!trial && !self) {
+ tipc_net_finalize(net, tn->trial_addr);
+ msg_set_type(buf_msg(d->skb), DSC_REQ_MSG);
+ }
+
+ if (mtyp != DSC_TRIAL_MSG)
+ return false;
+
+ sugg_addr = tipc_node_try_addr(net, peer_id, src);
+ if (sugg_addr)
+ tipc_disc_msg_xmit(net, DSC_TRIAL_FAIL_MSG, src,
+ self, sugg_addr, maddr, b);
+ return true;
+}
+
/**
* tipc_disc_rcv - handle incoming discovery message (request or response)
* @net: applicable net namespace
@@ -139,17 +191,27 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
struct tipc_msg *hdr = buf_msg(skb);
u16 caps = msg_node_capabilities(hdr);
bool legacy = tn->legacy_addr_format;
+ u32 sugg = msg_sugg_node_addr(hdr);
u32 signature = msg_node_sig(hdr);
+ u8 peer_id[NODE_ID_LEN] = {0,};
u32 dst = msg_dest_domain(hdr);
u32 net_id = msg_bc_netid(hdr);
- u32 self = tipc_own_addr(net);
struct tipc_media_addr maddr;
u32 src = msg_prevnode(hdr);
u32 mtyp = msg_type(hdr);
bool dupl_addr = false;
bool respond = false;
+ u32 self;
int err;
+ skb_linearize(skb);
+ hdr = buf_msg(skb);
+
+ if (caps & TIPC_NODE_ID128)
+ memcpy(peer_id, msg_node_id(hdr), NODE_ID_LEN);
+ else
+ sprintf(peer_id, "%x", src);
+
err = b->media->msg2addr(b, &maddr, msg_media_addr(hdr));
kfree_skb(skb);
if (err || maddr.broadcast) {
@@ -161,6 +223,12 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
return;
if (net_id != tn->net_id)
return;
+ if (tipc_disc_addr_trial_msg(b->disc, &maddr, b, dst,
+ src, sugg, peer_id, mtyp))
+ return;
+ self = tipc_own_addr(net);
+
+ /* Message from somebody using this node's address */
if (in_own_node(net, src)) {
disc_dupl_alert(b, self, &maddr);
return;
@@ -169,8 +237,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
return;
if (!tipc_in_scope(legacy, b->domain, src))
return;
-
- tipc_node_check_dest(net, src, b, caps, signature,
+ tipc_node_check_dest(net, src, peer_id, b, caps, signature,
&maddr, &respond, &dupl_addr);
if (dupl_addr)
disc_dupl_alert(b, src, &maddr);
@@ -178,7 +245,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
return;
if (mtyp != DSC_REQ_MSG)
return;
- tipc_disc_msg_xmit(net, DSC_RESP_MSG, src, self, &maddr, b);
+ tipc_disc_msg_xmit(net, DSC_RESP_MSG, src, self, 0, &maddr, b);
}
/* tipc_disc_add_dest - increment set of discovered nodes
@@ -216,9 +283,11 @@ void tipc_disc_remove_dest(struct tipc_discoverer *d)
static void tipc_disc_timeout(struct timer_list *t)
{
struct tipc_discoverer *d = from_timer(d, t, timer);
+ struct tipc_net *tn = tipc_net(d->net);
+ u32 self = tipc_own_addr(d->net);
struct tipc_media_addr maddr;
struct sk_buff *skb = NULL;
- struct net *net;
+ struct net *net = d->net;
u32 bearer_id;
spin_lock_bh(&d->lock);
@@ -228,16 +297,29 @@ static void tipc_disc_timeout(struct timer_list *t)
d->timer_intv = TIPC_DISC_INACTIVE;
goto exit;
}
+
+ /* Did we just leave the address trial period ? */
+ if (!self && !time_before(jiffies, tn->addr_trial_end)) {
+ self = tn->trial_addr;
+ tipc_net_finalize(net, self);
+ msg_set_prevnode(buf_msg(d->skb), self);
+ msg_set_type(buf_msg(d->skb), DSC_REQ_MSG);
+ }
+
/* Adjust timeout interval according to discovery phase */
- d->timer_intv *= 2;
- if (d->num_nodes && d->timer_intv > TIPC_DISC_SLOW)
- d->timer_intv = TIPC_DISC_SLOW;
- else if (!d->num_nodes && d->timer_intv > TIPC_DISC_FAST)
- d->timer_intv = TIPC_DISC_FAST;
+ if (time_before(jiffies, tn->addr_trial_end)) {
+ d->timer_intv = TIPC_DISC_INIT;
+ } else {
+ d->timer_intv *= 2;
+ if (d->num_nodes && d->timer_intv > TIPC_DISC_SLOW)
+ d->timer_intv = TIPC_DISC_SLOW;
+ else if (!d->num_nodes && d->timer_intv > TIPC_DISC_FAST)
+ d->timer_intv = TIPC_DISC_FAST;
+ }
+
mod_timer(&d->timer, jiffies + d->timer_intv);
memcpy(&maddr, &d->dest, sizeof(maddr));
skb = skb_clone(d->skb, GFP_ATOMIC);
- net = d->net;
bearer_id = d->bearer_id;
exit:
spin_unlock_bh(&d->lock);
@@ -257,18 +339,24 @@ static void tipc_disc_timeout(struct timer_list *t)
int tipc_disc_create(struct net *net, struct tipc_bearer *b,
struct tipc_media_addr *dest, struct sk_buff **skb)
{
+ struct tipc_net *tn = tipc_net(net);
struct tipc_discoverer *d;
d = kmalloc(sizeof(*d), GFP_ATOMIC);
if (!d)
return -ENOMEM;
- d->skb = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC);
+ d->skb = tipc_buf_acquire(MAX_H_SIZE + NODE_ID_LEN, GFP_ATOMIC);
if (!d->skb) {
kfree(d);
return -ENOMEM;
}
-
tipc_disc_init_msg(net, d->skb, DSC_REQ_MSG, b);
+
+ /* Do we need an address trial period first ? */
+ if (!tipc_own_addr(net)) {
+ tn->addr_trial_end = jiffies + msecs_to_jiffies(1000);
+ msg_set_type(buf_msg(d->skb), DSC_TRIAL_MSG);
+ }
memcpy(&d->dest, dest, sizeof(*dest));
d->net = net;
d->bearer_id = b->identity;
diff --git a/net/tipc/link.c b/net/tipc/link.c
index bcd76b1..1289b4b 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -434,15 +434,16 @@ char *tipc_link_name(struct tipc_link *l)
*/
bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
int tolerance, char net_plane, u32 mtu, int priority,
- int window, u32 session, u32 self, u32 peer,
- u16 peer_caps,
+ int window, u32 session, u32 self,
+ u32 peer, u8 *peer_id, u16 peer_caps,
struct tipc_link *bc_sndlink,
struct tipc_link *bc_rcvlink,
struct sk_buff_head *inputq,
struct sk_buff_head *namedq,
struct tipc_link **link)
{
- char *self_str = tipc_own_id_string(net);
+ char peer_str[NODE_ID_STR_LEN] = {0,};
+ char self_str[NODE_ID_STR_LEN] = {0,};
struct tipc_link *l;
l = kzalloc(sizeof(*l), GFP_ATOMIC);
@@ -451,11 +452,18 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
*link = l;
l->session = session;
- /* Note: peer i/f name is completed by reset/activate message */
- if (strlen(self_str) > 16)
- sprintf(l->name, "%x:%s-%x:unknown", self, if_name, peer);
- else
- sprintf(l->name, "%s:%s-%x:unknown", self_str, if_name, peer);
+ /* Set link name for unicast links only */
+ if (peer_id) {
+ tipc_nodeid2string(self_str, tipc_own_id(net));
+ if (strlen(self_str) > 16)
+ sprintf(self_str, "%x", self);
+ tipc_nodeid2string(peer_str, peer_id);
+ if (strlen(peer_str) > 16)
+ sprintf(peer_str, "%x", peer);
+ }
+ /* Peer i/f name will be completed by reset/activate message */
+ sprintf(l->name, "%s:%s-%s:unknown", self_str, if_name, peer_str);
+
strcpy(l->if_name, if_name);
l->addr = peer;
l->peer_caps = peer_caps;
@@ -503,7 +511,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
struct tipc_link *l;
if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, window,
- 0, ownnode, peer, peer_caps, bc_sndlink,
+ 0, ownnode, peer, NULL, peer_caps, bc_sndlink,
NULL, inputq, namedq, link))
return false;
diff --git a/net/tipc/link.h b/net/tipc/link.h
index d1bd178..ec59348 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -73,8 +73,8 @@ enum {
bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
int tolerance, char net_plane, u32 mtu, int priority,
- int window, u32 session, u32 ownnode, u32 peer,
- u16 peer_caps,
+ int window, u32 session, u32 ownnode,
+ u32 peer, u8 *peer_id, u16 peer_caps,
struct tipc_link *bc_sndlink,
struct tipc_link *bc_rcvlink,
struct sk_buff_head *inputq,
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index b4ba1b4..a4e944d 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -550,6 +550,8 @@ static inline void msg_set_nameupper(struct tipc_msg *m, u32 n)
*/
#define DSC_REQ_MSG 0
#define DSC_RESP_MSG 1
+#define DSC_TRIAL_MSG 2
+#define DSC_TRIAL_FAIL_MSG 3
/*
* Group protocol message types
@@ -627,7 +629,6 @@ static inline void msg_set_bcgap_to(struct tipc_msg *m, u32 n)
msg_set_bits(m, 2, 0, 0xffff, n);
}
-
/*
* Word 4
*/
@@ -925,6 +926,26 @@ static inline bool msg_is_reset(struct tipc_msg *hdr)
return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG);
}
+static inline u32 msg_sugg_node_addr(struct tipc_msg *m)
+{
+ return msg_word(m, 14);
+}
+
+static inline void msg_set_sugg_node_addr(struct tipc_msg *m, u32 n)
+{
+ msg_set_word(m, 14, n);
+}
+
+static inline void msg_set_node_id(struct tipc_msg *hdr, u8 *id)
+{
+ memcpy(msg_data(hdr), id, 16);
+}
+
+static inline u8 *msg_node_id(struct tipc_msg *hdr)
+{
+ return (u8 *)msg_data(hdr);
+}
+
struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp);
bool tipc_msg_validate(struct sk_buff **_skb);
bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err);
diff --git a/net/tipc/net.c b/net/tipc/net.c
index e786748..29538dc 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -112,10 +112,8 @@ int tipc_net_init(struct net *net, u8 *node_id, u32 addr)
}
pr_info("Started in network mode\n");
- if (node_id) {
+ if (node_id)
tipc_set_node_id(net, node_id);
- tipc_net_finalize(net, tipc_own_addr(net));
- }
if (addr)
tipc_net_finalize(net, addr);
return 0;
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 7b0c993..4a95c8c 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -115,6 +115,7 @@ struct tipc_node {
u16 capabilities;
u32 signature;
u32 link_id;
+ u8 peer_id[16];
struct list_head publ_list;
struct list_head conn_sks;
unsigned long keepalive_intv;
@@ -156,6 +157,7 @@ static void tipc_node_delete(struct tipc_node *node);
static void tipc_node_timeout(struct timer_list *t);
static void tipc_node_fsm_evt(struct tipc_node *n, int evt);
static struct tipc_node *tipc_node_find(struct net *net, u32 addr);
+static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id);
static void tipc_node_put(struct tipc_node *node);
static bool node_is_up(struct tipc_node *n);
@@ -245,6 +247,30 @@ static struct tipc_node *tipc_node_find(struct net *net, u32 addr)
return node;
}
+/* tipc_node_find_by_id - locate specified node object by its 128-bit id
+ * Note: this function is called only when a discovery request failed
+ * to find the node by its 32-bit id, and is not time critical
+ */
+static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id)
+{
+ struct tipc_net *tn = tipc_net(net);
+ struct tipc_node *n;
+ bool found = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(n, &tn->node_list, list) {
+ read_lock_bh(&n->lock);
+ if (!memcmp(id, n->peer_id, 16) &&
+ kref_get_unless_zero(&n->kref))
+ found = true;
+ read_unlock_bh(&n->lock);
+ if (found)
+ break;
+ }
+ rcu_read_unlock();
+ return found ? n : NULL;
+}
+
static void tipc_node_read_lock(struct tipc_node *n)
{
read_lock_bh(&n->lock);
@@ -307,7 +333,8 @@ static void tipc_node_write_unlock(struct tipc_node *n)
}
}
-struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities)
+struct tipc_node *tipc_node_create(struct net *net, u32 addr,
+ u8 *peer_id, u16 capabilities)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
struct tipc_node *n, *temp_node;
@@ -326,6 +353,7 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities)
goto exit;
}
n->addr = addr;
+ memcpy(&n->peer_id, peer_id, 16);
n->net = net;
n->capabilities = capabilities;
kref_init(&n->kref);
@@ -344,8 +372,8 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities)
n->signature = INVALID_NODE_SIG;
n->active_links[0] = INVALID_BEARER_ID;
n->active_links[1] = INVALID_BEARER_ID;
- if (!tipc_link_bc_create(net, tipc_own_addr(net), n->addr,
- U16_MAX,
+ if (!tipc_link_bc_create(net, tipc_own_addr(net),
+ addr, U16_MAX,
tipc_link_window(tipc_bc_sndlink(net)),
n->capabilities,
&n->bc_entry.inputq1,
@@ -735,8 +763,51 @@ bool tipc_node_is_up(struct net *net, u32 addr)
return retval;
}
-void tipc_node_check_dest(struct net *net, u32 onode,
- struct tipc_bearer *b,
+static u32 tipc_node_suggest_addr(struct net *net, u32 addr)
+{
+ struct tipc_node *n;
+
+ addr ^= tipc_net(net)->random;
+ while ((n = tipc_node_find(net, addr))) {
+ tipc_node_put(n);
+ addr++;
+ }
+ return addr;
+}
+
+/* tipc_node_try_addr(): Check if addr can be used by peer, suggest other if not
+ */
+u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr)
+{
+ struct tipc_net *tn = tipc_net(net);
+ struct tipc_node *n;
+
+ /* Suggest new address if some other peer is using this one */
+ n = tipc_node_find(net, addr);
+ if (n) {
+ if (!memcmp(n->peer_id, id, NODE_ID_LEN))
+ addr = 0;
+ tipc_node_put(n);
+ if (!addr)
+ return 0;
+ return tipc_node_suggest_addr(net, addr);
+ }
+
+ /* Suggest previously used address if peer is known */
+ n = tipc_node_find_by_id(net, id);
+ if (n) {
+ addr = n->addr;
+ tipc_node_put(n);
+ }
+ /* Even this node may be in trial phase */
+ if (tn->trial_addr == addr)
+ return tipc_node_suggest_addr(net, addr);
+
+ return addr;
+}
+
+void tipc_node_check_dest(struct net *net, u32 addr,
+ u8 *peer_id, struct tipc_bearer *b,
u16 capabilities, u32 signature,
struct tipc_media_addr *maddr,
bool *respond, bool *dupl_addr)
@@ -755,7 +826,7 @@ void tipc_node_check_dest(struct net *net, u32 onode,
*dupl_addr = false;
*respond = false;
- n = tipc_node_create(net, onode, capabilities);
+ n = tipc_node_create(net, addr, peer_id, capabilities);
if (!n)
return;
@@ -840,7 +911,7 @@ void tipc_node_check_dest(struct net *net, u32 onode,
if (!tipc_link_create(net, if_name, b->identity, b->tolerance,
b->net_plane, b->mtu, b->priority,
b->window, mod(tipc_net(net)->random),
- tipc_own_addr(net), onode,
+ tipc_own_addr(net), addr, peer_id,
n->capabilities,
tipc_bc_sndlink(n->net), n->bc_entry.link,
&le->inputq,
diff --git a/net/tipc/node.h b/net/tipc/node.h
index e06faf4..f24b835 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -60,7 +60,8 @@ enum {
#define INVALID_BEARER_ID -1
void tipc_node_stop(struct net *net);
-void tipc_node_check_dest(struct net *net, u32 onode,
+u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr);
+void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128,
struct tipc_bearer *bearer,
u16 capabilities, u32 signature,
struct tipc_media_addr *maddr,
--
2.1.4
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
^ permalink raw reply related
* [net-next 8/8] tipc: obtain node identity from interface by default
From: Jon Maloy @ 2018-03-22 19:42 UTC (permalink / raw)
To: davem, netdev; +Cc: tipc-discussion, mohan.krishna.ghanta.krishnamurthy
In-Reply-To: <1521747772-7727-1-git-send-email-jon.maloy@ericsson.com>
Selecting and explicitly configuring a TIPC node identity may be
unwanted in some cases.
In this commit we introduce a default setting if the identity has not
been set at the moment the first bearer is enabled. We do this by
using a raw copy of a unique identifier from the used interface: MAC
address in the case of an L2 bearer, IPv4/IPv6 address in the case
of a UDP bearer.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
---
net/tipc/bearer.c | 24 +++++++++++++++---------
net/tipc/net.h | 1 +
net/tipc/udp_media.c | 13 +++++++++++++
3 files changed, 29 insertions(+), 9 deletions(-)
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index ae5b44c..f7d47c8 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -243,12 +243,6 @@ static int tipc_enable_bearer(struct net *net, const char *name,
int res = -EINVAL;
char *errstr = "";
- if (!tipc_own_id(net)) {
- errstr = "not supported in standalone mode";
- res = -ENOPROTOOPT;
- goto rejected;
- }
-
if (!bearer_name_validate(name, &b_names)) {
errstr = "illegal name";
goto rejected;
@@ -381,11 +375,13 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b)
int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
struct nlattr *attr[])
{
+ char *dev_name = strchr((const char *)b->name, ':') + 1;
+ int hwaddr_len = b->media->hwaddr_len;
+ u8 node_id[NODE_ID_LEN] = {0,};
struct net_device *dev;
- char *driver_name = strchr((const char *)b->name, ':') + 1;
/* Find device with specified name */
- dev = dev_get_by_name(net, driver_name);
+ dev = dev_get_by_name(net, dev_name);
if (!dev)
return -ENODEV;
if (tipc_mtu_bad(dev, 0)) {
@@ -393,6 +389,16 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
return -EINVAL;
}
+ /* Autoconfigure own node identity if needed */
+ if (!tipc_own_id(net) && hwaddr_len <= NODE_ID_LEN) {
+ memcpy(node_id, dev->dev_addr, hwaddr_len);
+ tipc_net_init(net, node_id, 0);
+ }
+ if (!tipc_own_id(net)) {
+ pr_warn("Failed to obtain node identity\n");
+ return -EINVAL;
+ }
+
/* Associate TIPC bearer with L2 bearer */
rcu_assign_pointer(b->media_ptr, dev);
b->pt.dev = dev;
@@ -400,7 +406,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
b->pt.func = tipc_l2_rcv_msg;
dev_add_pack(&b->pt);
memset(&b->bcast_addr, 0, sizeof(b->bcast_addr));
- memcpy(b->bcast_addr.value, dev->broadcast, b->media->hwaddr_len);
+ memcpy(b->bcast_addr.value, dev->broadcast, hwaddr_len);
b->bcast_addr.media_id = b->media->type_id;
b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT;
b->mtu = dev->mtu;
diff --git a/net/tipc/net.h b/net/tipc/net.h
index 08efa60..09ad02b 100644
--- a/net/tipc/net.h
+++ b/net/tipc/net.h
@@ -41,6 +41,7 @@
extern const struct nla_policy tipc_nl_net_policy[];
+int tipc_net_init(struct net *net, u8 *node_id, u32 addr);
void tipc_net_finalize(struct net *net, u32 addr);
void tipc_net_stop(struct net *net);
int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb);
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 3deabca..2c13b18 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -47,6 +47,8 @@
#include <net/addrconf.h>
#include <linux/tipc_netlink.h>
#include "core.h"
+#include "addr.h"
+#include "net.h"
#include "bearer.h"
#include "netlink.h"
#include "msg.h"
@@ -647,6 +649,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
struct udp_port_cfg udp_conf = {0};
struct udp_tunnel_sock_cfg tuncfg = {NULL};
struct nlattr *opts[TIPC_NLA_UDP_MAX + 1];
+ u8 node_id[NODE_ID_LEN] = {0,};
ub = kzalloc(sizeof(*ub), GFP_ATOMIC);
if (!ub)
@@ -677,6 +680,16 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
if (err)
goto err;
+ /* Autoconfigure own node identity if needed */
+ if (!tipc_own_id(net)) {
+ memcpy(node_id, local.ipv6.in6_u.u6_addr8, 16);
+ tipc_net_init(net, node_id, 0);
+ }
+ if (!tipc_own_id(net)) {
+ pr_warn("Failed to set node id, please configure manually\n");
+ return -EINVAL;
+ }
+
b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP;
b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT;
rcu_assign_pointer(b->media_ptr, ub);
--
2.1.4
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
^ permalink raw reply related
* Re: [PATCH net-next v2] net: mvpp2: Don't use dynamic allocs for local variables
From: David Miller @ 2018-03-22 19:43 UTC (permalink / raw)
To: maxime.chevallier
Cc: netdev, linux-kernel, antoine.tenart, thomas.petazzoni,
gregory.clement, miquel.raynal, nadavh, stefanc, ymarkman, mw
In-Reply-To: <20180322201453.706b0ab8@bootlin.com>
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Thu, 22 Mar 2018 20:14:53 +0100
> Hello David,
>
> On Thu, 22 Mar 2018 14:47:09 -0400 (EDT),
> David Miller <davem@davemloft.net> wrote :
>
>> From: Maxime Chevallier <maxime.chevallier@bootlin.com>
>> Date: Wed, 21 Mar 2018 16:14:00 +0100
>>
>> In order to be an equivalent change you must bzero out this 'pe'
>> object on the stack. You are only initializing the index member
>> before passing it into other functions.
>
> I agree that this is unclear, but the functions I pass these objects to
> only need the index field to be set, and will fill the rest of the
> object according to the underlying HW representation (these objects
> mirror the HW configuration).
>
> I can see that this is confusing, we might want to make the
> mvpp2_prs_hw_read function more explicit about this.
>
> Would comments explaning this be enough, or should I try another way to
> make this cleaner ?
Please bzero the object as I have asked you to.
Today the function doesn't care about any input members other than
member, but in the future it might, and this is a bug waiting to
happen.
It is never good to pass partially initialized variables into
another piece of code.
^ permalink raw reply
* Re: [PATCH net-next v2] net: mvpp2: Don't use dynamic allocs for local variables
From: Maxime Chevallier @ 2018-03-22 19:53 UTC (permalink / raw)
To: David Miller, miquel.raynal
Cc: netdev, linux-kernel, antoine.tenart, thomas.petazzoni,
gregory.clement, nadavh, stefanc, ymarkman, mw
In-Reply-To: <20180322.154308.1553892464667808498.davem@davemloft.net>
On Thu, 22 Mar 2018 15:43:08 -0400 (EDT),
David Miller <davem@davemloft.net> wrote :
> From: Maxime Chevallier <maxime.chevallier@bootlin.com>
> Date: Thu, 22 Mar 2018 20:14:53 +0100
>
> > Hello David,
> >
> > On Thu, 22 Mar 2018 14:47:09 -0400 (EDT),
> > David Miller <davem@davemloft.net> wrote :
> >
> >> From: Maxime Chevallier <maxime.chevallier@bootlin.com>
> >> Date: Wed, 21 Mar 2018 16:14:00 +0100
> >>
> >> In order to be an equivalent change you must bzero out this 'pe'
> >> object on the stack. You are only initializing the index member
> >> before passing it into other functions.
> >
> > I agree that this is unclear, but the functions I pass these
> > objects to only need the index field to be set, and will fill the
> > rest of the object according to the underlying HW representation
> > (these objects mirror the HW configuration).
> >
> > I can see that this is confusing, we might want to make the
> > mvpp2_prs_hw_read function more explicit about this.
> >
> > Would comments explaning this be enough, or should I try another
> > way to make this cleaner ?
>
> Please bzero the object as I have asked you to.
>
> Today the function doesn't care about any input members other than
> member, but in the future it might, and this is a bug waiting to
> happen.
Got it.
> It is never good to pass partially initialized variables into
> another piece of code.
Ok, I'll send another version with this.
Thanks for the review,
Maxime
^ permalink raw reply
* [PATCH] dpaa_eth: use true and false for boolean values
From: Gustavo A. R. Silva @ 2018-03-22 19:59 UTC (permalink / raw)
To: Madalin Bucur; +Cc: netdev, linux-kernel, Gustavo A. R. Silva
Assign true or false to boolean variables instead of an integer value.
This issue was detected with the help of Coccinelle.
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
index 85306d1..c7ea193 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
@@ -344,7 +344,7 @@ static void dpaa_get_ethtool_stats(struct net_device *net_dev,
/* gather congestion related counters */
cg_num = 0;
- cg_status = 0;
+ cg_status = false;
cg_time = jiffies_to_msecs(priv->cgr_data.congested_jiffies);
if (qman_query_cgr_congested(&priv->cgr_data.cgr, &cg_status) == 0) {
cg_num = priv->cgr_data.cgr_congested_count;
--
2.7.4
^ permalink raw reply related
* Re: [PATCH v2 net-next] virtio-net: Fix operstate for virtio when no VIRTIO_NET_F_STATUS
From: Michael S. Tsirkin @ 2018-03-22 20:07 UTC (permalink / raw)
To: Jay Vosburgh; +Cc: netdev, Jason Wang, David Miller, Ben Hutchings
In-Reply-To: <23981.1521729761@nyx>
On Thu, Mar 22, 2018 at 02:42:41PM +0000, Jay Vosburgh wrote:
> The operstate update logic will leave an interface in the
> default UNKNOWN operstate if the interface carrier state never changes
> from the default carrier up state set at creation. This includes the
> case of an explicit call to netif_carrier_on, as the carrier on to on
> transition has no effect on operstate.
>
> This affects virtio-net for the case that the virtio peer does
> not support VIRTIO_NET_F_STATUS (the feature that provides carrier state
> updates). Without this feature, the virtio specification states that
> "the link should be assumed active," so, logically, the operstate should
> be UP instead of UNKNOWN. This has impact on user space applications
> that use the operstate to make availability decisions for the interface.
>
> Resolve this by changing the virtio probe logic slightly to call
> netif_carrier_off for both the "with" and "without" VIRTIO_NET_F_STATUS
> cases, and then the existing call to netif_carrier_on for the "without"
> case will cause an operstate transition.
>
> Cc: "Michael S. Tsirkin" <mst@redhat.com>
> Cc: Jason Wang <jasowang@redhat.com>
> Cc: Ben Hutchings <ben@decadent.org.uk>
> Signed-off-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Arguably userspace apps should learn to treat UNKNOWN as UP.
But on the balance this seems more likely to fix apps than to
break any, so
Acked-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>
> I considered resolving this by changing linkwatch_init_dev to
> unconditionally call rfc2863_policy, as that would always set operstate
> for all interfaces.
>
> This would not have any impact on most cases (as most drivers
> call netif_carrier_off during probe), except for the loopback device,
> which currently has an operstate of UNKNOWN (because it never does any
> carrier state transitions). This change would add a round trip on the
> dev_base_lock for every loopback device creation, which could have a
> negative impact when creating many loopback devices, e.g., when
> concurrently creating large numbers of containers.
>
>
> drivers/net/virtio_net.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 23374603e4d9..7b187ec7411e 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2857,8 +2857,8 @@ static int virtnet_probe(struct virtio_device *vdev)
>
> /* Assume link up if device can't report link status,
> otherwise get link status from config. */
> + netif_carrier_off(dev);
> if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> - netif_carrier_off(dev);
> schedule_work(&vi->config_work);
> } else {
> vi->status = VIRTIO_NET_S_LINK_UP;
> --
> 2.14.1
^ permalink raw reply
* [PATCH] qed: Use true and false for boolean values
From: Gustavo A. R. Silva @ 2018-03-22 20:08 UTC (permalink / raw)
To: Ariel Elior, everest-linux-l2; +Cc: netdev, linux-kernel, Gustavo A. R. Silva
Assign true or false to boolean variables instead of an integer value.
This issue was detected with the help of Coccinelle.
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
drivers/net/ethernet/qlogic/qed/qed_dev.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 553a6d1..cdb3eec 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -298,8 +298,8 @@ static void qed_init_qm_params(struct qed_hwfn *p_hwfn)
qm_info->start_vport = (u8) RESC_START(p_hwfn, QED_VPORT);
/* rate limiting and weighted fair queueing are always enabled */
- qm_info->vport_rl_en = 1;
- qm_info->vport_wfq_en = 1;
+ qm_info->vport_rl_en = true;
+ qm_info->vport_wfq_en = true;
/* TC config is different for AH 4 port */
four_port = p_hwfn->cdev->num_ports_in_engine == MAX_NUM_PORTS_K2;
@@ -1276,9 +1276,9 @@ static int qed_hw_init_common(struct qed_hwfn *p_hwfn,
if (p_hwfn->mcp_info) {
if (p_hwfn->mcp_info->func_info.bandwidth_max)
- qm_info->pf_rl_en = 1;
+ qm_info->pf_rl_en = true;
if (p_hwfn->mcp_info->func_info.bandwidth_min)
- qm_info->pf_wfq_en = 1;
+ qm_info->pf_wfq_en = true;
}
memset(¶ms, 0, sizeof(params));
@@ -1630,7 +1630,7 @@ static int qed_vf_start(struct qed_hwfn *p_hwfn,
qed_vf_pf_tunnel_param_update(p_hwfn, p_params->p_tunn);
}
- p_hwfn->b_int_enabled = 1;
+ p_hwfn->b_int_enabled = true;
return 0;
}
--
2.7.4
^ permalink raw reply related
* Re: [PATCH v3 bpf-next 01/10] treewide: remove struct-pass-by-value from tracepoints arguments
From: Linus Torvalds @ 2018-03-22 20:19 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Steven Rostedt, David Miller, Daniel Borkmann, Peter Zijlstra,
Network Development, kernel-team, Linux API
In-Reply-To: <6b1952c4-1612-7abb-49c6-9bbaf6dc6997@fb.com>
On Thu, Mar 22, 2018 at 12:31 PM, Alexei Starovoitov <ast@fb.com> wrote:
>
> yeah. C doesn't allow casting of 'struct s { u64 var };' into u64
> without massive hacks and aliasing warnings by compiler.
Without massive hacks? Yeah, no. But without warnings? You can do it.
#define UINTTYPE(size) \
__typeof__(__builtin_choose_expr(size==1, (u8)1, \
__builtin_choose_expr(size==2, (u16)2, \
__builtin_choose_expr(size==4, (u32)3, \
__builtin_choose_expr(size==8, (u64)4, \
(void)5)))))
#define CAST_TO_U64(x) ({ \
typeof(x) __src = (x); \
UINTTYPE(sizeof(x)) __dst; \
memcpy(&__dst, &__src, sizeof(__dst)); \
(u64)__dst; })
Yeah, I'm not proud of the above, but gcc actually seems to do the
right thing for it. Doing
struct d {
unsigned char a,b;
};
it generates
movzwl %di, %eax
for the CAST_TO_U64() (the above *looks* like it only casts to 32-bit,
but it is actually a full cast to 64 bits because movzwl will also
clear the top bits of the register).
No warnings.
But is ot "massively hacky"? You be the judge. Probably.
Going the other way is trivial, you just use that same UINTTYPE()
again and just the memcpy in reverse.
NOTE! The above obviously only works for things that are actually
proper nice easy sizes.
If you want to encode a 6-byte thing in a u64, the sanest thing is
likely to use a union, and just accept crap in some bytes of the u64
(and just expect it to be undone by the reversal operation). Honestly,
I'd suggest against it, because of just horrible code generation
and/or data leakage
Linus
^ permalink raw reply
* Re: [PATCH v2 iproute2-next 0/6] cm_id, cq, mr, and pd resource tracking
From: Steve Wise @ 2018-03-22 20:20 UTC (permalink / raw)
To: David Ahern, Doug Ledford, Leon Romanovsky, stephen; +Cc: netdev, linux-rdma
In-Reply-To: <fa254740-5cec-8a12-8f9e-2bec1242d140@gmail.com>
On 3/21/2018 11:59 AM, David Ahern wrote:
> On 3/20/18 11:21 AM, Doug Ledford wrote:
>> On 3/16/2018 12:18 PM, David Ahern wrote:
>>> On 3/13/18 1:58 PM, Doug Ledford wrote:
>>>> On Tue, 2018-03-13 at 13:45 -0700, David Ahern wrote:
>>>>> On 3/13/18 1:32 AM, Leon Romanovsky wrote:
>>>>>> On Mon, Mar 12, 2018 at 10:53:03AM -0700, David Ahern wrote:
>>>>>>> On 3/12/18 8:16 AM, Steve Wise wrote:
>>>>>>>> Hey all,
>>>>>>>>
>>>>>>>> The kernel side of this series has been merged for rdma-next [1]. Let me
>>>>>>>> know if this iproute2 series can be merged, of if it needs more changes.
>>>>>>>>
>>>>>>> The problem is that iproute2 headers are synced to kernel headers from
>>>>>>> DaveM's tree (net-next mainly). I take it this series will not appear in
>>>>>>> Dave's tree until after a merge through Linus' tree. Correct?
>>>>>> David,
>>>>>>
>>>>>> Technically, you are right, and we would like to ask you for an extra tweak
>>>>>> to the flow for the RDMAtool, because current scheme causes delays at least
>>>>>> cycle.
>>>>>>
>>>>>> Every RDMAtool's patchset which requires changes to headers is always
>>>>>> includes header patch, can you please accept those series and once you
>>>>>> are bringing new net-next headers from Linus, simply overwrite all our
>>>>>> headers?
>>>>> I did not follow the discussion back when this decision was made, so how
>>>>> did rdma tool end up in iproute2?
>>>> It is modeled after the ip command, and for better or worse, the
>>>> iproute2 package has become the standard drop box for low level kernel
>>>> network configuring tools. The RDMA subsystem may not be IP networking,
>>>> but it is still networking, so it seemed an appropriate fit.
>>> why doesn't the rdma tree go through Dave then?
>>>
>> Because it doesn't use the core network stack hardly at all. It creates
>> netdevs when it needs to bridge the two stacks, but otherwise the RDMA
>> subsystem core is apart and unique from the network stack Dave manages.
>> When I said it was networking, I meant it literally. The RDMA fabrics
>> are networks. It wasn't meant to imply that they shared anything
>> substantial in common with the typical Ethernet/IP networking that is
>> the core of what Dave manages.
>>
> I think the simplest approach is to move the uapi header under the rdma
> directory and you folks take ownership of that header.
Fine with me. Leon?
Steve.
^ permalink raw reply
* Re: [PATCH net-next] bridge: Allow max MTU when multiple VLANs present
From: Nikolay Aleksandrov @ 2018-03-22 20:21 UTC (permalink / raw)
To: Chas Williams, davem; +Cc: netdev, stephen
In-Reply-To: <20180322153406.17760-1-3chas3@gmail.com>
On 22/03/18 17:34, Chas Williams wrote:
> If the bridge is allowing multiple VLANs, some VLANs may have
> different MTUs. Instead of choosing the minimum MTU for the
> bridge interface, choose the maximum MTU of the bridge members.
> With this the user only needs to set a larger MTU on the member
> ports that are participating in the large MTU VLANS.
>
> Signed-off-by: Chas Williams <3chas3@gmail.com>
> ---
> net/bridge/br.c | 2 +-
> net/bridge/br_device.c | 2 +-
> net/bridge/br_if.c | 26 ++++++++++++++++++++++----
> net/bridge/br_private.h | 2 +-
> 4 files changed, 25 insertions(+), 7 deletions(-)
>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
^ permalink raw reply
* Re: [PATCH v2 iproute2-next 0/6] cm_id, cq, mr, and pd resource tracking
From: David Ahern @ 2018-03-22 20:26 UTC (permalink / raw)
To: Steve Wise, Doug Ledford, Leon Romanovsky, stephen; +Cc: netdev, linux-rdma
In-Reply-To: <882c6399-9b73-dc0f-8be3-abca624a3f78@opengridcomputing.com>
On 3/22/18 2:20 PM, Steve Wise wrote:
>> I think the simplest approach is to move the uapi header under the rdma
>> directory and you folks take ownership of that header.
> Fine with me. Leon?
Please make a patch moving the existing file and then make changes to it
in follow on patches.
^ permalink raw reply
* Re: [RFC v3 net-next 13/18] net/sched: Introduce the TBS Qdisc
From: Jesus Sanchez-Palencia @ 2018-03-22 20:25 UTC (permalink / raw)
To: Thomas Gleixner
Cc: netdev, jhs, xiyou.wangcong, jiri, vinicius.gomes, richardcochran,
anna-maria, henrik, John Stultz, levi.pearson, edumazet, willemb,
mlichvar
In-Reply-To: <alpine.DEB.2.21.1803211758140.3754@nanos.tec.linutronix.de>
Hi Thomas,
On 03/21/2018 03:29 PM, Thomas Gleixner wrote:
> On Wed, 21 Mar 2018, Thomas Gleixner wrote:
>> If you look at the use cases of TDM in various fields then FIFO mode is
>> pretty much useless. In industrial/automotive fieldbus applications the
>> various time slices are filled by different threads or even processes.
>
> That brings me to a related question. The TDM cases I'm familiar with which
> aim to use this utilize multiple periodic time slices, aka 802.1Qbv
> time-aware scheduling.
>
> Simple example:
>
> [1a][1b][1c][1d] [1a][1b][1c][1d] [.....
> [2a][2b] [2c][2d]
> [3a] [3b]
> [4a] [4b]
> ----------------------------------------------------------------------> t
>
> where 1-4 is the slice level and a-d are network nodes.
>
> In most cases the slice levels on a node are handled by different
> applications or threads. Some of the protocols utilize dedicated time slice
> levels - lets assume '4' in the above example - to run general network
> traffic which might even be allowed to have collisions, i.e. [4a-d] would
> become [4] and any node can send; the involved componets like switches are
> supposed to handle that.
>
> I'm not seing how TBS is going to assist with any of that. It requires
> everything to be handled at the application level. Not really useful
> especially not for general traffic which does not know about the scheduling
> bands at all.
>
> If you look at an industrial control node. It basically does:
>
> queue_first_packet(tx, slice1);
> while (!stop) {
> if (wait_for_packet(rx) == ERROR)
> goto errorhandling;
> tx = do_computation(rx);
> queue_next_tx(tx, slice1);
> }
>
> that's a pretty common pattern for these kind of applications. For audio
> sources queue_next() might be triggered by the input sampler which needs to
> be synchronized to the network slices anyway in order to work properly.
>
> TBS per current implementation is nice as a proof of concept, but it solves
> just a small portion of the complete problem space. I have the suspicion
> that this was 'designed' to replace the user space hack in the AVNU stack
> with something close to it. Not really a good plan to be honest.
>
> I think what we really want is a strict periodic scheduler which supports
> multiple slices as shown above because thats what all relevant TDM use
> cases need: A/V, industrial fieldbusses .....
>
> |---------------------------------------------------------|
> | |
> | TAS |<- Config
> | 1 2 3 4 |
> |---------------------------------------------------------|
> | | | |
> | | | |
> | | | |
> | | | |
> [DirectSocket] [Qdisc FIFO] [Qdisc Prio] [Qdisc FIFO]
> | | |
> | | |
> [Socket] [Socket] [General traffic]
>
>
> The interesting thing here is that it does not require any time stamp
> information brought in from the application. That's especially good for
> general network traffic which is routed through a dedicated time slot. If
> we don't have that then we need a user space scheduler which does exactly
> the same thing and we have to route the general traffic out to user space
> and back into the kernel, which is obviously a pointless exercise.
>
> There are all kind of TDM schemes out there which are not directly driven
> by applications, but rather route categorized traffic like VLANs through
> dedicated time slices. That works pretty well with the above scheme because
> in that case the applications might be completely oblivious about the tx
> time schedule.
>
> Surely there are protocols which do not utilize every time slice they could
> use, so we need a way to tell the number of empty slices between two
> consecutive packets. There are also different policies vs. the unused time
> slices, like sending dummy frames or just nothing which wants to be
> addressed, but I don't think that changes the general approach.
>
> There might be some special cases for setup or node hotplug, but the
> protocols I'm familiar with handle these in dedicated time slices or
> through general traffic so it should just fit in.
>
> I'm surely missing some details, but from my knowledge about the protocols
> which want to utilize this, the general direction should be fine.
>
> Feel free to tell me that I'm missing the point completely though :)
>
> Thoughts?
We agree with most of the above. :)
Actually, last year Vinicius shared our ideas for a "time-aware priority" root
qdisc as part of the cbs RFC cover letter, dubbed 'taprio':
https://patchwork.ozlabs.org/cover/808504/
Our plan was to work directly with the Qbv-like scheduling (per-port) just after
the cbs qdisc (Qav), but the feedback here and offline was that there were use
cases for a more simplistic launchtime approach (per-queue) as well. We've
decided to invest on it first (and postpone the 'taprio' qdisc until there was
NIC available with HW support for it, basically).
You are right, and we agree, that using tbs for a per-port schedule of any sort
will require a SW scheduler to be developed on top of it, but we've never said
the contrary either. Our vision has always been that these are separate
mechanisms with different use-cases, so we do see the value for the kernel to
provide both.
In other words, tbs is not the final solution for Qbv, and we agree that a 'TAS'
qdisc is still necessary. And due to the wide range of applications and hw being
used for those out there, we need both specially given that one does not block
the other.
What do you think?
Thanks,
Jesus
^ permalink raw reply
* Re: [RFC v3 net-next 13/18] net/sched: Introduce the TBS Qdisc
From: Jesus Sanchez-Palencia @ 2018-03-22 20:29 UTC (permalink / raw)
To: Thomas Gleixner
Cc: netdev, jhs, xiyou.wangcong, jiri, vinicius.gomes, richardcochran,
anna-maria, henrik, john.stultz, levi.pearson, edumazet, willemb,
mlichvar
In-Reply-To: <alpine.DEB.2.21.1803211407520.3754@nanos.tec.linutronix.de>
Hi Thomas,
On 03/21/2018 06:46 AM, Thomas Gleixner wrote:
> On Tue, 6 Mar 2018, Jesus Sanchez-Palencia wrote:
>> +struct tbs_sched_data {
>> + bool sorting;
>> + int clockid;
>> + int queue;
>> + s32 delta; /* in ns */
>> + ktime_t last; /* The txtime of the last skb sent to the netdevice. */
>> + struct rb_root head;
>
> Hmm. You are reimplementing timerqueue open coded. Have you checked whether
> you could reuse the timerqueue implementation?
>
> That requires to add a timerqueue node to struct skbuff
>
> @@ -671,7 +671,8 @@ struct sk_buff {
> unsigned long dev_scratch;
> };
> };
> - struct rb_node rbnode; /* used in netem & tcp stack */
> + struct rb_node rbnode; /* used in netem & tcp stack */
> + struct timerqueue_node tqnode;
> };
> struct sock *sk;
>
> Then you can use timerqueue_head in your scheduler data and all the open
> coded rbtree handling goes away.
Yes, you are right. We actually looked into that for the first prototype of this
qdisc but we weren't so sure about adding the timerqueue node to the sk_buff's
union and whether it would impact the other usages here, but looking again now
and it looks fine.
We'll fix for the next version, thanks.
>
>> +static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
>> +{
>> + struct tbs_sched_data *q = qdisc_priv(sch);
>> + ktime_t txtime = nskb->tstamp;
>> + struct sock *sk = nskb->sk;
>> + ktime_t now;
>> +
>> + if (sk && !sock_flag(sk, SOCK_TXTIME))
>> + return false;
>> +
>> + /* We don't perform crosstimestamping.
>> + * Drop if packet's clockid differs from qdisc's.
>> + */
>> + if (nskb->txtime_clockid != q->clockid)
>> + return false;
>> +
>> + now = get_time_by_clockid(q->clockid);
>
> If you store the time getter function pointer in tbs_sched_data then you
> avoid the lookup and just can do
>
> now = q->get_time();
>
> That applies to lots of other places.
Good idea, thanks. Will fix.
>> +
>> +static struct sk_buff *tbs_peek_timesortedlist(struct Qdisc *sch)
>> +{
>> + struct tbs_sched_data *q = qdisc_priv(sch);
>> + struct rb_node *p;
>> +
>> + p = rb_first(&q->head);
>
> timerqueue gives you direct access to the first expiring entry w/o walking
> the rbtree. So that would become:
>
> p = timerqueue_getnext(&q->tqhead);
> return p ? rb_to_skb(p) : NULL;
OK.
(...)
>> +static struct sk_buff *tbs_dequeue_scheduledfifo(struct Qdisc *sch)
>> +{
>> + struct tbs_sched_data *q = qdisc_priv(sch);
>> + struct sk_buff *skb = tbs_peek(sch);
>> + ktime_t now, next;
>> +
>> + if (!skb)
>> + return NULL;
>> +
>> + now = get_time_by_clockid(q->clockid);
>> +
>> + /* Drop if packet has expired while in queue and the drop_if_late
>> + * flag is set.
>> + */
>> + if (skb->tc_drop_if_late && ktime_before(skb->tstamp, now)) {
>> + struct sk_buff *to_free = NULL;
>> +
>> + qdisc_queue_drop_head(sch, &to_free);
>> + kfree_skb_list(to_free);
>> + qdisc_qstats_overlimit(sch);
>> +
>> + skb = NULL;
>> + goto out;
>
> Instead of going out immediately you should check the next skb whether its
> due for sending already.
We wanted to have a baseline before starting with the optimizations, so we left
this for a later patchset. It was one of the opens we had listed on the v2 cover
letter IIRC, but we'll look into it.
(...)
>> + }
>> +
>> + next = ktime_sub_ns(skb->tstamp, q->delta);
>> +
>> + /* Dequeue only if now is within the [txtime - delta, txtime] range. */
>> + if (ktime_after(now, next))
>> + timesortedlist_erase(sch, skb, false);
>> + else
>> + skb = NULL;
>> +
>> +out:
>> + /* Now we may need to re-arm the qdisc watchdog for the next packet. */
>> + reset_watchdog(sch);
>> +
>> + return skb;
>> +}
>> +
>> +static inline void setup_queueing_mode(struct tbs_sched_data *q)
>> +{
>> + if (q->sorting) {
>> + q->enqueue = tbs_enqueue_timesortedlist;
>> + q->dequeue = tbs_dequeue_timesortedlist;
>> + q->peek = tbs_peek_timesortedlist;
>> + } else {
>> + q->enqueue = tbs_enqueue_scheduledfifo;
>> + q->dequeue = tbs_dequeue_scheduledfifo;
>> + q->peek = qdisc_peek_head;
>
> I don't see the point of these two modes and all the duplicated code it
> involves.
>
> FIFO mode limits usage to a single thread which has to guarantee that the
> packets are queued in time order.
>
> If you look at the use cases of TDM in various fields then FIFO mode is
> pretty much useless. In industrial/automotive fieldbus applications the
> various time slices are filled by different threads or even processes.
>
> Sure, the rbtree queue/dequeue has overhead compared to a simple linked
> list, but you pay for that with more indirections and lots of mostly
> duplicated code. And in the worst case one of these code pathes is going to
> be rarely used and prone to bitrot.
Our initial version (on RFC v2) was performing the sorting for all modes. After
all the feedback we got we decided to make it optional and provide FIFO modes as
well. For the SW fallback we need the scheduled FIFO, and for "pure" hw offload
we need the "raw" FIFO.
This was a way to accommodate all the use cases without imposing too much of a
burden onto anyone, regardless of their application's segment (i.e. industrial,
pro a/v, automotive, etc).
Having the sorting always enabled requires that a valid static clockid is passed
to the qdisc. For the hw offload mode, that means that the PHC and one of the
system clocks must be synchronized since hrtimers do not support dynamic clocks.
Not all systems do that or want to, and given that we do not want to perform
crosstimestamping between the packets' clock reference and the qdisc's one, the
only solution for these systems would be using the raw hw offload mode.
Thanks,
Jesus
^ permalink raw reply
* [GIT] Networking
From: David Miller @ 2018-03-22 20:32 UTC (permalink / raw)
To: torvalds; +Cc: akpm, netdev, linux-kernel
1) Always validate XFRM esn replay attribute, from Florian Westphal.
2) Fix RCU read lock imbalance in xfrm_get_tos(), from Xin Long.
3) Don't try to get firmware dump if not loaded in iwlwifi, from
Shaul Triebitz.
4) Fix BPF helpers to deal with SCTP GSO SKBs properly, from Daniel
Axtens.
5) Fix some interrupt handling issues in e1000e driver, from Benjamin
Poitier.
6) Use strlcpy() in several ethtool get_strings methods, from Florian
Fainelli.
7) Fix rhlist dup insertion, from Paul Blakey.
8) Fix SKB leak in netem packet scheduler, from Alexey Kodanev.
9) Fix driver unload crash when link is up in smsc911x, from Jeremy
Linton.
10) Purge out invalid socket types in l2tp_tunnel_create(), from
Eric Dumazet.
11) Need to purge the write queue when TCP connections are aborted,
otherwise userspace using MSG_ZEROCOPY can't close the fd.
From Soheil Hassas Yeganeh.
12) Fix double free in error path of team driver, from Arkadi
Sharshevsky.
13) Filter fixes for hv_netvsc driver, from Stephen Hemminger.
14) Fix non-linear packet access in ipv6 ndisc code, from Lorenzo
Bianconi.
15) Properly filter out unsupported feature flags in macvlan driver,
from Shannon Nelson.
16) Don't request loading the diag module for a protocol if the
protocol itself is not even registered. From Xin Long.
17) If datagram connect fails in ipv6, make sure the socket state
is consistent afterwards. From Paolo Abeni.
18) Use after free in qed driver, from Dan Carpenter.
19) If received ipv4 PMTU is less than the min pmtu, lock the mtu
in the entry. From Sabrina Dubroca.
20) Fix sleep in atomic in tg3 driver, from Jonathan Toppins.
21) Fix vlan in vlan untagging in some situations, from Toshiaki
Makita.
22) Fix double SKB free in genlmsg_mcast(). From Nicolas Dichtel.
23) Fix NULL derefs in error paths of tcf_*_init(), from Davide
Caratti.
24) Unbalanced PM runtime calls in FEC driver, from Florian Fainelli.
25) Memory leak in gemini driver, from Igor Pylypiv.
26) IDR leaks in error paths of tcf_*_init() functions, from Davide
Caratti.
27) Need to use GFP_ATOMIC in seg6_build_state(), from David Lebrun.
28) Missing dev_put() in error path of macsec_newlink(), from
Dan Carpenter.
Please pull, thanks a lot!
The following changes since commit ce380619fab99036f5e745c7a865b21c59f005f6:
Merge tag 'please-pull-ia64_misc' of git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux (2018-03-05 20:31:14 -0800)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git
for you to fetch changes up to 5dcd8400884cc4a043a6d4617e042489e5d566a9:
macsec: missing dev_put() on error in macsec_newlink() (2018-03-22 14:30:36 -0400)
----------------------------------------------------------------
Alexander Potapenko (1):
vhost_net: initialize rx_ring in vhost_net_open()
Alexey Kodanev (2):
sch_netem: fix skb leak in netem_enqueue()
dccp: check sk for closed state in dccp_sendmsg()
Andrei Otcheretianski (1):
iwlwifi: mvm: Fix channel switch for count 0 and 1
Andrew Lunn (1):
net: dsa: mv88e6xxx: Fix binding documentation for MDIO busses
Andrew Zaborowski (1):
mac80211_hwsim: Set wmediumd for new radios
Andri Yngvason (3):
can: cc770: Fix stalls on rt-linux, remove redundant IRQ ack
can: cc770: Fix queue stall & dropped RTR reply
can: cc770: Fix use after free in cc770_tx_interrupt()
Arend Van Spriel (2):
brcmfmac: add possibility to obtain firmware error
brcmfmac: fix P2P_DEVICE ethernet address generation
Arkadi Sharshevsky (2):
team: Fix double free in error path
devlink: Remove redundant free on error path
Arvind Yadav (1):
net/iucv: Free memory obtained by kzalloc
Ben Caradoc-Davies (1):
mac80211: add ieee80211_hw flag for QoS NDP support
Benjamin Poirier (7):
e1000e: Remove Other from EIAC
Partial revert "e1000e: Avoid receiver overrun interrupt bursts"
e1000e: Fix queue interrupt re-raising in Other interrupt
e1000e: Avoid missed interrupts following ICR read
e1000e: Fix check_for_link return value with autoneg off
Revert "e1000e: Separate signaling for link check/link up"
e1000e: Fix link check race condition
Bich HEMON (1):
can: m_can: select pinctrl state in each suspend/resume function
Brad Mouring (1):
net: phy: Tell caller result of phy_change()
Camelia Groza (3):
dpaa_eth: remove duplicate initialization
dpaa_eth: increment the RX dropped counter when needed
dpaa_eth: remove duplicate increment of the tx_errors counter
Cathy Zhou (1):
sunvnet: does not support GSO for sctp
Chenbo Feng (1):
bpf: skip unnecessary capability check
Christophe JAILLET (1):
net: ethernet: arc: Fix a potential memory leak if an optional regulator is deferred
Colin Ian King (2):
bnx2x: fix spelling mistake: "registeration" -> "registration"
qede: fix spelling mistake: "registeration" -> "registration"
Dan Carpenter (2):
qed: Use after free in qed_rdma_free()
macsec: missing dev_put() on error in macsec_newlink()
Daniel Axtens (3):
bpf: fix bpf_skb_adjust_net/bpf_skb_proto_xlat to deal with gso sctp skbs
docs: segmentation-offloads.txt: Correct TCP gso_types
net: use skb_is_gso_sctp() instead of open-coding
Daniel Borkmann (2):
bpf, x64: increase number of passes
kbuild: disable clang's default use of -fmerge-all-constants
David Ahern (2):
net: Only honor ifindex in IP_PKTINFO if non-0
net/ipv6: Handle onlink flag with multipath routes
David Lebrun (2):
ipv6: sr: fix scheduling in RCU when creating seg6 lwtunnel state
ipv6: sr: fix NULL pointer dereference when setting encap source address
David S. Miller (32):
Merge branch 'net-Use-strlcpy-for-ethtool-get_strings'
Merge branch 'rhltable-dups'
Merge branch 'for-upstream' of git://git.kernel.org/.../bluetooth/bluetooth
Merge branch '1GbE' of git://git.kernel.org/.../jkirsher/net-queue
Merge git://git.kernel.org/.../bpf/bpf
Merge tag 'wireless-drivers-for-davem-2018-03-08' of git://git.kernel.org/.../kvalo/wireless-drivers
Merge branch 'hv_netvsc-fix-multicast-flags-and-sync'
Merge branch 'vhost_net-ptr_ring-fixes'
Merge branch 'mlxsw-ACL-and-mirroring-fixes'
Merge branch 'erspan-fixes'
Merge branch 'bnxt_en-Bug-fixes'
Merge tag 'linux-can-fixes-for-4.16-20180312' of ssh://gitolite.kernel.org/.../mkl/linux-can
Merge git://git.kernel.org/.../pablo/nf
Merge branch 'l2tp-fix-races-with-ipv4-mapped-ipv6-addresses'
Merge branch '1GbE' of git://git.kernel.org/.../jkirsher/net-queue
Merge branch 'master' of git://git.kernel.org/.../klassert/ipsec
Merge branch 'DPAA-Ethernet-fixes'
Merge tag 'linux-can-fixes-for-4.16-20180314' of ssh://gitolite.kernel.org/.../mkl/linux-can
Merge branch 'vlan-untag-and-insert-fixes'
Merge branch 'qed-iWARP-related-fixes'
Merge branch 'tcf_foo_init-NULL-deref'
Merge branch 'phy-relax-error-checking'
Merge branch 'for-upstream' of git://git.kernel.org/.../bluetooth/bluetooth
Merge tag 'linux-can-fixes-for-4.16-20180319' of ssh://gitolite.kernel.org/.../mkl/linux-can
Merge git://git.kernel.org/.../bpf/bpf
Merge branch 'net-sched-action-idr-leak'
Merge tag 'batadv-net-for-davem-20180319' of git://git.open-mesh.org/linux-merge
Merge branch 'net-phy-Add-general-dummy-stubs-for-MMD-register-access'
Merge branch 's390-qeth-fixes'
Merge branch 'aquantia-fixes'
Merge branch 'hv_netvsc-fix-races-during-shutdown-and-changes'
Merge tag 'mac80211-for-davem-2018-03-21' of git://git.kernel.org/.../jberg/mac80211
Davide Caratti (12):
net/sched: fix NULL dereference in the error path of tcf_vlan_init()
net/sched: fix NULL dereference in the error path of tcf_csum_init()
net/sched: fix NULL dereference in the error path of tunnel_key_init()
net/sched: fix NULL dereference in the error path of tcf_sample_init()
net/sched: fix NULL dereference on the error path of tcf_skbmod_init()
net/sched: fix idr leak on the error path of tcf_bpf_init()
net/sched: fix idr leak in the error path of tcf_simp_init()
net/sched: fix idr leak in the error path of tcf_act_police_init()
net/sched: fix idr leak in the error path of tcp_pedit_init()
net/sched: fix idr leak in the error path of __tcf_ipt_init()
net/sched: fix idr leak in the error path of tcf_vlan_init()
net/sched: fix idr leak in the error path of tcf_skbmod_init()
Denis Kirjanov (1):
fsl/fman: avoid sleeping in atomic context while adding an address
Eddie Wai (1):
bnxt_en: Fix vnic accounting in the bnxt_check_rings() path.
Emmanuel Grumbach (1):
iwlwifi: pcie: don't warn if we use all the transmit pointers
Eric Dumazet (5):
net: usbnet: fix potential deadlock on 32bit hosts
l2tp: do not accept arbitrary sockets
ieee802154: 6lowpan: fix possible NULL deref in lowpan_device_event()
net: use skb_to_full_sk() in skb_update_prio()
net: sched: fix uses after free
Florian Fainelli (8):
net: dsa: b53: Use strlcpy() for ethtool::get_strings
net: phy: marvell: Use strlcpy() for ethtool::get_strings
net: phy: micrel: Use strlcpy() for ethtool::get_strings
net: phy: broadcom: Use strlcpy() for ethtool::get_strings
net: dsa: Fix dsa_is_user_port() test inversion
net: systemport: Rewrite __bcm_sysport_tx_reclaim()
net: fec: Fix unbalanced PM runtime calls
net: dsa: Fix functional dsa-loop dependency on FIXED_PHY
Florian Westphal (4):
xfrm_user: uncoditionally validate esn replay attribute struct
netfilter: ebtables: fix erroneous reject of last rule
netfilter: x_tables: add and use xt_check_proc_name
netfilter: bridge: ebt_among: add more missing match size checks
Ganesh Goudar (2):
cxgb4: copy adap index to PF0-3 adapter instances
cxgb4: do not set needs_free_netdev for mgmt dev's
Geert Uytterhoeven (1):
dt-bindings: net: renesas-ravb: Make stream buffer optional
Greg Hackmann (1):
net: xfrm: use preempt-safe this_cpu_read() in ipcomp_alloc_tfms()
Grygorii Strashko (2):
sysfs: symlink: export sysfs_create_link_nowarn()
net: phy: relax error checking when creating sysfs link netdev->phydev
Guillaume Nault (1):
ppp: avoid loop in xmit recursion detection code
Haim Dreyfuss (1):
iwlwifi: Cancel and set MARKER_CMD timer during suspend-resume
Hans de Goede (3):
Bluetooth: btusb: Remove Yoga 920 from the btusb_needs_reset_resume_table
Revert "Bluetooth: hci_bcm: Streamline runtime PM code"
Bluetooth: hci_bcm: Set pulsed_host_wake flag in sleep parameters
Hemanth Puranik (1):
net: qcom/emac: Use proper free methods during TX
Ido Schimmel (1):
mlxsw: spectrum_buffers: Set a minimum quota for CPU port traffic
Igor Pylypiv (2):
vmxnet3: remove unused flag "rxcsum" from struct vmxnet3_adapter
net: gemini: fix memory leak
Igor Russkikh (7):
net: aquantia: Fix hardware reset when SPI may rarely hangup
net: aquantia: Fix a regression with reset on old firmware
net: aquantia: Change inefficient wait loop on fw data reads
net: aquantia: Add tx clean budget and valid budget handling logic
net: aquantia: Allow live mac address changes
net: aquantia: Implement pci shutdown callback
net: aquantia: driver version bump
Ilan Peer (2):
iwlwifi: mvm: Direct multicast frames to the correct station
iwlwifi: mvm: Correctly set the tid for mcast queue
Jason Wang (2):
vhost_net: keep private_data and rx_ring synced
vhost_net: examine pointer types during un-producing
Jeremy Linton (1):
net: smsc911x: Fix unload crash when link is up
Jiri Benc (1):
tools: bpftool: fix compilation with older headers
Jiri Pirko (1):
mlxsw: spectrum: Fix gact_ok offloading
Johannes Berg (1):
ath9k_htc: use non-QoS NDP for AP probing
Jonathan Toppins (1):
tg3: prevent scheduling while atomic splat
Julian Wiedmann (4):
s390/qeth: free netdevice when removing a card
s390/qeth: when thread completes, wake up all waiters
s390/qeth: lock read device while queueing next buffer
s390/qeth: on channel error, reject further cmd requests
Kai-Heng Feng (1):
Bluetooth: btusb: Add Dell OptiPlex 3060 to btusb_needs_reset_resume_table
Kalle Valo (2):
Merge tag 'iwlwifi-for-kalle-2018-02-16-2' of git://git.kernel.org/.../iwlwifi/iwlwifi-fixes
Merge tag 'iwlwifi-for-kalle-2018-03-02' of git://git.kernel.org/.../iwlwifi/iwlwifi-fixes
Kevin Hao (3):
net: phy: Add general dummy stubs for MMD register access
net: phy: realtek: Use the dummy stubs for MMD register access for rtl8211b
net: phy: micrel: Use the general dummy stubs for MMD register access
Kirill Tkhai (1):
net: Fix hlist corruptions in inet_evict_bucket()
Kunihiko Hayashi (1):
net: ethernet: ave: enable Rx drop interrupt
Larry Finger (1):
rtlwifi: rtl8723be: Fix loss of signal
Linus Lüssing (1):
batman-adv: Fix multicast packet loss with a single WANT_ALL_IPV4/6 flag
Lorenzo Bianconi (1):
ipv6: fix access to non-linear packet in ndisc_fill_redirect_hdr_option()
Madalin Bucur (2):
soc/fsl/qbman: fix issue in qman_delete_cgr_safe()
dpaa_eth: fix error in dpaa_remove()
Marek Vasut (2):
can: ifi: Check core revision upon probe
can: ifi: Repair the error handling
Masami Hiramatsu (1):
error-injection: Fix to prohibit jump optimization
Matthias Brugger (1):
net: hns: Fix ethtool private flags
Matthias Schiffer (2):
batman-adv: update data pointers after skb_cow()
batman-adv: fix header size check in batadv_dbg_arp()
Michael Chan (4):
bnxt_en: Refactor the functions to reserve hardware rings.
bnxt_en: Pass complete VLAN TCI to the stack.
bnxt_en: Fix regressions when setting up MQPRIO TX rings.
bnxt_en: Check valid VNIC ID in bnxt_hwrm_vnic_set_tpa().
Michal Kalderon (4):
qed: Free RoCE ILT Memory on rmmod qedr
qed: Fix MPA unalign flow in case header is split across two packets.
qed: Fix non TCP packets should be dropped on iWARP ll2 connection
qede: Fix qedr link update
Naftali Goldstein (1):
iwlwifi: mvm: always init rs with 20mhz bandwidth rates
Nicolas Dichtel (1):
netlink: avoid a double skb free in genlmsg_mcast()
Pablo Neira Ayuso (2):
netfilter: nft_set_hash: skip fixed hash if timeout is specified
netfilter: nf_tables: release flowtable hooks
Paolo Abeni (2):
net: ipv6: keep sk status consistent after datagram connect failure
l2tp: fix races with ipv4-mapped ipv6 addresses
Paul Blakey (2):
rhashtable: Fix rhlist duplicates insertion
test_rhashtable: add test case for rhltable with duplicate objects
Paul Moore (1):
net: don't unnecessarily load kernel modules in dev_ioctl()
Petr Machata (1):
mlxsw: spectrum: Prevent duplicate mirrors
Pierre-Yves Kerbrat (1):
e1000e: allocate ring descriptors with dma_zalloc_coherent
Randy Dunlap (1):
ethernet: natsemi: correct spelling
Roman Mashak (1):
net sched actions: return explicit error when tunnel_key mode is not specified
Ronak Doshi (2):
vmxnet3: avoid xmit reset due to a race in vmxnet3
vmxnet3: use correct flag to indicate LRO feature
SZ Lin (林上智) (1):
net: ethernet: ti: cpsw: add check for in-band mode setting with RGMII PHY interface
Sabrina Dubroca (1):
ipv4: lock mtu in fnhe when received PMTU < net.ipv4.route.min_pmtu
Sara Sharon (5):
iwlwifi: mvm: fix security bug in PN checking
iwlwifi: mvm: fix IBSS for devices that support station type API
iwlwifi: mvm: fix TX of CCMP 256
iwlwifi: mvm: fix assert 0x2B00 on older FWs
iwlwifi: mvm: fix "failed to remove key" message
Shannon Nelson (1):
macvlan: filter out unsupported feature flags
Shaul Triebitz (2):
iwlwifi: align timestamp cancel with timestamp start
iwlwifi: avoid collecting firmware dump if not loaded
Soheil Hassas Yeganeh (1):
tcp: purge write queue upon aborting the connection
Sriharsha Basavapatna (1):
bnxt_en: Remove unwanted ovs-offload messages in some conditions
Stefano Brivio (2):
ipv6: Reflect MTU changes on PMTU of exceptions for MTU-less routes
ipv6: old_dport should be a __be16 in __ip6_datagram_connect()
Steffen Klassert (4):
xfrm: Refuse to insert 32 bit userspace socket policies on 64 bit systems
xfrm: Fix policy hold queue after flowcache removal.
xfrm: Fix infinite loop in xfrm_get_dst_nexthop with transport mode.
xfrm: Fix ESN sequence number handling for IPsec GSO packets.
Stephane Grosjean (2):
can: peak/pcie_fd: fix echo_skb is occupied! bug
can: peak/pcie_fd: remove useless code when interface starts
Stephen Hemminger (8):
hv_netvsc: fix filter flags
hv_netvsc: avoid repeated updates of packet filter
hv_netvsc: fix locking for rx_mode
hv_netvsc: fix locking during VF setup
hv_netvsc: disable NAPI before channel close
hv_netvsc: use RCU to fix concurrent rx and queue changes
hv_netvsc: change GPAD teardown order on older versions
hv_netvsc: common detach logic
Sven Eckelmann (2):
batman-adv: Add missing include for EPOLL* constants
batman-adv: Fix skbuff rcsum on packet reroute
Szymon Janc (1):
Bluetooth: Fix missing encryption refresh on Security Request
Takashi Iwai (1):
Bluetooth: btusb: Fix quirk for Atheros 1525/QCA6174
Thadeu Lima de Souza Cascardo (1):
test_bpf: Fix testing with CONFIG_BPF_JIT_ALWAYS_ON=y on other arches
Tom Herbert (1):
kcm: lock lower socket in kcm_attach
Toshiaki Makita (2):
net: Fix vlan untag for bridge and vlan_dev with reorder_hdr off
vlan: Fix out of order vlan headers with reorder header off
Ulf Magnusson (1):
iwlwifi: fix malformed CONFIG_IWLWIFI_PCIE_RTPM default
Ursula Braun (1):
net/smc: simplify wait when closing listen socket
Venkat Duvvuru (2):
bnxt_en: Return standard Linux error codes for hwrm flow cmds.
bnxt_en: close & open NIC, only when the interface is in running state.
Vinicius Costa Gomes (1):
skbuff: Fix not waking applications when errors are enqueued
William Tu (3):
ip6gre: add erspan v2 to tunnel lookup
ip6erspan: improve error handling for erspan version number.
ip6erspan: make sure enough headroom at xmit.
Wolfram Sang (1):
can: m_can: change comparison to bitshift when dealing with a mask
Xin Long (3):
xfrm: reuse uncached_list to track xdsts
xfrm: do not call rcu_read_unlock when afinfo is NULL in xfrm_get_tos
sock_diag: request _diag module only when the family or proto has been registered
Yonghong Song (1):
trace/bpf: remove helper bpf_perf_prog_read_value from tracepoint type programs
Yossi Kuperman (1):
xfrm: Verify MAC header exists before overwriting eth_hdr(skb)->h_proto
zhangliping (1):
openvswitch: meter: fix the incorrect calculation of max delta_t
Documentation/devicetree/bindings/net/dsa/marvell.txt | 48 ++++++-------
Documentation/devicetree/bindings/net/renesas,ravb.txt | 6 +-
Documentation/networking/segmentation-offloads.txt | 18 +++--
Makefile | 9 +++
arch/x86/net/bpf_jit_comp.c | 3 +-
drivers/bluetooth/btusb.c | 8 +--
drivers/bluetooth/hci_bcm.c | 13 ++--
drivers/net/can/cc770/cc770.c | 100 ++++++++++++++++-----------
drivers/net/can/cc770/cc770.h | 2 +
drivers/net/can/ifi_canfd/ifi_canfd.c | 75 +++++++++++++--------
drivers/net/can/m_can/m_can.c | 7 +-
drivers/net/can/peak_canfd/peak_canfd.c | 25 +++----
drivers/net/can/peak_canfd/peak_pciefd_main.c | 8 ++-
drivers/net/dsa/Makefile | 5 +-
drivers/net/dsa/b53/b53_common.c | 4 +-
drivers/net/ethernet/8390/Kconfig | 2 +-
drivers/net/ethernet/aquantia/atlantic/aq_cfg.h | 2 +
drivers/net/ethernet/aquantia/atlantic/aq_nic.c | 22 ++++++
drivers/net/ethernet/aquantia/atlantic/aq_nic.h | 1 +
drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c | 15 +++++
drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 7 +-
drivers/net/ethernet/aquantia/atlantic/aq_ring.h | 2 +-
drivers/net/ethernet/aquantia/atlantic/aq_vec.c | 11 ++-
drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c | 66 ++++++++++++------
drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h | 1 +
drivers/net/ethernet/aquantia/atlantic/ver.h | 2 +-
drivers/net/ethernet/arc/emac_rockchip.c | 6 +-
drivers/net/ethernet/broadcom/bcmsysport.c | 33 +++++----
drivers/net/ethernet/broadcom/bcmsysport.h | 2 +-
drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 2 +-
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 180 ++++++++++++++++++++++++++-----------------------
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 +
drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c | 33 ++++++---
drivers/net/ethernet/broadcom/tg3.c | 2 +-
drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 3 +-
drivers/net/ethernet/cortina/gemini.c | 1 +
drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 8 +--
drivers/net/ethernet/freescale/fec_main.c | 2 +
drivers/net/ethernet/freescale/fman/fman_dtsec.c | 2 +-
drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c | 2 +-
drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c | 2 +-
drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c | 2 +-
drivers/net/ethernet/hisilicon/hns/hns_ethtool.c | 4 +-
drivers/net/ethernet/intel/e1000e/defines.h | 21 +++++-
drivers/net/ethernet/intel/e1000e/ich8lan.c | 42 ++++++------
drivers/net/ethernet/intel/e1000e/mac.c | 25 +++----
drivers/net/ethernet/intel/e1000e/netdev.c | 37 ++++------
drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c | 11 +++
drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h | 1 +
drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 28 ++++++--
drivers/net/ethernet/mellanox/mlxsw/spectrum.h | 4 ++
drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c | 5 ++
drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c | 12 ++--
drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c | 2 +-
drivers/net/ethernet/natsemi/Kconfig | 6 +-
drivers/net/ethernet/natsemi/Makefile | 2 +-
drivers/net/ethernet/qlogic/qed/qed_cxt.c | 5 +-
drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 17 ++++-
drivers/net/ethernet/qlogic/qed/qed_rdma.c | 1 +
drivers/net/ethernet/qlogic/qede/qede_main.c | 8 +--
drivers/net/ethernet/qlogic/qede/qede_ptp.c | 2 +-
drivers/net/ethernet/qualcomm/emac/emac-mac.c | 23 ++++---
drivers/net/ethernet/smsc/smsc911x.c | 4 +-
drivers/net/ethernet/socionext/sni_ave.c | 2 +-
drivers/net/ethernet/sun/sunvnet.c | 2 +-
drivers/net/ethernet/ti/cpsw.c | 3 +-
drivers/net/hyperv/hyperv_net.h | 2 +-
drivers/net/hyperv/netvsc.c | 52 +++++++--------
drivers/net/hyperv/netvsc_drv.c | 293 ++++++++++++++++++++++++++++++++++++++++++++++----------------------------------
drivers/net/hyperv/rndis_filter.c | 68 +++++++++----------
drivers/net/macsec.c | 5 +-
drivers/net/macvlan.c | 2 +-
drivers/net/phy/bcm-phy-lib.c | 4 +-
drivers/net/phy/marvell.c | 4 +-
drivers/net/phy/micrel.c | 27 ++------
drivers/net/phy/phy.c | 145 ++++++++++++++++++++--------------------
drivers/net/phy/phy_device.c | 32 +++++++--
drivers/net/phy/realtek.c | 2 +
drivers/net/ppp/ppp_generic.c | 26 ++++----
drivers/net/team/team.c | 4 +-
drivers/net/tun.c | 3 +-
drivers/net/usb/usbnet.c | 10 +--
drivers/net/vmxnet3/vmxnet3_drv.c | 16 +++--
drivers/net/vmxnet3/vmxnet3_int.h | 7 +-
drivers/net/wireless/ath/ath9k/htc_drv_init.c | 1 +
drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.h | 2 +
drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c | 10 +++
drivers/net/wireless/broadcom/brcm80211/brcmfmac/fwil.c | 3 +
drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c | 24 +++----
drivers/net/wireless/intel/iwlwifi/Kconfig | 1 -
drivers/net/wireless/intel/iwlwifi/fw/api/time-event.h | 4 +-
drivers/net/wireless/intel/iwlwifi/fw/dbg.c | 13 +++-
drivers/net/wireless/intel/iwlwifi/fw/dbg.h | 3 +
drivers/net/wireless/intel/iwlwifi/fw/debugfs.h | 18 +++++
drivers/net/wireless/intel/iwlwifi/fw/init.c | 12 +++-
drivers/net/wireless/intel/iwlwifi/fw/runtime.h | 7 ++
drivers/net/wireless/intel/iwlwifi/mvm/d3.c | 8 +++
drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c | 5 +-
drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c | 3 +-
drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 49 +++++++++++---
drivers/net/wireless/intel/iwlwifi/mvm/mvm.h | 2 +
drivers/net/wireless/intel/iwlwifi/mvm/ops.c | 10 ++-
drivers/net/wireless/intel/iwlwifi/mvm/rs.c | 28 +++++---
drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c | 39 +++++------
drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 33 ++++-----
drivers/net/wireless/intel/iwlwifi/mvm/time-event.c | 6 +-
drivers/net/wireless/intel/iwlwifi/mvm/tx.c | 13 +++-
drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c | 2 +-
drivers/net/wireless/intel/iwlwifi/pcie/tx.c | 2 +-
drivers/net/wireless/mac80211_hwsim.c | 1 +
drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c | 3 +-
drivers/s390/net/qeth_core_main.c | 21 ++++--
drivers/s390/net/qeth_l2_main.c | 2 +-
drivers/s390/net/qeth_l3_main.c | 2 +-
drivers/soc/fsl/qbman/qman.c | 28 ++------
drivers/vhost/net.c | 8 ++-
fs/sysfs/symlink.c | 1 +
include/linux/cgroup-defs.h | 4 +-
include/linux/if_tun.h | 4 ++
include/linux/if_vlan.h | 66 +++++++++++++++---
include/linux/net.h | 1 +
include/linux/netfilter/x_tables.h | 2 +
include/linux/phy.h | 5 +-
include/linux/rhashtable.h | 4 +-
include/linux/skbuff.h | 22 ++++++
include/linux/u64_stats_sync.h | 22 ++++++
include/net/ip.h | 11 ++-
include/net/ip6_route.h | 3 +
include/net/ip_fib.h | 1 +
include/net/mac80211.h | 4 ++
include/net/route.h | 6 +-
include/net/sch_generic.h | 19 ++++++
include/net/sock.h | 1 +
include/uapi/linux/if_ether.h | 1 +
kernel/bpf/syscall.c | 2 +-
kernel/fail_function.c | 10 +++
kernel/trace/bpf_trace.c | 68 +++++++++++--------
lib/rhashtable.c | 4 +-
lib/test_bpf.c | 2 +-
lib/test_rhashtable.c | 134 +++++++++++++++++++++++++++++++++++++
net/8021q/vlan_core.c | 4 +-
net/batman-adv/distributed-arp-table.c | 2 +-
net/batman-adv/icmp_socket.c | 1 +
net/batman-adv/log.c | 1 +
net/batman-adv/multicast.c | 4 +-
net/batman-adv/routing.c | 25 ++++---
net/bluetooth/smp.c | 8 ++-
net/bridge/netfilter/ebt_among.c | 34 ++++++++++
net/bridge/netfilter/ebtables.c | 6 +-
net/core/dev.c | 22 ++++--
net/core/dev_ioctl.c | 7 +-
net/core/devlink.c | 16 ++---
net/core/filter.c | 60 ++++++++++++-----
net/core/skbuff.c | 11 +--
net/core/sock.c | 21 ++++++
net/core/sock_diag.c | 12 ++--
net/dccp/proto.c | 5 ++
net/dsa/legacy.c | 2 +-
net/ieee802154/6lowpan/core.c | 12 ++--
net/ipv4/inet_diag.c | 3 +-
net/ipv4/inet_fragment.c | 3 +
net/ipv4/ip_sockglue.c | 6 +-
net/ipv4/route.c | 47 ++++++++-----
net/ipv4/tcp.c | 1 +
net/ipv4/tcp_timer.c | 1 +
net/ipv4/xfrm4_mode_tunnel.c | 3 +-
net/ipv4/xfrm4_policy.c | 5 +-
net/ipv6/datagram.c | 21 ++++--
net/ipv6/ip6_gre.c | 8 ++-
net/ipv6/ndisc.c | 3 +-
net/ipv6/route.c | 76 ++++++++++++---------
net/ipv6/seg6_iptunnel.c | 7 +-
net/ipv6/xfrm6_mode_tunnel.c | 3 +-
net/ipv6/xfrm6_policy.c | 5 ++
net/iucv/af_iucv.c | 4 +-
net/kcm/kcmsock.c | 33 ++++++---
net/l2tp/l2tp_core.c | 46 +++++++------
net/l2tp/l2tp_core.h | 3 -
net/mac80211/debugfs.c | 1 +
net/mac80211/mlme.c | 3 +-
net/netfilter/nf_tables_api.c | 1 +
net/netfilter/nft_set_hash.c | 2 +-
net/netfilter/x_tables.c | 30 +++++++++
net/netfilter/xt_hashlimit.c | 16 +++--
net/netfilter/xt_recent.c | 6 +-
net/netlink/genetlink.c | 2 +-
net/openvswitch/meter.c | 12 +++-
net/sched/act_bpf.c | 2 +-
net/sched/act_csum.c | 5 +-
net/sched/act_ipt.c | 9 ++-
net/sched/act_pedit.c | 2 +-
net/sched/act_police.c | 2 +-
net/sched/act_sample.c | 3 +-
net/sched/act_simple.c | 2 +-
net/sched/act_skbmod.c | 5 +-
net/sched/act_tunnel_key.c | 10 +--
net/sched/act_vlan.c | 5 +-
net/sched/sch_generic.c | 22 +++---
net/sched/sch_netem.c | 2 +-
net/sctp/input.c | 8 +--
net/sctp/inqueue.c | 2 +-
net/sctp/offload.c | 2 +-
net/smc/af_smc.c | 4 --
net/smc/smc_close.c | 25 +------
net/socket.c | 5 ++
net/xfrm/xfrm_ipcomp.c | 2 +-
net/xfrm/xfrm_policy.c | 13 ++--
net/xfrm/xfrm_replay.c | 2 +-
net/xfrm/xfrm_state.c | 5 ++
net/xfrm/xfrm_user.c | 21 +++---
tools/bpf/bpftool/common.c | 4 ++
211 files changed, 2086 insertions(+), 1152 deletions(-)
^ permalink raw reply
* Re: [v2] vhost: add vsock compat ioctl
From: Sonny Rao @ 2018-03-22 20:38 UTC (permalink / raw)
To: Stefan Hajnoczi; +Cc: David Miller, netdev
In-Reply-To: <CAJSP0QUFjhq-X=vRH+qb0XPRm5daqWqouwHJf1Lo_sHhjD7vQg@mail.gmail.com>
On Thu, Mar 22, 2018 at 2:25 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Fri, Mar 16, 2018 at 7:30 PM, David Miller <davem@davemloft.net> wrote:
>> Although the top level ioctls are probably size and layout compatible,
>> I do not think that the deeper ioctls can be called by compat binaries
>> without some translations in order for them to work.
>
> I audited the vhost ioctl code when reviewing this patch and was
> unable to find anything that would break for a 32-bit userspace
> process.
>
> drivers/vhost/net.c does the same thing already, which doesn't prove
> it's correct but makes me more confident I didn't miss something while
> auditing the vhost ioctl code.
>
> Did you have a specific ioctl in mind?
I think he means that we need to use the compat_ptr macro on any other
pointers we get from userspace in those other ioctls. For most
architectures this macro doesn't seem to do much but it does on some
-- I think s390 modifies the pointer.
>
> Stefan
^ permalink raw reply
* Re: [PATCH v3 bpf-next 01/10] treewide: remove struct-pass-by-value from tracepoints arguments
From: Steven Rostedt @ 2018-03-22 20:48 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: davem, daniel, torvalds, peterz, netdev, kernel-team, linux-api
In-Reply-To: <6b1952c4-1612-7abb-49c6-9bbaf6dc6997@fb.com>
On Thu, 22 Mar 2018 12:31:12 -0700
Alexei Starovoitov <ast@fb.com> wrote:
> On 3/22/18 11:11 AM, Steven Rostedt wrote:
> > On Thu, 22 Mar 2018 11:01:48 -0700
> > Alexei Starovoitov <ast@fb.com> wrote:
> >
> >> From: Alexei Starovoitov <ast@kernel.org>
> >>
> >> Fix all tracepoint arguments to pass structures (large and small) by reference
> >> instead of by value.
> >> Avoiding passing large structs by value is a good coding style.
> >> Passing small structs sometimes is beneficial, but in all cases
> >> it makes no difference vs readability of the code.
> >> The subsequent patch enforces that all tracepoints args are either integers
> >> or pointers and fit into 64-bit.
> >
> > But some of these structures are used to force type checking, and are
> > just the same size as a number. That's why they don't have "struct" in
> > front of them. Like pmd_t. Will the subsequent patches really break if
> > the structure itself has one element that is of size long? Just seems
> > to add extra code to pass in an address to something that fits into a
> > single register.
>
> yeah. C doesn't allow casting of 'struct s { u64 var };' into u64
> without massive hacks and aliasing warnings by compiler.
> CAST_TO_U64 macro in patch 7 will prevent tracepoint arguments to be
> things like pmd_t. It's not perfect, but doing & of pmd_t variable
> is imo clean enough as you can see in this patch.
> The macro can be tweaked to do the cast like
> *(sizeof(typeof(arg))*)&arg,
> but there is no way to get rid of compiler warning.
OK, but instead of changing it to pass by reference, why not just pass
the value. That is:
static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
{
- trace_xen_mmu_set_pte_atomic(ptep, pte);
+ trace_xen_mmu_set_pte_atomic(ptep, native_pte_val(pte));
set_64bit((u64 *)ptep, native_pte_val(pte));
}
It shouldn't add any extra code, as those helper functions are
basically just special casts.
-- Steve
^ permalink raw reply
* Re: [PATCH v3 bpf-next 01/10] treewide: remove struct-pass-by-value from tracepoints arguments
From: Linus Torvalds @ 2018-03-22 20:52 UTC (permalink / raw)
To: Steven Rostedt
Cc: Alexei Starovoitov, David Miller, Daniel Borkmann, Peter Zijlstra,
Network Development, kernel-team, Linux API
In-Reply-To: <20180322164831.01519df6@gandalf.local.home>
On Thu, Mar 22, 2018 at 1:48 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
>
> OK, but instead of changing it to pass by reference, why not just pass
> the value. That is:
>
> static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
> {
> - trace_xen_mmu_set_pte_atomic(ptep, pte);
> + trace_xen_mmu_set_pte_atomic(ptep, native_pte_val(pte));
> set_64bit((u64 *)ptep, native_pte_val(pte));
> }
That looks simple and clean, and makes sense since the function itself
then uses that value anyway.
Certainly simpler than my monster define.
Linus
^ permalink raw reply
* Re: [PATCH net-next] modules: allow modprobe load regular elf binaries
From: Luis R. Rodriguez @ 2018-03-22 20:54 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Jessica Yu, Linus Torvalds, Mimi Zohar, Djalal Harouni,
David Miller, Andy Lutomirski, Kees Cook, Alexei Starovoitov,
Al Viro, Daniel Borkmann, Greg Kroah-Hartman, Network Development,
Linux Kernel Mailing List, kernel-team, Linux API,
Luis R. Rodriguez, Michal Hocko, Hannes Reinecke, werner
In-Reply-To: <20180310151652.GV4449@wotan.suse.de>
On Sat, Mar 10, 2018 at 03:16:52PM +0000, Luis R. Rodriguez wrote:
> On Sat, Mar 10, 2018 at 02:08:43PM +0000, Luis R. Rodriguez wrote:
> > The alternative to this would be a simple equivalent of try_then_request_module()
> > for UMH modules: try_umhm_then_request_umh_module() or whatever. So just as I
> > argued earlier over UMH limitations, this is not the end of the world for umh
> > modules, and it doesn't mean you can't get *properly* add umh modules upstream,
> > it would *just mean* we'd be perpetuating today's (IMHO) horrible and loose
> > semantics.
>
> I was about to suggest that perhaps a try_umhm_then_request_umh_module() or
> whatever should not be a macro -- but instead an actual routine, and we don't
> export say the simple form to avoid non-deterministic uses of it from the
> start... but the thing is *it'd have to be a macro* given that the *check* for
> the module *has to be loose*, just as try_then_request_module()...
>
> *Ugh* gross.
>
> Another reason for me to want an actual deterministic clean proper solution
> from the start.
I just thought of another consideration which should be made here for the long
term.
Some init systems have a timeout for kmod workers, that is the userspace
process which issues the modprobe call.
That was very well intentioned, however it ended up being nonsense, so at least
on SLE systemd we disable the timeout for kmod workers. What others do... is
unclear to me. Upstream wise the timeout was increased considerably, however,
*if* such timeout is in effect for users it has some implicit implications on
the number of devices a driver could support:
number_devices = systemd_timeout
-------------------------------------
max known probe time for driver
I've documented the logic to these conclusions [0].
It sounds like we *do* want a full sync wait mechanism, and as I noted I think
we should fix the determinism aspect of it. Since no aliases will be supported
for usermode modules this will be much easier to support, and I can volunteer
to help with that.
However given the above... if we're going to use request_module() API (or a
really fixed deterministic version of it later) for usermode kernel modules,
the limitation above still applies.
Are these usermode modules doing all the handy work on init? Or can it be
deferred once loaded? How much loading on init should a usermode module need?
If we can ensure that these usermode modules don't take *any time at all* on
their init *from the start*, it would be wonderful and we'd end up avoiding
some really odd corner case issues later.
[0] http://www.do-not-panic.com/2015/12/linux-asynchronous-probe.html
Luis
^ permalink raw reply
* [PATCH] Net: ethernet: ti: netcp: Fix inbound ping crash if MTU size is greater than 1500
From: Chang, Rex @ 2018-03-22 21:01 UTC (permalink / raw)
To: netdev@vger.kernel.org; +Cc: Karicheri, Muralidharan, Kwok, WingMan
I am requesting to merge this patch to the stable releases:
commit 5a717843177c96ca3fe4565187de395afdb28092
Kernel crashes if MTU is greater than 1500. This code was working in Kernel v 3.14, but got broken when migrating from Kernel v3.14 to v 4.1.
Stable versions to apply: v4.1 and v4.4.
Thanks!
Rex
-----Original Message-----
From: David Miller [mailto:davem@davemloft.net]
Sent: Wednesday, January 17, 2018 4:20 PM
To: Chang, Rex
Cc: Kwok, WingMan; Karicheri, Muralidharan; netdev@vger.kernel.org; linux-kernel@vger.kernel.org
Subject: [EXTERNAL] Re: [PATCH] Net: ethernet: ti: netcp: Fix inbound ping crash if MTU size is greater than 1500
From: Rex Chang <rchang@ti.com>
Date: Tue, 16 Jan 2018 15:16:01 -0500
> In the receive queue for 4096 bytes fragments, the page address set in
> the SW data0 field of the descriptor is not the one we got when doing
> the reassembly in receive. The page structure was retrieved from the
> wrong descriptor into SW data0 which is then causing a page fault when
> UDP checksum is accessing data above 1500.
>
> Signed-off-by: Rex Chang <rchang@ti.com>
Applied, thank you.
^ permalink raw reply
* [net-next:master 1115/1116] drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c:82:5: sparse: symbol 'hclge_inform_reset_assert_to_vf' was not declared. Should it be static?
From: kbuild test robot @ 2018-03-22 21:31 UTC (permalink / raw)
To: Salil Mehta
Cc: kbuild-all, netdev, Yisen Zhuang, Peng Li, Fuyun Liang, Jian Shen,
Yunsheng Lin, linux-kernel
tree: https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master
head: 6686c459e1449a3ee5f3fd313b0a559ace7a700e
commit: 2bfbd35d8ecd97a4a7f1db1754908b54542fa7aa [1115/1116] net: hns3: Changes required in PF mailbox to support VF reset
reproduce:
# apt-get install sparse
git checkout 2bfbd35d8ecd97a4a7f1db1754908b54542fa7aa
make ARCH=x86_64 allmodconfig
make C=1 CF=-D__CHECK_ENDIAN__
sparse warnings: (new ones prefixed by >>)
>> drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c:82:5: sparse: symbol 'hclge_inform_reset_assert_to_vf' was not declared. Should it be static?
Please review and possibly fold the followup patch.
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
^ permalink raw reply
* [RFC PATCH net-next] net: hns3: hclge_inform_reset_assert_to_vf() can be static
From: kbuild test robot @ 2018-03-22 21:31 UTC (permalink / raw)
To: Salil Mehta
Cc: kbuild-all, netdev, Yisen Zhuang, Peng Li, Fuyun Liang, Jian Shen,
Yunsheng Lin, linux-kernel
In-Reply-To: <201803230525.jlwCIx4C%fengguang.wu@intel.com>
Fixes: 2bfbd35d8ecd ("net: hns3: Changes required in PF mailbox to support VF reset")
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
---
hclge_mbx.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
index 3901333..a6f7ffa 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
@@ -79,7 +79,7 @@ static int hclge_send_mbx_msg(struct hclge_vport *vport, u8 *msg, u16 msg_len,
return status;
}
-int hclge_inform_reset_assert_to_vf(struct hclge_vport *vport)
+static int hclge_inform_reset_assert_to_vf(struct hclge_vport *vport)
{
u8 msg_data[2];
u8 dest_vfid;
^ permalink raw reply related
* [PATCH net-next 1/1] net sched actions: merge event notification routines
From: Roman Mashak @ 2018-03-22 22:00 UTC (permalink / raw)
To: davem; +Cc: netdev, kernel, jhs, xiyou.wangcong, jiri, Roman Mashak
Collapse tca_get_notify(), tca_add_notify() and tca_del_notify() in a
single function since they repeat the same code pattern.
Signed-off-by: Roman Mashak <mrv@mojatatu.com>
---
net/sched/act_api.c | 111 ++++++++++++++++------------------------------------
1 file changed, 33 insertions(+), 78 deletions(-)
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 57cf37145282..5b04184fb525 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -895,24 +895,41 @@ static int tca_get_fill(struct sk_buff *skb, struct list_head *actions,
return -1;
}
-static int
-tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
- struct list_head *actions, int event,
- struct netlink_ext_ack *extack)
+static int tca_notify(struct net *net, struct nlmsghdr *n,
+ struct list_head *actions, u32 portid, int event,
+ size_t attr_size, struct netlink_ext_ack *extack)
{
struct sk_buff *skb;
+ int err;
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size,
+ GFP_KERNEL);
if (!skb)
return -ENOBUFS;
- if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event,
- 0, 0) <= 0) {
- NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action");
+
+ if (tca_get_fill(skb, actions, portid, n->nlmsg_seq,
+ event == RTM_NEWACTION ? n->nlmsg_flags : 0,
+ event, 0,
+ event == RTM_DELACTION ? 1 : 0) <= 0) {
+ NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes in event message");
kfree_skb(skb);
return -EINVAL;
}
- return rtnl_unicast(skb, net, portid);
+ if (event == RTM_GETACTION) {
+ return rtnl_unicast(skb, net, portid);
+ } else if (event == RTM_DELACTION) {
+ /* now do the delete */
+ err = tcf_action_destroy(actions, 0);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Failed to delete TC action");
+ kfree_skb(skb);
+ return err;
+ }
+ }
+ err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+ return err > 0 ? 0 : err;
}
static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla,
@@ -1034,40 +1051,6 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
}
static int
-tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
- u32 portid, size_t attr_size, struct netlink_ext_ack *extack)
-{
- int ret;
- struct sk_buff *skb;
-
- skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size,
- GFP_KERNEL);
- if (!skb)
- return -ENOBUFS;
-
- if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION,
- 0, 1) <= 0) {
- NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes");
- kfree_skb(skb);
- return -EINVAL;
- }
-
- /* now do the delete */
- ret = tcf_action_destroy(actions, 0);
- if (ret < 0) {
- NL_SET_ERR_MSG(extack, "Failed to delete TC action");
- kfree_skb(skb);
- return ret;
- }
-
- ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
- n->nlmsg_flags & NLM_F_ECHO);
- if (ret > 0)
- return 0;
- return ret;
-}
-
-static int
tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
u32 portid, int event, struct netlink_ext_ack *extack)
{
@@ -1102,46 +1085,17 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
attr_size = tcf_action_full_attrs_size(attr_size);
- if (event == RTM_GETACTION)
- ret = tcf_get_notify(net, portid, n, &actions, event, extack);
- else { /* delete */
- ret = tcf_del_notify(net, n, &actions, portid, attr_size, extack);
- if (ret)
- goto err;
- return ret;
- }
+ ret = tca_notify(net, n, &actions, portid, event, attr_size, extack);
+ if (ret)
+ goto err;
+ return ret;
+
err:
if (event != RTM_GETACTION)
tcf_action_destroy(&actions, 0);
return ret;
}
-static int
-tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
- u32 portid, size_t attr_size, struct netlink_ext_ack *extack)
-{
- struct sk_buff *skb;
- int err = 0;
-
- skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size,
- GFP_KERNEL);
- if (!skb)
- return -ENOBUFS;
-
- if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags,
- RTM_NEWACTION, 0, 0) <= 0) {
- NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action");
- kfree_skb(skb);
- return -EINVAL;
- }
-
- err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
- n->nlmsg_flags & NLM_F_ECHO);
- if (err > 0)
- err = 0;
- return err;
-}
-
static int tcf_action_add(struct net *net, struct nlattr *nla,
struct nlmsghdr *n, u32 portid, int ovr,
struct netlink_ext_ack *extack)
@@ -1155,7 +1109,8 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
if (ret)
return ret;
- return tcf_add_notify(net, n, &actions, portid, attr_size, extack);
+ return tca_notify(net, n, &actions, portid, RTM_NEWACTION, attr_size,
+ extack);
}
static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON;
--
2.7.4
^ permalink raw reply related
* Re: [RFC v3 net-next 14/18] net/sched: Add HW offloading capability to TBS
From: Jesus Sanchez-Palencia @ 2018-03-22 22:01 UTC (permalink / raw)
To: Thomas Gleixner, Richard Cochran
Cc: netdev, jhs, xiyou.wangcong, jiri, vinicius.gomes, anna-maria,
henrik, John Stultz, levi.pearson, edumazet, willemb, mlichvar
In-Reply-To: <alpine.DEB.2.21.1803211645160.3754@nanos.tec.linutronix.de>
Hi,
On 03/21/2018 09:18 AM, Thomas Gleixner wrote:
> On Wed, 21 Mar 2018, Richard Cochran wrote:
>
>> On Wed, Mar 21, 2018 at 03:22:11PM +0100, Thomas Gleixner wrote:
>>> Which clockid will be handed in from the application? The network adapter
>>> time has no fixed clockid. The only way you can get to it is via a fd based
>>> posix clock and that does not work at all because the qdisc setup might
>>> have a different FD than the application which queues packets.
>>
>> Duh. That explains it. Please ignore my "why not?" Q in the other thread...
>
> :)
>
> So in that case you are either bound to rely on the application to use the
> proper dynamic clock or if we need a sanity check, then you need a cookie
> of some form which can be retrieved from the posix clock file descriptor
> and handed in as 'clockid' together with clock_adapter = true.
>
> That's doable, but that needs a bit more trickery. A simple unique ID per
> dynamic posix-clock would be trivial to add, but that would not give you
> any form of verification whether this ID actually belongs to the network
> adapter or not.
>
> So either you ignore the clockid and rely on the application not being
> stupid when it says "clock_adpater = true" or you need some extra
> complexity to build an association of a "clockid" to a network adapter.
>
> There is a connection already, via
>
> adapter->ptp_clock->devid
>
> which is MKDEV(major, index) which is accessible at least at the network
> driver level, but probably not from networking core. So you'd need to drill
> a few more holes by adding yet another callback to net_device_ops.
>
> I'm not sure if its worth the trouble. If the application hands in bogus
> timestamps, packets go out at the wrong time or are dropped. That's true
> whether it uses the proper clock or not. So nothing the kernel should
> really worry about.
+1 and that is the approach we've taken so far with the qdisc setting
"CLOCKID_INVALID" to its internal clockid for the "raw" (non-assisted) hw
offload case.
thanks,
Jesus
>
> For clock_system - REAL/MONO/TAI(sigh) - you surely need a sanity check,
> but that is independent of the underlying network adapater even in the
> qdisc assisted HW offload case.
>
> Thanks,
>
> tglx
>
>
>
>
>
>
^ permalink raw reply
* Re: [PATCH v2] net/mlx5: Fix use-after-free
From: Saeed Mahameed @ 2018-03-22 22:08 UTC (permalink / raw)
To: yuval.shaia@oracle.com, Matan Barak, Ilan Tayari,
gustavo@embeddedor.com, Boris Pismenny, leon@kernel.org
Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
linux-rdma@vger.kernel.org
In-Reply-To: <20180322184456.GA22259@embeddedgus>
On Thu, 2018-03-22 at 13:44 -0500, Gustavo A. R. Silva wrote:
> _rule_ is being freed and then dereferenced by accessing rule->ctx
>
> Fix this by copying the value returned by PTR_ERR(rule->ctx) into a
> local
> variable for its safe use after freeing _rule_
>
> Addresses-Coverity-ID: 1466041 ("Read from pointer after free")
> Fixes: 05564d0ae075 ("net/mlx5: Add flow-steering commands for FPGA
> IPSec implementation")
> Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
> Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
> ---
> Changes in v2:
> - Use a short subject prefix as suggested by Yuval Shaia.
> - Add Yuval's Reviewed-by.
>
> drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
> b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
> index 4f15685..0f5da49 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
> @@ -1061,8 +1061,9 @@ static int fpga_ipsec_fs_create_fte(struct
> mlx5_core_dev *dev,
>
> rule->ctx = mlx5_fpga_ipsec_fs_create_sa_ctx(dev, fte,
> is_egress);
> if (IS_ERR(rule->ctx)) {
> + int err = PTR_ERR(rule->ctx);
> kfree(rule);
> - return PTR_ERR(rule->ctx);
> + return err;
> }
>
> rule->fte = fte;
^ permalink raw reply
* Re: [RFC v3 net-next 13/18] net/sched: Introduce the TBS Qdisc
From: Thomas Gleixner @ 2018-03-22 22:11 UTC (permalink / raw)
To: Jesus Sanchez-Palencia
Cc: netdev, jhs, xiyou.wangcong, jiri, vinicius.gomes, richardcochran,
anna-maria, henrik, john.stultz, levi.pearson, edumazet, willemb,
mlichvar
In-Reply-To: <7c3f5a9f-cc16-8483-cb77-b5548d46cd5b@intel.com>
On Thu, 22 Mar 2018, Jesus Sanchez-Palencia wrote:
> On 03/21/2018 06:46 AM, Thomas Gleixner wrote:
> > If you look at the use cases of TDM in various fields then FIFO mode is
> > pretty much useless. In industrial/automotive fieldbus applications the
> > various time slices are filled by different threads or even processes.
> >
> > Sure, the rbtree queue/dequeue has overhead compared to a simple linked
> > list, but you pay for that with more indirections and lots of mostly
> > duplicated code. And in the worst case one of these code pathes is going to
> > be rarely used and prone to bitrot.
>
>
> Our initial version (on RFC v2) was performing the sorting for all modes. After
> all the feedback we got we decided to make it optional and provide FIFO modes as
> well. For the SW fallback we need the scheduled FIFO, and for "pure" hw offload
> we need the "raw" FIFO.
I don't see how FIFO ever works without the issue that a newly qeueud
packet which has an earlier time stamp than the head of the FIFO list will
lose. Why would you even want to have that mode? Just because some weird
existing application misdesign thinks its required? That doesn't make it a
good idea.
With pure hardware offload the packets are immediately handed off to the
network card and that one is responsible for sending it on time. So there
is no FIFO at all. It's actually a bypass mode.
> This was a way to accommodate all the use cases without imposing too much of a
> burden onto anyone, regardless of their application's segment (i.e. industrial,
> pro a/v, automotive, etc).
I'm not buying that argument at all. That's all handwaving.
The whole approach is a burden on every application segment because it
pushes the whole schedule and time slice management out to user space,
which also requires that you route general traffic down to that user space
scheduling entity and then queue it back into the proper time slice. And
FIFO makes that even worse.
> Having the sorting always enabled requires that a valid static clockid is passed
> to the qdisc. For the hw offload mode, that means that the PHC and one of the
> system clocks must be synchronized since hrtimers do not support dynamic clocks.
> Not all systems do that or want to, and given that we do not want to perform
> crosstimestamping between the packets' clock reference and the qdisc's one, the
> only solution for these systems would be using the raw hw offload mode.
There are two variants of hardware offload:
1) Full hardware offload
That bypasses the queue completely. You just stick the thing into the
scatter gather buffers. Except when there is no room anymore, then you
have to queue, but it does not make any difference if you queue in FIFO
or in time order. The packets go out in time order anyway.
2) Single packet hardware offload
What you do here is to schedule a hrtimer a bit earlier than the first
packet tx time and when it fires stick the packet into the hardware and
rearm the timer for the next one.
The whole point of TSN with hardware support is that you have:
- Global network time
and
- Frequency adjustment of the system time base
PTP is TAI based and the kernel exposes clock TAI directly through
hrtimers. You don't need dynamic clocks for that.
You can even use clock MONOTONIC as it basically is just
TAI - offset
If the network card uses anything else than TAI or a time stamp with a
strict correlation to TAI for actual TX scheduling then the whole thing is
broken to begin with.
Thanks,
tglx
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox