* [net-next PATCH 2/3] net: Enable Tx queue selection based on Rx queues
From: Amritha Nambiar @ 2018-04-20 1:04 UTC (permalink / raw)
To: netdev, davem
Cc: alexander.h.duyck, amritha.nambiar, sridhar.samudrala, edumazet,
hannes, tom
In-Reply-To: <152418597668.5832.5150463027149101930.stgit@anamdev.jf.intel.com>
This patch adds support to pick Tx queue based on the Rx queue map
configuration set by the admin through the sysfs attribute
for each Tx queue. If the user configuration for receive
queue map does not apply, then the Tx queue selection falls back
to CPU map based selection and finally to hashing.
Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
---
include/net/sock.h | 18 ++++++++++++++++++
net/core/dev.c | 36 +++++++++++++++++++++++++++++-------
net/core/sock.c | 5 +++++
net/ipv4/tcp_input.c | 7 +++++++
net/ipv4/tcp_ipv4.c | 1 +
net/ipv4/tcp_minisocks.c | 1 +
6 files changed, 61 insertions(+), 7 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index 74d725f..f10b2a2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -139,6 +139,8 @@ typedef __u64 __bitwise __addrpair;
* @skc_node: main hash linkage for various protocol lookup tables
* @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
* @skc_tx_queue_mapping: tx queue number for this connection
+ * @skc_rx_queue_mapping: rx queue number for this connection
+ * @skc_rx_ifindex: rx ifindex for this connection
* @skc_flags: place holder for sk_flags
* %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
* %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
@@ -215,6 +217,10 @@ struct sock_common {
struct hlist_nulls_node skc_nulls_node;
};
int skc_tx_queue_mapping;
+#ifdef CONFIG_XPS
+ int skc_rx_queue_mapping;
+ int skc_rx_ifindex;
+#endif
union {
int skc_incoming_cpu;
u32 skc_rcv_wnd;
@@ -326,6 +332,10 @@ struct sock {
#define sk_nulls_node __sk_common.skc_nulls_node
#define sk_refcnt __sk_common.skc_refcnt
#define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping
+#ifdef CONFIG_XPS
+#define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping
+#define sk_rx_ifindex __sk_common.skc_rx_ifindex
+#endif
#define sk_dontcopy_begin __sk_common.skc_dontcopy_begin
#define sk_dontcopy_end __sk_common.skc_dontcopy_end
@@ -1691,6 +1701,14 @@ static inline int sk_tx_queue_get(const struct sock *sk)
return sk ? sk->sk_tx_queue_mapping : -1;
}
+static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+ sk->sk_rx_ifindex = skb->skb_iif;
+ sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
+#endif
+}
+
static inline void sk_set_socket(struct sock *sk, struct socket *sock)
{
sk_tx_queue_clear(sk);
diff --git a/net/core/dev.c b/net/core/dev.c
index 17c4883..cf24d47 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3456,18 +3456,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
}
#endif /* CONFIG_NET_EGRESS */
-static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
-{
#ifdef CONFIG_XPS
- struct xps_dev_maps *dev_maps;
+static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
+ struct xps_dev_maps *dev_maps, unsigned int tci)
+{
struct xps_map *map;
int queue_index = -1;
- rcu_read_lock();
- dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_CPUS]);
if (dev_maps) {
- unsigned int tci = skb->sender_cpu - 1;
-
if (dev->num_tc) {
tci *= dev->num_tc;
tci += netdev_get_prio_tc_map(dev, skb->priority);
@@ -3484,6 +3480,32 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
queue_index = -1;
}
}
+ return queue_index;
+}
+#endif
+
+static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+ enum xps_map_type i = XPS_MAP_RXQS;
+ struct xps_dev_maps *dev_maps;
+ struct sock *sk = skb->sk;
+ int queue_index = -1;
+ unsigned int tci = 0;
+
+ if (sk && sk->sk_rx_queue_mapping <= dev->real_num_rx_queues &&
+ dev->ifindex == sk->sk_rx_ifindex)
+ tci = sk->sk_rx_queue_mapping;
+
+ rcu_read_lock();
+ while (queue_index < 0 && i < __XPS_MAP_MAX) {
+ if (i == XPS_MAP_CPUS)
+ tci = skb->sender_cpu - 1;
+ dev_maps = rcu_dereference(dev->xps_maps[i]);
+ queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci);
+ i++;
+ }
+
rcu_read_unlock();
return queue_index;
diff --git a/net/core/sock.c b/net/core/sock.c
index b2c3db1..f7a4b46 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2820,6 +2820,11 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_pacing_rate = ~0U;
sk->sk_pacing_shift = 10;
sk->sk_incoming_cpu = -1;
+
+#ifdef CONFIG_XPS
+ sk->sk_rx_ifindex = -1;
+ sk->sk_rx_queue_mapping = -1;
+#endif
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0396fb9..157f401 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -78,6 +78,7 @@
#include <linux/errqueue.h>
#include <trace/events/tcp.h>
#include <linux/static_key.h>
+#include <net/busy_poll.h>
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
@@ -5535,6 +5536,11 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
__tcp_fast_path_on(tp, tp->snd_wnd);
else
tp->pred_flags = 0;
+
+ if (skb) {
+ sk_mark_napi_id(sk, skb);
+ sk_mark_rx_queue(sk, skb);
+ }
}
static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
@@ -6347,6 +6353,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_openreq_init_rwin(req, sk, dst);
+ sk_mark_rx_queue(req_to_sk(req), skb);
if (!want_cookie) {
tcp_reqsk_record_syn(sk, req, skb);
fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f70586b..132d9af 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1467,6 +1467,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
+ sk_mark_rx_queue(sk, skb);
if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
!dst->ops->check(dst, 0)) {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 57b5468..c18d6f2 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -835,6 +835,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
/* record NAPI ID of child */
sk_mark_napi_id(child, skb);
+ sk_mark_rx_queue(child, skb);
tcp_segs_in(tcp_sk(child), skb);
if (!sock_owned_by_user(child)) {
^ permalink raw reply related
* [net-next PATCH 1/3] net: Refactor XPS for CPUs and Rx queues
From: Amritha Nambiar @ 2018-04-20 1:04 UTC (permalink / raw)
To: netdev, davem
Cc: alexander.h.duyck, amritha.nambiar, sridhar.samudrala, edumazet,
hannes, tom
In-Reply-To: <152418597668.5832.5150463027149101930.stgit@anamdev.jf.intel.com>
Refactor XPS code to support Tx queue selection based on
CPU map or Rx queue map.
Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
---
include/linux/netdevice.h | 82 +++++++++++++++++-
net/core/dev.c | 206 +++++++++++++++++++++++++++++----------------
net/core/net-sysfs.c | 4 -
3 files changed, 216 insertions(+), 76 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 14e0777..40a9171 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -730,10 +730,21 @@ struct xps_map {
*/
struct xps_dev_maps {
struct rcu_head rcu;
- struct xps_map __rcu *cpu_map[0];
+ struct xps_map __rcu *attr_map[0];
};
-#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
+
+#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
(nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
+
+#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
+ (_rxqs * (_tcs) * sizeof(struct xps_map *)))
+
+enum xps_map_type {
+ XPS_MAP_RXQS,
+ XPS_MAP_CPUS,
+ __XPS_MAP_MAX
+};
+
#endif /* CONFIG_XPS */
#define TC_MAX_QUEUE 16
@@ -1867,7 +1878,7 @@ struct net_device {
int watchdog_timeo;
#ifdef CONFIG_XPS
- struct xps_dev_maps __rcu *xps_maps;
+ struct xps_dev_maps __rcu *xps_maps[__XPS_MAP_MAX];
#endif
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_egress;
@@ -3204,6 +3215,71 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
#ifdef CONFIG_XPS
int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
u16 index);
+int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
+ u16 index, enum xps_map_type type);
+
+static inline bool attr_test_mask(unsigned long j, const unsigned long *mask,
+ unsigned int nr_bits)
+{
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+ WARN_ON_ONCE(j >= nr_bits);
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+ return test_bit(j, mask);
+}
+
+static inline bool attr_test_online(unsigned long j,
+ const unsigned long *online_mask,
+ unsigned int nr_bits)
+{
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+ WARN_ON_ONCE(j >= nr_bits);
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+
+ if (online_mask)
+ return test_bit(j, online_mask);
+
+ if (j >= 0 && j < nr_bits)
+ return true;
+
+ return false;
+}
+
+static inline unsigned int attrmask_next(int n, const unsigned long *srcp,
+ unsigned int nr_bits)
+{
+ /* -1 is a legal arg here. */
+ if (n != -1) {
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+ WARN_ON_ONCE(n >= nr_bits);
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+ }
+
+ if (srcp)
+ return find_next_bit(srcp, nr_bits, n + 1);
+
+ return n + 1;
+}
+
+static inline int attrmask_next_and(int n, const unsigned long *src1p,
+ const unsigned long *src2p,
+ unsigned int nr_bits)
+{
+ /* -1 is a legal arg here. */
+ if (n != -1) {
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+ WARN_ON_ONCE(n >= nr_bits);
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
+ }
+
+ if (src1p && src2p)
+ return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
+ else if (src1p)
+ return find_next_bit(src1p, nr_bits, n + 1);
+ else if (src2p)
+ return find_next_bit(src2p, nr_bits, n + 1);
+
+ return n + 1;
+}
#else
static inline int netif_set_xps_queue(struct net_device *dev,
const struct cpumask *mask,
diff --git a/net/core/dev.c b/net/core/dev.c
index a490ef6..17c4883 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2092,7 +2092,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
int pos;
if (dev_maps)
- map = xmap_dereference(dev_maps->cpu_map[tci]);
+ map = xmap_dereference(dev_maps->attr_map[tci]);
if (!map)
return false;
@@ -2105,7 +2105,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
break;
}
- RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
+ RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
kfree_rcu(map, rcu);
return false;
}
@@ -2138,30 +2138,47 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
u16 count)
{
+ const unsigned long *possible_mask = NULL;
+ enum xps_map_type type = XPS_MAP_RXQS;
struct xps_dev_maps *dev_maps;
- int cpu, i;
bool active = false;
+ unsigned int nr_ids;
+ int i, j;
mutex_lock(&xps_map_mutex);
- dev_maps = xmap_dereference(dev->xps_maps);
- if (!dev_maps)
- goto out_no_maps;
+ while (type < __XPS_MAP_MAX) {
+ dev_maps = xmap_dereference(dev->xps_maps[type]);
+ if (!dev_maps)
+ goto out_no_maps;
+
+ if (type == XPS_MAP_CPUS) {
+ if (num_possible_cpus() > 1)
+ possible_mask = cpumask_bits(cpu_possible_mask);
+ nr_ids = nr_cpu_ids;
+ } else if (type == XPS_MAP_RXQS) {
+ nr_ids = dev->num_rx_queues;
+ }
- for_each_possible_cpu(cpu)
- active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
- offset, count);
+ for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
+ j < nr_ids;)
+ active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
+ count);
+ if (!active) {
+ RCU_INIT_POINTER(dev->xps_maps[type], NULL);
+ kfree_rcu(dev_maps, rcu);
+ }
- if (!active) {
- RCU_INIT_POINTER(dev->xps_maps, NULL);
- kfree_rcu(dev_maps, rcu);
+ if (type == XPS_MAP_CPUS) {
+ for (i = offset + (count - 1); count--; i--)
+ netdev_queue_numa_node_write(
+ netdev_get_tx_queue(dev, i),
+ NUMA_NO_NODE);
+ }
+out_no_maps:
+ type++;
}
- for (i = offset + (count - 1); count--; i--)
- netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
- NUMA_NO_NODE);
-
-out_no_maps:
mutex_unlock(&xps_map_mutex);
}
@@ -2170,11 +2187,11 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
}
-static struct xps_map *expand_xps_map(struct xps_map *map,
- int cpu, u16 index)
+static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
+ u16 index, enum xps_map_type type)
{
- struct xps_map *new_map;
int alloc_len = XPS_MIN_MAP_ALLOC;
+ struct xps_map *new_map = NULL;
int i, pos;
for (pos = 0; map && pos < map->len; pos++) {
@@ -2183,7 +2200,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
return map;
}
- /* Need to add queue to this CPU's existing map */
+ /* Need to add tx-queue to this CPU's/rx-queue's existing map */
if (map) {
if (pos < map->alloc_len)
return map;
@@ -2191,9 +2208,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
alloc_len = map->alloc_len * 2;
}
- /* Need to allocate new map to store queue on this CPU's map */
- new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
- cpu_to_node(cpu));
+ /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
+ * map
+ */
+ if (type == XPS_MAP_RXQS)
+ new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
+ else if (type == XPS_MAP_CPUS)
+ new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
+ cpu_to_node(attr_index));
if (!new_map)
return NULL;
@@ -2205,14 +2227,16 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
return new_map;
}
-int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
- u16 index)
+int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
+ u16 index, enum xps_map_type type)
{
+ const unsigned long *online_mask = NULL, *possible_mask = NULL;
struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
- int i, cpu, tci, numa_node_id = -2;
+ int i, j, tci, numa_node_id = -2;
int maps_sz, num_tc = 1, tc = 0;
struct xps_map *map, *new_map;
bool active = false;
+ unsigned int nr_ids;
if (dev->num_tc) {
num_tc = dev->num_tc;
@@ -2221,16 +2245,33 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
return -EINVAL;
}
- maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
+ switch (type) {
+ case XPS_MAP_RXQS:
+ maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
+ dev_maps = xmap_dereference(dev->xps_maps[XPS_MAP_RXQS]);
+ nr_ids = dev->num_rx_queues;
+ break;
+ case XPS_MAP_CPUS:
+ maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
+ if (num_possible_cpus() > 1) {
+ online_mask = cpumask_bits(cpu_online_mask);
+ possible_mask = cpumask_bits(cpu_possible_mask);
+ }
+ dev_maps = xmap_dereference(dev->xps_maps[XPS_MAP_CPUS]);
+ nr_ids = nr_cpu_ids;
+ break;
+ default:
+ return -EINVAL;
+ }
+
if (maps_sz < L1_CACHE_BYTES)
maps_sz = L1_CACHE_BYTES;
mutex_lock(&xps_map_mutex);
- dev_maps = xmap_dereference(dev->xps_maps);
-
/* allocate memory for queue storage */
- for_each_cpu_and(cpu, cpu_online_mask, mask) {
+ for (j = -1; j = attrmask_next_and(j, online_mask, mask, nr_ids),
+ j < nr_ids;) {
if (!new_dev_maps)
new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
if (!new_dev_maps) {
@@ -2238,73 +2279,81 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
return -ENOMEM;
}
- tci = cpu * num_tc + tc;
- map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
+ tci = j * num_tc + tc;
+ map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
NULL;
- map = expand_xps_map(map, cpu, index);
+ map = expand_xps_map(map, j, index, type);
if (!map)
goto error;
- RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
if (!new_dev_maps)
goto out_no_new_maps;
- for_each_possible_cpu(cpu) {
+ for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
+ j < nr_ids;) {
/* copy maps belonging to foreign traffic classes */
- for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
+ for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
/* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->cpu_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
/* We need to explicitly update tci as prevous loop
* could break out early if dev_maps is NULL.
*/
- tci = cpu * num_tc + tc;
+ tci = j * num_tc + tc;
- if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
- /* add queue to CPU maps */
+ if (attr_test_mask(j, mask, nr_ids) &&
+ attr_test_online(j, online_mask, nr_ids)) {
+ /* add tx-queue to CPU/rx-queue maps */
int pos = 0;
- map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+ map = xmap_dereference(new_dev_maps->attr_map[tci]);
while ((pos < map->len) && (map->queues[pos] != index))
pos++;
if (pos == map->len)
map->queues[map->len++] = index;
#ifdef CONFIG_NUMA
- if (numa_node_id == -2)
- numa_node_id = cpu_to_node(cpu);
- else if (numa_node_id != cpu_to_node(cpu))
- numa_node_id = -1;
+ if (type == XPS_MAP_CPUS) {
+ if (numa_node_id == -2)
+ numa_node_id = cpu_to_node(j);
+ else if (numa_node_id != cpu_to_node(j))
+ numa_node_id = -1;
+ }
#endif
} else if (dev_maps) {
/* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->cpu_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
/* copy maps belonging to foreign traffic classes */
for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
/* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->cpu_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
}
- rcu_assign_pointer(dev->xps_maps, new_dev_maps);
+ if (type == XPS_MAP_RXQS)
+ rcu_assign_pointer(dev->xps_maps[XPS_MAP_RXQS], new_dev_maps);
+ else if (type == XPS_MAP_CPUS)
+ rcu_assign_pointer(dev->xps_maps[XPS_MAP_CPUS], new_dev_maps);
/* Cleanup old maps */
if (!dev_maps)
goto out_no_old_maps;
- for_each_possible_cpu(cpu) {
- for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
- new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
- map = xmap_dereference(dev_maps->cpu_map[tci]);
+ for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
+ j < nr_ids;) {
+ for (i = num_tc, tci = j * num_tc; i--; tci++) {
+ new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ map = xmap_dereference(dev_maps->attr_map[tci]);
if (map && map != new_map)
kfree_rcu(map, rcu);
}
@@ -2317,19 +2366,23 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
active = true;
out_no_new_maps:
- /* update Tx queue numa node */
- netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
- (numa_node_id >= 0) ? numa_node_id :
- NUMA_NO_NODE);
+ if (type == XPS_MAP_CPUS) {
+ /* update Tx queue numa node */
+ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
+ (numa_node_id >= 0) ?
+ numa_node_id : NUMA_NO_NODE);
+ }
if (!dev_maps)
goto out_no_maps;
- /* removes queue from unused CPUs */
- for_each_possible_cpu(cpu) {
- for (i = tc, tci = cpu * num_tc; i--; tci++)
+ /* removes tx-queue from unused CPUs/rx-queues */
+ for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
+ j < nr_ids;) {
+ for (i = tc, tci = j * num_tc; i--; tci++)
active |= remove_xps_queue(dev_maps, tci, index);
- if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
+ if (!attr_test_mask(j, mask, nr_ids) ||
+ !attr_test_online(j, online_mask, nr_ids))
active |= remove_xps_queue(dev_maps, tci, index);
for (i = num_tc - tc, tci++; --i; tci++)
active |= remove_xps_queue(dev_maps, tci, index);
@@ -2337,7 +2390,10 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
/* free map if not active */
if (!active) {
- RCU_INIT_POINTER(dev->xps_maps, NULL);
+ if (type == XPS_MAP_RXQS)
+ RCU_INIT_POINTER(dev->xps_maps[XPS_MAP_RXQS], NULL);
+ else if (type == XPS_MAP_CPUS)
+ RCU_INIT_POINTER(dev->xps_maps[XPS_MAP_CPUS], NULL);
kfree_rcu(dev_maps, rcu);
}
@@ -2347,11 +2403,12 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
return 0;
error:
/* remove any maps that we added */
- for_each_possible_cpu(cpu) {
- for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
- new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+ for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
+ j < nr_ids;) {
+ for (i = num_tc, tci = j * num_tc; i--; tci++) {
+ new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
map = dev_maps ?
- xmap_dereference(dev_maps->cpu_map[tci]) :
+ xmap_dereference(dev_maps->attr_map[tci]) :
NULL;
if (new_map && new_map != map)
kfree(new_map);
@@ -2363,6 +2420,13 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
kfree(new_dev_maps);
return -ENOMEM;
}
+
+int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
+ u16 index)
+{
+ return __netif_set_xps_queue(dev, cpumask_bits(mask), index,
+ XPS_MAP_CPUS);
+}
EXPORT_SYMBOL(netif_set_xps_queue);
#endif
@@ -3400,7 +3464,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
int queue_index = -1;
rcu_read_lock();
- dev_maps = rcu_dereference(dev->xps_maps);
+ dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_CPUS]);
if (dev_maps) {
unsigned int tci = skb->sender_cpu - 1;
@@ -3409,7 +3473,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
tci += netdev_get_prio_tc_map(dev, skb->priority);
}
- map = rcu_dereference(dev_maps->cpu_map[tci]);
+ map = rcu_dereference(dev_maps->attr_map[tci]);
if (map) {
if (map->len == 1)
queue_index = map->queues[0];
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index c476f07..d7abd33 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1227,13 +1227,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
}
rcu_read_lock();
- dev_maps = rcu_dereference(dev->xps_maps);
+ dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_CPUS]);
if (dev_maps) {
for_each_possible_cpu(cpu) {
int i, tci = cpu * num_tc + tc;
struct xps_map *map;
- map = rcu_dereference(dev_maps->cpu_map[tci]);
+ map = rcu_dereference(dev_maps->attr_map[tci]);
if (!map)
continue;
^ permalink raw reply related
* [net-next PATCH 0/3] Symmetric queue selection using XPS for Rx queues
From: Amritha Nambiar @ 2018-04-20 1:04 UTC (permalink / raw)
To: netdev, davem
Cc: alexander.h.duyck, amritha.nambiar, sridhar.samudrala, edumazet,
hannes, tom
This patch series implements support for Tx queue selection based on
Rx queue map. This is done by configuring Rx queue map per Tx-queue
using sysfs attribute. If the user configuration for Rx queues does
not apply, then the Tx queue selection falls back to XPS using CPUs and
finally to hashing.
XPS is refactored to support Tx queue selection based on either the
CPU map or the Rx-queue map. The config option CONFIG_XPS needs to be
enabled. By default no receive queues are configured for the Tx queue.
- /sys/class/net/eth0/queues/tx-*/xps_rxqs
This is to enable sending packets on the same Tx-Rx queue pair as this
is useful for busy polling multi-threaded workloads where it is not
possible to pin the threads to a CPU. This is a rework of Sridhar's
patch for symmetric queueing via socket option:
https://www.spinics.net/lists/netdev/msg453106.html
---
Amritha Nambiar (3):
net: Refactor XPS for CPUs and Rx queues
net: Enable Tx queue selection based on Rx queues
net-sysfs: Add interface for Rx queue map per Tx queue
include/linux/netdevice.h | 82 +++++++++++++++
include/net/sock.h | 18 +++
net/core/dev.c | 240 +++++++++++++++++++++++++++++++--------------
net/core/net-sysfs.c | 85 ++++++++++++++++
net/core/sock.c | 5 +
net/ipv4/tcp_input.c | 7 +
net/ipv4/tcp_ipv4.c | 1
net/ipv4/tcp_minisocks.c | 1
8 files changed, 357 insertions(+), 82 deletions(-)
^ permalink raw reply
* Re: [RFC PATCH ghak32 V2 05/13] audit: add containerid support for ptrace and signals
From: Richard Guy Briggs @ 2018-04-20 1:03 UTC (permalink / raw)
To: Paul Moore
Cc: cgroups, containers, linux-api, Linux-Audit Mailing List,
linux-fsdevel, LKML, netdev, ebiederm, luto, jlayton, carlos,
dhowells, viro, simo, Eric Paris, serge
In-Reply-To: <CAHC9VhTy4fX1hYfD5tppbP-fRaVRMXOfeJ=Et96J_rc7Jw12Bw@mail.gmail.com>
On 2018-04-18 20:32, Paul Moore wrote:
> On Fri, Mar 16, 2018 at 5:00 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
> > Add container ID support to ptrace and signals. In particular, the "op"
> > field provides a way to label the auxiliary record to which it is
> > associated.
> >
> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > ---
> > include/linux/audit.h | 16 +++++++++++-----
> > kernel/audit.c | 12 ++++++++----
> > kernel/audit.h | 2 ++
> > kernel/auditsc.c | 19 +++++++++++++++----
> > 4 files changed, 36 insertions(+), 13 deletions(-)
>
> ...
>
> > diff --git a/kernel/audit.c b/kernel/audit.c
> > index a12f21f..b238be5 100644
> > --- a/kernel/audit.c
> > +++ b/kernel/audit.c
> > @@ -142,6 +142,7 @@ struct audit_net {
> > kuid_t audit_sig_uid = INVALID_UID;
> > pid_t audit_sig_pid = -1;
> > u32 audit_sig_sid = 0;
> > +u64 audit_sig_cid = INVALID_CID;
> >
> > /* Records can be lost in several ways:
> > 0) [suppressed in audit_alloc]
> > @@ -1438,6 +1439,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
> > memcpy(sig_data->ctx, ctx, len);
> > security_release_secctx(ctx, len);
> > }
> > + sig_data->cid = audit_sig_cid;
> > audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
> > sig_data, sizeof(*sig_data) + len);
> > kfree(sig_data);
> > @@ -2051,20 +2053,22 @@ void audit_log_session_info(struct audit_buffer *ab)
> >
> > /*
> > * audit_log_container_info - report container info
> > - * @tsk: task to be recorded
> > * @context: task or local context for record
> > + * @op: containerid string description
> > + * @containerid: container ID to report
> > */
> > -int audit_log_container_info(struct task_struct *tsk, struct audit_context *context)
> > +int audit_log_container_info(struct audit_context *context,
> > + char *op, u64 containerid)
> > {
> > struct audit_buffer *ab;
> >
> > - if (!audit_containerid_set(tsk))
> > + if (!cid_valid(containerid))
> > return 0;
> > /* Generate AUDIT_CONTAINER_INFO with container ID */
> > ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONTAINER_INFO);
> > if (!ab)
> > return -ENOMEM;
> > - audit_log_format(ab, "contid=%llu", audit_get_containerid(tsk));
> > + audit_log_format(ab, "op=%s contid=%llu", op, containerid);
> > audit_log_end(ab);
> > return 0;
> > }
>
> Let's get these changes into the first patch where
> audit_log_container_info() is defined. Why? This inserts a new field
> into the record which is a no-no. Yes, it is one single patchset, but
> they are still separate patches and who knows which patches a given
> distribution and/or tree may decide to backport.
Fair enough. That first thought went through my mind... Would it be
sufficient to move that field addition to the first patch and leave the
rest here to support trace and signals?
> > diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> > index 2bba324..2932ef1 100644
> > --- a/kernel/auditsc.c
> > +++ b/kernel/auditsc.c
> > @@ -113,6 +113,7 @@ struct audit_aux_data_pids {
> > kuid_t target_uid[AUDIT_AUX_PIDS];
> > unsigned int target_sessionid[AUDIT_AUX_PIDS];
> > u32 target_sid[AUDIT_AUX_PIDS];
> > + u64 target_cid[AUDIT_AUX_PIDS];
> > char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
> > int pid_count;
> > };
> > @@ -1422,21 +1423,27 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
> > for (aux = context->aux_pids; aux; aux = aux->next) {
> > struct audit_aux_data_pids *axs = (void *)aux;
> >
> > - for (i = 0; i < axs->pid_count; i++)
> > + for (i = 0; i < axs->pid_count; i++) {
> > + char axsn[sizeof("aux0xN ")];
> > +
> > + sprintf(axsn, "aux0x%x", i);
> > if (audit_log_pid_context(context, axs->target_pid[i],
> > axs->target_auid[i],
> > axs->target_uid[i],
> > axs->target_sessionid[i],
> > axs->target_sid[i],
> > - axs->target_comm[i]))
> > + axs->target_comm[i])
> > + && audit_log_container_info(context, axsn, axs->target_cid[i]))
>
> Shouldn't this be an OR instead of an AND?
Yes. Bash-brain...
> > call_panic = 1;
> > + }
> > }
> >
> > if (context->target_pid &&
> > audit_log_pid_context(context, context->target_pid,
> > context->target_auid, context->target_uid,
> > context->target_sessionid,
> > - context->target_sid, context->target_comm))
> > + context->target_sid, context->target_comm)
> > + && audit_log_container_info(context, "target", context->target_cid))
>
> Same question.
Yes.
> > call_panic = 1;
> >
> > if (context->pwd.dentry && context->pwd.mnt) {
>
> --
> paul moore
> www.paul-moore.com
- RGB
--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
^ permalink raw reply
* Re: [PATCH net-next 4/5] tcp: implement mmap() for zero copy receive
From: Eric Dumazet @ 2018-04-20 1:01 UTC (permalink / raw)
To: Eric Dumazet, David S . Miller
Cc: netdev, Neal Cardwell, Yuchung Cheng, Soheil Hassas Yeganeh
In-Reply-To: <7a961ead-e77d-7334-3c29-399e071670fb@gmail.com>
On 04/19/2018 04:15 PM, Eric Dumazet wrote:
> I am not sure we can keep mmap() API, since we probably need to first lock the socket,
> then grab vm semaphore.
>
We can keep mmap() nice interface, granted we can add one hook like in following patch.
David, do you think such patch would be acceptable by lkml and mm/fs maintainers ?
Alternative would be implementing an ioctl() or getsockopt() operation,
but it seems less natural...
Thanks !
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 92efaf1f89775f7b017477617dd983c10e0dc4d2..016c711ac33e226b4285ee5bd688e14661dc0879 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1714,6 +1714,7 @@ struct file_operations {
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
+ void (*mmap_hook) (struct file *, bool);
unsigned long mmap_supported_flags;
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *, fl_owner_t id);
diff --git a/mm/util.c b/mm/util.c
index 1fc4fa7576f762bbbf341f056ca6d0be803a423f..b546c59a6169c4dfa9011c61e86da4d03496aa4d 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -350,11 +350,20 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
ret = security_mmap_file(file, prot, flag);
if (!ret) {
- if (down_write_killable(&mm->mmap_sem))
+ void (*mmap_hook)(struct file *, bool) = file ? file->f_op->mmap_hook : NULL;
+
+ if (mmap_hook)
+ mmap_hook(file, true);
+ if (down_write_killable(&mm->mmap_sem)) {
+ if (mmap_hook)
+ mmap_hook(file, false);
return -EINTR;
+ }
ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
&populate, &uf);
up_write(&mm->mmap_sem);
+ if (mmap_hook)
+ mmap_hook(file, false);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(ret, populate);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4022073b0aeea9d07af0fa825b640a00512908a3..79b05d6d41643e8c309dfb8bd9597dc8b00fb0e1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1756,8 +1756,6 @@ int tcp_mmap(struct file *file, struct socket *sock,
/* TODO: Maybe the following is not needed if pages are COW */
vma->vm_flags &= ~VM_MAYWRITE;
- lock_sock(sk);
-
ret = -ENOTCONN;
if (sk->sk_state == TCP_LISTEN)
goto out;
@@ -1833,7 +1831,6 @@ int tcp_mmap(struct file *file, struct socket *sock,
ret = 0;
out:
- release_sock(sk);
kvfree(pages_array);
return ret;
}
diff --git a/net/socket.c b/net/socket.c
index f10f1d947c78c193b49379b0ec641d81367fb4cf..bcabae3c37d765e5c0548a14fc93c19258972b48 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -131,6 +131,16 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags);
+static void sock_mmap_hook(struct file *file, bool enter)
+{
+ struct socket *sock = file->private_data;
+ struct sock *sk = sock->sk;
+
+ if (enter)
+ lock_sock(sk);
+ else
+ release_sock(sk);
+}
/*
* Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
* in the operation structures but are done directly via the socketcall() multiplexor.
@@ -147,6 +157,7 @@ static const struct file_operations socket_file_ops = {
.compat_ioctl = compat_sock_ioctl,
#endif
.mmap = sock_mmap,
+ .mmap_hook = sock_mmap_hook,
.release = sock_close,
.fasync = sock_fasync,
.sendpage = sock_sendpage,
^ permalink raw reply related
* Re: [pci PATCH v7 0/5] Add support for unmanaged SR-IOV
From: Michael S. Tsirkin @ 2018-04-20 0:46 UTC (permalink / raw)
To: Alexander Duyck
Cc: Bjorn Helgaas, Duyck, Alexander H, linux-pci, virtio-dev, kvm,
Netdev, Daly, Dan, LKML, linux-nvme, Keith Busch, netanel,
Don Dutile, Maximilian Heyne, Wang, Liang-min, Rustad, Mark D,
David Woodhouse, Christoph Hellwig, dwmw
In-Reply-To: <CAKgT0UdTANRo3xnr89aWdfSPmMg81-W2H-ZcJUdC=5nUkf9RAw@mail.gmail.com>
On Thu, Apr 19, 2018 at 03:54:49PM -0700, Alexander Duyck wrote:
> On Thu, Mar 15, 2018 at 11:40 AM, Alexander Duyck
> <alexander.duyck@gmail.com> wrote:
> > This series is meant to add support for SR-IOV on devices when the VFs are
> > not managed by the kernel. Examples of recent patches attempting to do this
> > include:
> > virto - https://patchwork.kernel.org/patch/10241225/
> > pci-stub - https://patchwork.kernel.org/patch/10109935/
> > vfio - https://patchwork.kernel.org/patch/10103353/
> > uio - https://patchwork.kernel.org/patch/9974031/
> >
> > Since this is quickly blowing up into a multi-driver problem it is probably
> > best to implement this solution as generically as possible.
> >
> > This series is an attempt to do that. What we do with this patch set is
> > provide a generic framework to enable SR-IOV in the case that the PF driver
> > doesn't support managing the VFs itself.
> >
> > I based my patch set originally on the patch by Mark Rustad but there isn't
> > much left after going through and cleaning out the bits that were no longer
> > needed, and after incorporating the feedback from David Miller. At this point
> > the only items to be fully reused was his patch description which is now
> > present in patch 3 of the set.
> >
> > This solution is limited in scope to just adding support for devices that
> > provide no functionality for SR-IOV other than allocating the VFs by
> > calling pci_enable_sriov. Previous sets had included patches for VFIO, but
> > for now I am dropping that as the scope of that work is larger then I
> > think I can take on at this time.
> >
> > v2: Reduced scope back to just virtio_pci and vfio-pci
> > Broke into 3 patch set from single patch
> > Changed autoprobe behavior to always set when num_vfs is set non-zero
> > v3: Updated Documentation to clarify when sriov_unmanaged_autoprobe is used
> > Wrapped vfio_pci_sriov_configure to fix build errors w/o SR-IOV in kernel
> > v4: Dropped vfio-pci patch
> > Added ena and nvme to drivers now using pci_sriov_configure_unmanaged
> > Dropped pci_disable_sriov call in virtio_pci to be consistent with ena
> > v5: Dropped sriov_unmanaged_autoprobe and pci_sriov_conifgure_unmanaged
> > Added new patch that enables pci_sriov_configure_simple
> > Updated drivers to use pci_sriov_configure_simple
> > v6: Defined pci_sriov_configure_simple as NULL when SR-IOV is not enabled
> > Updated drivers to drop "#ifdef" checks for IOV
> > Added pci-pf-stub as place for PF-only drivers to add support
> > v7: Dropped pci_id table explanation from pci-pf-stub driver
> > Updated pci_sriov_configure_simple to drop need for err value
> > Fixed comment explaining why pci_sriov_configure_simple is NULL
> >
>
> Just following up since this has been sitting in patchwork for just
> over a month now
> (https://patchwork.ozlabs.org/project/linux-pci/list/?series=34034).
> I'm just wondering what the expectation is on getting these pulled
> into the pci tree? I'm assuming that is the best place for these
> patches. Are there any concerns I still need to address or are these
> going to be pulled in at some point, and if so is there any ETA on
> when that will be?
>
> Thanks.
>
> - Alex
Sorry I didn't notice you had more questions. I have responded
hopefully explaining my concerns. Summary:
- For virtio we should add this with a feature bit.
- I am worried about security of this for the stub, but I am
not the maintainer there.
--
MST
^ permalink raw reply
* Re: [RFC PATCH ghak32 V2 10/13] audit: add containerid support for seccomp and anom_abend records
From: Richard Guy Briggs @ 2018-04-20 0:42 UTC (permalink / raw)
To: Paul Moore
Cc: simo, jlayton, carlos, linux-api, containers, LKML, Eric Paris,
dhowells, Linux-Audit Mailing List, ebiederm, luto, netdev,
linux-fsdevel, cgroups, serge, viro
In-Reply-To: <CAHC9VhS6MKoLkzpfcmYBSNnvrtbL2FOF5PX9uOfivSVEWykkQg@mail.gmail.com>
On 2018-04-18 21:31, Paul Moore wrote:
> On Fri, Mar 16, 2018 at 5:00 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
> > Add container ID auxiliary records to secure computing and abnormal end
> > standalone records.
> >
> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > ---
> > kernel/auditsc.c | 10 ++++++++--
> > 1 file changed, 8 insertions(+), 2 deletions(-)
> >
> > diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> > index 7103d23..2f02ed9 100644
> > --- a/kernel/auditsc.c
> > +++ b/kernel/auditsc.c
> > @@ -2571,6 +2571,7 @@ static void audit_log_task(struct audit_buffer *ab)
> > void audit_core_dumps(long signr)
> > {
> > struct audit_buffer *ab;
> > + struct audit_context *context = audit_alloc_local();
>
> Looking quickly at do_coredump() I *believe* we can use current here.
>
> > if (!audit_enabled)
> > return;
> > @@ -2578,19 +2579,22 @@ void audit_core_dumps(long signr)
> > if (signr == SIGQUIT) /* don't care for those */
> > return;
> >
> > - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
> > + ab = audit_log_start(context, GFP_KERNEL, AUDIT_ANOM_ABEND);
> > if (unlikely(!ab))
> > return;
> > audit_log_task(ab);
> > audit_log_format(ab, " sig=%ld res=1", signr);
> > audit_log_end(ab);
> > + audit_log_container_info(context, "abend", audit_get_containerid(current));
> > + audit_free_context(context);
> > }
> >
> > void __audit_seccomp(unsigned long syscall, long signr, int code)
> > {
> > struct audit_buffer *ab;
> > + struct audit_context *context = audit_alloc_local();
>
> We can definitely use current here.
Ok, so both syscall aux records. That elimintes this patch from the
set, can go in independently.
> > - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP);
> > + ab = audit_log_start(context, GFP_KERNEL, AUDIT_SECCOMP);
> > if (unlikely(!ab))
> > return;
> > audit_log_task(ab);
> > @@ -2598,6 +2602,8 @@ void __audit_seccomp(unsigned long syscall, long signr, int code)
> > signr, syscall_get_arch(), syscall,
> > in_compat_syscall(), KSTK_EIP(current), code);
> > audit_log_end(ab);
> > + audit_log_container_info(context, "seccomp", audit_get_containerid(current));
> > + audit_free_context(context);
> > }
> >
> > struct list_head *audit_killed_trees(void)
>
> --
> paul moore
> www.paul-moore.com
>
> --
> Linux-audit mailing list
> Linux-audit@redhat.com
> https://www.redhat.com/mailman/listinfo/linux-audit
- RGB
--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
^ permalink raw reply
* Re: [pci PATCH v7 2/5] virtio_pci: Add support for unmanaged SR-IOV on virtio_pci devices
From: Michael S. Tsirkin @ 2018-04-20 0:40 UTC (permalink / raw)
To: Alexander Duyck
Cc: Daly, Dan, Bjorn Helgaas, Duyck, Alexander H, linux-pci,
virtio-dev, kvm, Netdev, LKML, linux-nvme, Keith Busch, netanel,
Don Dutile, Maximilian Heyne, Wang, Liang-min, Rustad, Mark D,
David Woodhouse, Christoph Hellwig, dwmw
In-Reply-To: <CAKgT0Ude79FYrK4qA0OKRJ1NackyqPi-hZ8Zh3WSdLDFxoOosQ@mail.gmail.com>
On Tue, Apr 03, 2018 at 12:06:03PM -0700, Alexander Duyck wrote:
> On Tue, Apr 3, 2018 at 11:27 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Tue, Apr 03, 2018 at 10:32:00AM -0700, Alexander Duyck wrote:
> >> On Tue, Apr 3, 2018 at 6:12 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> >> > On Fri, Mar 16, 2018 at 09:40:34AM -0700, Alexander Duyck wrote:
> >> >> On Fri, Mar 16, 2018 at 9:34 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> >> >> > On Thu, Mar 15, 2018 at 11:42:41AM -0700, Alexander Duyck wrote:
> >> >> >> From: Alexander Duyck <alexander.h.duyck@intel.com>
> >> >> >>
> >> >> >> Hardware-realized virtio_pci devices can implement SR-IOV, so this
> >> >> >> patch enables its use. The device in question is an upcoming Intel
> >> >> >> NIC that implements both a virtio_net PF and virtio_net VFs. These
> >> >> >> are hardware realizations of what has been up to now been a software
> >> >> >> interface.
> >> >> >>
> >> >> >> The device in question has the following 4-part PCI IDs:
> >> >> >>
> >> >> >> PF: vendor: 1af4 device: 1041 subvendor: 8086 subdevice: 15fe
> >> >> >> VF: vendor: 1af4 device: 1041 subvendor: 8086 subdevice: 05fe
> >> >> >>
> >> >> >> The patch currently needs no check for device ID, because the callback
> >> >> >> will never be made for devices that do not assert the capability or
> >> >> >> when run on a platform incapable of SR-IOV.
> >> >> >>
> >> >> >> One reason for this patch is because the hardware requires the
> >> >> >> vendor ID of a VF to be the same as the vendor ID of the PF that
> >> >> >> created it. So it seemed logical to simply have a fully-functioning
> >> >> >> virtio_net PF create the VFs. This patch makes that possible.
> >> >> >>
> >> >> >> Reviewed-by: Christoph Hellwig <hch@lst.de>
> >> >> >> Signed-off-by: Mark Rustad <mark.d.rustad@intel.com>
> >> >> >> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> >> >> >
> >> >> > So if and when virtio PFs can manage the VFs, then we can
> >> >> > add a feature bit for that?
> >> >> > Seems reasonable.
> >> >>
> >> >> Yes. If nothing else you may not even need a feature bit depending on
> >> >> how things go.
> >> >
> >> > OTOH if the interface is changed in an incompatible way,
> >> > and old Linux will attempt to drive the new device
> >> > since there is no check.
> >> >
> >> > I think we should add a feature bit right away.
> >>
> >> I'm not sure why you would need a feature bit. The capability is
> >> controlled via PCI configuration space. If it is present the device
> >> has the capability. If it is not then it does not.
> >>
> >> Basically if the PCI configuration space is not present then the sysfs
> >> entries will not be spawned and nothing will attempt to use this
> >> function.
> >>
> >> - ALex
> >
> > It's about compability with older guests which ignore the
> > capability.
> >
> > The feature is thus helpful so host knows whether guest supports VFs.
>
> The thing is if the capability is ignored then the feature isn't used.
> So for SR-IOV it isn't an uncommon thing for there to be drivers for
> the PF floating around that do not support SR-IOV. In such cases
> SR-IOV just isn't used while the hardware could support it.
Right but how come there are VF drivers but PF driver does not
know about these?
And are there PF drivers that intentially do not enable SRIOV
because it's known to be broken in some way?
Case in point I do think virtio want to limit this
depending on a feature bit on general principles
(the principle being that all extensions have feature bits).
There are security implications here - we previously relied on
whitelisting after all.
Wouldn't it be safer to be a bit more careful and update the
actual PF drivers? It's just one line per driver, but it
can be done with an ack by driver maintainer.
If/once we find out all drivers do have it, we can then
change the default.
> I would think in the case of virtio it would be the same kind of
> thing. Basically if SR-IOV is supported by the host then the
> capability would be present. If SR-IOV is supported by the guest then
> it would make use of the capability to spawn VFs. If either the
> capability isn't present, or the driver doesn't use it then you won't
> be able to spawn VFs in the guest.
> Maybe I am missing something. Do you support dynamically changing the
> PCI configuration space for Virtio devices based on the presence of
> feature bits provided by the guest?
No. The point is that IMHO at least virtio - in absence of feature bit -
to ignore VFs rather than assume they are safe to drive
in an unmanaged way.
> Also are you saying this patch set should wait on the feature bit to
> be added, or are you talking about doing this as some sort of
> follow-up?
>
> - Alex
I think for virtio it should include the feature bit, yes.
Adding feature bit is very easy - post a patch to the virtio TC mailing
list, wait about a week to give people time to respond (two weeks if it
is around holidays and such).
--
MST
^ permalink raw reply
* Re: [bpf-next PATCH 3/3] bpf: add sample program to trace map events
From: Alexei Starovoitov @ 2018-04-20 0:27 UTC (permalink / raw)
To: Sebastiano Miano
Cc: netdev, ast, daniel, mingo, rostedt, brouer, fulvio.risso,
David S. Miller
In-Reply-To: <152406545918.3465.14253635905960610284.stgit@localhost.localdomain>
On Wed, Apr 18, 2018 at 05:30:59PM +0200, Sebastiano Miano wrote:
> This patch adds a sample program, called trace_map_events,
> that shows how to capture map events and filter them based on
> the map id.
...
> +struct bpf_map_keyval_ctx {
> + u64 pad; // First 8 bytes are not accessible by bpf code
> + u32 type; // offset:8; size:4; signed:0;
> + u32 key_len; // offset:12; size:4; signed:0;
> + u32 key; // offset:16; size:4; signed:0;
> + bool key_trunc; // offset:20; size:1; signed:0;
> + u32 val_len; // offset:24; size:4; signed:0;
> + u32 val; // offset:28; size:4; signed:0;
> + bool val_trunc; // offset:32; size:1; signed:0;
> + int ufd; // offset:36; size:4; signed:1;
> + u32 id; // offset:40; size:4; signed:0;
> +};
> +
> +SEC("tracepoint/bpf/bpf_map_lookup_elem")
> +int trace_bpf_map_lookup_elem(struct bpf_map_keyval_ctx *ctx)
> +{
> + struct map_event_data data;
> + int cpu = bpf_get_smp_processor_id();
> + bool *filter;
> + u32 key = 0, map_id = ctx->id;
> +
> + filter = bpf_map_lookup_elem(&filter_events, &key);
> + if (!filter)
> + return 1;
> +
> + if (!*filter)
> + goto send_event;
> +
> + /*
> + * If the map_id is not in the list of filtered
> + * ids we immediately return
> + */
> + if (!bpf_map_lookup_elem(&filtered_ids, &map_id))
> + return 0;
> +
> +send_event:
> + data.map_id = map_id;
> + data.evnt_type = MAP_LOOKUP;
> + data.map_type = ctx->type;
> +
> + bpf_perf_event_output(ctx, &map_event_trace, cpu, &data, sizeof(data));
> + return 0;
> +}
looks like the purpose of the series is to create map notify mechanism
so some external user space daemon can snoop all bpf map operations
that all other processes and bpf programs are doing.
I think it would be way better to create a proper mechanism for that
with permissions.
tracepoints in the bpf core were introduced as introspection mechanism
for debugging. Right now we have better ways to do introspection
with ids, queries, etc that bpftool is using, so original purpose of
those tracepoints is gone and they actually rot.
Let's not repurpose them into this map notify logic.
I don't want tracepoints in the bpf core to become a stable interface
it will stiffen the development.
^ permalink raw reply
* Re: [PATCH bpf-next v5 00/10] BTF: BPF Type Format
From: Daniel Borkmann @ 2018-04-19 23:57 UTC (permalink / raw)
To: Martin KaFai Lau, netdev
Cc: Alexei Starovoitov, kernel-team, Arnaldo Carvalho de Melo
In-Reply-To: <20180418225606.2771620-1-kafai@fb.com>
On 04/19/2018 12:55 AM, Martin KaFai Lau wrote:
> This patch introduces BPF Type Format (BTF).
>
> BTF (BPF Type Format) is the meta data format which describes
> the data types of BPF program/map. Hence, it basically focus
> on the C programming language which the modern BPF is primary
> using. The first use case is to provide a generic pretty print
> capability for a BPF map.
>
> A modified pahole that can convert dwarf to BTF is here:
> https://github.com/iamkafai/pahole/tree/btf
> (Arnaldo, there is some BTF_KIND numbering changes on
> Apr 18th, d61426c1571)
>
> Please see individual patch for details.
>
> v5:
> - Remove BTF_KIND_FLOAT and BTF_KIND_FUNC which are not
> currently used. They can be added in the future.
> Some bpf_df_xxx() are removed together.
> - Add comment in patch 7 to clarify that the new bpffs_map_fops
> should not be extended further.
>
> v4:
> - Fix warning (remove unneeded semicolon)
> - Remove a redundant variable (nr_bytes) from btf_int_check_meta() in
> patch 1. Caught by W=1.
>
> v3:
> - Rebase to bpf-next
> - Fix sparse warning (by adding static)
> - Add BTF header logging: btf_verifier_log_hdr()
> - Fix the alignment test on btf->type_off
> - Add tests for the BTF header
> - Lower the max BTF size to 16MB. It should be enough
> for some time. We could raise it later if it would
> be needed.
>
> v2:
> - Use kvfree where needed in patch 1 and 2
> - Also consider BTF_INT_OFFSET() in the btf_int_check_meta()
> in patch 1
> - Fix an incorrect goto target in map_create() during
> the btf-error-path in patch 7
> - re-org some local vars to keep the rev xmas tree in btf.c
Series applied to bpf-next, thanks Martin. As discussed please follow up
with the bpftool patches.
Thanks,
Daniel
^ permalink raw reply
* Re: [PATCH bpf-next v2 9/9] tools/bpf: add a test_progs test case for bpf_get_stack helper
From: Yonghong Song @ 2018-04-19 23:42 UTC (permalink / raw)
To: Alexei Starovoitov; +Cc: ast, daniel, netdev, kernel-team
In-Reply-To: <20180419043953.fcv33e2glomg33gp@ast-mbp>
On 4/18/18 9:39 PM, Alexei Starovoitov wrote:
> On Wed, Apr 18, 2018 at 09:54:44AM -0700, Yonghong Song wrote:
>> The test_stacktrace_map is enhanced to call bpf_get_stack
>> in the helper to get the stack trace as well.
>> The stack traces from bpf_get_stack and bpf_get_stackid
>> are compared to ensure that for the same stack as
>> represented as the same hash, their ip addresses
>> must be the same.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>
> could you please add a test for bpf_get_stack() with buildid as well?
> I think patch 2 implementes it correctly, but would be good to have a test for it.
Right. Will improve the test to cover buildid as well.
^ permalink raw reply
* Re: [PATCH bpf-next v2 7/9] samples/bpf: add a test for bpf_get_stack helper
From: Yonghong Song @ 2018-04-19 23:42 UTC (permalink / raw)
To: Alexei Starovoitov; +Cc: ast, daniel, netdev, kernel-team
In-Reply-To: <20180419043745.a23qak7peaurmiqg@ast-mbp>
On 4/18/18 9:37 PM, Alexei Starovoitov wrote:
> On Wed, Apr 18, 2018 at 09:54:42AM -0700, Yonghong Song wrote:
>> The test attached a kprobe program to kernel function sys_write.
>> It tested to get stack for user space, kernel space and user
>> space with build_id request. It also tested to get user
>> and kernel stack into the same buffer with back-to-back
>> bpf_get_stack helper calls.
>>
>> Whenever the kernel stack is available, the user space
>> application will check to ensure that sys_write/SyS_write
>> is part of the stack.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>> samples/bpf/Makefile | 4 +
>> samples/bpf/trace_get_stack_kern.c | 86 +++++++++++++++++++++
>> samples/bpf/trace_get_stack_user.c | 150 +++++++++++++++++++++++++++++++++++++
>> 3 files changed, 240 insertions(+)
>
> since perf_read is being refactored out of trace_output_user.c in the previous patch
> please move it to selftests (instead of bpf_load.c) and move
> this whole test to selftests as well.
I put it here since I am attaching to a kprobe so that I can compare
address. I guess I can still do it by attaching to a kernel tracepoint.
Will move the tests to selftests as suggested.
^ permalink raw reply
* Re: [PATCH bpf-next v2 4/9] bpf/verifier: improve register value range tracking with ARSH
From: Yonghong Song @ 2018-04-19 23:39 UTC (permalink / raw)
To: Alexei Starovoitov; +Cc: ast, daniel, netdev, kernel-team
In-Reply-To: <20180419043511.n65ryn5twzcfyp2f@ast-mbp>
On 4/18/18 9:35 PM, Alexei Starovoitov wrote:
> On Wed, Apr 18, 2018 at 09:54:39AM -0700, Yonghong Song wrote:
>> When helpers like bpf_get_stack returns an int value
>> and later on used for arithmetic computation, the LSH and ARSH
>> operations are often required to get proper sign extension into
>> 64-bit. For example, without this patch:
>> 54: R0=inv(id=0,umax_value=800)
>> 54: (bf) r8 = r0
>> 55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800)
>> 55: (67) r8 <<= 32
>> 56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000))
>> 56: (c7) r8 s>>= 32
>> 57: R8=inv(id=0)
>> With this patch:
>> 54: R0=inv(id=0,umax_value=800)
>> 54: (bf) r8 = r0
>> 55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800)
>> 55: (67) r8 <<= 32
>> 56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000))
>> 56: (c7) r8 s>>= 32
>> 57: R8=inv(id=0, umax_value=800,var_off=(0x0; 0x3ff))
>> With better range of "R8", later on when "R8" is added to other register,
>> e.g., a map pointer or scalar-value register, the better register
>> range can be derived and verifier failure may be avoided.
>>
>> In our later example,
>> ......
>> usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
>> if (usize < 0)
>> return 0;
>> ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
>> ......
>> Without improving ARSH value range tracking, the register representing
>> "max_len - usize" will have smin_value equal to S64_MIN and will be
>> rejected by verifier.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>> kernel/bpf/verifier.c | 1 +
>> 1 file changed, 1 insertion(+)
>>
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index a8302c3..6148d31 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -2944,6 +2944,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
>> __update_reg_bounds(dst_reg);
>> break;
>> case BPF_RSH:
>> + case BPF_ARSH:
>
> I don't think that's correct.
> The code further down is very RSH specific.
Okay, I may need to introduce tnum_arshift then.
>
>> if (umax_val >= insn_bitness) {
>> /* Shifts greater than 31 or 63 are undefined.
>> * This includes shifts by a negative number.
>> --
>> 2.9.5
>>
^ permalink raw reply
* Re: [PATCH bpf-next v2 3/9] bpf/verifier: refine retval R0 state for bpf_get_stack helper
From: Yonghong Song @ 2018-04-19 23:37 UTC (permalink / raw)
To: Alexei Starovoitov; +Cc: ast, daniel, netdev, kernel-team
In-Reply-To: <20180419043322.zmwxapw3vcimlgg6@ast-mbp>
On 4/18/18 9:33 PM, Alexei Starovoitov wrote:
> On Wed, Apr 18, 2018 at 09:54:38AM -0700, Yonghong Song wrote:
>> The special property of return values for helpers bpf_get_stack
>> and bpf_probe_read_str are captured in verifier.
>> Both helpers return a negative error code or
>> a length, which is equal to or smaller than the buffer
>> size argument. This additional information in the
>> verifier can avoid the condition such as "retval > bufsize"
>> in the bpf program. For example, for the code blow,
>> usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
>> if (usize < 0 || usize > max_len)
>> return 0;
>> The verifier may have the following errors:
>> 52: (85) call bpf_get_stack#65
>> R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0)
>> R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256
>> R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
>> R9_w=inv800 R10=fp0,call_-1
>> 53: (bf) r8 = r0
>> 54: (bf) r1 = r8
>> 55: (67) r1 <<= 32
>> 56: (bf) r2 = r1
>> 57: (77) r2 >>= 32
>> 58: (25) if r2 > 0x31f goto pc+33
>> R0=inv(id=0) R1=inv(id=0,smax_value=9223372032559808512,
>> umax_value=18446744069414584320,
>> var_off=(0x0; 0xffffffff00000000))
>> R2=inv(id=0,umax_value=799,var_off=(0x0; 0x3ff))
>> R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
>> R8=inv(id=0) R9=inv800 R10=fp0,call_-1
>> 59: (1f) r9 -= r8
>> 60: (c7) r1 s>>= 32
>> 61: (bf) r2 = r7
>> 62: (0f) r2 += r1
>> math between map_value pointer and register with unbounded
>> min value is not allowed
>> The failure is due to llvm compiler optimization where register "r2",
>> which is a copy of "r1", is tested for condition while later on "r1"
>> is used for map_ptr operation. The verifier is not able to track such
>> inst sequence effectively.
>>
>> Without the "usize > max_len" condition, there is no llvm optimization
>> and the below generated code passed verifier:
>> 52: (85) call bpf_get_stack#65
>> R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0)
>> R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256
>> R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
>> R9_w=inv800 R10=fp0,call_-1
>> 53: (b7) r1 = 0
>> 54: (bf) r8 = r0
>> 55: (67) r8 <<= 32
>> 56: (c7) r8 s>>= 32
>> 57: (6d) if r1 s> r8 goto pc+24
>> R0=inv(id=0,umax_value=800) R1=inv0 R6=ctx(id=0,off=0,imm=0)
>> R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
>> R8=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff)) R9=inv800
>> R10=fp0,call_-1
>> 58: (bf) r2 = r7
>> 59: (0f) r2 += r8
>> 60: (1f) r9 -= r8
>> 61: (bf) r1 = r6
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>> kernel/bpf/verifier.c | 31 ++++++++++++++++++++++++++++++-
>> 1 file changed, 30 insertions(+), 1 deletion(-)
>>
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index aba9425..a8302c3 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -2333,10 +2333,32 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
>> return 0;
>> }
>>
>> +static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
>> + int func_id,
>> + struct bpf_reg_state *retval_state,
>> + bool is_check)
>> +{
>> + struct bpf_reg_state *src_reg, *dst_reg;
>> +
>> + if (ret_type != RET_INTEGER ||
>> + (func_id != BPF_FUNC_get_stack &&
>> + func_id != BPF_FUNC_probe_read_str))
>> + return;
>> +
>> + dst_reg = is_check ? retval_state : ®s[BPF_REG_0];
>> + if (func_id == BPF_FUNC_get_stack)
>> + src_reg = is_check ? ®s[BPF_REG_3] : retval_state;
>> + else
>> + src_reg = is_check ? ®s[BPF_REG_2] : retval_state;
>> +
>> + dst_reg->smax_value = src_reg->smax_value;
>> + dst_reg->umax_value = src_reg->umax_value;
>> +}
>
> I think this part can be made more generic, by using 'meta' logic.
> check_func_arg(.. &meta);
> can remember smax/umax into meta for arg_type_is_mem_size()
> and later refine_retval_range() can be applied to r0.
> This will help avoid mistakes with specifying reg by position (r2 or r3)
> like above snippet is doing.
Good suggestion. Let me try this.
>
>> +
>> static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
>> {
>> const struct bpf_func_proto *fn = NULL;
>> - struct bpf_reg_state *regs;
>> + struct bpf_reg_state *regs, retval_state;
>> struct bpf_call_arg_meta meta;
>> bool changes_data;
>> int i, err;
>> @@ -2415,6 +2437,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
>> }
>>
>> regs = cur_regs(env);
>> +
>> + /* before reset caller saved regs, check special ret value */
>> + do_refine_retval_range(regs, fn->ret_type, func_id, &retval_state, 1);
>> +
>> /* reset caller saved regs */
>> for (i = 0; i < CALLER_SAVED_REGS; i++) {
>> mark_reg_not_init(env, regs, caller_saved[i]);
>> @@ -2456,6 +2482,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
>> return -EINVAL;
>> }
>>
>> + /* apply additional constraints to ret value */
>> + do_refine_retval_range(regs, fn->ret_type, func_id, &retval_state, 0);
>> +
>> err = check_map_func_compatibility(env, meta.map_ptr, func_id);
>> if (err)
>> return err;
>> --
>> 2.9.5
>>
^ permalink raw reply
* Re: [PATCH] kvmalloc: always use vmalloc if CONFIG_DEBUG_VM
From: Andrew Morton @ 2018-04-19 23:22 UTC (permalink / raw)
To: Mikulas Patocka
Cc: David Miller, linux-mm, eric.dumazet, edumazet, bhutchings,
netdev, linux-kernel, mst, jasowang, virtualization, dm-devel,
Vlastimil Babka
In-Reply-To: <alpine.LRH.2.02.1804191716100.10099@file01.intranet.prod.int.rdu2.redhat.com>
On Thu, 19 Apr 2018 17:19:20 -0400 (EDT) Mikulas Patocka <mpatocka@redhat.com> wrote:
> > > In order to detect these bugs reliably I submit this patch that changes
> > > kvmalloc to always use vmalloc if CONFIG_DEBUG_VM is turned on.
> > >
> > > ...
> > >
> > > --- linux-2.6.orig/mm/util.c 2018-04-18 15:46:23.000000000 +0200
> > > +++ linux-2.6/mm/util.c 2018-04-18 16:00:43.000000000 +0200
> > > @@ -395,6 +395,7 @@ EXPORT_SYMBOL(vm_mmap);
> > > */
> > > void *kvmalloc_node(size_t size, gfp_t flags, int node)
> > > {
> > > +#ifndef CONFIG_DEBUG_VM
> > > gfp_t kmalloc_flags = flags;
> > > void *ret;
> > >
> > > @@ -426,6 +427,7 @@ void *kvmalloc_node(size_t size, gfp_t f
> > > */
> > > if (ret || size <= PAGE_SIZE)
> > > return ret;
> > > +#endif
> > >
> > > return __vmalloc_node_flags_caller(size, node, flags,
> > > __builtin_return_address(0));
> >
> > Well, it doesn't have to be done at compile-time, does it? We could
> > add a knob (in debugfs, presumably) which enables this at runtime.
> > That's far more user-friendly.
>
> But who will turn it on in debugfs?
But who will turn it on in Kconfig? Just a handful of developers. We
could add SONFIG_DEBUG_SG to the list in
Documentation/process/submit-checklist.rst, but nobody reads that.
Also, a whole bunch of defconfigs set CONFIG_DEBUG_SG=y and some
googling indicates that they aren't the only ones...
> It should be default for debugging
> kernels, so that users using them would report the error.
Well. This isn't the first time we've wanted to enable expensive (or
noisy) debugging things in debug kernels, by any means.
So how could we define a debug kernel in which it's OK to enable such
things?
- Could be "it's an -rc kernel". But then we'd be enabling a bunch of
untested code when Linus cuts a release.
- Could be "it's an -rc kernel with SUBLEVEL <= 5". But then we risk
unexpected things happening when Linux cuts -rc6, which still isn't
good.
- How about "it's an -rc kernel with odd-numbered SUBLEVEL and
SUBLEVEL <= 5". That way everybody who runs -rc1, -rc3 and -rc5 will
have kvmalloc debugging enabled. That's potentially nasty because
vmalloc is much slower than kmalloc. But kvmalloc() is only used for
large and probably infrequent allocations, so it's probably OK.
I wonder how we get at SUBLEVEL from within .c.
^ permalink raw reply
* Re: [PATCH net-next 4/5] tcp: implement mmap() for zero copy receive
From: Eric Dumazet @ 2018-04-19 23:15 UTC (permalink / raw)
To: Eric Dumazet, David S . Miller
Cc: netdev, Neal Cardwell, Yuchung Cheng, Soheil Hassas Yeganeh
In-Reply-To: <20180416173339.6310-5-edumazet@google.com>
On 04/16/2018 10:33 AM, Eric Dumazet wrote:
> Some networks can make sure TCP payload can exactly fit 4KB pages,
> with well chosen MSS/MTU and architectures.
>
> Implement mmap() system call so that applications can avoid
> copying data without complex splice() games.
>
> Note that a successful mmap( X bytes) on TCP socket is consuming
> bytes, as if recvmsg() has been done. (tp->copied += X)
>
Oh well, I should have run this code with LOCKDEP enabled :/
[ 974.320412] ======================================================
[ 974.326631] WARNING: possible circular locking dependency detected
[ 974.332816] 4.16.0-dbx-DEV #40 Not tainted
[ 974.336927] ------------------------------------------------------
[ 974.343107] b78299096/15790 is trying to acquire lock:
[ 974.348246] 000000006074c9cf (sk_lock-AF_INET6){+.+.}, at: tcp_mmap+0x7c/0x550
[ 974.355505]
but task is already holding lock:
[ 974.361366] 000000008dbe063b (&mm->mmap_sem){++++}, at: vm_mmap_pgoff+0x99/0x100
[ 974.368801]
which lock already depends on the new lock.
[ 974.377010]
the existing dependency chain (in reverse order) is:
[ 974.384501]
-> #1 (&mm->mmap_sem){++++}:
[ 974.389911] __might_fault+0x68/0x90
[ 974.394025] _copy_from_user+0x23/0xa0
[ 974.398311] sock_setsockopt+0x4a2/0xac0
[ 974.402761] __sys_setsockopt+0xd9/0xf0
[ 974.407118] SyS_setsockopt+0xe/0x20
[ 974.411242] do_syscall_64+0x6e/0x1a0
[ 974.415431] entry_SYSCALL_64_after_hwframe+0x42/0xb7
[ 974.421011]
-> #0 (sk_lock-AF_INET6){+.+.}:
[ 974.426690] lock_acquire+0x95/0x1e0
[ 974.430813] lock_sock_nested+0x71/0xa0
[ 974.435196] tcp_mmap+0x7c/0x550
[ 974.438940] sock_mmap+0x23/0x30
[ 974.442695] mmap_region+0x3a4/0x5d0
[ 974.446808] do_mmap+0x313/0x530
[ 974.450571] vm_mmap_pgoff+0xc7/0x100
[ 974.454769] ksys_mmap_pgoff+0x1d5/0x260
[ 974.459247] SyS_mmap+0x1b/0x30
[ 974.462936] do_syscall_64+0x6e/0x1a0
[ 974.467114] entry_SYSCALL_64_after_hwframe+0x42/0xb7
[ 974.472678]
other info that might help us debug this:
[ 974.480677] Possible unsafe locking scenario:
[ 974.486600] CPU0 CPU1
[ 974.491152] ---- ----
[ 974.495684] lock(&mm->mmap_sem);
[ 974.499089] lock(sk_lock-AF_INET6);
[ 974.505285] lock(&mm->mmap_sem);
[ 974.511211] lock(sk_lock-AF_INET6);
[ 974.514885]
*** DEADLOCK ***
[ 974.520825] 1 lock held by b78299096/15790:
[ 974.525018] #0: 000000008dbe063b (&mm->mmap_sem){++++}, at: vm_mmap_pgoff+0x99/0x100
[ 974.532852]
stack backtrace:
[ 974.537224] CPU: 25 PID: 15790 Comm: b78299096 Not tainted 4.16.0-dbx-DEV #40
[ 974.544371] Hardware name: Intel RML,PCH/Iota_QC_19, BIOS 2.40.0 06/22/2016
[ 974.551333] Call Trace:
[ 974.553792] dump_stack+0x70/0xa5
[ 974.557111] print_circular_bug.isra.39+0x1d8/0x1e6
[ 974.561982] __lock_acquire+0x1284/0x1340
[ 974.565992] ? tcp_mmap+0x7c/0x550
[ 974.569419] lock_acquire+0x95/0x1e0
[ 974.573011] ? lock_acquire+0x95/0x1e0
[ 974.576767] ? tcp_mmap+0x7c/0x550
[ 974.580167] lock_sock_nested+0x71/0xa0
[ 974.584023] ? tcp_mmap+0x7c/0x550
[ 974.587437] tcp_mmap+0x7c/0x550
[ 974.590677] sock_mmap+0x23/0x30
[ 974.593909] mmap_region+0x3a4/0x5d0
[ 974.597506] do_mmap+0x313/0x530
[ 974.600749] vm_mmap_pgoff+0xc7/0x100
[ 974.604414] ksys_mmap_pgoff+0x1d5/0x260
[ 974.608341] ? fd_install+0x25/0x30
[ 974.611849] ? trace_hardirqs_on_caller+0xef/0x180
[ 974.616641] SyS_mmap+0x1b/0x30
[ 974.619804] do_syscall_64+0x6e/0x1a0
[ 974.623462] entry_SYSCALL_64_after_hwframe+0x42/0xb7
[ 974.628549] RIP: 0033:0x433749
[ 974.631600] RSP: 002b:00007ffd29fdb438 EFLAGS: 00000216 ORIG_RAX: 0000000000000009
[ 974.639197] RAX: ffffffffffffffda RBX: 00000000004002e0 RCX: 0000000000433749
[ 974.646323] RDX: 0000000000000008 RSI: 0000000000004000 RDI: 0000000020ab7000
[ 974.653463] RBP: 00007ffd29fdb460 R08: 0000000000000003 R09: 0000000000000000
[ 974.660603] R10: 0000000000000012 R11: 0000000000000216 R12: 0000000000401670
[ 974.667737] R13: 0000000000401700 R14: 0000000000000000 R15: 0000000000000000
I am not sure we can keep mmap() API, since we probably need to first lock the socket,
then grab vm semaphore.
^ permalink raw reply
* Re: [pci PATCH v7 0/5] Add support for unmanaged SR-IOV
From: Alexander Duyck @ 2018-04-19 22:54 UTC (permalink / raw)
To: Bjorn Helgaas, Duyck, Alexander H, linux-pci
Cc: virtio-dev, kvm, Netdev, Daly, Dan, LKML, linux-nvme, Keith Busch,
netanel, Don Dutile, Maximilian Heyne, Wang, Liang-min,
Rustad, Mark D, David Woodhouse, Christoph Hellwig, dwmw,
Michael S. Tsirkin
In-Reply-To: <20180315183449.3102.64791.stgit@localhost.localdomain>
On Thu, Mar 15, 2018 at 11:40 AM, Alexander Duyck
<alexander.duyck@gmail.com> wrote:
> This series is meant to add support for SR-IOV on devices when the VFs are
> not managed by the kernel. Examples of recent patches attempting to do this
> include:
> virto - https://patchwork.kernel.org/patch/10241225/
> pci-stub - https://patchwork.kernel.org/patch/10109935/
> vfio - https://patchwork.kernel.org/patch/10103353/
> uio - https://patchwork.kernel.org/patch/9974031/
>
> Since this is quickly blowing up into a multi-driver problem it is probably
> best to implement this solution as generically as possible.
>
> This series is an attempt to do that. What we do with this patch set is
> provide a generic framework to enable SR-IOV in the case that the PF driver
> doesn't support managing the VFs itself.
>
> I based my patch set originally on the patch by Mark Rustad but there isn't
> much left after going through and cleaning out the bits that were no longer
> needed, and after incorporating the feedback from David Miller. At this point
> the only items to be fully reused was his patch description which is now
> present in patch 3 of the set.
>
> This solution is limited in scope to just adding support for devices that
> provide no functionality for SR-IOV other than allocating the VFs by
> calling pci_enable_sriov. Previous sets had included patches for VFIO, but
> for now I am dropping that as the scope of that work is larger then I
> think I can take on at this time.
>
> v2: Reduced scope back to just virtio_pci and vfio-pci
> Broke into 3 patch set from single patch
> Changed autoprobe behavior to always set when num_vfs is set non-zero
> v3: Updated Documentation to clarify when sriov_unmanaged_autoprobe is used
> Wrapped vfio_pci_sriov_configure to fix build errors w/o SR-IOV in kernel
> v4: Dropped vfio-pci patch
> Added ena and nvme to drivers now using pci_sriov_configure_unmanaged
> Dropped pci_disable_sriov call in virtio_pci to be consistent with ena
> v5: Dropped sriov_unmanaged_autoprobe and pci_sriov_conifgure_unmanaged
> Added new patch that enables pci_sriov_configure_simple
> Updated drivers to use pci_sriov_configure_simple
> v6: Defined pci_sriov_configure_simple as NULL when SR-IOV is not enabled
> Updated drivers to drop "#ifdef" checks for IOV
> Added pci-pf-stub as place for PF-only drivers to add support
> v7: Dropped pci_id table explanation from pci-pf-stub driver
> Updated pci_sriov_configure_simple to drop need for err value
> Fixed comment explaining why pci_sriov_configure_simple is NULL
>
Just following up since this has been sitting in patchwork for just
over a month now
(https://patchwork.ozlabs.org/project/linux-pci/list/?series=34034).
I'm just wondering what the expectation is on getting these pulled
into the pci tree? I'm assuming that is the best place for these
patches. Are there any concerns I still need to address or are these
going to be pulled in at some point, and if so is there any ETA on
when that will be?
Thanks.
- Alex
^ permalink raw reply
* Re: WARNING in refcount_inc (3)
From: Eric Biggers @ 2018-04-19 22:45 UTC (permalink / raw)
To: syzbot; +Cc: davem, kuznet, linux-kernel, netdev, syzkaller-bugs, yoshfuji
In-Reply-To: <001a113ec036acc2460568bd5427@google.com>
On Sat, Mar 31, 2018 at 04:01:02PM -0700, syzbot wrote:
> Hello,
>
> syzbot hit the following crash on bpf-next commit
> 1379ef828a18d8f81c526b25e4d5685caa2cfd65 (Thu Mar 29 22:09:44 2018 +0000)
> Merge branch 'bpf-sockmap-ingress'
> syzbot dashboard link:
> https://syzkaller.appspot.com/bug?extid=6eaf536fd743f5e119c5
>
> So far this crash happened 6 times on bpf-next.
> C reproducer: https://syzkaller.appspot.com/x/repro.c?id=6614614900998144
> syzkaller reproducer:
> https://syzkaller.appspot.com/x/repro.syz?id=5035340528091136
> Raw console output:
> https://syzkaller.appspot.com/x/log.txt?id=5063394046509056
> Kernel config:
> https://syzkaller.appspot.com/x/.config?id=-1280663959502969741
> compiler: gcc (GCC) 7.1.1 20170620
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+6eaf536fd743f5e119c5@syzkaller.appspotmail.com
> It will help syzbot understand when the bug is fixed. See footer for
> details.
> If you forward the report, please keep this part and the footer.
>
> R13: 0000000000000005 R14: 0000000000001380 R15: 00007ffd314c8768
> ------------[ cut here ]------------
> ------------[ cut here ]------------
> refcount_t: increment on 0; use-after-free.
> refcount_t: underflow; use-after-free.
> WARNING: CPU: 1 PID: 4434 at lib/refcount.c:153 refcount_inc+0x47/0x50
> lib/refcount.c:153
> WARNING: CPU: 0 PID: 4437 at lib/refcount.c:187
> refcount_sub_and_test+0x167/0x1b0 lib/refcount.c:187
> Kernel panic - not syncing: panic_on_warn set ...
>
> Modules linked in:
> CPU: 1 PID: 4434 Comm: syzkaller349430 Not tainted 4.16.0-rc6+ #41
> CPU: 0 PID: 4437 Comm: syzkaller349430 Not tainted 4.16.0-rc6+ #41
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> RIP: 0010:refcount_sub_and_test+0x167/0x1b0 lib/refcount.c:187
> Call Trace:
> RSP: 0018:ffff8801b061f728 EFLAGS: 00010286
> __dump_stack lib/dump_stack.c:17 [inline]
> dump_stack+0x194/0x24d lib/dump_stack.c:53
> RAX: dffffc0000000008 RBX: 0000000000000000 RCX: ffffffff815ba4be
> RDX: 0000000000000000 RSI: 1ffff100360c3e95 RDI: 1ffff100360c3e6a
> RBP: ffff8801b061f7b8 R08: 0000000000000000 R09: 0000000000000000
> R10: ffff8801b061f850 R11: 0000000000000000 R12: 1ffff100360c3ee6
> panic+0x1e4/0x41c kernel/panic.c:183
> R13: 00000000ffffffff R14: 0000000000000001 R15: ffff8801b1be4184
> FS: 0000000001817880(0000) GS:ffff8801db200000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007ffd314c9000 CR3: 00000001b04a1006 CR4: 00000000001606f0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Call Trace:
> __warn+0x1dc/0x200 kernel/panic.c:547
> report_bug+0x1f4/0x2b0 lib/bug.c:186
> fixup_bug.part.11+0x37/0x80 arch/x86/kernel/traps.c:178
> fixup_bug arch/x86/kernel/traps.c:247 [inline]
> do_error_trap+0x2d7/0x3e0 arch/x86/kernel/traps.c:296
> refcount_dec_and_test+0x1a/0x20 lib/refcount.c:212
> put_net include/net/net_namespace.h:222 [inline]
> __sk_destruct+0x560/0x920 net/core/sock.c:1592
> do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
> invalid_op+0x1b/0x40 arch/x86/entry/entry_64.S:986
> RIP: 0010:refcount_inc+0x47/0x50 lib/refcount.c:153
> RSP: 0018:ffff8801b058f860 EFLAGS: 00010286
> RAX: dffffc0000000008 RBX: ffff8801ab55a1c4 RCX: ffffffff815ba4be
> RDX: 0000000000000000 RSI: 1ffff100360b1ebc RDI: 1ffff100360b1e91
> RBP: ffff8801b058f868 R08: 0000000000000000 R09: 0000000000000000
> sk_destruct+0x47/0x80 net/core/sock.c:1601
> R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801b058faf8
> __sk_free+0xf1/0x2b0 net/core/sock.c:1612
> R13: ffff8801af87b513 R14: ffff8801ab55a1c0 R15: ffff8801af87b501
> sk_free+0x2a/0x40 net/core/sock.c:1623
> sock_put include/net/sock.h:1661 [inline]
> tcp_close+0x967/0x1190 net/ipv4/tcp.c:2329
> get_net include/net/net_namespace.h:204 [inline]
> sk_alloc+0x3f9/0x1440 net/core/sock.c:1540
> inet_release+0xed/0x1c0 net/ipv4/af_inet.c:427
> sock_release+0x8d/0x1e0 net/socket.c:594
> sock_close+0x16/0x20 net/socket.c:1149
> __fput+0x327/0x7e0 fs/file_table.c:209
> ____fput+0x15/0x20 fs/file_table.c:243
> task_work_run+0x199/0x270 kernel/task_work.c:113
> inet_create+0x47c/0xf50 net/ipv4/af_inet.c:320
> tracehook_notify_resume include/linux/tracehook.h:191 [inline]
> exit_to_usermode_loop+0x275/0x2f0 arch/x86/entry/common.c:166
> __sock_create+0x4d4/0x850 net/socket.c:1285
> prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline]
> syscall_return_slowpath arch/x86/entry/common.c:265 [inline]
> do_syscall_64+0x6ec/0x940 arch/x86/entry/common.c:292
> sock_create net/socket.c:1325 [inline]
> SYSC_socket net/socket.c:1355 [inline]
> SyS_socket+0xeb/0x1d0 net/socket.c:1335
> do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
> entry_SYSCALL_64_after_hwframe+0x42/0xb7
> RIP: 0033:0x402950
> RSP: 002b:00007ffd314c8628 EFLAGS: 00000246
> ORIG_RAX: 0000000000000003
> RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000402950
> RDX: 00000000000000e0 RSI: 00007ffd314c8f00 RDI: 0000000000000003
> RBP: 00007ffd314c8740 R08: 00007ffd314c864c R09: 0000000000000001
> R10: 00007ffd314c8740 R11: 0000000000000246 R12: 00000000006cf4c0
> R13: 00000000006cee40 R14: 0000000000001380 R15: 00007ffd314c8768
> Code:
> entry_SYSCALL_64_after_hwframe+0x42/0xb7
> 5e
> RIP: 0033:0x4456a7
> 41
> RSP: 002b:00007ffd314c8628 EFLAGS: 00000202 ORIG_RAX: 0000000000000029
> 5f
> RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00000000004456a7
> 5d
> RDX: 0000000000000006 RSI: 0000000000000001 RDI: 0000000000000002
> RBP: 00007ffd314c8740 R08: 0000000000000000 R09: 0000000000000001
> c3
> R10: 0000000000000006 R11: 0000000000000202 R12: 0000000000000003
> e8
> R13: 0000000000000003 R14: 0000000000006cc2 R15: 00007ffd314c8768
> 0a 0b be fe 80 3d 20 c9 84 05 00 75 1a e8 fc 0a be fe 48 c7 c7 e0 78 e5 86
> c6 05 0b c9 84 05 01 e8 a9 16 8e fe <0f> 0b 31 db eb a3 e8 de 0a be fe 83 fb
> ff 0f 85 63 ff ff ff 31
> ---[ end trace dd327356f543ce46 ]---
> Dumping ftrace buffer:
> (ftrace buffer empty)
> Kernel Offset: disabled
> Rebooting in 86400 seconds..
>
>
> ---
> This bug is generated by a dumb bot. It may contain errors.
> See https://goo.gl/tpsmEJ for details.
> Direct all questions to syzkaller@googlegroups.com.
>
> syzbot will keep track of this bug report.
> If you forgot to add the Reported-by tag, once the fix for this bug is
> merged
> into any tree, please reply to this email with:
> #syz fix: exact-commit-title
Broken error handling when mounting rpc_pipefs is messing things up.
Fixed by patch in vfs/for-linus:
#syz fix: rpc_pipefs: deal with early sget() failures
- Eric
^ permalink raw reply
* Re: [PATCHv3 net 3/3] net: sched: ife: check on metadata length
From: Eric Dumazet @ 2018-04-19 22:24 UTC (permalink / raw)
To: Alexander Aring, yotam.gi
Cc: jhs, davem, xiyou.wangcong, jiri, yuvalm, netdev, kernel
In-Reply-To: <20180419221445.26205-4-aring@mojatatu.com>
On 04/19/2018 03:14 PM, Alexander Aring wrote:
> This patch checks if sk buffer is available to dererence ife header. If
> not then NULL will returned to signal an malformed ife packet. This
> avoids to crashing the kernel from outside.
>
> Signed-off-by: Alexander Aring <aring@mojatatu.com>
> Reviewed-by: Yotam Gigi <yotam.gi@gmail.com>
> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
> ---
> net/ife/ife.c | 3 +++
> 1 file changed, 3 insertions(+)
>
> diff --git a/net/ife/ife.c b/net/ife/ife.c
> index 7fbe70a0af4b..570a18d4ca32 100644
> --- a/net/ife/ife.c
> +++ b/net/ife/ife.c
> @@ -70,6 +70,9 @@ void *ife_decode(struct sk_buff *skb, u16 *metalen)
> u16 ifehdrln;
>
> ifehdr = (struct ifeheadr *) (skb->data + skb->dev->hard_header_len);
> + if (!pskb_may_pull(skb, skb->dev->hard_header_len + IFE_METAHDRLEN))
> + return NULL;
> +
No, you need to move here :
ifehdr = (struct ifeheadr *) (skb->data + skb->dev->hard_header_len);
> ifehdrln = ntohs(ifehdr->metalen);
> total_pull = skb->dev->hard_header_len + ifehdrln;
>
>
Please do not rush, wait one day before sending V4, no need to flood netdev@
^ permalink raw reply
* Re: [PATCH net-next] ipv6: send netlink notifications for manually configured addresses
From: Lorenzo Bianconi @ 2018-04-19 22:23 UTC (permalink / raw)
To: David Miller; +Cc: netdev, Thomas Haller
In-Reply-To: <20180417.140536.2292177928357979103.davem@davemloft.net>
> From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
> Date: Tue, 17 Apr 2018 11:54:39 +0200
>
>> Send a netlink notification when userspace adds a manually configured
>> address if DAD is enabled and optimistic flag isn't set.
>> Moreover send RTM_DELADDR notifications for tentative addresses.
>>
>> Some userspace applications (e.g. NetworkManager) are interested in
>> addr netlink events albeit the address is still in tentative state,
>> however events are not sent if DAD process is not completed.
>> If the address is added and immediately removed userspace listeners
>> are not notified. This behaviour can be easily reproduced by using
>> veth interfaces:
>>
>> $ ip -b - <<EOF
>>> link add dev vm1 type veth peer name vm2
>>> link set dev vm1 up
>>> link set dev vm2 up
>>> addr add 2001:db8:a:b:1:2:3:4/64 dev vm1
>>> addr del 2001:db8:a:b:1:2:3:4/64 dev vm1
>> EOF
>>
>> This patch reverts the behaviour introduced by the commit f784ad3d79e5
>> ("ipv6: do not send RTM_DELADDR for tentative addresses")
>>
>> Suggested-by: Thomas Haller <thaller@redhat.com>
>> Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
>
> Ok, applied to net-next.
>
> It would be really nice if we clearly documented somewhere the exact
> situations where we desire ipv6 address netlink notifications to be
> sent out.
>
> Maybe even a common function that guards the event emission, where we
> encode the rules. Or a comment somewhere prominent. Something like
> that.
>
> Right now this isn't spelled out clearly anywhere, and that's probably
> why things keep being adjusted back and forth like this.
Sounds good, I will post a patch. Thanks
Regards,
Lorenzo
>
> Thank you.
^ permalink raw reply
* [PATCHv3 net 3/3] net: sched: ife: check on metadata length
From: Alexander Aring @ 2018-04-19 22:14 UTC (permalink / raw)
To: yotam.gi
Cc: jhs, davem, xiyou.wangcong, jiri, yuvalm, netdev, kernel,
Alexander Aring
In-Reply-To: <20180419221445.26205-1-aring@mojatatu.com>
This patch checks if sk buffer is available to dererence ife header. If
not then NULL will returned to signal an malformed ife packet. This
avoids to crashing the kernel from outside.
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Reviewed-by: Yotam Gigi <yotam.gi@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
---
net/ife/ife.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/net/ife/ife.c b/net/ife/ife.c
index 7fbe70a0af4b..570a18d4ca32 100644
--- a/net/ife/ife.c
+++ b/net/ife/ife.c
@@ -70,6 +70,9 @@ void *ife_decode(struct sk_buff *skb, u16 *metalen)
u16 ifehdrln;
ifehdr = (struct ifeheadr *) (skb->data + skb->dev->hard_header_len);
+ if (!pskb_may_pull(skb, skb->dev->hard_header_len + IFE_METAHDRLEN))
+ return NULL;
+
ifehdrln = ntohs(ifehdr->metalen);
total_pull = skb->dev->hard_header_len + ifehdrln;
--
2.11.0
^ permalink raw reply related
* [PATCHv3 net 2/3] net: sched: ife: handle malformed tlv length
From: Alexander Aring @ 2018-04-19 22:14 UTC (permalink / raw)
To: yotam.gi
Cc: jhs, davem, xiyou.wangcong, jiri, yuvalm, netdev, kernel,
Alexander Aring
In-Reply-To: <20180419221445.26205-1-aring@mojatatu.com>
There is currently no handling to check on a invalid tlv length. This
patch adds such handling to avoid killing the kernel with a malformed
ife packet.
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Reviewed-by: Yotam Gigi <yotam.gi@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
---
include/net/ife.h | 3 ++-
net/ife/ife.c | 35 +++++++++++++++++++++++++++++++++--
net/sched/act_ife.c | 7 ++++++-
3 files changed, 41 insertions(+), 4 deletions(-)
diff --git a/include/net/ife.h b/include/net/ife.h
index 44b9c00f7223..e117617e3c34 100644
--- a/include/net/ife.h
+++ b/include/net/ife.h
@@ -12,7 +12,8 @@
void *ife_encode(struct sk_buff *skb, u16 metalen);
void *ife_decode(struct sk_buff *skb, u16 *metalen);
-void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen, u16 *totlen);
+void *ife_tlv_meta_decode(void *skbdata, const void *ifehdr_end, u16 *attrtype,
+ u16 *dlen, u16 *totlen);
int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
const void *dval);
diff --git a/net/ife/ife.c b/net/ife/ife.c
index 7d1ec76e7f43..7fbe70a0af4b 100644
--- a/net/ife/ife.c
+++ b/net/ife/ife.c
@@ -92,12 +92,43 @@ struct meta_tlvhdr {
__be16 len;
};
+static bool __ife_tlv_meta_valid(const unsigned char *skbdata,
+ const unsigned char *ifehdr_end)
+{
+ const struct meta_tlvhdr *tlv;
+ u16 tlvlen;
+
+ if (unlikely(skbdata + sizeof(*tlv) > ifehdr_end))
+ return false;
+
+ tlv = (const struct meta_tlvhdr *)skbdata;
+ tlvlen = ntohs(tlv->len);
+
+ /* tlv length field is inc header, check on minimum */
+ if (tlvlen < NLA_HDRLEN)
+ return false;
+
+ /* overflow by NLA_ALIGN check */
+ if (NLA_ALIGN(tlvlen) < tlvlen)
+ return false;
+
+ if (unlikely(skbdata + NLA_ALIGN(tlvlen) > ifehdr_end))
+ return false;
+
+ return true;
+}
+
/* Caller takes care of presenting data in network order
*/
-void *ife_tlv_meta_decode(void *skbdata, u16 *attrtype, u16 *dlen, u16 *totlen)
+void *ife_tlv_meta_decode(void *skbdata, const void *ifehdr_end, u16 *attrtype,
+ u16 *dlen, u16 *totlen)
{
- struct meta_tlvhdr *tlv = (struct meta_tlvhdr *) skbdata;
+ struct meta_tlvhdr *tlv;
+
+ if (!__ife_tlv_meta_valid(skbdata, ifehdr_end))
+ return NULL;
+ tlv = (struct meta_tlvhdr *)skbdata;
*dlen = ntohs(tlv->len) - NLA_HDRLEN;
*attrtype = ntohs(tlv->type);
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 49b8ab551fbe..8527cfdc446d 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -682,7 +682,12 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
u16 mtype;
u16 dlen;
- curr_data = ife_tlv_meta_decode(tlv_data, &mtype, &dlen, NULL);
+ curr_data = ife_tlv_meta_decode(tlv_data, ifehdr_end, &mtype,
+ &dlen, NULL);
+ if (!curr_data) {
+ qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats));
+ return TC_ACT_SHOT;
+ }
if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) {
/* abuse overlimits to count when we receive metadata
--
2.11.0
^ permalink raw reply related
* [PATCHv3 net 1/3] net: sched: ife: signal not finding metaid
From: Alexander Aring @ 2018-04-19 22:14 UTC (permalink / raw)
To: yotam.gi
Cc: jhs, davem, xiyou.wangcong, jiri, yuvalm, netdev, kernel,
Alexander Aring
In-Reply-To: <20180419221445.26205-1-aring@mojatatu.com>
We need to record stats for received metadata that we dont know how
to process. Have find_decode_metaid() return -ENOENT to capture this.
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Reviewed-by: Yotam Gigi <yotam.gi@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
---
net/sched/act_ife.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index a5994cf0512b..49b8ab551fbe 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -652,7 +652,7 @@ static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife,
}
}
- return 0;
+ return -ENOENT;
}
static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
--
2.11.0
^ permalink raw reply related
* [PATCHv3 net 0/3] net: sched: ife: malformed ife packet fixes
From: Alexander Aring @ 2018-04-19 22:14 UTC (permalink / raw)
To: yotam.gi
Cc: jhs, davem, xiyou.wangcong, jiri, yuvalm, netdev, kernel,
Alexander Aring
As promised at netdev 2.2 tc workshop I am working on adding scapy support for
tdc testing. It is still work in progress. I will submit the patches to tdc
later (they are not in good shape yet). The good news is I have been able to
find bugs which normal packet testing would not be able to find.
With fuzzy testing I was able to craft certain malformed packets that IFE
action was not able to deal with. This patch set fixes those bugs.
changes since v3:
- use pskb_may_pull
changes since v2:
- remove inline from __ife_tlv_meta_valid
- add const to cast to meta_tlvhdr
- add acked and reviewed tags
Alexander Aring (3):
net: sched: ife: signal not finding metaid
net: sched: ife: handle malformed tlv length
net: sched: ife: check on metadata length
include/net/ife.h | 3 ++-
net/ife/ife.c | 38 ++++++++++++++++++++++++++++++++++++--
net/sched/act_ife.c | 9 +++++++--
3 files changed, 45 insertions(+), 5 deletions(-)
--
2.11.0
^ permalink raw reply
* Re: [PATCHv2 net 3/3] net: sched: ife: check on metadata length
From: Eric Dumazet @ 2018-04-19 21:50 UTC (permalink / raw)
To: Alexander Aring, yotam.gi
Cc: jhs, davem, xiyou.wangcong, jiri, yuvalm, netdev, kernel
In-Reply-To: <20180419214438.6801-4-aring@mojatatu.com>
On 04/19/2018 02:44 PM, Alexander Aring wrote:
> This patch checks if sk buffer is available to dererence ife header. If
> not then NULL will returned to signal an malformed ife packet. This
> avoids to crashing the kernel from outside.
>
> Signed-off-by: Alexander Aring <aring@mojatatu.com>
> Reviewed-by: Yotam Gigi <yotam.gi@gmail.com>
> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
> ---
> net/ife/ife.c | 3 +++
> 1 file changed, 3 insertions(+)
>
> diff --git a/net/ife/ife.c b/net/ife/ife.c
> index 7fbe70a0af4b..93e8c36ce6ec 100644
> --- a/net/ife/ife.c
> +++ b/net/ife/ife.c
> @@ -70,6 +70,9 @@ void *ife_decode(struct sk_buff *skb, u16 *metalen)
> u16 ifehdrln;
>
> ifehdr = (struct ifeheadr *) (skb->data + skb->dev->hard_header_len);
> + if (skb->len < skb->dev->hard_header_len + IFE_METAHDRLEN)
> + return NULL;
> +
> ifehdrln = ntohs(ifehdr->metalen);
> total_pull = skb->dev->hard_header_len + ifehdrln;
>
>
Nope, please use pskb_may_pull()
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox