* Re: Donation
From: M.M fridman @ 2018-06-24 3:07 UTC (permalink / raw)
I Mikhail Fridman. has selected you specially as one of my beneficiaries
for my Charitable Donation, Just as I have declared on May 23, 2016 to give
my fortune as charity.
Check the link below for confirmation:
http://www.ibtimes.co.uk/russias-second-wealthiest-man-mikhail-fridman-plans-leaving-14-2bn-fortune-charity-1561604
Reply as soon as possible with further directives.
Best Regards,
Mikhail Fridman.
^ permalink raw reply
* [PATCH RFC 0/2] Convert GRO receive over to hash table.
From: David Miller @ 2018-06-24 5:13 UTC (permalink / raw)
To: netdev; +Cc: edumazet
When many parallel flows are present and being received on the same
RX queue, GRO processing can become expensive because each incoming
frame must traverse the per-NAPI GRO list at each protocol layer
of GRO receive (eth --> ipv{4,6} --> tcp).
Use the already computed hash to chain these SKBs in a hash table
instead of a simple list.
The first patch makes the GRO list a true list_head.
The second patch implements the hash table.
This series patches basic testing and I added some diagnostics
to make sure we really were aggregating GRO frames :-)
Signed-off-by: David S. Miller <davem@davemloft.net>
^ permalink raw reply
* [PATCH RFC 1/2] net: Convert GRO SKB handling to list_head.
From: David Miller @ 2018-06-24 5:13 UTC (permalink / raw)
To: netdev; +Cc: edumazet
Manage pending per-NAPI GRO packets via list_head.
Return an SKB pointer from the GRO receive handlers. When GRO receive
handlers return non-NULL, it means that this SKB needs to be completed
at this time and removed from the NAPI queue.
Several operations are greatly simplified by this transformation,
especially timing out the oldest SKB in the list when gro_count
exceeds MAX_GRO_SKBS, and napi_gro_flush() which walks the queue
in reverse order.
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/geneve.c | 11 +++---
drivers/net/vxlan.c | 11 +++---
include/linux/etherdevice.h | 3 +-
include/linux/netdevice.h | 32 ++++++++---------
include/linux/skbuff.h | 3 +-
include/linux/udp.h | 4 +--
include/net/inet_common.h | 2 +-
include/net/tcp.h | 2 +-
include/net/udp.h | 4 +--
include/net/udp_tunnel.h | 6 ++--
net/8021q/vlan.c | 13 +++----
net/core/dev.c | 68 +++++++++++++++----------------------
net/core/skbuff.c | 4 +--
net/ethernet/eth.c | 12 +++----
net/ipv4/af_inet.c | 12 +++----
net/ipv4/esp4_offload.c | 4 +--
net/ipv4/fou.c | 20 +++++------
net/ipv4/gre_offload.c | 8 ++---
net/ipv4/tcp_offload.c | 14 ++++----
net/ipv4/udp_offload.c | 13 +++----
net/ipv6/esp6_offload.c | 4 +--
net/ipv6/ip6_offload.c | 16 ++++-----
net/ipv6/tcpv6_offload.c | 4 +--
net/ipv6/udp_offload.c | 4 +--
24 files changed, 133 insertions(+), 141 deletions(-)
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 750eaa53bf0c..3e94375b9b01 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -418,11 +418,12 @@ static int geneve_hlen(struct genevehdr *gh)
return sizeof(*gh) + gh->opt_len * 4;
}
-static struct sk_buff **geneve_gro_receive(struct sock *sk,
- struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *geneve_gro_receive(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
{
- struct sk_buff *p, **pp = NULL;
+ struct sk_buff *pp = NULL;
+ struct sk_buff *p;
struct genevehdr *gh, *gh2;
unsigned int hlen, gh_len, off_gnv;
const struct packet_offload *ptype;
@@ -449,7 +450,7 @@ static struct sk_buff **geneve_gro_receive(struct sock *sk,
goto out;
}
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
if (!NAPI_GRO_CB(p)->same_flow)
continue;
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index aee0e60471f1..cc14e0cd5647 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -568,11 +568,12 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
return vh;
}
-static struct sk_buff **vxlan_gro_receive(struct sock *sk,
- struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *vxlan_gro_receive(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
{
- struct sk_buff *p, **pp = NULL;
+ struct sk_buff *pp = NULL;
+ struct sk_buff *p;
struct vxlanhdr *vh, *vh2;
unsigned int hlen, off_vx;
int flush = 1;
@@ -607,7 +608,7 @@ static struct sk_buff **vxlan_gro_receive(struct sock *sk,
skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
if (!NAPI_GRO_CB(p)->same_flow)
continue;
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 79563840c295..572e11bb8696 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -59,8 +59,7 @@ struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
unsigned int rxqs);
#define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1)
-struct sk_buff **eth_gro_receive(struct sk_buff **head,
- struct sk_buff *skb);
+struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb);
int eth_gro_complete(struct sk_buff *skb, int nhoff);
/* Reserved Ethernet Addresses per IEEE 802.1Q */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3ec9850c7936..f176d9873910 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -322,7 +322,7 @@ struct napi_struct {
int poll_owner;
#endif
struct net_device *dev;
- struct sk_buff *gro_list;
+ struct list_head gro_list;
struct sk_buff *skb;
struct hrtimer timer;
struct list_head dev_list;
@@ -2255,10 +2255,10 @@ static inline int gro_recursion_inc_test(struct sk_buff *skb)
return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT;
}
-typedef struct sk_buff **(*gro_receive_t)(struct sk_buff **, struct sk_buff *);
-static inline struct sk_buff **call_gro_receive(gro_receive_t cb,
- struct sk_buff **head,
- struct sk_buff *skb)
+typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *);
+static inline struct sk_buff *call_gro_receive(gro_receive_t cb,
+ struct list_head *head,
+ struct sk_buff *skb)
{
if (unlikely(gro_recursion_inc_test(skb))) {
NAPI_GRO_CB(skb)->flush |= 1;
@@ -2268,12 +2268,12 @@ static inline struct sk_buff **call_gro_receive(gro_receive_t cb,
return cb(head, skb);
}
-typedef struct sk_buff **(*gro_receive_sk_t)(struct sock *, struct sk_buff **,
- struct sk_buff *);
-static inline struct sk_buff **call_gro_receive_sk(gro_receive_sk_t cb,
- struct sock *sk,
- struct sk_buff **head,
- struct sk_buff *skb)
+typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *,
+ struct sk_buff *);
+static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb,
+ struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
{
if (unlikely(gro_recursion_inc_test(skb))) {
NAPI_GRO_CB(skb)->flush |= 1;
@@ -2299,8 +2299,8 @@ struct packet_type {
struct offload_callbacks {
struct sk_buff *(*gso_segment)(struct sk_buff *skb,
netdev_features_t features);
- struct sk_buff **(*gro_receive)(struct sk_buff **head,
- struct sk_buff *skb);
+ struct sk_buff *(*gro_receive)(struct list_head *head,
+ struct sk_buff *skb);
int (*gro_complete)(struct sk_buff *skb, int nhoff);
};
@@ -2568,7 +2568,7 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
struct net_device *dev_get_by_napi_id(unsigned int napi_id);
int netdev_get_name(struct net *net, char *name, int ifindex);
int dev_restart(struct net_device *dev);
-int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb);
+int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
static inline unsigned int skb_gro_offset(const struct sk_buff *skb)
{
@@ -2784,13 +2784,13 @@ static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb,
}
#ifdef CONFIG_XFRM_OFFLOAD
-static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush)
+static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush)
{
if (PTR_ERR(pp) != -EINPROGRESS)
NAPI_GRO_CB(skb)->flush |= flush;
}
#else
-static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush)
+static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush)
{
NAPI_GRO_CB(skb)->flush |= flush;
}
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c86885954994..7ccc601b55d9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -677,7 +677,8 @@ struct sk_buff {
int ip_defrag_offset;
};
};
- struct rb_node rbnode; /* used in netem & tcp stack */
+ struct rb_node rbnode; /* used in netem & tcp stack */
+ struct list_head list;
};
struct sock *sk;
diff --git a/include/linux/udp.h b/include/linux/udp.h
index ca840345571b..320d49d85484 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -74,8 +74,8 @@ struct udp_sock {
void (*encap_destroy)(struct sock *sk);
/* GRO functions for UDP socket */
- struct sk_buff ** (*gro_receive)(struct sock *sk,
- struct sk_buff **head,
+ struct sk_buff * (*gro_receive)(struct sock *sk,
+ struct list_head *head,
struct sk_buff *skb);
int (*gro_complete)(struct sock *sk,
struct sk_buff *skb,
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 384b90c62c0b..3ca969cbd161 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -43,7 +43,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
int inet_recv_error(struct sock *sk, struct msghdr *msg, int len,
int *addr_len);
-struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb);
+struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb);
int inet_gro_complete(struct sk_buff *skb, int nhoff);
struct sk_buff *inet_gso_segment(struct sk_buff *skb,
netdev_features_t features);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 822ee49ed0f9..402a88b0e8a8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1788,7 +1788,7 @@ void tcp_v4_destroy_sock(struct sock *sk);
struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
netdev_features_t features);
-struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb);
+struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb);
int tcp_gro_complete(struct sk_buff *skb);
void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr);
diff --git a/include/net/udp.h b/include/net/udp.h
index b1ea8b0f5e6a..5723c6128ae4 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -170,8 +170,8 @@ static inline void udp_csum_pull_header(struct sk_buff *skb)
typedef struct sock *(*udp_lookup_t)(struct sk_buff *skb, __be16 sport,
__be16 dport);
-struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
- struct udphdr *uh, udp_lookup_t lookup);
+struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
+ struct udphdr *uh, udp_lookup_t lookup);
int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index b95a6927c718..fe680ab6b15a 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -65,9 +65,9 @@ static inline int udp_sock_create(struct net *net,
typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb);
typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk);
-typedef struct sk_buff **(*udp_tunnel_gro_receive_t)(struct sock *sk,
- struct sk_buff **head,
- struct sk_buff *skb);
+typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb);
typedef int (*udp_tunnel_gro_complete_t)(struct sock *sk, struct sk_buff *skb,
int nhoff);
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 73a65789271b..99141986efa0 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -647,13 +647,14 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
return err;
}
-static struct sk_buff **vlan_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *vlan_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
- struct sk_buff *p, **pp = NULL;
- struct vlan_hdr *vhdr;
- unsigned int hlen, off_vlan;
const struct packet_offload *ptype;
+ unsigned int hlen, off_vlan;
+ struct sk_buff *pp = NULL;
+ struct vlan_hdr *vhdr;
+ struct sk_buff *p;
__be16 type;
int flush = 1;
@@ -675,7 +676,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head,
flush = 0;
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
struct vlan_hdr *vhdr2;
if (!NAPI_GRO_CB(p)->same_flow)
diff --git a/net/core/dev.c b/net/core/dev.c
index a5aa1c7444e6..aa61b9344b46 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4881,36 +4881,25 @@ static int napi_gro_complete(struct sk_buff *skb)
*/
void napi_gro_flush(struct napi_struct *napi, bool flush_old)
{
- struct sk_buff *skb, *prev = NULL;
-
- /* scan list and build reverse chain */
- for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
- skb->prev = prev;
- prev = skb;
- }
-
- for (skb = prev; skb; skb = prev) {
- skb->next = NULL;
+ struct sk_buff *skb, *p;
+ list_for_each_entry_safe_reverse(skb, p, &napi->gro_list, list) {
if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
return;
-
- prev = skb->prev;
+ list_del_init(&skb->list);
napi_gro_complete(skb);
napi->gro_count--;
}
-
- napi->gro_list = NULL;
}
EXPORT_SYMBOL(napi_gro_flush);
static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
{
- struct sk_buff *p;
unsigned int maclen = skb->dev->hard_header_len;
u32 hash = skb_get_hash_raw(skb);
+ struct sk_buff *p;
- for (p = napi->gro_list; p; p = p->next) {
+ list_for_each_entry(p, &napi->gro_list, list) {
unsigned long diffs;
NAPI_GRO_CB(p)->flush = 0;
@@ -4977,12 +4966,12 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
- struct sk_buff **pp = NULL;
+ struct list_head *head = &offload_base;
struct packet_offload *ptype;
__be16 type = skb->protocol;
- struct list_head *head = &offload_base;
- int same_flow;
+ struct sk_buff *pp = NULL;
enum gro_result ret;
+ int same_flow;
int grow;
if (netif_elide_gro(skb->dev))
@@ -5039,11 +5028,8 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
if (pp) {
- struct sk_buff *nskb = *pp;
-
- *pp = nskb->next;
- nskb->next = NULL;
- napi_gro_complete(nskb);
+ list_del_init(&pp->list);
+ napi_gro_complete(pp);
napi->gro_count--;
}
@@ -5054,15 +5040,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
goto normal;
if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
- struct sk_buff *nskb = napi->gro_list;
+ struct sk_buff *nskb;
- /* locate the end of the list to select the 'oldest' flow */
- while (nskb->next) {
- pp = &nskb->next;
- nskb = *pp;
- }
- *pp = NULL;
- nskb->next = NULL;
+ nskb = list_last_entry(&napi->gro_list, struct sk_buff, list);
+ list_del(&nskb->list);
napi_gro_complete(nskb);
} else {
napi->gro_count++;
@@ -5071,8 +5052,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
NAPI_GRO_CB(skb)->age = jiffies;
NAPI_GRO_CB(skb)->last = skb;
skb_shinfo(skb)->gso_size = skb_gro_len(skb);
- skb->next = napi->gro_list;
- napi->gro_list = skb;
+ list_add(&skb->list, &napi->gro_list);
ret = GRO_HELD;
pull:
@@ -5478,7 +5458,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
NAPIF_STATE_IN_BUSY_POLL)))
return false;
- if (n->gro_list) {
+ if (!list_empty(&n->gro_list)) {
unsigned long timeout = 0;
if (work_done)
@@ -5687,7 +5667,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
/* Note : we use a relaxed variant of napi_schedule_prep() not setting
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/
- if (napi->gro_list && !napi_disable_pending(napi) &&
+ if (!list_empty(&napi->gro_list) && !napi_disable_pending(napi) &&
!test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
__napi_schedule_irqoff(napi);
@@ -5701,7 +5681,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
napi->gro_count = 0;
- napi->gro_list = NULL;
+ INIT_LIST_HEAD(&napi->gro_list);
napi->skb = NULL;
napi->poll = poll;
if (weight > NAPI_POLL_WEIGHT)
@@ -5734,6 +5714,14 @@ void napi_disable(struct napi_struct *n)
}
EXPORT_SYMBOL(napi_disable);
+static void gro_list_free(struct list_head *head)
+{
+ struct sk_buff *skb, *p;
+
+ list_for_each_entry_safe(skb, p, head, list)
+ kfree_skb(skb);
+}
+
/* Must be called in process context */
void netif_napi_del(struct napi_struct *napi)
{
@@ -5743,8 +5731,8 @@ void netif_napi_del(struct napi_struct *napi)
list_del_init(&napi->dev_list);
napi_free_frags(napi);
- kfree_skb_list(napi->gro_list);
- napi->gro_list = NULL;
+ gro_list_free(&napi->gro_list);
+ INIT_LIST_HEAD(&napi->gro_list);
napi->gro_count = 0;
}
EXPORT_SYMBOL(netif_napi_del);
@@ -5787,7 +5775,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
goto out_unlock;
}
- if (n->gro_list) {
+ if (!list_empty(&n->gro_list)) {
/* flush too old packets
* If HZ < 1000, flush all packets.
*/
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c642304f178c..b1f274f22d85 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3815,14 +3815,14 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
}
EXPORT_SYMBOL_GPL(skb_segment);
-int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
{
struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
unsigned int offset = skb_gro_offset(skb);
unsigned int headlen = skb_headlen(skb);
unsigned int len = skb_gro_len(skb);
- struct sk_buff *lp, *p = *head;
unsigned int delta_truesize;
+ struct sk_buff *lp;
if (unlikely(p->len + len >= 65536))
return -E2BIG;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index ee28440f57c5..fd8faa0dfa61 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -427,13 +427,13 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
}
EXPORT_SYMBOL(sysfs_format_mac);
-struct sk_buff **eth_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb)
{
- struct sk_buff *p, **pp = NULL;
- struct ethhdr *eh, *eh2;
- unsigned int hlen, off_eth;
const struct packet_offload *ptype;
+ unsigned int hlen, off_eth;
+ struct sk_buff *pp = NULL;
+ struct ethhdr *eh, *eh2;
+ struct sk_buff *p;
__be16 type;
int flush = 1;
@@ -448,7 +448,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head,
flush = 0;
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
if (!NAPI_GRO_CB(p)->same_flow)
continue;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 15e125558c76..06b218a2870f 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1384,12 +1384,12 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
}
EXPORT_SYMBOL(inet_gso_segment);
-struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
{
const struct net_offload *ops;
- struct sk_buff **pp = NULL;
- struct sk_buff *p;
+ struct sk_buff *pp = NULL;
const struct iphdr *iph;
+ struct sk_buff *p;
unsigned int hlen;
unsigned int off;
unsigned int id;
@@ -1425,7 +1425,7 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb)
flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
id >>= 16;
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
struct iphdr *iph2;
u16 flush_id;
@@ -1505,8 +1505,8 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb)
}
EXPORT_SYMBOL(inet_gro_receive);
-static struct sk_buff **ipip_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *ipip_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
if (NAPI_GRO_CB(skb)->encap_mark) {
NAPI_GRO_CB(skb)->flush = 1;
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 7cf755ef9efb..bbeecd13e534 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -28,8 +28,8 @@
#include <linux/spinlock.h>
#include <net/udp.h>
-static struct sk_buff **esp4_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *esp4_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
int offset = skb_gro_offset(skb);
struct xfrm_offload *xo;
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 1540db65241a..efdc9e1f741e 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -224,14 +224,14 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
return 0;
}
-static struct sk_buff **fou_gro_receive(struct sock *sk,
- struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *fou_gro_receive(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
{
- const struct net_offload *ops;
- struct sk_buff **pp = NULL;
u8 proto = fou_from_sock(sk)->protocol;
const struct net_offload **offloads;
+ const struct net_offload *ops;
+ struct sk_buff *pp = NULL;
/* We can clear the encap_mark for FOU as we are essentially doing
* one of two possible things. We are either adding an L4 tunnel
@@ -305,13 +305,13 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
return guehdr;
}
-static struct sk_buff **gue_gro_receive(struct sock *sk,
- struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *gue_gro_receive(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
{
const struct net_offload **offloads;
const struct net_offload *ops;
- struct sk_buff **pp = NULL;
+ struct sk_buff *pp = NULL;
struct sk_buff *p;
struct guehdr *guehdr;
size_t len, optlen, hdrlen, off;
@@ -397,7 +397,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk,
skb_gro_pull(skb, hdrlen);
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
const struct guehdr *guehdr2;
if (!NAPI_GRO_CB(p)->same_flow)
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 1859c473b21a..b9673c21be45 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -108,10 +108,10 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
return segs;
}
-static struct sk_buff **gre_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *gre_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
- struct sk_buff **pp = NULL;
+ struct sk_buff *pp = NULL;
struct sk_buff *p;
const struct gre_base_hdr *greh;
unsigned int hlen, grehlen;
@@ -182,7 +182,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
null_compute_pseudo);
}
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
const struct gre_base_hdr *greh2;
if (!NAPI_GRO_CB(p)->same_flow)
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 8cc7c3487330..f5aee641f825 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -180,9 +180,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
return segs;
}
-struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
{
- struct sk_buff **pp = NULL;
+ struct sk_buff *pp = NULL;
struct sk_buff *p;
struct tcphdr *th;
struct tcphdr *th2;
@@ -220,7 +220,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
len = skb_gro_len(skb);
flags = tcp_flag_word(th);
- for (; (p = *head); head = &p->next) {
+ list_for_each_entry(p, head, list) {
if (!NAPI_GRO_CB(p)->same_flow)
continue;
@@ -233,7 +233,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
goto found;
}
-
+ p = NULL;
goto out_check_final;
found:
@@ -263,7 +263,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
flush |= (len - 1) >= mss;
flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
- if (flush || skb_gro_receive(head, skb)) {
+ if (flush || skb_gro_receive(p, skb)) {
mss = 1;
goto out_check_final;
}
@@ -277,7 +277,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
TCP_FLAG_FIN));
if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
- pp = head;
+ pp = p;
out:
NAPI_GRO_CB(skb)->flush |= (flush != 0);
@@ -302,7 +302,7 @@ int tcp_gro_complete(struct sk_buff *skb)
}
EXPORT_SYMBOL(tcp_gro_complete);
-static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
{
/* Don't bother verifying checksum if we're going to flush anyway. */
if (!NAPI_GRO_CB(skb)->flush &&
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 92dc9e5a7ff3..ac46c1c55c99 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -343,10 +343,11 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
return segs;
}
-struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
- struct udphdr *uh, udp_lookup_t lookup)
+struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
+ struct udphdr *uh, udp_lookup_t lookup)
{
- struct sk_buff *p, **pp = NULL;
+ struct sk_buff *pp = NULL;
+ struct sk_buff *p;
struct udphdr *uh2;
unsigned int off = skb_gro_offset(skb);
int flush = 1;
@@ -371,7 +372,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
unflush:
flush = 0;
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
if (!NAPI_GRO_CB(p)->same_flow)
continue;
@@ -399,8 +400,8 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
}
EXPORT_SYMBOL(udp_gro_receive);
-static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *udp4_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index 27f59b61f70f..ddfa533a84e5 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -49,8 +49,8 @@ static __u16 esp6_nexthdr_esp_offset(struct ipv6hdr *ipv6_hdr, int nhlen)
return 0;
}
-static struct sk_buff **esp6_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *esp6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
int offset = skb_gro_offset(skb);
struct xfrm_offload *xo;
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 5b3f2f89ef41..37ff4805b20c 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -163,11 +163,11 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph,
return len;
}
-static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *ipv6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
const struct net_offload *ops;
- struct sk_buff **pp = NULL;
+ struct sk_buff *pp = NULL;
struct sk_buff *p;
struct ipv6hdr *iph;
unsigned int nlen;
@@ -214,7 +214,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
flush--;
nlen = skb_network_header_len(skb);
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
const struct ipv6hdr *iph2;
__be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */
@@ -263,8 +263,8 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
return pp;
}
-static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
/* Common GRO receive for SIT and IP6IP6 */
@@ -278,8 +278,8 @@ static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head,
return ipv6_gro_receive(head, skb);
}
-static struct sk_buff **ip4ip6_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *ip4ip6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
/* Common GRO receive for SIT and IP6IP6 */
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index 278e49cd67d4..e72947c99454 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -15,8 +15,8 @@
#include <net/ip6_checksum.h>
#include "ip6_offload.h"
-static struct sk_buff **tcp6_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *tcp6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
/* Don't bother verifying checksum if we're going to flush anyway. */
if (!NAPI_GRO_CB(skb)->flush &&
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 03a2ff3fe1e6..95dee9ca8d22 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -114,8 +114,8 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
return segs;
}
-static struct sk_buff **udp6_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *udp6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
--
2.17.1
^ permalink raw reply related
* [PATCH RFC 2/2] net: Convert NAPI gro list into a small hash table.
From: David Miller @ 2018-06-24 5:14 UTC (permalink / raw)
To: netdev; +Cc: edumazet
Improve the performance of GRO receive by splitting flows into
multiple hash chains.
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
include/linux/netdevice.h | 3 +-
net/core/dev.c | 105 ++++++++++++++++++++++++++++----------
2 files changed, 81 insertions(+), 27 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f176d9873910..c6b377a15869 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -305,6 +305,7 @@ int __init netdev_boot_setup(char *str);
/*
* Structure for NAPI scheduling similar to tasklet but with weighting
*/
+#define GRO_HASH_BUCKETS 8
struct napi_struct {
/* The poll_list must only be managed by the entity which
* changes the state of the NAPI_STATE_SCHED bit. This means
@@ -322,7 +323,7 @@ struct napi_struct {
int poll_owner;
#endif
struct net_device *dev;
- struct list_head gro_list;
+ struct list_head gro_hash[GRO_HASH_BUCKETS];
struct sk_buff *skb;
struct hrtimer timer;
struct list_head dev_list;
diff --git a/net/core/dev.c b/net/core/dev.c
index aa61b9344b46..dffed642e686 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4875,15 +4875,12 @@ static int napi_gro_complete(struct sk_buff *skb)
return netif_receive_skb_internal(skb);
}
-/* napi->gro_list contains packets ordered by age.
- * youngest packets at the head of it.
- * Complete skbs in reverse order to reduce latencies.
- */
-void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+static void __napi_gro_flush_chain(struct napi_struct *napi, struct list_head *head,
+ bool flush_old)
{
struct sk_buff *skb, *p;
- list_for_each_entry_safe_reverse(skb, p, &napi->gro_list, list) {
+ list_for_each_entry_safe_reverse(skb, p, head, list) {
if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
return;
list_del_init(&skb->list);
@@ -4891,15 +4888,33 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old)
napi->gro_count--;
}
}
+
+/* napi->gro_hash contains packets ordered by age.
+ * youngest packets at the head of it.
+ * Complete skbs in reverse order to reduce latencies.
+ */
+void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+{
+ int i;
+
+ for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+ struct list_head *head = &napi->gro_hash[i];
+
+ __napi_gro_flush_chain(napi, head, flush_old);
+ }
+}
EXPORT_SYMBOL(napi_gro_flush);
-static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
+static struct list_head *gro_list_prepare(struct napi_struct *napi,
+ struct sk_buff *skb)
{
unsigned int maclen = skb->dev->hard_header_len;
u32 hash = skb_get_hash_raw(skb);
+ struct list_head *head;
struct sk_buff *p;
- list_for_each_entry(p, &napi->gro_list, list) {
+ head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)];
+ list_for_each_entry(p, head, list) {
unsigned long diffs;
NAPI_GRO_CB(p)->flush = 0;
@@ -4922,6 +4937,8 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
maclen);
NAPI_GRO_CB(p)->same_flow = !diffs;
}
+
+ return head;
}
static void skb_gro_reset_offset(struct sk_buff *skb)
@@ -4964,11 +4981,45 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
}
}
+static void gro_flush_oldest(struct napi_struct *napi)
+{
+ struct sk_buff *oldest = NULL;
+ unsigned long age = jiffies;
+ int i;
+
+ for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+ struct list_head *head = &napi->gro_hash[i];
+ struct sk_buff *skb;
+
+ if (list_empty(head))
+ continue;
+
+ skb = list_last_entry(head, struct sk_buff, list);
+ if (!oldest || time_before(NAPI_GRO_CB(skb)->age, age)) {
+ oldest = skb;
+ age = NAPI_GRO_CB(skb)->age;
+ }
+ }
+
+ /* We are called with napi->gro_count >= MAX_GRO_SKBS, so this is
+ * impossible.
+ */
+ if (WARN_ON_ONCE(!oldest))
+ return;
+
+ /* Do not adjust napi->gro_count, caller is adding a new SKB to
+ * the chain.
+ */
+ list_del(&oldest->list);
+ napi_gro_complete(oldest);
+}
+
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
struct list_head *head = &offload_base;
struct packet_offload *ptype;
__be16 type = skb->protocol;
+ struct list_head *gro_head;
struct sk_buff *pp = NULL;
enum gro_result ret;
int same_flow;
@@ -4977,7 +5028,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (netif_elide_gro(skb->dev))
goto normal;
- gro_list_prepare(napi, skb);
+ gro_head = gro_list_prepare(napi, skb);
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
@@ -5011,7 +5062,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
NAPI_GRO_CB(skb)->csum_valid = 0;
}
- pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
+ pp = ptype->callbacks.gro_receive(gro_head, skb);
break;
}
rcu_read_unlock();
@@ -5040,11 +5091,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
goto normal;
if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
- struct sk_buff *nskb;
-
- nskb = list_last_entry(&napi->gro_list, struct sk_buff, list);
- list_del(&nskb->list);
- napi_gro_complete(nskb);
+ gro_flush_oldest(napi);
} else {
napi->gro_count++;
}
@@ -5052,7 +5099,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
NAPI_GRO_CB(skb)->age = jiffies;
NAPI_GRO_CB(skb)->last = skb;
skb_shinfo(skb)->gso_size = skb_gro_len(skb);
- list_add(&skb->list, &napi->gro_list);
+ list_add(&skb->list, gro_head);
ret = GRO_HELD;
pull:
@@ -5458,7 +5505,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
NAPIF_STATE_IN_BUSY_POLL)))
return false;
- if (!list_empty(&n->gro_list)) {
+ if (n->gro_count) {
unsigned long timeout = 0;
if (work_done)
@@ -5667,7 +5714,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
/* Note : we use a relaxed variant of napi_schedule_prep() not setting
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/
- if (!list_empty(&napi->gro_list) && !napi_disable_pending(napi) &&
+ if (napi->gro_count && !napi_disable_pending(napi) &&
!test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
__napi_schedule_irqoff(napi);
@@ -5677,11 +5724,14 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
+ int i;
+
INIT_LIST_HEAD(&napi->poll_list);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
napi->gro_count = 0;
- INIT_LIST_HEAD(&napi->gro_list);
+ for (i = 0; i < GRO_HASH_BUCKETS; i++)
+ INIT_LIST_HEAD(&napi->gro_hash[i]);
napi->skb = NULL;
napi->poll = poll;
if (weight > NAPI_POLL_WEIGHT)
@@ -5714,12 +5764,16 @@ void napi_disable(struct napi_struct *n)
}
EXPORT_SYMBOL(napi_disable);
-static void gro_list_free(struct list_head *head)
+static void flush_gro_hash(struct napi_struct *napi)
{
- struct sk_buff *skb, *p;
+ int i;
- list_for_each_entry_safe(skb, p, head, list)
- kfree_skb(skb);
+ for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+ struct sk_buff *skb, *n;
+
+ list_for_each_entry_safe(skb, n, &napi->gro_hash[i], list)
+ kfree_skb(skb);
+ }
}
/* Must be called in process context */
@@ -5731,8 +5785,7 @@ void netif_napi_del(struct napi_struct *napi)
list_del_init(&napi->dev_list);
napi_free_frags(napi);
- gro_list_free(&napi->gro_list);
- INIT_LIST_HEAD(&napi->gro_list);
+ flush_gro_hash(napi);
napi->gro_count = 0;
}
EXPORT_SYMBOL(netif_napi_del);
@@ -5775,7 +5828,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
goto out_unlock;
}
- if (!list_empty(&n->gro_list)) {
+ if (n->gro_count) {
/* flush too old packets
* If HZ < 1000, flush all packets.
*/
--
2.17.1
^ permalink raw reply related
* Re: BUG: unable to handle kernel paging request in bpf_int_jit_compile
From: Thomas Gleixner @ 2018-06-24 7:09 UTC (permalink / raw)
To: syzbot
Cc: ast, daniel, David Miller, H. Peter Anvin, kuznet, LKML, mingo,
netdev, syzkaller-bugs, x86, yoshfuji, Peter Zijlstra
In-Reply-To: <000000000000d48c8e056f5b6c67@google.com>
On Sat, 23 Jun 2018, syzbot wrote:
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+a4eb8c7766952a1ca872@syzkaller.appspotmail.com
>
> RAX: ffffffffffffffda RBX: 0000000001429914 RCX: 0000000000455a99
> RDX: 0000000000000048 RSI: 0000000020000240 RDI: 0000000000000005
> RBP: 000000000072bea0 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000005
> R13: 00000000004bb7d5 R14: 00000000004c8508 R15: 0000000000000023
> BUG: unable to handle kernel paging request at ffffffffa0008002
> PGD 8e6d067 P4D 8e6d067 PUD 8e6e063 PMD 1b4528067 PTE 1d433d161
> Oops: 0003 [#1] SMP KASAN
> CPU: 1 PID: 4811 Comm: syz-executor0 Not tainted 4.18.0-rc1+ #114
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google
> 01/01/2011
> RIP: 0010:bpf_jit_binary_lock_ro include/linux/filter.h:703 [inline]
> RIP: 0010:bpf_int_jit_compile+0xc36/0xf30 arch/x86/net/bpf_jit_comp.c:1168
static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
{
WARN_ON_ONCE(set_memory_ro((unsigned long)hdr, hdr->pages));
}
Qualitee. set_memory_ro() has legitimate reasons to fail, but sure it does
not most of the time.
So instead of implementing proper error handling, this adds complete bogus
wrappers. Hell, set_memory_*() have stub functions which return 0 for the
CONFIG_ARCH_HAS_SET_MEMORY=n case.
The unlock function is even more hilarious:
static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
{
if (fp->locked) {
WARN_ON_ONCE(set_memory_rw((unsigned long)fp, fp->pages));
/* In case set_memory_rw() fails, we want to be the first
* to crash here instead of some random place later on.
*/
fp->locked = 0;
}
}
Great approach for a facility, which deals with untrusted user space
stuff. Yeah. I know. The BPF mantra is: "Performance first"
I'm really tempted to make the BPF config switch depend on BROKEN.
Thanks,
tglx
^ permalink raw reply
* Re: BUG: unable to handle kernel paging request in bpf_int_jit_compile
From: David Miller @ 2018-06-24 7:14 UTC (permalink / raw)
To: tglx
Cc: syzbot+a4eb8c7766952a1ca872, ast, daniel, hpa, kuznet,
linux-kernel, mingo, netdev, syzkaller-bugs, x86, yoshfuji,
peterz
In-Reply-To: <alpine.DEB.2.21.1806240849150.8650@nanos.tec.linutronix.de>
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 24 Jun 2018 09:09:09 +0200 (CEST)
> I'm really tempted to make the BPF config switch depend on BROKEN.
This really isn't necessary Thomas.
Whoever wrote the code didn't understand that set ro can legitimately
fail.
So let's correct that instead of flaming a feature.
Thank you.
^ permalink raw reply
* Re: [PATCH net-next] strparser: Corrected typo in documentation.
From: David Miller @ 2018-06-24 7:18 UTC (permalink / raw)
To: vakul.garg; +Cc: netdev, linux-kernel, linux-doc, corbet
In-Reply-To: <20180624123721.24287-1-vakul.garg@nxp.com>
From: Vakul Garg <vakul.garg@nxp.com>
Date: Sun, 24 Jun 2018 18:07:21 +0530
> Replaced strp_pause() with strp_unpause() to correct a seemingly copy
> paste documentation mistake.
>
> Signed-off-by: Vakul Garg <vakul.garg@nxp.com>
As a bug fix this should target 'net'.
This is even more true since this fixes documentation.
^ permalink raw reply
* BUG: unable to handle kernel paging request in bpf_prog_select_runtime
From: syzbot @ 2018-06-24 7:32 UTC (permalink / raw)
To: ast, daniel, linux-kernel, netdev, syzkaller-bugs
Hello,
syzbot found the following crash on:
HEAD commit: 5e2204832b20 Merge tag 'powerpc-4.18-2' of git://git.kerne..
git tree: upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=11e79814400000
kernel config: https://syzkaller.appspot.com/x/.config?x=befbcd7305e41bb0
dashboard link: https://syzkaller.appspot.com/bug?extid=d866d1925855328eac3b
compiler: gcc (GCC) 8.0.1 20180413 (experimental)
Unfortunately, I don't have any reproducer for this crash yet.
IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+d866d1925855328eac3b@syzkaller.appspotmail.com
BUG: unable to handle kernel paging request at ffffc90001952002
PGD 1da947067 P4D 1da947067 PUD 1da948067 PMD 1d3a61067 PTE 800000016ba62161
Oops: 0003 [#1] SMP KASAN
CPU: 0 PID: 17593 Comm: syz-executor3 Not tainted 4.18.0-rc1+ #114
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
RIP: 0010:bpf_prog_lock_ro include/linux/filter.h:681 [inline]
RIP: 0010:bpf_prog_select_runtime+0xf5/0xa60 kernel/bpf/core.c:1519
Code: 48 b8 00 00 00 00 00 fc ff df 48 89 ca 48 c1 ea 03 0f b6 14 02 48 89
c8 83 e0 07 83 c0 01 38 d0 7c 08 84 d2 0f 85 de 08 00 00 <41> 80 66 02 fb
e8 f1 67 f3 ff 49 8d 7e 20 48 b8 00 00 00 00 00 fc
RSP: 0018:ffff8801b4d6fac8 EFLAGS: 00010246
RAX: 0000000000000003 RBX: 00000000fffffff4 RCX: ffffc90001952002
RDX: 0000000000000000 RSI: ffffffff8188a717 RDI: 0000000000000005
RBP: ffff8801b4d6fb30 R08: ffff8801d778e400 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
R13: ffffc90001952030 R14: ffffc90001952000 R15: ffff8801b4d6fb98
FS: 00007f59edc0c700(0000) GS:ffff8801dae00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffc90001952002 CR3: 00000001d20b4000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
bpf_prog_load+0x1194/0x1c60 kernel/bpf/syscall.c:1356
__do_sys_bpf kernel/bpf/syscall.c:2360 [inline]
__se_sys_bpf kernel/bpf/syscall.c:2322 [inline]
__x64_sys_bpf+0x36c/0x510 kernel/bpf/syscall.c:2322
do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x455a99
Code: 1d ba fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff
ff 0f 83 eb b9 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f59edc0bc68 EFLAGS: 00000246 ORIG_RAX: 0000000000000141
RAX: ffffffffffffffda RBX: 00007f59edc0c6d4 RCX: 0000000000455a99
RDX: 0000000000000048 RSI: 0000000020000240 RDI: 0000000000000005
RBP: 000000000072bea0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000015
R13: 00000000004bb7d5 R14: 00000000004c8508 R15: 0000000000000023
Modules linked in:
Dumping ftrace buffer:
(ftrace buffer empty)
CR2: ffffc90001952002
---[ end trace f4bd75b0437a2cce ]---
RIP: 0010:bpf_prog_lock_ro include/linux/filter.h:681 [inline]
RIP: 0010:bpf_prog_select_runtime+0xf5/0xa60 kernel/bpf/core.c:1519
Code: 48 b8 00 00 00 00 00 fc ff df 48 89 ca 48 c1 ea 03 0f b6 14 02 48 89
c8 83 e0 07 83 c0 01 38 d0 7c 08 84 d2 0f 85 de 08 00 00 <41> 80 66 02 fb
e8 f1 67 f3 ff 49 8d 7e 20 48 b8 00 00 00 00 00 fc
RSP: 0018:ffff8801b4d6fac8 EFLAGS: 00010246
RAX: 0000000000000003 RBX: 00000000fffffff4 RCX: ffffc90001952002
RDX: 0000000000000000 RSI: ffffffff8188a717 RDI: 0000000000000005
RBP: ffff8801b4d6fb30 R08: ffff8801d778e400 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
R13: ffffc90001952030 R14: ffffc90001952000 R15: ffff8801b4d6fb98
FS: 00007f59edc0c700(0000) GS:ffff8801dae00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffc90001952002 CR3: 00000001d20b4000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.
syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with
syzbot.
^ permalink raw reply
* Re: [PATCH net] strparser: Corrected typo in documentation.
From: David Miller @ 2018-06-24 7:40 UTC (permalink / raw)
To: vakul.garg; +Cc: netdev, linux-kernel, linux-doc, corbet
In-Reply-To: <20180624124401.24584-1-vakul.garg@nxp.com>
From: Vakul Garg <vakul.garg@nxp.com>
Date: Sun, 24 Jun 2018 18:14:01 +0530
> Replaced strp_pause() with strp_unpause() to correct a seemingly copy
> paste documentation mistake.
>
> Signed-off-by: Vakul Garg <vakul.garg@nxp.com>
> ---
> Resending for 'net' as advised.
Applied, thank you.
^ permalink raw reply
* Re: [PATCH net-next] net: phy: fixed-phy: Make the error path simpler
From: David Miller @ 2018-06-24 7:42 UTC (permalink / raw)
To: festevam; +Cc: andrew, f.fainelli, netdev, fabio.estevam
In-Reply-To: <1529800102-18287-1-git-send-email-festevam@gmail.com>
From: Fabio Estevam <festevam@gmail.com>
Date: Sat, 23 Jun 2018 21:28:22 -0300
> From: Fabio Estevam <fabio.estevam@nxp.com>
>
> When platform_device_register_simple() fails we can return
> the error immediately instead of jumping to the 'err_pdev'
> label.
>
> This makes the error path a bit simpler.
>
> Signed-off-by: Fabio Estevam <fabio.estevam@nxp.com>
Applied, thank you.
^ permalink raw reply
* Re: [Patch net-next] net_sched: remove unused htb drop_list
From: David Miller @ 2018-06-24 7:43 UTC (permalink / raw)
To: xiyou.wangcong; +Cc: netdev, fw
In-Reply-To: <20180623204639.29933-1-xiyou.wangcong@gmail.com>
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Sat, 23 Jun 2018 13:46:39 -0700
> After commit a09ceb0e0814 ("sched: remove qdisc->drop"),
> it is no longer used.
>
> Cc: Florian Westphal <fw@strlen.de>
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Applied, thanks Cong.
^ permalink raw reply
* Re: [PATCH] ipv6: avoid copy_from_user() via ipv6_renew_options_kern()
From: David Miller @ 2018-06-24 7:48 UTC (permalink / raw)
To: viro; +Cc: pmoore, netdev, selinux, linux-security-module
In-Reply-To: <20180623222106.GE30522@ZenIV.linux.org.uk>
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Sat, 23 Jun 2018 23:21:07 +0100
> BTW, I wonder if the life would be simpler with do_ipv6_setsockopt() doing
> the copy-in and verifying ipv6_optlen(*hdr) <= newoptlen; that would've
> simplified ipv6_renew_option{,s}() quite a bit and completely eliminated
> ipv6_renew_options_kern()...
I agree that this makes things a lot simpler.
One thing that drives me crazy though is this inherit stuff:
> + ipv6_renew_option(newtype == IPV6_HOPOPTS ? newopt :
> + opt ? opt->hopopt : NULL,
Why don't we pass the type into ipv6_renew_option() and have it
do this pointer dance instead?
That's going to definitely be easier to read.
I don't know enough about this code to give feedback about the
option length handling wrt. copies, sorry.
^ permalink raw reply
* [PATCH rdma-next 08/12] overflow.h: Add arithmetic shift helper
From: Leon Romanovsky @ 2018-06-24 8:23 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe, Kees Cook, Rasmus Villemoes
Cc: Leon Romanovsky, RDMA mailing list, Hadar Hen Zion, Matan Barak,
Michael J Ruhl, Noa Osherovich, Raed Salem, Yishai Hadas,
Saeed Mahameed, linux-netdev, linux-kernel
In-Reply-To: <20180624082353.16138-1-leon@kernel.org>
From: Leon Romanovsky <leonro@mellanox.com>
Add shift_overflow() helper to help driver authors to ensure that
shift operand doesn't cause to overflow, which is very common pattern
for RDMA drivers.
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
include/linux/overflow.h | 23 +++++++++++++++++++++++
1 file changed, 23 insertions(+)
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index 8712ff70995f..2a3395248e94 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -202,6 +202,29 @@
#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */
+/**
+ * shift_overflow() - Peform shift operation with overflow check
+ * @a: value to be shifted
+ * @b: shift operand
+ *
+ * Checks if a << b will overflow
+ *
+ * Returns: result of shift for no overflow or SIZE_MAX for overflow
+ */
+static inline __must_check size_t shift_overflow(size_t a, size_t b)
+{
+ size_t c, res;
+
+ if (b >= sizeof(size_t) * BITS_PER_BYTE)
+ return SIZE_MAX;
+
+ c = (size_t)1 << b;
+ if (check_mul_overflow(a, c, &res))
+ return SIZE_MAX;
+
+ return res;
+}
+
/**
* array_size() - Calculate size of 2-dimensional array.
*
--
2.14.4
^ permalink raw reply related
* [PATCH rdma-next 00/12] RDMA fixes 2018-06-24
From: Leon Romanovsky @ 2018-06-24 8:23 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Hadar Hen Zion, Matan Barak,
Michael J Ruhl, Noa Osherovich, Raed Salem, Yishai Hadas,
Saeed Mahameed, linux-netdev
From: Leon Romanovsky <leonro@mellanox.com>
Hi,
This is bunch of patches trigged by running syzkaller internally.
I'm sending them based on rdma-next mainly for two reasons:
1, Most of the patches fix the old issues and it doesn't matter when
they will hit the Linus's tree: now or later in a couple of weeks
during merge window.
2. They interleave with code cleanup, mlx5-next patches and Michael's
feedback on flow counters series.
Thanks
Leon Romanovsky (12):
RDMA/uverbs: Protect from attempts to create flows on unsupported QP
RDMA/uverbs: Check existence of create_flow callback
RDMA/verbs: Drop kernel variant of create_flow
RDMA/verbs: Drop kernel variant of destroy_flow
net/mlx5: Rate limit errors in command interface
RDMA/uverbs: Don't overwrite NULL pointer with ZERO_SIZE_PTR
RDMA/umem: Don't check for negative return value of dma_map_sg_attrs()
overflow.h: Add arithmetic shift helper
RDMA/mlx5: Fix shift overflow in mlx5_ib_create_wq
RDMA/mlx5: Reuse existed shift_overlow helper
RDMA/uverbs: Remove redundant check
RDMA/uverbs: Fix slab-out-of-bounds in ib_uverbs_ex_create_flow
drivers/infiniband/core/umem.c | 2 +-
drivers/infiniband/core/uverbs_cmd.c | 49 ++++++++++++++--------
drivers/infiniband/core/uverbs_std_types.c | 9 ++--
drivers/infiniband/core/verbs.c | 29 -------------
drivers/infiniband/hw/mlx5/qp.c | 16 +++++--
drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 11 ++---
.../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 6 +++
include/linux/overflow.h | 23 ++++++++++
include/rdma/ib_verbs.h | 4 --
9 files changed, 83 insertions(+), 66 deletions(-)
^ permalink raw reply
* [PATCH rdma-next 01/12] RDMA/uverbs: Protect from attempts to create flows on unsupported QP
From: Leon Romanovsky @ 2018-06-24 8:23 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Hadar Hen Zion, Matan Barak,
Michael J Ruhl, Noa Osherovich, Raed Salem, Yishai Hadas,
Saeed Mahameed, linux-netdev
In-Reply-To: <20180624082353.16138-1-leon@kernel.org>
From: Leon Romanovsky <leonro@mellanox.com>
Flows can be created on UD and RAW_PACKET QP types. Attempts to provide
other QP types as an input causes to various unpredictable failures.
The reason to it that in order to support all various types (e.g. XRC),
we are supposed to use real_qp handle and not qp handle and give to
driver/FW to fail such (XRC) flows. Being valuable solution, the simpler
and safer variant is to ban all QP types except UD and RAW_PACKET,
instead of relying on driver/FW.
Cc: <stable@vger.kernel.org> # 3.11
Fixes: 436f2ad05a0b ("IB/core: Export ib_create/destroy_flow through uverbs")
Cc: syzkaller <syzkaller@googlegroups.com>
Reported-by: Noa Osherovich <noaos@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/uverbs_cmd.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 779892b63729..c842a9423fbf 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3553,14 +3553,20 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
goto err_free_attr;
}
- qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext);
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle,
+ file->ucontext);
if (!qp) {
err = -EINVAL;
goto err_uobj;
}
+ if (qp->qp_type != IB_QPT_UD && qp->qp_type != IB_QPT_RAW_PACKET) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
flow_attr = kzalloc(struct_size(flow_attr, flows,
- cmd.flow_attr.num_of_specs), GFP_KERNEL);
+ cmd.flow_attr.num_of_specs), GFP_KERNEL);
if (!flow_attr) {
err = -ENOMEM;
goto err_put;
--
2.14.4
^ permalink raw reply related
* [PATCH rdma-next 02/12] RDMA/uverbs: Check existence of create_flow callback
From: Leon Romanovsky @ 2018-06-24 8:23 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Hadar Hen Zion, Matan Barak,
Michael J Ruhl, Noa Osherovich, Raed Salem, Yishai Hadas,
Saeed Mahameed, linux-netdev
In-Reply-To: <20180624082353.16138-1-leon@kernel.org>
From: Leon Romanovsky <leonro@mellanox.com>
In the accepted series "Refactor ib_uverbs_write path", we presented the
roadmap to get rid of uverbs_cmd_mask and uverbs_ex_cmd_mask fields in
favor of simple check of function pointer. So let's put NULL check of
create_flow function callback despite the fact that uverbs_ex_cmd_mask
still exists.
Link: https://www.spinics.net/lists/linux-rdma/msg60753.html
Suggested-by: Michael J Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/uverbs_cmd.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index c842a9423fbf..6251d80db732 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3565,6 +3565,11 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
goto err_put;
}
+ if (!qp->device->create_flow) {
+ err = -EOPNOTSUPP;
+ goto err_put;
+ }
+
flow_attr = kzalloc(struct_size(flow_attr, flows,
cmd.flow_attr.num_of_specs), GFP_KERNEL);
if (!flow_attr) {
--
2.14.4
^ permalink raw reply related
* [PATCH rdma-next 03/12] RDMA/verbs: Drop kernel variant of create_flow
From: Leon Romanovsky @ 2018-06-24 8:23 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Hadar Hen Zion, Matan Barak,
Michael J Ruhl, Noa Osherovich, Raed Salem, Yishai Hadas,
Saeed Mahameed, linux-netdev
In-Reply-To: <20180624082353.16138-1-leon@kernel.org>
From: Leon Romanovsky <leonro@mellanox.com>
There are no kernel users of this interface so let's drop it.
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/verbs.c | 17 -----------------
include/rdma/ib_verbs.h | 2 --
2 files changed, 19 deletions(-)
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 9a72b88fea80..5ada09f708f5 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -2275,23 +2275,6 @@ int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table)
}
EXPORT_SYMBOL(ib_destroy_rwq_ind_table);
-struct ib_flow *ib_create_flow(struct ib_qp *qp,
- struct ib_flow_attr *flow_attr,
- int domain)
-{
- struct ib_flow *flow_id;
- if (!qp->device->create_flow)
- return ERR_PTR(-EOPNOTSUPP);
-
- flow_id = qp->device->create_flow(qp, flow_attr, domain, NULL);
- if (!IS_ERR(flow_id)) {
- atomic_inc(&qp->usecnt);
- flow_id->qp = qp;
- }
- return flow_id;
-}
-EXPORT_SYMBOL(ib_create_flow);
-
int ib_destroy_flow(struct ib_flow *flow_id)
{
int err;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index d1e2f2d91766..a55e1aa808a7 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -3832,8 +3832,6 @@ struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller);
*/
int ib_dealloc_xrcd(struct ib_xrcd *xrcd);
-struct ib_flow *ib_create_flow(struct ib_qp *qp,
- struct ib_flow_attr *flow_attr, int domain);
int ib_destroy_flow(struct ib_flow *flow_id);
static inline int ib_check_mr_access(int flags)
--
2.14.4
^ permalink raw reply related
* [PATCH mlx5-next 05/12] net/mlx5: Rate limit errors in command interface
From: Leon Romanovsky @ 2018-06-24 8:23 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Hadar Hen Zion, Matan Barak,
Michael J Ruhl, Noa Osherovich, Raed Salem, Yishai Hadas,
Saeed Mahameed, linux-netdev
In-Reply-To: <20180624082353.16138-1-leon@kernel.org>
From: Leon Romanovsky <leonro@mellanox.com>
Any error status returned by FW will trigger similar
to the following error message in the dmesg.
[ 55.884355] mlx5_core 0000:00:04.0: mlx5_cmd_check:712:(pid 555):
ALLOC_UAR(0x802) op_mod(0x0) failed, status limits exceeded(0x8),
syndrome (0x0)
Those prints are extremely valuable to diagnose issues with running
system and it is important to keep them. However, not-so-careful user
can trigger endless number of such prints by depleting HW resources
and will spam dmesg.
Rate limiting of such messages solves this issue.
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 11 ++++-------
drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h | 6 ++++++
2 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 9d03a202abb1..7dd878b00196 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -717,13 +717,10 @@ static int mlx5_cmd_check(struct mlx5_core_dev *dev, void *in, void *out)
uid = MLX5_GET(mbox_in, in, uid);
if (!uid && opcode != MLX5_CMD_OP_DESTROY_MKEY)
- mlx5_core_err(dev,
- "%s(0x%x) op_mod(0x%x) failed, status %s(0x%x), syndrome (0x%x)\n",
- mlx5_command_str(opcode),
- opcode, op_mod,
- cmd_status_str(status),
- status,
- syndrome);
+ mlx5_core_err_rl(dev,
+ "%s(0x%x) op_mod(0x%x) failed, status %s(0x%x), syndrome (0x%x)\n",
+ mlx5_command_str(opcode), opcode, op_mod,
+ cmd_status_str(status), status, syndrome);
else
mlx5_core_dbg(dev,
"%s(0x%x) op_mod(0x%x) failed, status %s(0x%x), syndrome (0x%x)\n",
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 023882d9a22e..49955117ae36 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -66,6 +66,12 @@ do { \
__func__, __LINE__, current->pid, \
##__VA_ARGS__)
+#define mlx5_core_err_rl(__dev, format, ...) \
+ dev_err_ratelimited(&(__dev)->pdev->dev, \
+ "%s:%d:(pid %d): " format, \
+ __func__, __LINE__, current->pid, \
+ ##__VA_ARGS__)
+
#define mlx5_core_warn(__dev, format, ...) \
dev_warn(&(__dev)->pdev->dev, "%s:%d:(pid %d): " format, \
__func__, __LINE__, current->pid, \
--
2.14.4
^ permalink raw reply related
* [PATCH rdma-next 06/12] RDMA/uverbs: Don't overwrite NULL pointer with ZERO_SIZE_PTR
From: Leon Romanovsky @ 2018-06-24 8:23 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Hadar Hen Zion, Matan Barak,
Michael J Ruhl, Noa Osherovich, Raed Salem, Yishai Hadas,
Saeed Mahameed, linux-netdev
In-Reply-To: <20180624082353.16138-1-leon@kernel.org>
From: Leon Romanovsky <leonro@mellanox.com>
Number of specs is provided by user and in valid case can be equal to zero.
Such argument causes to call to kcalloc() with zero-length request and in
return the ZERO_SIZE_PTR is assigned. This pointer is different from NULL
and makes various if (..) checks to success.
Fixes: b6ba4a9aa59f ("IB/uverbs: Add support for flow counters")
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/uverbs_cmd.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 3aba63aa1779..8ed4b674416f 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -2768,6 +2768,9 @@ static struct ib_uflow_resources *flow_resources_alloc(size_t num_specs)
if (!resources)
return NULL;
+ if (!num_specs)
+ goto out;
+
resources->counters =
kcalloc(num_specs, sizeof(*resources->counters), GFP_KERNEL);
resources->collection =
@@ -2776,8 +2779,8 @@ static struct ib_uflow_resources *flow_resources_alloc(size_t num_specs)
if (!resources->counters || !resources->collection)
goto err;
+out:
resources->max = num_specs;
-
return resources;
err:
--
2.14.4
^ permalink raw reply related
* [PATCH rdma-next 07/12] RDMA/umem: Don't check for negative return value of dma_map_sg_attrs()
From: Leon Romanovsky @ 2018-06-24 8:23 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Hadar Hen Zion, Matan Barak,
Michael J Ruhl, Noa Osherovich, Raed Salem, Yishai Hadas,
Saeed Mahameed, linux-netdev
In-Reply-To: <20180624082353.16138-1-leon@kernel.org>
From: Leon Romanovsky <leonro@mellanox.com>
dma_map_sg_attrs() returns 0 on error and can't return negative number
(ensured by BUG_ON), so don't check for being negative value.
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/umem.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 54ab6335c48d..498f59bb4989 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -206,7 +206,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
DMA_BIDIRECTIONAL,
dma_attrs);
- if (umem->nmap <= 0) {
+ if (!umem->nmap) {
ret = -ENOMEM;
goto out;
}
--
2.14.4
^ permalink raw reply related
* [PATCH rdma-next 09/12] RDMA/mlx5: Fix shift overflow in mlx5_ib_create_wq
From: Leon Romanovsky @ 2018-06-24 8:23 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Hadar Hen Zion, Matan Barak,
Michael J Ruhl, Noa Osherovich, Raed Salem, Yishai Hadas,
Saeed Mahameed, linux-netdev
In-Reply-To: <20180624082353.16138-1-leon@kernel.org>
From: Leon Romanovsky <leonro@mellanox.com>
[ 61.182439] UBSAN: Undefined behaviour in drivers/infiniband/hw/mlx5/qp.c:5366:34
[ 61.183673] shift exponent 4294967288 is too large for 32-bit type 'unsigned int'
[ 61.185530] CPU: 0 PID: 639 Comm: qp Not tainted 4.18.0-rc1-00037-g4aa1d69a9c60-dirty #96
[ 61.186981] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 04/01/2014
[ 61.188315] Call Trace:
[ 61.188661] dump_stack+0xc7/0x13b
[ 61.190427] ubsan_epilogue+0x9/0x49
[ 61.190899] __ubsan_handle_shift_out_of_bounds+0x1ea/0x22f
[ 61.197040] mlx5_ib_create_wq+0x1c99/0x1d50
[ 61.206632] ib_uverbs_ex_create_wq+0x499/0x820
[ 61.213892] ib_uverbs_write+0x77e/0xae0
[ 61.248018] vfs_write+0x121/0x3b0
[ 61.249831] ksys_write+0xa1/0x120
[ 61.254024] do_syscall_64+0x7c/0x2a0
[ 61.256178] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 61.259211] RIP: 0033:0x7f54bab70e99
[ 61.262125] Code: 00 f3 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89
[ 61.268678] RSP: 002b:00007ffe1541c318 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[ 61.271076] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f54bab70e99
[ 61.273795] RDX: 0000000000000070 RSI: 0000000020000240 RDI: 0000000000000003
[ 61.276982] RBP: 00007ffe1541c330 R08: 00000000200078e0 R09: 0000000000000002
[ 61.280035] R10: 0000000000000000 R11: 0000000000000246 R12: 00000000004005c0
[ 61.283279] R13: 00007ffe1541c420 R14: 0000000000000000 R15: 0000000000000000
Cc: <stable@vger.kernel.org> # 4.7
Fixes: 79b20a6c3014 ("IB/mlx5: Add receive Work Queue verbs")
Cc: syzkaller <syzkaller@googlegroups.com>
Reported-by: Noa Osherovich <noaos@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/hw/mlx5/qp.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 6034a670859f..8e40263fd40e 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -5377,7 +5377,11 @@ static int set_user_rq_size(struct mlx5_ib_dev *dev,
rwq->wqe_count = ucmd->rq_wqe_count;
rwq->wqe_shift = ucmd->rq_wqe_shift;
- rwq->buf_size = (rwq->wqe_count << rwq->wqe_shift);
+ rwq->buf_size =
+ shift_overflow((size_t)rwq->wqe_count, (size_t)rwq->wqe_shift);
+ if (rwq->buf_size == SIZE_MAX)
+ return -EINVAL;
+
rwq->log_rq_stride = rwq->wqe_shift;
rwq->log_rq_size = ilog2(rwq->wqe_count);
return 0;
--
2.14.4
^ permalink raw reply related
* [PATCH rdma-next 10/12] RDMA/mlx5: Reuse existed shift_overlow helper
From: Leon Romanovsky @ 2018-06-24 8:23 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Hadar Hen Zion, Matan Barak,
Michael J Ruhl, Noa Osherovich, Raed Salem, Yishai Hadas,
Saeed Mahameed, linux-netdev
In-Reply-To: <20180624082353.16138-1-leon@kernel.org>
From: Leon Romanovsky <leonro@mellanox.com>
Rewrite commit 002bf2282b2d ("RDMA/mlx5: Protect from shift operand
overflow") to reuse newly introduced shift_overflow() helper.
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/hw/mlx5/qp.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 8e40263fd40e..5471b57b873d 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -259,13 +259,17 @@ static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap,
cap->max_recv_sge = 0;
} else {
if (ucmd) {
+ size_t s;
+
qp->rq.wqe_cnt = ucmd->rq_wqe_count;
- if (ucmd->rq_wqe_shift > BITS_PER_BYTE * sizeof(ucmd->rq_wqe_shift))
+ s = shift_overflow(1, ucmd->rq_wqe_shift);
+ if (s == SIZE_MAX)
return -EINVAL;
qp->rq.wqe_shift = ucmd->rq_wqe_shift;
- if ((1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) < qp->wq_sig)
+ if (s / sizeof(struct mlx5_wqe_data_seg) < qp->wq_sig)
return -EINVAL;
- qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig;
+ qp->rq.max_gs = s / sizeof(struct mlx5_wqe_data_seg) -
+ qp->wq_sig;
qp->rq.max_post = qp->rq.wqe_cnt;
} else {
wqe_size = qp->wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : 0;
--
2.14.4
^ permalink raw reply related
* [PATCH rdma-next 11/12] RDMA/uverbs: Remove redundant check
From: Leon Romanovsky @ 2018-06-24 8:23 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Hadar Hen Zion, Matan Barak,
Michael J Ruhl, Noa Osherovich, Raed Salem, Yishai Hadas,
Saeed Mahameed, linux-netdev
In-Reply-To: <20180624082353.16138-1-leon@kernel.org>
From: Leon Romanovsky <leonro@mellanox.com>
kern_spec->reserved is checked prior to calling to
kern_spec_to_ib_spec_filter() and it makes this second check redundant.
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/uverbs_cmd.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 8ed4b674416f..3a0bc4c1b17b 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3045,9 +3045,6 @@ static int kern_spec_to_ib_spec_filter(struct ib_uverbs_flow_spec *kern_spec,
void *kern_spec_mask;
void *kern_spec_val;
- if (kern_spec->reserved)
- return -EINVAL;
-
kern_filter_sz = kern_spec_filter_sz(&kern_spec->hdr);
kern_spec_val = (void *)kern_spec +
--
2.14.4
^ permalink raw reply related
* [PATCH rdma-next 04/12] RDMA/verbs: Drop kernel variant of destroy_flow
From: Leon Romanovsky @ 2018-06-24 8:23 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Hadar Hen Zion, Matan Barak,
Michael J Ruhl, Noa Osherovich, Raed Salem, Yishai Hadas,
Saeed Mahameed, linux-netdev
In-Reply-To: <20180624082353.16138-1-leon@kernel.org>
From: Leon Romanovsky <leonro@mellanox.com>
Following the removal of ib_create_flow(), adjust the code to get rid of
ib_destroy_flow() too.
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/uverbs_cmd.c | 3 ++-
drivers/infiniband/core/uverbs_std_types.c | 9 ++++++---
drivers/infiniband/core/verbs.c | 12 ------------
include/rdma/ib_verbs.h | 2 --
4 files changed, 8 insertions(+), 18 deletions(-)
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 6251d80db732..3aba63aa1779 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3642,7 +3642,8 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
kfree(kern_flow_attr);
return 0;
err_copy:
- ib_destroy_flow(flow_id);
+ if (!qp->device->destroy_flow(flow_id))
+ atomic_dec(&qp->usecnt);
err_free:
ib_uverbs_flow_resources_free(uflow_res);
err_free_flow_attr:
diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c
index 6497263d13c8..9b4e1e53cd9c 100644
--- a/drivers/infiniband/core/uverbs_std_types.c
+++ b/drivers/infiniband/core/uverbs_std_types.c
@@ -48,14 +48,17 @@ static int uverbs_free_ah(struct ib_uobject *uobject,
static int uverbs_free_flow(struct ib_uobject *uobject,
enum rdma_remove_reason why)
{
- int ret;
struct ib_flow *flow = (struct ib_flow *)uobject->object;
struct ib_uflow_object *uflow =
container_of(uobject, struct ib_uflow_object, uobject);
+ struct ib_qp *qp = flow->qp;
+ int ret;
- ret = ib_destroy_flow(flow);
- if (!ret)
+ ret = qp->device->destroy_flow(flow);
+ if (!ret) {
+ atomic_dec(&qp->usecnt);
ib_uverbs_flow_resources_free(uflow->resources);
+ }
return ret;
}
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 5ada09f708f5..128d94988dd8 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -2275,18 +2275,6 @@ int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table)
}
EXPORT_SYMBOL(ib_destroy_rwq_ind_table);
-int ib_destroy_flow(struct ib_flow *flow_id)
-{
- int err;
- struct ib_qp *qp = flow_id->qp;
-
- err = qp->device->destroy_flow(flow_id);
- if (!err)
- atomic_dec(&qp->usecnt);
- return err;
-}
-EXPORT_SYMBOL(ib_destroy_flow);
-
int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
struct ib_mr_status *mr_status)
{
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index a55e1aa808a7..6c51190ae7a1 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -3832,8 +3832,6 @@ struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller);
*/
int ib_dealloc_xrcd(struct ib_xrcd *xrcd);
-int ib_destroy_flow(struct ib_flow *flow_id);
-
static inline int ib_check_mr_access(int flags)
{
/*
--
2.14.4
^ permalink raw reply related
* [PATCH rdma-next 12/12] RDMA/uverbs: Fix slab-out-of-bounds in ib_uverbs_ex_create_flow
From: Leon Romanovsky @ 2018-06-24 8:23 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Hadar Hen Zion, Matan Barak,
Michael J Ruhl, Noa Osherovich, Raed Salem, Yishai Hadas,
Saeed Mahameed, linux-netdev
In-Reply-To: <20180624082353.16138-1-leon@kernel.org>
From: Leon Romanovsky <leonro@mellanox.com>
The check of cmd.flow_attr.size should check into account the size of
reserved field (2 bytes), otherwise user can provide size whihc will
cause to slab-out-of-bounds warning below.
==================================================================
BUG: KASAN: slab-out-of-bounds in ib_uverbs_ex_create_flow+0x1740/0x1d00
Read of size 2 at addr ffff880068dff1a6 by task syz-executor775/269
CPU: 0 PID: 269 Comm: syz-executor775 Not tainted 4.18.0-rc1+ #245
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
rel-1.11.0-0-g63451fca13-prebuilt.qemu-project.org 04/01/2014
Call Trace:
dump_stack+0xef/0x17e
print_address_description+0x83/0x3b0
kasan_report+0x18d/0x4d0
ib_uverbs_ex_create_flow+0x1740/0x1d00
ib_uverbs_write+0x923/0x1010
__vfs_write+0x10d/0x720
vfs_write+0x1b0/0x550
ksys_write+0xc6/0x1a0
do_syscall_64+0xa7/0x590
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x433899
Code: fd ff 48 81 c4 80 00 00 00 e9 f1 fe ff ff 0f 1f 00 48 89 f8 48 89
f7 48 89 d6 48 89 ca 4d 89 c2 4d
89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 3b 91 fd ff c3 66
2e 0f 1f 84 00 00 00 00
RSP: 002b:00007ffc2724db58 EFLAGS: 00000217 ORIG_RAX: 0000000000000001
RAX: ffffffffffffffda RBX: 0000000020006880 RCX: 0000000000433899
RDX: 00000000000000e0 RSI: 0000000020002480 RDI: 0000000000000003
RBP: 00000000006d7018 R08: 00000000004002f8 R09: 00000000004002f8
R10: 00000000004002f8 R11: 0000000000000217 R12: 0000000000000000
R13: 000000000040cd20 R14: 000000000040cdb0 R15: 0000000000000006
Allocated by task 269:
kasan_kmalloc+0xa0/0xd0
__kmalloc+0x1a9/0x510
ib_uverbs_ex_create_flow+0x26c/0x1d00
ib_uverbs_write+0x923/0x1010
__vfs_write+0x10d/0x720
vfs_write+0x1b0/0x550
ksys_write+0xc6/0x1a0
do_syscall_64+0xa7/0x590
entry_SYSCALL_64_after_hwframe+0x49/0xbe
Freed by task 0:
__kasan_slab_free+0x12e/0x180
kfree+0x159/0x630
detach_buf+0x559/0x7a0
virtqueue_get_buf_ctx+0x3cc/0xab0
virtblk_done+0x1eb/0x3d0
vring_interrupt+0x16d/0x2b0
__handle_irq_event_percpu+0x10a/0x980
handle_irq_event_percpu+0x77/0x190
handle_irq_event+0xc6/0x1a0
handle_edge_irq+0x211/0xd80
handle_irq+0x3d/0x60
do_IRQ+0x9b/0x220
The buggy address belongs to the object at ffff880068dff180
which belongs to the cache kmalloc-64 of size 64
The buggy address is located 38 bytes inside of
64-byte region [ffff880068dff180, ffff880068dff1c0)
The buggy address belongs to the page:
page:ffffea0001a37fc0 count:1 mapcount:0 mapping:ffff88006c401780
index:0x0
flags: 0x4000000000000100(slab)
raw: 4000000000000100 ffffea0001a31100 0000001100000011 ffff88006c401780
raw: 0000000000000000 00000000802a002a 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
ffff880068dff080: fb fb fb fb fc fc fc fc fb fb fb fb fb fb fb fb
ffff880068dff100: fc fc fc fc fb fb fb fb fb fb fb fb fc fc fc fc
>ffff880068dff180: 00 00 00 00 07 fc fc fc fc fc fc fc fb fb fb fb
^
ffff880068dff200: fb fb fb fb fc fc fc fc 00 00 00 00 00 00 fc fc
ffff880068dff280: fc fc fc fc 00 00 00 00 00 00 00 00 fc fc fc fc
==================================================================
Cc: <stable@vger.kernel.org> # 3.12
Fixes: f88482743872 ("IB/core: clarify overflow/underflow checks on ib_create/destroy_flow")
Cc: syzkaller <syzkaller@googlegroups.com>
Reported-by: Noa Osherovich <noaos@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
drivers/infiniband/core/uverbs_cmd.c | 23 ++++++++++++-----------
1 file changed, 12 insertions(+), 11 deletions(-)
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 3a0bc4c1b17b..b6bca79fd48b 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3488,8 +3488,8 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
struct ib_flow_attr *flow_attr;
struct ib_qp *qp;
struct ib_uflow_resources *uflow_res;
+ struct ib_uverbs_flow_spec_hdr *kern_spec;
int err = 0;
- void *kern_spec;
void *ib_spec;
int i;
@@ -3538,8 +3538,8 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
if (!kern_flow_attr)
return -ENOMEM;
- memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr));
- err = ib_copy_from_udata(kern_flow_attr + 1, ucore,
+ *kern_flow_attr = cmd.flow_attr;
+ err = ib_copy_from_udata(&kern_flow_attr->flow_specs, ucore,
cmd.flow_attr.size);
if (err)
goto err_free_attr;
@@ -3589,21 +3589,22 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
flow_attr->flags = kern_flow_attr->flags;
flow_attr->size = sizeof(*flow_attr);
- kern_spec = kern_flow_attr + 1;
+ kern_spec = kern_flow_attr->flow_specs;
ib_spec = flow_attr + 1;
for (i = 0; i < flow_attr->num_of_specs &&
- cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) &&
- cmd.flow_attr.size >=
- ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) {
- err = kern_spec_to_ib_spec(file->ucontext, kern_spec, ib_spec,
- uflow_res);
+ cmd.flow_attr.size > sizeof(*kern_spec) &&
+ cmd.flow_attr.size >= kern_spec->size;
+ i++) {
+ err = kern_spec_to_ib_spec(
+ file->ucontext, (struct ib_uverbs_flow_spec *)kern_spec,
+ ib_spec, uflow_res);
if (err)
goto err_free;
flow_attr->size +=
((union ib_flow_spec *) ib_spec)->size;
- cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size;
- kern_spec += ((struct ib_uverbs_flow_spec *) kern_spec)->size;
+ cmd.flow_attr.size -= kern_spec->size;
+ kern_spec = ((void *)kern_spec) + kern_spec->size;
ib_spec += ((union ib_flow_spec *) ib_spec)->size;
}
if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) {
--
2.14.4
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox