* [PATCH 1/2] r8169: Align ASPM/CLKREQ setting function with vendor driver
From: Kai-Heng Feng @ 2018-09-12 6:58 UTC (permalink / raw)
To: nic_swsd; +Cc: davem, hkallweit1, netdev, linux-kernel, Kai-Heng Feng
There's a small delay after setting ASPM in vendor drivers, r8101 and
r8168.
In addition, those drivers enable ASPM before ClkReq, also change that
to align with vendor driver.
I haven't seen anything bad becasue of this, but I think it's better to
keep in sync with vendor driver.
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
---
drivers/net/ethernet/realtek/r8169.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index b08d51bf7a20..5091e9fd601f 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -4775,12 +4775,14 @@ static void rtl_pcie_state_l2l3_enable(struct rtl8169_private *tp, bool enable)
static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable)
{
if (enable) {
- RTL_W8(tp, Config2, RTL_R8(tp, Config2) | ClkReqEn);
RTL_W8(tp, Config5, RTL_R8(tp, Config5) | ASPM_en);
+ RTL_W8(tp, Config2, RTL_R8(tp, Config2) | ClkReqEn);
} else {
RTL_W8(tp, Config2, RTL_R8(tp, Config2) & ~ClkReqEn);
RTL_W8(tp, Config5, RTL_R8(tp, Config5) & ~ASPM_en);
}
+
+ udelay(10);
}
static void rtl_hw_start_8168bb(struct rtl8169_private *tp)
--
2.17.1
^ permalink raw reply related
* [PATCH 2/2] r8169: enable ASPM on RTL8106E
From: Kai-Heng Feng @ 2018-09-12 6:58 UTC (permalink / raw)
To: nic_swsd; +Cc: davem, hkallweit1, netdev, linux-kernel, Kai-Heng Feng
In-Reply-To: <20180912065821.7198-1-kai.heng.feng@canonical.com>
The Intel SoC was prevented from entering lower idle state because
of RTL8106E's ASPM was not enabled.
So enable ASPM on RTL8106E (chip version 39).
Now the Intel SoC can enter lower idle state, power consumption and
temperature are much lower.
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
---
drivers/net/ethernet/realtek/r8169.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 5091e9fd601f..3752b1bbe6db 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -5627,6 +5627,8 @@ static void rtl_hw_start_8402(struct rtl8169_private *tp)
static void rtl_hw_start_8106(struct rtl8169_private *tp)
{
+ rtl_hw_aspm_clkreq_enable(tp, false);
+
/* Force LAN exit from ASPM if Rx/Tx are not idle */
RTL_W32(tp, FuncEvent, RTL_R32(tp, FuncEvent) | 0x002800);
@@ -5635,6 +5637,7 @@ static void rtl_hw_start_8106(struct rtl8169_private *tp)
RTL_W8(tp, DLLPR, RTL_R8(tp, DLLPR) & ~PFM_EN);
rtl_pcie_state_l2l3_enable(tp, false);
+ rtl_hw_aspm_clkreq_enable(tp, true);
}
static void rtl_hw_start_8101(struct rtl8169_private *tp)
--
2.17.1
^ permalink raw reply related
* Re: [PATCH v2] r8169: Clear RTL_FLAG_TASK_*_PENDING when clearing RTL_FLAG_TASK_ENABLED
From: David Miller @ 2018-09-12 7:06 UTC (permalink / raw)
To: kai.heng.feng; +Cc: nic_swsd, netdev, linux-kernel, hkallweit1
In-Reply-To: <20180910175143.23878-1-kai.heng.feng@canonical.com>
From: Kai-Heng Feng <kai.heng.feng@canonical.com>
Date: Tue, 11 Sep 2018 01:51:43 +0800
> After system suspend, sometimes the r8169 doesn't work when ethernet
> cable gets pluggued.
>
> This issue happens because rtl_reset_work() doesn't get called from
> rtl8169_runtime_resume(), after system suspend.
>
> In rtl_task(), RTL_FLAG_TASK_* only gets cleared if this condition is
> met:
> if (!netif_running(dev) ||
> !test_bit(RTL_FLAG_TASK_ENABLED, tp->wk.flags))
> ...
>
> If RTL_FLAG_TASK_ENABLED was cleared during system suspend while
> RTL_FLAG_TASK_RESET_PENDING was set, the next rtl_schedule_task() won't
> schedule task as the flag is still there.
>
> So in addition to clearing RTL_FLAG_TASK_ENABLED, also clears other
> flags.
>
> Cc: Heiner Kallweit <hkallweit1@gmail.com>
> Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
> ---
> v2:
> - Use bitmap_zero(), suggested by Florian Fainelli.
> - Assign zero to first enum member in rtl_flag, just in case.
Applied, thank you.
^ permalink raw reply
* [PATCH net] geneve: add ttl inherit support
From: Hangbin Liu @ 2018-09-12 2:04 UTC (permalink / raw)
To: netdev
Cc: Pravin Shelar, David S. Miller, Jiri Benc, Stefano Brivio,
Hangbin Liu
Similar with commit 72f6d71e491e6 ("vxlan: add ttl inherit support"),
currently ttl == 0 means "use whatever default value" on geneve instead
of inherit inner ttl. To respect compatibility with old behavior, let's
add a new IFLA_GENEVE_TTL_INHERIT for geneve ttl inherit support.
Reported-by: Jianlin Shi <jishi@redhat.com>
Suggested-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
---
drivers/net/geneve.c | 41 ++++++++++++++++++++++++++++++--------
include/uapi/linux/if_link.h | 1 +
tools/include/uapi/linux/if_link.h | 1 +
3 files changed, 35 insertions(+), 8 deletions(-)
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 6acb6b5..6625fab 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -69,6 +69,7 @@ struct geneve_dev {
struct gro_cells gro_cells;
bool collect_md;
bool use_udp6_rx_checksums;
+ bool ttl_inherit;
};
struct geneve_sock {
@@ -843,7 +844,11 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
ttl = key->ttl;
} else {
tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, ip_hdr(skb), skb);
- ttl = key->ttl ? : ip4_dst_hoplimit(&rt->dst);
+ if (geneve->ttl_inherit)
+ ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb);
+ else
+ ttl = key->ttl;
+ ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
}
df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
@@ -889,7 +894,11 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
} else {
prio = ip_tunnel_ecn_encap(ip6_tclass(fl6.flowlabel),
ip_hdr(skb), skb);
- ttl = key->ttl ? : ip6_dst_hoplimit(dst);
+ if (geneve->ttl_inherit)
+ ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb);
+ else
+ ttl = key->ttl;
+ ttl = ttl ? : ip6_dst_hoplimit(dst);
}
err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr));
if (unlikely(err))
@@ -1091,6 +1100,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
[IFLA_GENEVE_UDP_CSUM] = { .type = NLA_U8 },
[IFLA_GENEVE_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 },
[IFLA_GENEVE_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 },
+ [IFLA_GENEVE_TTL_INHERIT] = { .type = NLA_U8 },
};
static int geneve_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -1170,7 +1180,8 @@ static bool geneve_dst_addr_equal(struct ip_tunnel_info *a,
static int geneve_configure(struct net *net, struct net_device *dev,
struct netlink_ext_ack *extack,
const struct ip_tunnel_info *info,
- bool metadata, bool ipv6_rx_csum)
+ bool metadata, bool ipv6_rx_csum,
+ bool ttl_inherit)
{
struct geneve_net *gn = net_generic(net, geneve_net_id);
struct geneve_dev *t, *geneve = netdev_priv(dev);
@@ -1219,6 +1230,7 @@ static int geneve_configure(struct net *net, struct net_device *dev,
geneve->info = *info;
geneve->collect_md = metadata;
geneve->use_udp6_rx_checksums = ipv6_rx_csum;
+ geneve->ttl_inherit = ttl_inherit;
err = register_netdevice(dev);
if (err)
@@ -1237,7 +1249,8 @@ static void init_tnl_info(struct ip_tunnel_info *info, __u16 dst_port)
static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
struct netlink_ext_ack *extack,
struct ip_tunnel_info *info, bool *metadata,
- bool *use_udp6_rx_checksums, bool changelink)
+ bool *use_udp6_rx_checksums, bool *ttl_inherit,
+ bool changelink)
{
int attrtype;
@@ -1315,6 +1328,9 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
if (data[IFLA_GENEVE_TTL])
info->key.ttl = nla_get_u8(data[IFLA_GENEVE_TTL]);
+ if (data[IFLA_GENEVE_TTL_INHERIT])
+ *ttl_inherit = true;
+
if (data[IFLA_GENEVE_TOS])
info->key.tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
@@ -1438,17 +1454,18 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
{
bool use_udp6_rx_checksums = false;
struct ip_tunnel_info info;
+ bool ttl_inherit = false;
bool metadata = false;
int err;
init_tnl_info(&info, GENEVE_UDP_PORT);
err = geneve_nl2info(tb, data, extack, &info, &metadata,
- &use_udp6_rx_checksums, false);
+ &use_udp6_rx_checksums, &ttl_inherit, false);
if (err)
return err;
err = geneve_configure(net, dev, extack, &info, metadata,
- use_udp6_rx_checksums);
+ use_udp6_rx_checksums, ttl_inherit);
if (err)
return err;
@@ -1511,6 +1528,7 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[],
struct ip_tunnel_info info;
bool metadata;
bool use_udp6_rx_checksums;
+ bool ttl_inherit;
int err;
/* If the geneve device is configured for metadata (or externally
@@ -1523,8 +1541,9 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[],
memcpy(&info, &geneve->info, sizeof(info));
metadata = geneve->collect_md;
use_udp6_rx_checksums = geneve->use_udp6_rx_checksums;
+ ttl_inherit = geneve->ttl_inherit;
err = geneve_nl2info(tb, data, extack, &info, &metadata,
- &use_udp6_rx_checksums, true);
+ &use_udp6_rx_checksums, &ttl_inherit, true);
if (err)
return err;
@@ -1537,6 +1556,7 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[],
geneve->info = info;
geneve->collect_md = metadata;
geneve->use_udp6_rx_checksums = use_udp6_rx_checksums;
+ geneve->ttl_inherit = ttl_inherit;
geneve_unquiesce(geneve, gs4, gs6);
return 0;
@@ -1562,6 +1582,7 @@ static size_t geneve_get_size(const struct net_device *dev)
nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */
nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_TX */
nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX */
+ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL_INHERIT */
0;
}
@@ -1569,6 +1590,7 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
struct geneve_dev *geneve = netdev_priv(dev);
struct ip_tunnel_info *info = &geneve->info;
+ bool ttl_inherit = geneve->ttl_inherit;
bool metadata = geneve->collect_md;
__u8 tmp_vni[3];
__u32 vni;
@@ -1614,6 +1636,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
goto nla_put_failure;
#endif
+ if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit))
+ goto nla_put_failure;
+
return 0;
nla_put_failure:
@@ -1650,7 +1675,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
return dev;
init_tnl_info(&info, dst_port);
- err = geneve_configure(net, dev, NULL, &info, true, true);
+ err = geneve_configure(net, dev, NULL, &info, true, true, false);
if (err) {
free_netdev(dev);
return ERR_PTR(err);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 43391e2..5118af5 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -554,6 +554,7 @@ enum {
IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
IFLA_GENEVE_LABEL,
+ IFLA_GENEVE_TTL_INHERIT,
__IFLA_GENEVE_MAX
};
#define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1)
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index cf01b68..1678d08 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -541,6 +541,7 @@ enum {
IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
IFLA_GENEVE_LABEL,
+ IFLA_GENEVE_TTL_INHERIT,
__IFLA_GENEVE_MAX
};
#define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1)
--
2.5.5
^ permalink raw reply related
* Re: [PATCH net-next] net: dsa: b53: Only call b53_port_event() for SGMII ports
From: David Miller @ 2018-09-12 7:07 UTC (permalink / raw)
To: f.fainelli; +Cc: netdev, andrew, vivien.didelot, linux-kernel
In-Reply-To: <20180910234707.566-1-f.fainelli@gmail.com>
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 10 Sep 2018 16:47:07 -0700
> Built-in PHY ports are still being polled, avoid generating spurious
> and duplicate events which the PHY library resolves through polling
> anyways.
>
> Fixes: 0e01491de646 ("net: dsa: b53: Add SerDes support")
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Applied, thanks Florian.
^ permalink raw reply
* [RFC] netlink: add NLA_REJECT policy type
From: Johannes Berg @ 2018-09-12 7:32 UTC (permalink / raw)
To: linux-wireless-u79uwXL29TY76Z2rM5mHXA,
netdev-u79uwXL29TY76Z2rM5mHXA
Cc: Johannes Berg
From: Johannes Berg <johannes.berg-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
In some situations some netlink attributes may be used for output
only (kernel->userspace) or may be reserved for future use. It's
then helpful to be able to prevent userspace from using them in
messages sent to the kernel, since they'd otherwise be ignored and
any future will become impossible if this happens.
Add NLA_REJECT to the policy which does nothing but reject (with
EINVAL) validation of any messages containing this attribute.
The specific case I have in mind now is a shared nested attribute
containing request/response data, and it would be pointless and
potentially confusing to have userspace include response data in
the messages that actually contain a request.
Signed-off-by: Johannes Berg <johannes.berg-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
include/net/netlink.h | 1 +
lib/nlattr.c | 3 +++
2 files changed, 4 insertions(+)
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 0c154f98e987..ea7e87564f59 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -180,6 +180,7 @@ enum {
NLA_S32,
NLA_S64,
NLA_BITFIELD32,
+ NLA_REJECT,
__NLA_TYPE_MAX,
};
diff --git a/lib/nlattr.c b/lib/nlattr.c
index e335bcafa9e4..4644ae74bf63 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -87,6 +87,9 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
}
switch (pt->type) {
+ case NLA_REJECT:
+ return -EINVAL;
+
case NLA_FLAG:
if (attrlen > 0)
return -ERANGE;
--
2.14.4
^ permalink raw reply related
* Re: [PATCH net-next v2 2/5] virtio_ring: support creating packed ring
From: Tiwei Bie @ 2018-09-12 7:51 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: jasowang, virtualization, linux-kernel, netdev, virtio-dev, wexu,
jfreimann
In-Reply-To: <20180910022837.GA11822@debian>
On Mon, Sep 10, 2018 at 10:28:37AM +0800, Tiwei Bie wrote:
> On Fri, Sep 07, 2018 at 10:03:24AM -0400, Michael S. Tsirkin wrote:
> > On Wed, Jul 11, 2018 at 10:27:08AM +0800, Tiwei Bie wrote:
> > > This commit introduces the support for creating packed ring.
> > > All split ring specific functions are added _split suffix.
> > > Some necessary stubs for packed ring are also added.
> > >
> > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> >
[...]
> > > +
> > > +/*
> > > + * The layout for the packed ring is a continuous chunk of memory
> > > + * which looks like this.
> > > + *
> > > + * struct vring_packed {
> > > + * // The actual descriptors (16 bytes each)
> > > + * struct vring_packed_desc desc[num];
> > > + *
> > > + * // Padding to the next align boundary.
> > > + * char pad[];
> > > + *
> > > + * // Driver Event Suppression
> > > + * struct vring_packed_desc_event driver;
> > > + *
> > > + * // Device Event Suppression
> > > + * struct vring_packed_desc_event device;
> > > + * };
> > > + */
> >
> > Why not just allocate event structures separately?
> > Is it a win to have them share a cache line for some reason?
Users may call vring_new_virtqueue() with preallocated
memory to setup the ring, e.g.:
https://github.com/torvalds/linux/blob/11da3a7f84f1/drivers/s390/virtio/virtio_ccw.c#L513-L522
https://github.com/torvalds/linux/blob/11da3a7f84f1/drivers/misc/mic/vop/vop_main.c#L306-L307
Below is the corresponding definition in split ring:
https://github.com/oasis-tcs/virtio-spec/blob/89dd55f5e606/split-ring.tex#L64-L78
If my understanding is correct, this is just for legacy
interfaces, and we won't define layout in packed ring
and don't need to support vring_new_virtqueue() in packed
ring. Is it correct? Thanks!
>
> Will do that.
>
> >
> > > +static inline void vring_init_packed(struct vring_packed *vr, unsigned int num,
> > > + void *p, unsigned long align)
> > > +{
> > > + vr->num = num;
> > > + vr->desc = p;
> > > + vr->driver = (void *)ALIGN(((uintptr_t)p +
> > > + sizeof(struct vring_packed_desc) * num), align);
> > > + vr->device = vr->driver + 1;
> > > +}
> >
> > What's all this about alignment? Where does it come from?
>
> It comes from the `vring_align` parameter of vring_create_virtqueue()
> and vring_new_virtqueue():
>
> https://github.com/torvalds/linux/blob/a49a9dcce802/drivers/virtio/virtio_ring.c#L1061
> https://github.com/torvalds/linux/blob/a49a9dcce802/drivers/virtio/virtio_ring.c#L1123
>
> Should I just ignore it in packed ring?
>
> CCW defined this:
>
> #define KVM_VIRTIO_CCW_RING_ALIGN 4096
>
> I'm not familiar with CCW. Currently, in this patch set, packed ring
> isn't enabled on CCW dues to some legacy accessors are not implemented
> in packed ring yet.
>
> >
> > > +
> > > +static inline unsigned vring_size_packed(unsigned int num, unsigned long align)
> > > +{
> > > + return ((sizeof(struct vring_packed_desc) * num + align - 1)
> > > + & ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
> > > +}
> [...]
> > > @@ -1129,10 +1388,17 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
> > > void (*callback)(struct virtqueue *vq),
> > > const char *name)
> > > {
> > > - struct vring vring;
> > > - vring_init(&vring, num, pages, vring_align);
> > > - return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
> > > - notify, callback, name);
> > > + union vring_union vring;
> > > + bool packed;
> > > +
> > > + packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
> > > + if (packed)
> > > + vring_init_packed(&vring.vring_packed, num, pages, vring_align);
> > > + else
> > > + vring_init(&vring.vring_split, num, pages, vring_align);
> >
> >
> > vring_init in the UAPI header is more or less a bug.
> > I'd just stop using it, keep it around for legacy userspace.
>
> Got it. I'd like to do that. Thanks.
>
> >
> > > +
> > > + return __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
> > > + context, notify, callback, name);
> > > }
> > > EXPORT_SYMBOL_GPL(vring_new_virtqueue);
> > >
> > > @@ -1142,7 +1408,9 @@ void vring_del_virtqueue(struct virtqueue *_vq)
> > >
> > > if (vq->we_own_ring) {
> > > vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
> > > - vq->vring.desc, vq->queue_dma_addr);
> > > + vq->packed ? (void *)vq->vring_packed.desc :
> > > + (void *)vq->vring.desc,
> > > + vq->queue_dma_addr);
> > > }
> > > list_del(&_vq->list);
> > > kfree(vq);
> > > @@ -1184,7 +1452,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
> > >
> > > struct vring_virtqueue *vq = to_vvq(_vq);
> > >
> > > - return vq->vring.num;
> > > + return vq->packed ? vq->vring_packed.num : vq->vring.num;
> > > }
> > > EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
> > >
> > > @@ -1227,6 +1495,10 @@ dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
> > >
> > > BUG_ON(!vq->we_own_ring);
> > >
> > > + if (vq->packed)
> > > + return vq->queue_dma_addr + ((char *)vq->vring_packed.driver -
> > > + (char *)vq->vring_packed.desc);
> > > +
> > > return vq->queue_dma_addr +
> > > ((char *)vq->vring.avail - (char *)vq->vring.desc);
> > > }
> > > @@ -1238,11 +1510,16 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
> > >
> > > BUG_ON(!vq->we_own_ring);
> > >
> > > + if (vq->packed)
> > > + return vq->queue_dma_addr + ((char *)vq->vring_packed.device -
> > > + (char *)vq->vring_packed.desc);
> > > +
> > > return vq->queue_dma_addr +
> > > ((char *)vq->vring.used - (char *)vq->vring.desc);
> > > }
> > > EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
> > >
> > > +/* Only available for split ring */
> > > const struct vring *virtqueue_get_vring(struct virtqueue *vq)
> > > {
> > > return &to_vvq(vq)->vring;
> > > diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
> > > index fab02133a919..992142b35f55 100644
> > > --- a/include/linux/virtio_ring.h
> > > +++ b/include/linux/virtio_ring.h
> > > @@ -60,6 +60,11 @@ static inline void virtio_store_mb(bool weak_barriers,
> > > struct virtio_device;
> > > struct virtqueue;
> > >
> > > +union vring_union {
> > > + struct vring vring_split;
> > > + struct vring_packed vring_packed;
> > > +};
> > > +
> > > /*
> > > * Creates a virtqueue and allocates the descriptor ring. If
> > > * may_reduce_num is set, then this may allocate a smaller ring than
> > > @@ -79,7 +84,8 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
> > >
> > > /* Creates a virtqueue with a custom layout. */
> > > struct virtqueue *__vring_new_virtqueue(unsigned int index,
> > > - struct vring vring,
> > > + union vring_union vring,
> > > + bool packed,
> > > struct virtio_device *vdev,
> > > bool weak_barriers,
> > > bool ctx,
> > > --
> > > 2.18.0
^ permalink raw reply
* [PATCH] docs: net: Remove TCP congestion document
From: Tobin C. Harding @ 2018-09-12 8:14 UTC (permalink / raw)
To: David S. Miller
Cc: Tobin C. Harding, Eric Dumazet, netdev, linux-doc, linux-kernel
Document is stale, let's remove it.
Remove TCP congestion document.
Signed-off-by: Tobin C. Harding <me@tobin.cc>
---
Previous patch converted this file to rst.
[PATCH RESEND net-next] docs: net: Convert tcp.txt to RST format
Response to that patch on LKML by Eric:
I dunno, this 'doc' is probably useless and should be deleted.
Removing as suggested.
thanks,
Tobin.
Documentation/networking/00-INDEX | 2 -
Documentation/networking/tcp.txt | 101 ------------------------------
2 files changed, 103 deletions(-)
delete mode 100644 Documentation/networking/tcp.txt
diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX
index 02a323c43261..dcbccae4043e 100644
--- a/Documentation/networking/00-INDEX
+++ b/Documentation/networking/00-INDEX
@@ -198,8 +198,6 @@ tc-actions-env-rules.txt
- rules for traffic control (tc) actions.
timestamping.txt
- overview of network packet timestamping variants.
-tcp.txt
- - short blurb on how TCP output takes place.
tcp-thin.txt
- kernel tuning options for low rate 'thin' TCP streams.
team.txt
diff --git a/Documentation/networking/tcp.txt b/Documentation/networking/tcp.txt
deleted file mode 100644
index 9c7139d57e57..000000000000
--- a/Documentation/networking/tcp.txt
+++ /dev/null
@@ -1,101 +0,0 @@
-TCP protocol
-============
-
-Last updated: 3 June 2017
-
-Contents
-========
-
-- Congestion control
-- How the new TCP output machine [nyi] works
-
-Congestion control
-==================
-
-The following variables are used in the tcp_sock for congestion control:
-snd_cwnd The size of the congestion window
-snd_ssthresh Slow start threshold. We are in slow start if
- snd_cwnd is less than this.
-snd_cwnd_cnt A counter used to slow down the rate of increase
- once we exceed slow start threshold.
-snd_cwnd_clamp This is the maximum size that snd_cwnd can grow to.
-snd_cwnd_stamp Timestamp for when congestion window last validated.
-snd_cwnd_used Used as a highwater mark for how much of the
- congestion window is in use. It is used to adjust
- snd_cwnd down when the link is limited by the
- application rather than the network.
-
-As of 2.6.13, Linux supports pluggable congestion control algorithms.
-A congestion control mechanism can be registered through functions in
-tcp_cong.c. The functions used by the congestion control mechanism are
-registered via passing a tcp_congestion_ops struct to
-tcp_register_congestion_control. As a minimum, the congestion control
-mechanism must provide a valid name and must implement either ssthresh,
-cong_avoid and undo_cwnd hooks or the "omnipotent" cong_control hook.
-
-Private data for a congestion control mechanism is stored in tp->ca_priv.
-tcp_ca(tp) returns a pointer to this space. This is preallocated space - it
-is important to check the size of your private data will fit this space, or
-alternatively, space could be allocated elsewhere and a pointer to it could
-be stored here.
-
-There are three kinds of congestion control algorithms currently: The
-simplest ones are derived from TCP reno (highspeed, scalable) and just
-provide an alternative congestion window calculation. More complex
-ones like BIC try to look at other events to provide better
-heuristics. There are also round trip time based algorithms like
-Vegas and Westwood+.
-
-Good TCP congestion control is a complex problem because the algorithm
-needs to maintain fairness and performance. Please review current
-research and RFC's before developing new modules.
-
-The default congestion control mechanism is chosen based on the
-DEFAULT_TCP_CONG Kconfig parameter. If you really want a particular default
-value then you can set it using sysctl net.ipv4.tcp_congestion_control. The
-module will be autoloaded if needed and you will get the expected protocol. If
-you ask for an unknown congestion method, then the sysctl attempt will fail.
-
-If you remove a TCP congestion control module, then you will get the next
-available one. Since reno cannot be built as a module, and cannot be
-removed, it will always be available.
-
-How the new TCP output machine [nyi] works.
-===========================================
-
-Data is kept on a single queue. The skb->users flag tells us if the frame is
-one that has been queued already. To add a frame we throw it on the end. Ack
-walks down the list from the start.
-
-We keep a set of control flags
-
-
- sk->tcp_pend_event
-
- TCP_PEND_ACK Ack needed
- TCP_ACK_NOW Needed now
- TCP_WINDOW Window update check
- TCP_WINZERO Zero probing
-
-
- sk->transmit_queue The transmission frame begin
- sk->transmit_new First new frame pointer
- sk->transmit_end Where to add frames
-
- sk->tcp_last_tx_ack Last ack seen
- sk->tcp_dup_ack Dup ack count for fast retransmit
-
-
-Frames are queued for output by tcp_write. We do our best to send the frames
-off immediately if possible, but otherwise queue and compute the body
-checksum in the copy.
-
-When a write is done we try to clear any pending events and piggy back them.
-If the window is full we queue full sized frames. On the first timeout in
-zero window we split this.
-
-On a timer we walk the retransmit list to send any retransmits, update the
-backoff timers etc. A change of route table stamp causes a change of header
-and recompute. We add any new tcp level headers and refinish the checksum
-before sending.
-
--
2.17.1
^ permalink raw reply related
* Re: Regression caused by commit 7bb05b85bc2d ("r8169: don't use MSI-X on RTL8106e")
From: Kai-Heng Feng @ 2018-09-12 8:19 UTC (permalink / raw)
To: Thomas Gleixner; +Cc: jian-hong, Linux Netdev List, Linux Kernel Mailing List
In-Reply-To: <alpine.DEB.2.21.1809120826310.1427@nanos.tec.linutronix.de>
at 14:32, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Wed, 12 Sep 2018, Kai-Heng Feng wrote:
>
>> There's a Dell machine with RTL8106e stops to work after S3 since the
>> commit introduced. So I am wondering if it's possible to revert the
>> commit and use DMI/subsystem id based quirk table?
>
> Probably.
Hopefully Jian-Hong can cook up a quirk table for the issue.
>
>> It's because of commit bc976233a872 ("genirq/msi, x86/vector: Prevent
>> reservation mode for non maskable MSI") cleared the reservation mode,
>> and I
>> can see this after S3:
>>
>> [ 94.872838] do_IRQ: 3.33 No irq handler for vector
>
> It's not because of that commit, really. There is a interrupt sent after
> resume to the wrong vector for whatever reason. The MSI vector cannot be
> masked it seems in the device, but the driver should quiescen the device to
> a point where it does not send interrupts.
Understood.
>
>> If the device uses MSI-X instead of MSI, the issue doesn't happen
>> because of
>> reservation mode.
>
> Reservation mode has absolutely nothing to do with that. What prevents the
> issue is the fact that MSI-X can be masked by the IRQ core.
So in this case I think keep the device using MSI-X is a better route, it's
MSI-X capable anyway.
>
>> Is it something should be handled by x86 BIOS? Because I don't see this
>> issue
>> when I use Suspend-to-Idle, which doesn't use BIOS to do suspend.
>
> Suspend to idle works completely different and I don't see the BIOS at
> fault here. it's more an issue of MSI not being maskable on that device,
> which can't be fixed in BIOS or it's some half quiescened state which is
> used when suspending and that's a pure driver issue.
Understood.
Thanks for all the info!
Kai-Heng
>
> Thanks,
>
> tglx
^ permalink raw reply
* [PATCH net-next V2 04/11] tuntap: simplify error handling in tun_build_skb()
From: Jason Wang @ 2018-09-12 3:17 UTC (permalink / raw)
To: netdev, linux-kernel; +Cc: kvm, virtualization, mst, jasowang
In-Reply-To: <20180912031709.14112-1-jasowang@redhat.com>
There's no need to duplicate page get logic in each action. So this
patch tries to get page and calculate the offset before processing XDP
actions (except for XDP_DROP), and undo them when meet errors (we
don't care the performance on errors). This will be used for factoring
out XDP logic.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/tun.c | 10 +++-------
1 file changed, 3 insertions(+), 7 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 372caf7d67d9..257cf7342d54 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1701,17 +1701,13 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
xdp_do_flush_map();
if (err)
goto err_redirect;
- rcu_read_unlock();
- local_bh_enable();
- return NULL;
+ goto out;
case XDP_TX:
get_page(alloc_frag->page);
alloc_frag->offset += buflen;
if (tun_xdp_tx(tun->dev, &xdp) < 0)
goto err_redirect;
- rcu_read_unlock();
- local_bh_enable();
- return NULL;
+ goto out;
case XDP_PASS:
delta = orig_data - xdp.data;
len = xdp.data_end - xdp.data;
@@ -1742,7 +1738,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
err_redirect:
put_page(alloc_frag->page);
-err_xdp:
+out:
rcu_read_unlock();
local_bh_enable();
this_cpu_inc(tun->pcpu_stats->rx_dropped);
--
2.17.1
^ permalink raw reply related
* [RFC v2 1/2] netlink: add NLA_REJECT policy type
From: Johannes Berg @ 2018-09-12 8:36 UTC (permalink / raw)
To: linux-wireless-u79uwXL29TY76Z2rM5mHXA,
netdev-u79uwXL29TY76Z2rM5mHXA
Cc: Michal Kubecek, Johannes Berg
From: Johannes Berg <johannes.berg-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
In some situations some netlink attributes may be used for output
only (kernel->userspace) or may be reserved for future use. It's
then helpful to be able to prevent userspace from using them in
messages sent to the kernel, since they'd otherwise be ignored and
any future will become impossible if this happens.
Add NLA_REJECT to the policy which does nothing but reject (with
EINVAL) validation of any messages containing this attribute.
Allow for returning a specific extended ACK error message in the
validation_data pointer.
While at it fix the indentation of NLA_BITFIELD32 and describe the
validation_data pointer for it.
The specific case I have in mind now is a shared nested attribute
containing request/response data, and it would be pointless and
potentially confusing to have userspace include response data in
the messages that actually contain a request.
Signed-off-by: Johannes Berg <johannes.berg-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
include/net/netlink.h | 6 +++++-
lib/nlattr.c | 12 +++++++++---
2 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 0c154f98e987..04e40fcc70d6 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -180,6 +180,7 @@ enum {
NLA_S32,
NLA_S64,
NLA_BITFIELD32,
+ NLA_REJECT,
__NLA_TYPE_MAX,
};
@@ -208,7 +209,10 @@ enum {
* NLA_MSECS Leaving the length field zero will verify the
* given type fits, using it verifies minimum length
* just like "All other"
- * NLA_BITFIELD32 A 32-bit bitmap/bitselector attribute
+ * NLA_BITFIELD32 A 32-bit bitmap/bitselector attribute, validation
+ * data must point to a u32 value of valid flags
+ * NLA_REJECT Reject this attribute, validation data may point
+ * to a string to report as the error in extended ACK.
* All other Minimum length of attribute payload
*
* Example:
diff --git a/lib/nlattr.c b/lib/nlattr.c
index e335bcafa9e4..9ec0cc151148 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -69,7 +69,8 @@ static int validate_nla_bitfield32(const struct nlattr *nla,
}
static int validate_nla(const struct nlattr *nla, int maxtype,
- const struct nla_policy *policy)
+ const struct nla_policy *policy,
+ struct netlink_ext_ack *extack)
{
const struct nla_policy *pt;
int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla);
@@ -87,6 +88,11 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
}
switch (pt->type) {
+ case NLA_REJECT:
+ if (pt->validation_data && extack)
+ extack->_msg = pt->validation_data;
+ return -EINVAL;
+
case NLA_FLAG:
if (attrlen > 0)
return -ERANGE;
@@ -180,7 +186,7 @@ int nla_validate(const struct nlattr *head, int len, int maxtype,
int rem;
nla_for_each_attr(nla, head, len, rem) {
- int err = validate_nla(nla, maxtype, policy);
+ int err = validate_nla(nla, maxtype, policy, extack);
if (err < 0) {
if (extack)
@@ -251,7 +257,7 @@ int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
if (type > 0 && type <= maxtype) {
if (policy) {
- err = validate_nla(nla, maxtype, policy);
+ err = validate_nla(nla, maxtype, policy, extack);
if (err < 0) {
NL_SET_ERR_MSG_ATTR(extack, nla,
"Attribute failed policy validation");
--
2.14.4
^ permalink raw reply related
* [RFC v2 2/2] netlink: add ethernet address policy types
From: Johannes Berg @ 2018-09-12 8:36 UTC (permalink / raw)
To: linux-wireless-u79uwXL29TY76Z2rM5mHXA,
netdev-u79uwXL29TY76Z2rM5mHXA
Cc: Michal Kubecek, Johannes Berg
In-Reply-To: <20180912083610.20857-1-johannes-cdvu00un1VgdHxzADdlk8Q@public.gmane.org>
From: Johannes Berg <johannes.berg-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Commonly, ethernet addresses are just using a policy of
{ .len = ETH_ALEN }
which leaves userspace free to send more data than it should,
which may hide bugs.
Introduce NLA_ETH_ADDR which checks for exact size, and rejects
the attribute if the length isn't ETH_ALEN.
Also add NLA_ETH_ADDR_COMPAT which can be used in place of the
policy above, but will, in addition, warn on an address that's
too long.
Signed-off-by: Johannes Berg <johannes.berg-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
include/net/netlink.h | 4 ++++
lib/nlattr.c | 8 ++++++++
2 files changed, 12 insertions(+)
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 04e40fcc70d6..1139163c0db0 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -181,6 +181,8 @@ enum {
NLA_S64,
NLA_BITFIELD32,
NLA_REJECT,
+ NLA_ETH_ADDR,
+ NLA_ETH_ADDR_COMPAT,
__NLA_TYPE_MAX,
};
@@ -213,6 +215,8 @@ enum {
* data must point to a u32 value of valid flags
* NLA_REJECT Reject this attribute, validation data may point
* to a string to report as the error in extended ACK.
+ * NLA_ETH_ADDR Ethernet address, rejected if not exactly 6 octets.
+ * NLA_ETH_ADDR_COMPAT Ethernet address, only warns if not exactly 6 octets.
* All other Minimum length of attribute payload
*
* Example:
diff --git a/lib/nlattr.c b/lib/nlattr.c
index 9ec0cc151148..3165b6d0baaa 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -29,6 +29,8 @@ static const u8 nla_attr_len[NLA_TYPE_MAX+1] = {
[NLA_S16] = sizeof(s16),
[NLA_S32] = sizeof(s32),
[NLA_S64] = sizeof(s64),
+ [NLA_ETH_ADDR] = ETH_ALEN,
+ [NLA_ETH_ADDR_COMPAT] = ETH_ALEN,
};
static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = {
@@ -42,6 +44,7 @@ static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = {
[NLA_S16] = sizeof(s16),
[NLA_S32] = sizeof(s32),
[NLA_S64] = sizeof(s64),
+ [NLA_ETH_ADDR_COMPAT] = ETH_ALEN,
};
static int validate_nla_bitfield32(const struct nlattr *nla,
@@ -93,6 +96,11 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
extack->_msg = pt->validation_data;
return -EINVAL;
+ case NLA_ETH_ADDR:
+ if (attrlen != ETH_ALEN)
+ return -ERANGE;
+ break;
+
case NLA_FLAG:
if (attrlen > 0)
return -ERANGE;
--
2.14.4
^ permalink raw reply related
* Re: [PATCH net-next] virtio_net: ethtool tx napi configuration
From: Jason Wang @ 2018-09-12 3:35 UTC (permalink / raw)
To: Willem de Bruijn
Cc: Network Development, David Miller, caleb.raitto,
Michael S. Tsirkin, Jon Olson (Google Drive), Willem de Bruijn
In-Reply-To: <CAF=yD-L=N8Ak9wxzvzgL5zsRQnngfzxS++o11bauS=6dXHaewQ@mail.gmail.com>
On 2018年09月11日 09:14, Willem de Bruijn wrote:
>>>> I cook a fixup, and it looks works in my setup:
>>>>
>>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>>> index b320b6b14749..9181c3f2f832 100644
>>>> --- a/drivers/net/virtio_net.c
>>>> +++ b/drivers/net/virtio_net.c
>>>> @@ -2204,10 +2204,17 @@ static int virtnet_set_coalesce(struct
>>>> net_device *dev,
>>>> return -EINVAL;
>>>>
>>>> if (napi_weight ^ vi->sq[0].napi.weight) {
>>>> - if (dev->flags & IFF_UP)
>>>> - return -EBUSY;
>>>> - for (i = 0; i < vi->max_queue_pairs; i++)
>>>> + for (i = 0; i < vi->max_queue_pairs; i++) {
>>>> + struct netdev_queue *txq =
>>>> + netdev_get_tx_queue(vi->dev, i);
>>>> +
>>>> + virtnet_napi_tx_disable(&vi->sq[i].napi);
>>>> + __netif_tx_lock_bh(txq);
>>>> vi->sq[i].napi.weight = napi_weight;
>>>> + __netif_tx_unlock_bh(txq);
>>>> + virtnet_napi_tx_enable(vi, vi->sq[i].vq,
>>>> + &vi->sq[i].napi);
>>>> + }
>>>> }
>>>>
>>>> return 0;
>>> Thanks! It passes my simple stress test, too. Which consists of two
>>> concurrent loops, one toggling the ethtool option, another running
>>> TCP_RR.
>>>
>>>> The only left case is the speculative tx polling in RX NAPI. I think we
>>>> don't need to care in this case since it was not a must for correctness.
>>> As long as the txq lock is held that will be a noop, anyway. The other
>>> concurrent action is skb_xmit_done. It looks correct to me, but need
>>> to think about it a bit. The tricky transition is coming out of napi without
>>> having >= 2 + MAX_SKB_FRAGS clean descriptors. If the queue is
>>> stopped it may deadlock transmission in no-napi mode.
>> Yes, maybe we can enable tx queue when napi weight is zero in
>> virtnet_poll_tx().
> Yes, that precaution should resolve that edge case.
>
I've done a stress test and it passes. The test contains:
- vm with 2 queues
- a bash script to enable and disable tx napi
- two netperf UDP_STREAM sessions to send small packets
Thanks
^ permalink raw reply
* Regression caused by commit 7bb05b85bc2d ("r8169: don't use MSI-X on RTL8106e")
From: Kai-Heng Feng @ 2018-09-12 3:42 UTC (permalink / raw)
To: jian-hong, Thomas Gleixner; +Cc: Linux Netdev List, Linux Kernel Mailing List
Hi Jian-Hong,
There's a Dell machine with RTL8106e stops to work after S3 since the
commit introduced.
So I am wondering if it's possible to revert the commit and use
DMI/subsystem id based quirk table?
It's because of commit bc976233a872 ("genirq/msi, x86/vector: Prevent
reservation mode for non maskable MSI") cleared the reservation mode, and I
can see this after S3:
[ 94.872838] do_IRQ: 3.33 No irq handler for vector
If the device uses MSI-X instead of MSI, the issue doesn't happen because
of reservation mode.
Hi Thomas,
Is it something should be handled by x86 BIOS? Because I don't see this
issue when I use Suspend-to-Idle, which doesn't use BIOS to do suspend.
Kai-Heng
^ permalink raw reply
* Inquiry
From: Sinara Group @ 2018-09-11 22:20 UTC (permalink / raw)
To: netdev
Hello,
This is Daniel Murray and i am from Sinara Group Co.Ltd Group Co.,LTD in Russia.
We are glad to know about your company from the web and we are interested in your products.
Could you kindly send us your Latest catalog and price list for our trial order.
Best Regards,
Daniel Murray
Purchasing Manager
^ permalink raw reply
* Re: [bpf-next, v2 1/3] flow_dissector: implements flow dissector BPF hook
From: Alexei Starovoitov @ 2018-09-12 3:46 UTC (permalink / raw)
To: Petar Penkov
Cc: netdev, davem, ast, daniel, simon.horman, ecree, songliubraving,
tom, Petar Penkov, Willem de Bruijn
In-Reply-To: <20180908001110.200828-2-peterpenkov96@gmail.com>
On Fri, Sep 07, 2018 at 05:11:08PM -0700, Petar Penkov wrote:
> From: Petar Penkov <ppenkov@google.com>
>
> Adds a hook for programs of type BPF_PROG_TYPE_FLOW_DISSECTOR and
> attach type BPF_FLOW_DISSECTOR that is executed in the flow dissector
> path. The BPF program is per-network namespace.
>
> Signed-off-by: Petar Penkov <ppenkov@google.com>
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> ---
> include/linux/bpf.h | 1 +
> include/linux/bpf_types.h | 1 +
> include/linux/skbuff.h | 7 ++
> include/net/net_namespace.h | 3 +
> include/net/sch_generic.h | 12 ++-
> include/uapi/linux/bpf.h | 25 ++++++
> kernel/bpf/syscall.c | 8 ++
> kernel/bpf/verifier.c | 32 ++++++++
> net/core/filter.c | 67 ++++++++++++++++
> net/core/flow_dissector.c | 136 +++++++++++++++++++++++++++++++++
> tools/bpf/bpftool/prog.c | 1 +
> tools/include/uapi/linux/bpf.h | 25 ++++++
> tools/lib/bpf/libbpf.c | 2 +
please split up update to tools/include/uapi/linux/bpf.h as a separate patch 2.
We often have conflicts in there, so best to have a separate.
Also please split tools/lib and tools/bpf chnages into patch 3.
> 13 files changed, 317 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 523481a3471b..988a00797bcd 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -212,6 +212,7 @@ enum bpf_reg_type {
> PTR_TO_PACKET_META, /* skb->data - meta_len */
> PTR_TO_PACKET, /* reg points to skb->data */
> PTR_TO_PACKET_END, /* skb->data + headlen */
> + PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */
> };
>
> /* The information passed from prog-specific *_is_valid_access
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index cd26c090e7c0..22083712dd18 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -32,6 +32,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
> #ifdef CONFIG_INET
> BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
> #endif
> +BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector)
>
> BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
> BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 17a13e4785fc..ce0e863f02a2 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -243,6 +243,8 @@ struct scatterlist;
> struct pipe_inode_info;
> struct iov_iter;
> struct napi_struct;
> +struct bpf_prog;
> +union bpf_attr;
>
> #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
> struct nf_conntrack {
> @@ -1192,6 +1194,11 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
> const struct flow_dissector_key *key,
> unsigned int key_count);
>
> +int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
> + struct bpf_prog *prog);
> +
> +int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr);
> +
> bool __skb_flow_dissect(const struct sk_buff *skb,
> struct flow_dissector *flow_dissector,
> void *target_container,
> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> index 9b5fdc50519a..99d4148e0f90 100644
> --- a/include/net/net_namespace.h
> +++ b/include/net/net_namespace.h
> @@ -43,6 +43,7 @@ struct ctl_table_header;
> struct net_generic;
> struct uevent_sock;
> struct netns_ipvs;
> +struct bpf_prog;
>
>
> #define NETDEV_HASHBITS 8
> @@ -145,6 +146,8 @@ struct net {
> #endif
> struct net_generic __rcu *gen;
>
> + struct bpf_prog __rcu *flow_dissector_prog;
> +
> /* Note : following structs are cache line aligned */
> #ifdef CONFIG_XFRM
> struct netns_xfrm xfrm;
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index a6d00093f35e..1b81ba85fd2d 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -19,6 +19,7 @@ struct Qdisc_ops;
> struct qdisc_walker;
> struct tcf_walker;
> struct module;
> +struct bpf_flow_keys;
>
> typedef int tc_setup_cb_t(enum tc_setup_type type,
> void *type_data, void *cb_priv);
> @@ -307,9 +308,14 @@ struct tcf_proto {
> };
>
> struct qdisc_skb_cb {
> - unsigned int pkt_len;
> - u16 slave_dev_queue_mapping;
> - u16 tc_classid;
> + union {
> + struct {
> + unsigned int pkt_len;
> + u16 slave_dev_queue_mapping;
> + u16 tc_classid;
> + };
> + struct bpf_flow_keys *flow_keys;
> + };
is this magic really necessary? flow_dissector runs very early in recv path.
There is no qdisc or conflicts with tcp/ip use of cb.
I think the whole cb block can be used.
> #define QDISC_CB_PRIV_LEN 20
> unsigned char data[QDISC_CB_PRIV_LEN];
> };
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 66917a4eba27..3064706fcaaa 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -152,6 +152,7 @@ enum bpf_prog_type {
> BPF_PROG_TYPE_LWT_SEG6LOCAL,
> BPF_PROG_TYPE_LIRC_MODE2,
> BPF_PROG_TYPE_SK_REUSEPORT,
> + BPF_PROG_TYPE_FLOW_DISSECTOR,
> };
>
> enum bpf_attach_type {
> @@ -172,6 +173,7 @@ enum bpf_attach_type {
> BPF_CGROUP_UDP4_SENDMSG,
> BPF_CGROUP_UDP6_SENDMSG,
> BPF_LIRC_MODE2,
> + BPF_FLOW_DISSECTOR,
> __MAX_BPF_ATTACH_TYPE
> };
>
> @@ -2333,6 +2335,7 @@ struct __sk_buff {
> /* ... here. */
>
> __u32 data_meta;
> + __u32 flow_keys;
please use
struct bpf_flow_keys *flow_keys;
instead.
See what we did in 'struct sk_msg_md' and in 'struct sk_reuseport_md'.
There is no need to hide pointers in u32.
> };
>
> struct bpf_tunnel_key {
> @@ -2778,4 +2781,26 @@ enum bpf_task_fd_type {
> BPF_FD_TYPE_URETPROBE, /* filename + offset */
> };
>
> +struct bpf_flow_keys {
> + __u16 thoff;
> + __u16 addr_proto; /* ETH_P_* of valid addrs */
> + __u8 is_frag;
> + __u8 is_first_frag;
> + __u8 is_encap;
> + __be16 n_proto;
> + __u8 ip_proto;
> + union {
> + struct {
> + __be32 ipv4_src;
> + __be32 ipv4_dst;
> + };
> + struct {
> + __u32 ipv6_src[4]; /* in6_addr; network order */
> + __u32 ipv6_dst[4]; /* in6_addr; network order */
> + };
> + };
> + __be16 sport;
> + __be16 dport;
> +};
> +
> #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 3c9636f03bb2..b3c2d09bcf7a 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1615,6 +1615,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
> case BPF_LIRC_MODE2:
> ptype = BPF_PROG_TYPE_LIRC_MODE2;
> break;
> + case BPF_FLOW_DISSECTOR:
> + ptype = BPF_PROG_TYPE_FLOW_DISSECTOR;
> + break;
> default:
> return -EINVAL;
> }
> @@ -1636,6 +1639,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
> case BPF_PROG_TYPE_LIRC_MODE2:
> ret = lirc_prog_attach(attr, prog);
> break;
> + case BPF_PROG_TYPE_FLOW_DISSECTOR:
> + ret = skb_flow_dissector_bpf_prog_attach(attr, prog);
> + break;
> default:
> ret = cgroup_bpf_prog_attach(attr, ptype, prog);
> }
> @@ -1688,6 +1694,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
> return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL);
> case BPF_LIRC_MODE2:
> return lirc_prog_detach(attr);
> + case BPF_FLOW_DISSECTOR:
> + return skb_flow_dissector_bpf_prog_detach(attr);
> default:
> return -EINVAL;
> }
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 6ff1bac1795d..8ccbff4fff93 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -261,6 +261,7 @@ static const char * const reg_type_str[] = {
> [PTR_TO_PACKET] = "pkt",
> [PTR_TO_PACKET_META] = "pkt_meta",
> [PTR_TO_PACKET_END] = "pkt_end",
> + [PTR_TO_FLOW_KEYS] = "flow_keys",
> };
>
> static char slot_type_char[] = {
> @@ -965,6 +966,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
> case PTR_TO_PACKET:
> case PTR_TO_PACKET_META:
> case PTR_TO_PACKET_END:
> + case PTR_TO_FLOW_KEYS:
> case CONST_PTR_TO_MAP:
> return true;
> default:
> @@ -1238,6 +1240,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
> case BPF_PROG_TYPE_LWT_XMIT:
> case BPF_PROG_TYPE_SK_SKB:
> case BPF_PROG_TYPE_SK_MSG:
> + case BPF_PROG_TYPE_FLOW_DISSECTOR:
> if (meta)
> return meta->pkt_access;
>
> @@ -1321,6 +1324,18 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
> return -EACCES;
> }
>
> +static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
> + int size)
> +{
> + if (size < 0 || off < 0 ||
> + (u64)off + size > sizeof(struct bpf_flow_keys)) {
> + verbose(env, "invalid access to flow keys off=%d size=%d\n",
> + off, size);
> + return -EACCES;
> + }
> + return 0;
> +}
> +
> static bool __is_pointer_value(bool allow_ptr_leaks,
> const struct bpf_reg_state *reg)
> {
> @@ -1422,6 +1437,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
> * right in front, treat it the very same way.
> */
> return check_pkt_ptr_alignment(env, reg, off, size, strict);
> + case PTR_TO_FLOW_KEYS:
> + pointer_desc = "flow keys ";
> + break;
> case PTR_TO_MAP_VALUE:
> pointer_desc = "value ";
> break;
> @@ -1692,6 +1710,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
> err = check_packet_access(env, regno, off, size, false);
> if (!err && t == BPF_READ && value_regno >= 0)
> mark_reg_unknown(env, regs, value_regno);
> + } else if (reg->type == PTR_TO_FLOW_KEYS) {
> + if (t == BPF_WRITE && value_regno >= 0 &&
> + is_pointer_value(env, value_regno)) {
> + verbose(env, "R%d leaks addr into flow keys\n",
> + value_regno);
> + return -EACCES;
> + }
> +
> + err = check_flow_keys_access(env, off, size);
> + if (!err && t == BPF_READ && value_regno >= 0)
> + mark_reg_unknown(env, regs, value_regno);
> } else {
> verbose(env, "R%d invalid mem access '%s'\n", regno,
> reg_type_str[reg->type]);
> @@ -1839,6 +1868,8 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
> case PTR_TO_PACKET_META:
> return check_packet_access(env, regno, reg->off, access_size,
> zero_size_allowed);
> + case PTR_TO_FLOW_KEYS:
> + return check_flow_keys_access(env, reg->off, access_size);
> case PTR_TO_MAP_VALUE:
> return check_map_access(env, regno, reg->off, access_size,
> zero_size_allowed);
> @@ -4366,6 +4397,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
> case PTR_TO_CTX:
> case CONST_PTR_TO_MAP:
> case PTR_TO_PACKET_END:
> + case PTR_TO_FLOW_KEYS:
> /* Only valid matches are exact, which memcmp() above
> * would have accepted
> */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 8cb242b4400f..bc3725c26794 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5122,6 +5122,17 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> }
> }
>
> +static const struct bpf_func_proto *
> +flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> +{
> + switch (func_id) {
> + case BPF_FUNC_skb_load_bytes:
> + return &bpf_skb_load_bytes_proto;
> + default:
> + return bpf_base_func_proto(func_id);
> + }
> +}
> +
> static const struct bpf_func_proto *
> lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> {
> @@ -5237,6 +5248,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
> case bpf_ctx_range(struct __sk_buff, data):
> case bpf_ctx_range(struct __sk_buff, data_meta):
> case bpf_ctx_range(struct __sk_buff, data_end):
> + case bpf_ctx_range(struct __sk_buff, flow_keys):
> if (size != size_default)
> return false;
> break;
> @@ -5265,6 +5277,7 @@ static bool sk_filter_is_valid_access(int off, int size,
> case bpf_ctx_range(struct __sk_buff, data):
> case bpf_ctx_range(struct __sk_buff, data_meta):
> case bpf_ctx_range(struct __sk_buff, data_end):
> + case bpf_ctx_range(struct __sk_buff, flow_keys):
> case bpf_ctx_range_till(struct __sk_buff, family, local_port):
> return false;
> }
> @@ -5290,6 +5303,7 @@ static bool lwt_is_valid_access(int off, int size,
> case bpf_ctx_range(struct __sk_buff, tc_classid):
> case bpf_ctx_range_till(struct __sk_buff, family, local_port):
> case bpf_ctx_range(struct __sk_buff, data_meta):
> + case bpf_ctx_range(struct __sk_buff, flow_keys):
> return false;
> }
>
> @@ -5500,6 +5514,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
> case bpf_ctx_range(struct __sk_buff, data_end):
> info->reg_type = PTR_TO_PACKET_END;
> break;
> + case bpf_ctx_range(struct __sk_buff, flow_keys):
> case bpf_ctx_range_till(struct __sk_buff, family, local_port):
> return false;
> }
> @@ -5701,6 +5716,7 @@ static bool sk_skb_is_valid_access(int off, int size,
> switch (off) {
> case bpf_ctx_range(struct __sk_buff, tc_classid):
> case bpf_ctx_range(struct __sk_buff, data_meta):
> + case bpf_ctx_range(struct __sk_buff, flow_keys):
> return false;
> }
>
> @@ -5760,6 +5776,39 @@ static bool sk_msg_is_valid_access(int off, int size,
> return true;
> }
>
> +static bool flow_dissector_is_valid_access(int off, int size,
> + enum bpf_access_type type,
> + const struct bpf_prog *prog,
> + struct bpf_insn_access_aux *info)
> +{
> + if (type == BPF_WRITE) {
> + switch (off) {
> + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
> + break;
> + default:
> + return false;
> + }
> + }
> +
> + switch (off) {
> + case bpf_ctx_range(struct __sk_buff, data):
> + info->reg_type = PTR_TO_PACKET;
> + break;
> + case bpf_ctx_range(struct __sk_buff, data_end):
> + info->reg_type = PTR_TO_PACKET_END;
> + break;
> + case bpf_ctx_range(struct __sk_buff, flow_keys):
> + info->reg_type = PTR_TO_FLOW_KEYS;
> + break;
> + case bpf_ctx_range(struct __sk_buff, tc_classid):
> + case bpf_ctx_range(struct __sk_buff, data_meta):
> + case bpf_ctx_range_till(struct __sk_buff, family, local_port):
> + return false;
> + }
> +
> + return bpf_skb_is_valid_access(off, size, type, prog, info);
> +}
> +
> static u32 bpf_convert_ctx_access(enum bpf_access_type type,
> const struct bpf_insn *si,
> struct bpf_insn *insn_buf,
> @@ -6054,6 +6103,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
> bpf_target_off(struct sock_common,
> skc_num, 2, target_size));
> break;
> +
> + case offsetof(struct __sk_buff, flow_keys):
> + off = si->off;
> + off -= offsetof(struct __sk_buff, flow_keys);
> + off += offsetof(struct sk_buff, cb);
> + off += offsetof(struct qdisc_skb_cb, flow_keys);
> + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
> + si->src_reg, off);
> + break;
> }
>
> return insn - insn_buf;
> @@ -7017,6 +7075,15 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = {
> const struct bpf_prog_ops sk_msg_prog_ops = {
> };
>
> +const struct bpf_verifier_ops flow_dissector_verifier_ops = {
> + .get_func_proto = flow_dissector_func_proto,
> + .is_valid_access = flow_dissector_is_valid_access,
> + .convert_ctx_access = bpf_convert_ctx_access,
> +};
> +
> +const struct bpf_prog_ops flow_dissector_prog_ops = {
> +};
> +
> int sk_detach_filter(struct sock *sk)
> {
> int ret = -ENOENT;
> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
> index ce9eeeb7c024..7eed48c46a94 100644
> --- a/net/core/flow_dissector.c
> +++ b/net/core/flow_dissector.c
> @@ -25,6 +25,9 @@
> #include <net/flow_dissector.h>
> #include <scsi/fc/fc_fcoe.h>
> #include <uapi/linux/batadv_packet.h>
> +#include <linux/bpf.h>
> +
> +static DEFINE_MUTEX(flow_dissector_mutex);
>
> static void dissector_set_key(struct flow_dissector *flow_dissector,
> enum flow_dissector_key_id key_id)
> @@ -62,6 +65,44 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
> }
> EXPORT_SYMBOL(skb_flow_dissector_init);
>
> +int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
> + struct bpf_prog *prog)
> +{
> + struct bpf_prog *attached;
> + struct net *net;
> +
> + net = current->nsproxy->net_ns;
> + mutex_lock(&flow_dissector_mutex);
> + attached = rcu_dereference_protected(net->flow_dissector_prog,
> + lockdep_is_held(&flow_dissector_mutex));
> + if (attached) {
> + /* Only one BPF program can be attached at a time */
> + mutex_unlock(&flow_dissector_mutex);
> + return -EEXIST;
> + }
> + rcu_assign_pointer(net->flow_dissector_prog, prog);
> + mutex_unlock(&flow_dissector_mutex);
> + return 0;
> +}
> +
> +int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
> +{
> + struct bpf_prog *attached;
> + struct net *net;
> +
> + net = current->nsproxy->net_ns;
> + mutex_lock(&flow_dissector_mutex);
> + attached = rcu_dereference_protected(net->flow_dissector_prog,
> + lockdep_is_held(&flow_dissector_mutex));
> + if (!attached) {
> + mutex_unlock(&flow_dissector_mutex);
> + return -ENOENT;
> + }
> + bpf_prog_put(attached);
> + RCU_INIT_POINTER(net->flow_dissector_prog, NULL);
> + mutex_unlock(&flow_dissector_mutex);
> + return 0;
> +}
> /**
> * skb_flow_get_be16 - extract be16 entity
> * @skb: sk_buff to extract from
> @@ -588,6 +629,60 @@ static bool skb_flow_dissect_allowed(int *num_hdrs)
> return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS);
> }
>
> +static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
> + struct flow_dissector *flow_dissector,
> + void *target_container)
> +{
> + struct flow_dissector_key_control *key_control;
> + struct flow_dissector_key_basic *key_basic;
> + struct flow_dissector_key_addrs *key_addrs;
> + struct flow_dissector_key_ports *key_ports;
> +
> + key_control = skb_flow_dissector_target(flow_dissector,
> + FLOW_DISSECTOR_KEY_CONTROL,
> + target_container);
> + key_control->thoff = flow_keys->thoff;
> + if (flow_keys->is_frag)
> + key_control->flags |= FLOW_DIS_IS_FRAGMENT;
> + if (flow_keys->is_first_frag)
> + key_control->flags |= FLOW_DIS_FIRST_FRAG;
> + if (flow_keys->is_encap)
> + key_control->flags |= FLOW_DIS_ENCAPSULATION;
> +
> + key_basic = skb_flow_dissector_target(flow_dissector,
> + FLOW_DISSECTOR_KEY_BASIC,
> + target_container);
> + key_basic->n_proto = flow_keys->n_proto;
> + key_basic->ip_proto = flow_keys->ip_proto;
> +
> + if (flow_keys->addr_proto == ETH_P_IP &&
> + dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
> + key_addrs = skb_flow_dissector_target(flow_dissector,
> + FLOW_DISSECTOR_KEY_IPV4_ADDRS,
> + target_container);
> + key_addrs->v4addrs.src = flow_keys->ipv4_src;
> + key_addrs->v4addrs.dst = flow_keys->ipv4_dst;
> + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
> + } else if (flow_keys->addr_proto == ETH_P_IPV6 &&
> + dissector_uses_key(flow_dissector,
> + FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
> + key_addrs = skb_flow_dissector_target(flow_dissector,
> + FLOW_DISSECTOR_KEY_IPV6_ADDRS,
> + target_container);
> + memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src,
> + sizeof(key_addrs->v6addrs));
> + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
> + }
> +
> + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) {
> + key_ports = skb_flow_dissector_target(flow_dissector,
> + FLOW_DISSECTOR_KEY_PORTS,
> + target_container);
> + key_ports->src = flow_keys->sport;
> + key_ports->dst = flow_keys->dport;
> + }
> +}
> +
> /**
> * __skb_flow_dissect - extract the flow_keys struct and return it
> * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
> @@ -619,6 +714,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
> struct flow_dissector_key_vlan *key_vlan;
> enum flow_dissect_ret fdret;
> enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
> + struct bpf_prog *attached;
> int num_hdrs = 0;
> u8 ip_proto = 0;
> bool ret;
> @@ -658,6 +754,46 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
> FLOW_DISSECTOR_KEY_BASIC,
> target_container);
>
> + rcu_read_lock();
> + attached = skb ? rcu_dereference(dev_net(skb->dev)->flow_dissector_prog)
> + : NULL;
> + if (attached) {
> + /* Note that even though the const qualifier is discarded
> + * throughout the execution of the BPF program, all changes(the
> + * control block) are reverted after the BPF program returns.
> + * Therefore, __skb_flow_dissect does not alter the skb.
> + */
> + struct bpf_flow_keys flow_keys = {};
> + struct qdisc_skb_cb cb_saved;
> + struct qdisc_skb_cb *cb;
> + u16 *pseudo_cb;
> + u32 result;
> +
> + cb = qdisc_skb_cb(skb);
> + pseudo_cb = (u16 *)bpf_skb_cb((struct sk_buff *)skb);
> +
> + /* Save Control Block */
> + memcpy(&cb_saved, cb, sizeof(cb_saved));
> + memset(cb, 0, sizeof(cb_saved));
> +
> + /* Pass parameters to the BPF program */
> + cb->flow_keys = &flow_keys;
> + *pseudo_cb = nhoff;
I don't understand this bit.
What is this pseudo_cb and why nhoff goes in there?
Some odd way to pass it into the prog?
> +
> + bpf_compute_data_pointers((struct sk_buff *)skb);
> + result = BPF_PROG_RUN(attached, skb);
> +
> + /* Restore state */
> + memcpy(cb, &cb_saved, sizeof(cb_saved));
> +
> + __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
> + target_container);
> + key_control->thoff = min_t(u16, key_control->thoff, skb->len);
> + rcu_read_unlock();
> + return result == BPF_OK;
> + }
> + rcu_read_unlock();
> +
> if (dissector_uses_key(flow_dissector,
> FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
> struct ethhdr *eth = eth_hdr(skb);
^ permalink raw reply
* Re: [PATCH net-next 2/5] eBPF: Add new eBPF prog type BPF_PROG_TYPE_SOCKET_SG_FILTER
From: Alexei Starovoitov @ 2018-09-12 3:57 UTC (permalink / raw)
To: Tushar Dave
Cc: ast, daniel, davem, santosh.shilimkar, jakub.kicinski,
quentin.monnet, jiong.wang, sandipan, john.fastabend, kafai, rdna,
yhs, netdev, rds-devel, sowmini.varadhan
In-Reply-To: <1536694684-3200-3-git-send-email-tushar.n.dave@oracle.com>
On Tue, Sep 11, 2018 at 09:38:01PM +0200, Tushar Dave wrote:
> Add new eBPF prog type BPF_PROG_TYPE_SOCKET_SG_FILTER which uses the
> existing socket filter infrastructure for bpf program attach and load.
> SOCKET_SG_FILTER eBPF program receives struct scatterlist as bpf context
> contrast to SOCKET_FILTER which deals with struct skb. This is useful
> for kernel entities that don't have skb to represent packet data but
> want to run eBPF socket filter on packet data that is in form of struct
> scatterlist e.g. IB/RDMA
>
> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
> Acked-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
> ---
> include/linux/bpf_types.h | 1 +
> include/uapi/linux/bpf.h | 1 +
> kernel/bpf/syscall.c | 1 +
> kernel/bpf/verifier.c | 1 +
> net/core/filter.c | 55 ++++++++++++++++++++++++++++++++++++++++--
> samples/bpf/bpf_load.c | 11 ++++++---
> tools/bpf/bpftool/prog.c | 1 +
> tools/include/uapi/linux/bpf.h | 1 +
> tools/lib/bpf/libbpf.c | 3 +++
> tools/lib/bpf/libbpf.h | 2 ++
please do not mix core kernel and user space into single patch.
split tools/include/uapi/linux/bpf.h sync into separate patch
and changes to tools/lib/bpf as yet another patch.
> 10 files changed, 72 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index cd26c09..7dc1503 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -16,6 +16,7 @@
> BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
> BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
> BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg)
> +BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_SG_FILTER, socksg_filter)
> #endif
> #ifdef CONFIG_BPF_EVENTS
> BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 66917a4..6ec1e32 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -152,6 +152,7 @@ enum bpf_prog_type {
> BPF_PROG_TYPE_LWT_SEG6LOCAL,
> BPF_PROG_TYPE_LIRC_MODE2,
> BPF_PROG_TYPE_SK_REUSEPORT,
> + BPF_PROG_TYPE_SOCKET_SG_FILTER,
> };
>
> enum bpf_attach_type {
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 3c9636f..5f302b7 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1361,6 +1361,7 @@ static int bpf_prog_load(union bpf_attr *attr)
>
> if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
> type != BPF_PROG_TYPE_CGROUP_SKB &&
> + type != BPF_PROG_TYPE_SOCKET_SG_FILTER &&
I'm not comfortable to let unpriv use this right away.
Can you live with root-only ?
> !capable(CAP_SYS_ADMIN))
> return -EPERM;
>
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index f4ff0c5..17fc4d2 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1234,6 +1234,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
> case BPF_PROG_TYPE_LWT_XMIT:
> case BPF_PROG_TYPE_SK_SKB:
> case BPF_PROG_TYPE_SK_MSG:
> + case BPF_PROG_TYPE_SOCKET_SG_FILTER:
> if (meta)
> return meta->pkt_access;
>
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 0b40f95..469c488 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -1140,7 +1140,8 @@ static void bpf_release_orig_filter(struct bpf_prog *fp)
>
> static void __bpf_prog_release(struct bpf_prog *prog)
> {
> - if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
> + if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER ||
> + prog->type == BPF_PROG_TYPE_SOCKET_SG_FILTER) {
> bpf_prog_put(prog);
this doesn't look right.
Why this is needed?
Are you using old-style setsockopt to attach?
I think new style of attaching that all bpf prog types that came
after socket_filter are using is preferred.
Pls take a look at BPF_PROG_ATTACH cmd.
Also it looks the first patch doesn't really add the useful logic, but adds
few lines of code here and there. Then more code comes in patches 3 and 4.
Please rearrange them that they're reviewable as logical pieces.
^ permalink raw reply
* Re: [PATCH net-next 3/5] ebpf: Add sg_filter_run()
From: Alexei Starovoitov @ 2018-09-12 3:58 UTC (permalink / raw)
To: Tushar Dave
Cc: ast, daniel, davem, santosh.shilimkar, jakub.kicinski,
quentin.monnet, jiong.wang, sandipan, john.fastabend, kafai, rdna,
yhs, netdev, rds-devel, sowmini.varadhan
In-Reply-To: <1536694684-3200-4-git-send-email-tushar.n.dave@oracle.com>
On Tue, Sep 11, 2018 at 09:38:02PM +0200, Tushar Dave wrote:
> When sg_filter_run() is invoked it runs the attached eBPF
> prog of type BPF_PROG_TYPE_SOCKET_SG_FILTER which deals with
> struct scatterlist.
>
> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
> Acked-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
> ---
> include/linux/filter.h | 8 ++++++++
> include/uapi/linux/bpf.h | 6 ++++++
> net/core/filter.c | 35 +++++++++++++++++++++++++++++++++++
> tools/include/uapi/linux/bpf.h | 6 ++++++
> 4 files changed, 55 insertions(+)
>
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 6791a0a..ae664a9 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -1113,4 +1113,12 @@ struct bpf_sock_ops_kern {
> */
> };
>
> +enum __socksg_action {
> + __SOCKSG_PASS = 0,
> + __SOCKSG_DROP,
> + __SOCKSG_REDIRECT,
what is this? I see no code that handles it either in this patch
or in the later patches?!
^ permalink raw reply
* Re: [PATCH net-next 5/5] ebpf: Add sample ebpf program for SOCKET_SG_FILTER
From: Alexei Starovoitov @ 2018-09-12 4:00 UTC (permalink / raw)
To: Tushar Dave
Cc: ast, daniel, davem, santosh.shilimkar, jakub.kicinski,
quentin.monnet, jiong.wang, sandipan, john.fastabend, kafai, rdna,
yhs, netdev, rds-devel, sowmini.varadhan
In-Reply-To: <1536694684-3200-6-git-send-email-tushar.n.dave@oracle.com>
On Tue, Sep 11, 2018 at 09:38:04PM +0200, Tushar Dave wrote:
> Add a sample program that shows how socksg program is used and attached
> to socket filter. The kernel sample program deals with struct
> scatterlist that is passed as bpf context.
>
> When run in server mode, the sample RDS program opens PF_RDS socket,
> attaches eBPF program to RDS socket which then uses bpf_msg_pull_data
> helper to inspect packet data contained in struct scatterlist and
> returns appropriate action code back to kernel.
>
> To ease testing, RDS client functionality is also added so that users
> can generate RDS packet.
>
> Server:
> [root@lab71 bpf]# ./rds_filter -s 192.168.3.71 -t tcp
> running server in a loop
> transport tcp
> server bound to address: 192.168.3.71 port 4000
> server listening on 192.168.3.71
>
> Client:
> [root@lab70 bpf]# ./rds_filter -s 192.168.3.71 -c 192.168.3.70 -t tcp
> transport tcp
> client bound to address: 192.168.3.70 port 25278
> client sending 8192 byte message from 192.168.3.70 to 192.168.3.71 on
> port 25278
> payload contains:30 31 32 33 34 35 36 37 38 39 ...
>
> Server output:
> 192.168.3.71 received a packet from 192.168.3.71 of len 8192 cmsg len 0,
> on port 25278
> payload contains:30 31 32 33 34 35 36 37 38 39 ...
> server listening on 192.168.3.71
>
> [root@lab71 tushar]# cat /sys/kernel/debug/tracing/trace_pipe
> <idle>-0 [038] ..s. 146.947362: 0: 30 31 32
> <idle>-0 [038] ..s. 146.947364: 0: 33 34 35
>
> Similarly specifying '-t ib' will run this on IB link.
>
> Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
> Acked-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
> ---
> samples/bpf/Makefile | 3 +
> samples/bpf/rds_filter_kern.c | 42 ++++++
> samples/bpf/rds_filter_user.c | 339 ++++++++++++++++++++++++++++++++++++++++++
please no samples.
Add this as proper test to tools/testing/selftests/bpf
that reports PASS/FAIL and can be run automatically.
samples/bpf is effectively dead code.
^ permalink raw reply
* Re: [PATCH v2] PCI: Reprogram bridge prefetch registers on resume
From: Rafael J. Wysocki @ 2018-09-12 9:05 UTC (permalink / raw)
To: Daniel Drake
Cc: andy.shevchenko-VuQAYsv1563Yd54FQh9/CA, Mika Westerberg, Linux PM,
Linux PCI, Rafael Wysocki, nic_swsd-Rasf1IRRPZFBDgjK7y7TUQ,
Keith Busch, netdev, nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
Bjorn Helgaas, rchang-eYqpPyKDWXRBDgjK7y7TUQ,
Linux Upstreaming Team, David Miller,
jonathan.derrick-ral2JQCrhuEAvxtiuMwx3w, Heiner Kallweit
In-Reply-To: <20180912064523.9599-1-drake-6IF/jdPJHihWk0Htik3J/w@public.gmane.org>
On Wed, Sep 12, 2018 at 8:45 AM Daniel Drake <drake@endlessm.com> wrote:
>
> On 38+ Intel-based Asus products, the nvidia GPU becomes unusable
> after S3 suspend/resume. The affected products include multiple
> generations of nvidia GPUs and Intel SoCs. After resume, nouveau logs
> many errors such as:
>
> fifo: fault 00 [READ] at 0000005555555000 engine 00 [GR] client 04 [HUB/FE] reason 4a [] on channel -1 [007fa91000 unknown]
> DRM: failed to idle channel 0 [DRM]
>
> Similarly, the nvidia proprietary driver also fails after resume
> (black screen, 100% CPU usage in Xorg process). We shipped a sample
> to Nvidia for diagnosis, and their response indicated that it's a
> problem with the parent PCI bridge (on the Intel SoC), not the GPU.
>
> Runtime suspend/resume works fine, only S3 suspend is affected.
>
> We found a workaround: on resume, rewrite the Intel PCI bridge
> 'Prefetchable Base Upper 32 Bits' register (PCI_PREF_BASE_UPPER32). In
> the cases that I checked, this register has value 0 and we just have to
> rewrite that value.
>
> Linux already saves and restores PCI config space during suspend/resume,
> but this register was being skipped because upon resume, it already
> has value 0 (the correct, pre-suspend value).
>
> Intel appear to have previously acknowledged this behaviour and the
> requirement to rewrite this register.
> https://bugzilla.kernel.org/show_bug.cgi?id=116851#c23
>
> Based on that, rewrite the prefetch register values even when that
> appears unnecessary.
>
> We have confirmed this solution on all the affected models we have
> in-hands (X542UQ, UX533FD, X530UN, V272UN).
>
> Additionally, this solves an issue where r8169 MSI-X interrupts were
> broken after S3 suspend/resume on Asus X441UAR. This issue was recently
> worked around in commit 7bb05b85bc2d ("r8169: don't use MSI-X on
> RTL8106e"). It also fixes the same issue on RTL6186evl/8111evl on an
> Aimfor-tech laptop that we had not yet patched. I suspect it will also
> fix the issue that was worked around in commit 7c53a722459c ("r8169:
> don't use MSI-X on RTL8168g").
>
> Thomas Martitz reports that this change also solves an issue where
> the AMD Radeon Polaris 10 GPU on the HP Zbook 14u G5 is unresponsive
> after S3 suspend/resume.
>
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=201069
> Signed-off-by: Daniel Drake <drake@endlessm.com>
> ---
>
> Notes:
> Replaces patch:
> PCI: add prefetch quirk to work around Asus/Nvidia suspend issues
>
> Some of the more verbose info was moved to bugzilla:
> https://bugzilla.kernel.org/show_bug.cgi?id=201069
>
> This patch is aimed at v4.19 (and maybe v4.18-stable); we may follow
> up with more intrusive improvements for v4.20+.
>
> v2: reimplement the register restore within the existing
> pci_restore_config_space() code.
>
> drivers/pci/pci.c | 31 +++++++++++++++++++++----------
> 1 file changed, 21 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index 29ff9619b5fa..e1704100e72d 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -1289,13 +1289,15 @@ int pci_save_state(struct pci_dev *dev)
> EXPORT_SYMBOL(pci_save_state);
>
> static void pci_restore_config_dword(struct pci_dev *pdev, int offset,
> - u32 saved_val, int retry)
> + u32 saved_val, int retry, bool force)
> {
> u32 val;
>
> - pci_read_config_dword(pdev, offset, &val);
> - if (val == saved_val)
> - return;
> + if (!force) {
> + pci_read_config_dword(pdev, offset, &val);
> + if (val == saved_val)
> + return;
> + }
>
> for (;;) {
> pci_dbg(pdev, "restoring config space at offset %#x (was %#x, writing %#x)\n",
> @@ -1313,25 +1315,34 @@ static void pci_restore_config_dword(struct pci_dev *pdev, int offset,
> }
>
> static void pci_restore_config_space_range(struct pci_dev *pdev,
> - int start, int end, int retry)
> + int start, int end, int retry,
> + bool force)
> {
> int index;
>
> for (index = end; index >= start; index--)
> pci_restore_config_dword(pdev, 4 * index,
> pdev->saved_config_space[index],
> - retry);
> + retry, force);
> }
>
> static void pci_restore_config_space(struct pci_dev *pdev)
> {
> if (pdev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
> - pci_restore_config_space_range(pdev, 10, 15, 0);
> + pci_restore_config_space_range(pdev, 10, 15, 0, false);
> /* Restore BARs before the command register. */
> - pci_restore_config_space_range(pdev, 4, 9, 10);
> - pci_restore_config_space_range(pdev, 0, 3, 0);
> + pci_restore_config_space_range(pdev, 4, 9, 10, false);
> + pci_restore_config_space_range(pdev, 0, 3, 0, false);
> + } else if (pdev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
> + pci_restore_config_space_range(pdev, 12, 15, 0, false);
> + /* Force rewriting of prefetch registers to avoid
> + * S3 resume issues on Intel PCI bridges that occur when
> + * these registers are not explicitly written.
> + */
> + pci_restore_config_space_range(pdev, 9, 11, 0, true);
> + pci_restore_config_space_range(pdev, 0, 8, 0, false);
> } else {
> - pci_restore_config_space_range(pdev, 0, 15, 0);
> + pci_restore_config_space_range(pdev, 0, 15, 0, false);
> }
> }
>
> --
Passing the extra bool around is somewhat clumsy, but I haven't found
a cleaner way to do it, so
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
_______________________________________________
Nouveau mailing list
Nouveau@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/nouveau
^ permalink raw reply
* Re: [PATCH][net-dsa-next] net: dsa: b53: ensure variable pause is initialized
From: Dan Carpenter @ 2018-09-12 9:18 UTC (permalink / raw)
To: Colin King
Cc: Florian Fainelli, Andrew Lunn, Vivien Didelot, David S . Miller,
netdev, kernel-janitors, linux-kernel
In-Reply-To: <20180911175311.17929-1-colin.king@canonical.com>
I already sent this one.
regards,
dan carpenter
^ permalink raw reply
* [PATCH v2,net-next 1/2] ip_gre: fix parsing gre header in ipgre_err
From: Haishuang Yan @ 2018-09-12 9:21 UTC (permalink / raw)
To: David S. Miller, Alexey Kuznetsov
Cc: Jiri Benc, netdev, linux-kernel, Haishuang Yan
gre_parse_header stops parsing when csum_err is encountered, which means
tpi->key is undefined and ip_tunnel_lookup will return NULL improperly.
This patch introduce a NULL pointer as csum_err parameter. Even when
csum_err is encountered, it won't return error and continue parsing gre
header as expected.
Fixes: 9f57c67c379d ("gre: Remove support for sharing GRE protocol hook.")
Reported-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
---
net/ipv4/gre_demux.c | 2 +-
net/ipv4/ip_gre.c | 9 +++------
2 files changed, 4 insertions(+), 7 deletions(-)
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index b798862..679a527 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -86,7 +86,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
options = (__be32 *)(greh + 1);
if (greh->flags & GRE_CSUM) {
- if (skb_checksum_simple_validate(skb)) {
+ if (csum_err && skb_checksum_simple_validate(skb)) {
*csum_err = true;
return -EINVAL;
}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 8cce0e9..c3385a8 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -232,13 +232,10 @@ static void gre_err(struct sk_buff *skb, u32 info)
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct tnl_ptk_info tpi;
- bool csum_err = false;
- if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP),
- iph->ihl * 4) < 0) {
- if (!csum_err) /* ignore csum errors. */
- return;
- }
+ if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP),
+ iph->ihl * 4) < 0)
+ return;
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
ipv4_update_pmtu(skb, dev_net(skb->dev), info,
--
1.8.3.1
^ permalink raw reply related
* [PATCH v2,net-next 2/2] ip6_gre: simplify gre header parsing in ip6gre_err
From: Haishuang Yan @ 2018-09-12 9:21 UTC (permalink / raw)
To: David S. Miller, Alexey Kuznetsov
Cc: Jiri Benc, netdev, linux-kernel, Haishuang Yan
In-Reply-To: <1536744082-3568-1-git-send-email-yanhaishuang@cmss.chinamobile.com>
Same as ip_gre, use gre_parse_header to parse gre header in gre error
handler code.
Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
---
net/ipv6/ip6_gre.c | 26 ++++----------------------
1 file changed, 4 insertions(+), 22 deletions(-)
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index e493b04..515adbd 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -427,35 +427,17 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct net *net = dev_net(skb->dev);
- const struct gre_base_hdr *greh;
const struct ipv6hdr *ipv6h;
- int grehlen = sizeof(*greh);
+ struct tnl_ptk_info tpi;
struct ip6_tnl *t;
- int key_off = 0;
- __be16 flags;
- __be32 key;
- if (!pskb_may_pull(skb, offset + grehlen))
- return;
- greh = (const struct gre_base_hdr *)(skb->data + offset);
- flags = greh->flags;
- if (flags & (GRE_VERSION | GRE_ROUTING))
+ if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IPV6),
+ offset) < 0)
return;
- if (flags & GRE_CSUM)
- grehlen += 4;
- if (flags & GRE_KEY) {
- key_off = grehlen + offset;
- grehlen += 4;
- }
- if (!pskb_may_pull(skb, offset + grehlen))
- return;
ipv6h = (const struct ipv6hdr *)skb->data;
- greh = (const struct gre_base_hdr *)(skb->data + offset);
- key = key_off ? *(__be32 *)(skb->data + key_off) : 0;
-
t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr,
- key, greh->protocol);
+ tpi.key, tpi.proto);
if (!t)
return;
--
1.8.3.1
^ permalink raw reply related
* Re: [PATCH 00/12] Netfilter fixes for net
From: David Miller @ 2018-09-12 4:21 UTC (permalink / raw)
To: pablo; +Cc: netfilter-devel, netdev
In-Reply-To: <20180911002044.9100-1-pablo@netfilter.org>
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 11 Sep 2018 02:20:32 +0200
> The following patchset contains Netfilter fixes for you net tree:
...
> You can pull these changes from:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git
Looks good, pulled, thanks.
^ permalink raw reply
* [PATCH v2 2/3] r8169: Get and enable optional ether_clk clock
From: Hans de Goede @ 2018-09-12 9:34 UTC (permalink / raw)
To: David S . Miller, Heiner Kallweit, Michael Turquette,
Stephen Boyd, Andy Shevchenko, Pierre-Louis Bossart
Cc: Hans de Goede, linux-wireless-u79uwXL29TY76Z2rM5mHXA,
netdev-u79uwXL29TY76Z2rM5mHXA, Johannes Stezenbach, Carlo Caione,
linux-clk-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20180912093456.23400-1-hdegoede-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
On some boards a platform clock is used as clock for the r8169 chip,
this commit adds support for getting and enabling this clock (assuming
it has an "ether_clk" alias set on it).
This is related to commit d31fd43c0f9a ("clk: x86: Do not gate clocks
enabled by the firmware") which is a previous attempt to fix this for some
x86 boards, but this causes all Cherry Trail SoC using boards to not reach
there lowest power states when suspending.
This commit (together with an atom-pmc-clk driver commit adding the alias)
fixes things properly by making the r8169 get the clock and enable it when
it needs it.
Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=193891#c102
Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=196861
Cc: Johannes Stezenbach <js-FF7aIK3TAVNeoWH0uzbU5w@public.gmane.org>
Cc: Carlo Caione <carlo-6IF/jdPJHihWk0Htik3J/w@public.gmane.org>
Reported-by: Johannes Stezenbach <js-FF7aIK3TAVNeoWH0uzbU5w@public.gmane.org>
Acked-by: Stephen Boyd <sboyd-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>
Signed-off-by: Hans de Goede <hdegoede-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
Changes in v2:
-Tweaked the commit msg a bit
-Added: Stephen's Acked-by, Andy's Reviewed-by
---
drivers/net/ethernet/realtek/r8169.c | 33 ++++++++++++++++++++++++++++
1 file changed, 33 insertions(+)
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index b08d51bf7a20..474229548731 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -13,6 +13,7 @@
#include <linux/pci.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
+#include <linux/clk.h>
#include <linux/delay.h>
#include <linux/ethtool.h>
#include <linux/phy.h>
@@ -665,6 +666,7 @@ struct rtl8169_private {
u16 event_slow;
const struct rtl_coalesce_info *coalesce_info;
+ struct clk *clk;
struct mdio_ops {
void (*write)(struct rtl8169_private *, int, int);
@@ -7254,6 +7256,11 @@ static int rtl_jumbo_max(struct rtl8169_private *tp)
}
}
+static void rtl_disable_clk(void *data)
+{
+ clk_disable_unprepare(data);
+}
+
static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
{
const struct rtl_cfg_info *cfg = rtl_cfg_infos + ent->driver_data;
@@ -7274,6 +7281,32 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
tp->msg_enable = netif_msg_init(debug.msg_enable, R8169_MSG_DEFAULT);
tp->supports_gmii = cfg->has_gmii;
+ /* Get the *optional* external "ether_clk" used on some boards */
+ tp->clk = devm_clk_get(&pdev->dev, "ether_clk");
+ if (IS_ERR(tp->clk)) {
+ rc = PTR_ERR(tp->clk);
+ if (rc == -ENOENT) {
+ /* clk-core allows NULL (for suspend / resume) */
+ tp->clk = NULL;
+ } else if (rc == -EPROBE_DEFER) {
+ return rc;
+ } else {
+ dev_err(&pdev->dev, "failed to get clk: %d\n", rc);
+ return rc;
+ }
+ } else {
+ rc = clk_prepare_enable(tp->clk);
+ if (rc) {
+ dev_err(&pdev->dev, "failed to enable clk: %d\n", rc);
+ return rc;
+ }
+
+ rc = devm_add_action_or_reset(&pdev->dev, rtl_disable_clk,
+ tp->clk);
+ if (rc)
+ return rc;
+ }
+
/* enable device (incl. PCI PM wakeup and hotplug setup) */
rc = pcim_enable_device(pdev);
if (rc < 0) {
--
2.19.0.rc0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox