Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH iproute2 1/1] tc: pass correct conversion specifier to print 'unsigned int' action index.
From: Roman Mashak @ 2016-12-13 20:31 UTC (permalink / raw)
  To: stephen; +Cc: netdev, jhs, daniel, xiyou.wangcong, Roman Mashak

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
---
 tc/m_bpf.c      | 2 +-
 tc/m_connmark.c | 2 +-
 tc/m_csum.c     | 3 ++-
 tc/m_gact.c     | 3 ++-
 tc/m_ife.c      | 2 +-
 tc/m_ipt.c      | 2 +-
 tc/m_mirred.c   | 3 ++-
 tc/m_pedit.c    | 2 +-
 tc/m_simple.c   | 2 +-
 tc/m_skbedit.c  | 2 +-
 tc/m_skbmod.c   | 2 +-
 tc/m_vlan.c     | 2 +-
 tc/m_xt.c       | 2 +-
 tc/m_xt_old.c   | 2 +-
 14 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/tc/m_bpf.c b/tc/m_bpf.c
index 9bf2a85..6400724 100644
--- a/tc/m_bpf.c
+++ b/tc/m_bpf.c
@@ -161,7 +161,7 @@ static int bpf_print_opt(struct action_util *au, FILE *f, struct rtattr *arg)
 	}
 
 	fprintf(f, "default-action %s\n", action_n2a(parm->action));
-	fprintf(f, "\tindex %d ref %d bind %d", parm->index, parm->refcnt,
+	fprintf(f, "\tindex %u ref %d bind %d", parm->index, parm->refcnt,
 		parm->bindcnt);
 
 	if (show_stats) {
diff --git a/tc/m_connmark.c b/tc/m_connmark.c
index 20f98e4..295f90d 100644
--- a/tc/m_connmark.c
+++ b/tc/m_connmark.c
@@ -123,7 +123,7 @@ static int print_connmark(struct action_util *au, FILE *f, struct rtattr *arg)
 	ci = RTA_DATA(tb[TCA_CONNMARK_PARMS]);
 
 	fprintf(f, " connmark zone %d\n", ci->zone);
-	fprintf(f, "\t index %d ref %d bind %d", ci->index,
+	fprintf(f, "\t index %u ref %d bind %d", ci->index,
 		ci->refcnt, ci->bindcnt);
 
 	if (show_stats) {
diff --git a/tc/m_csum.c b/tc/m_csum.c
index a6e4c1e..d5b1af6 100644
--- a/tc/m_csum.c
+++ b/tc/m_csum.c
@@ -199,7 +199,8 @@ print_csum(struct action_util *au, FILE *f, struct rtattr *arg)
 		uflag_1, uflag_2, uflag_3,
 		uflag_4, uflag_5, uflag_6,
 		action_n2a(sel->action));
-	fprintf(f, "\tindex %d ref %d bind %d", sel->index, sel->refcnt, sel->bindcnt);
+	fprintf(f, "\tindex %u ref %d bind %d", sel->index, sel->refcnt,
+		sel->bindcnt);
 
 	if (show_stats) {
 		if (tb[TCA_CSUM_TM]) {
diff --git a/tc/m_gact.c b/tc/m_gact.c
index dc04b9f..755a3be 100644
--- a/tc/m_gact.c
+++ b/tc/m_gact.c
@@ -224,7 +224,8 @@ print_gact(struct action_util *au, FILE * f, struct rtattr *arg)
 	fprintf(f, "\n\t random type %s %s val %d",
 		prob_n2a(pp->ptype), action_n2a(pp->paction), pp->pval);
 #endif
-	fprintf(f, "\n\t index %d ref %d bind %d", p->index, p->refcnt, p->bindcnt);
+	fprintf(f, "\n\t index %u ref %d bind %d", p->index, p->refcnt,
+		p->bindcnt);
 	if (show_stats) {
 		if (tb[TCA_GACT_TM]) {
 			struct tcf_t *tm = RTA_DATA(tb[TCA_GACT_TM]);
diff --git a/tc/m_ife.c b/tc/m_ife.c
index e6f6153..f6131b1 100644
--- a/tc/m_ife.c
+++ b/tc/m_ife.c
@@ -312,7 +312,7 @@ static int print_ife(struct action_util *au, FILE *f, struct rtattr *arg)
 				    sizeof(b2)));
 	}
 
-	fprintf(f, "\n\t index %d ref %d bind %d", p->index, p->refcnt,
+	fprintf(f, "\n\t index %u ref %d bind %d", p->index, p->refcnt,
 		p->bindcnt);
 	if (show_stats) {
 		if (tb[TCA_IFE_TM]) {
diff --git a/tc/m_ipt.c b/tc/m_ipt.c
index d6f62bd..1b935ec 100644
--- a/tc/m_ipt.c
+++ b/tc/m_ipt.c
@@ -489,7 +489,7 @@ print_ipt(struct action_util *au, FILE * f, struct rtattr *arg)
 			__u32 index;
 
 			index = rta_getattr_u32(tb[TCA_IPT_INDEX]);
-			fprintf(f, "\n\tindex %d", index);
+			fprintf(f, "\n\tindex %u", index);
 		}
 
 		if (tb[TCA_IPT_CNT]) {
diff --git a/tc/m_mirred.c b/tc/m_mirred.c
index 11f4c9b..35ae21f 100644
--- a/tc/m_mirred.c
+++ b/tc/m_mirred.c
@@ -260,7 +260,8 @@ print_mirred(struct action_util *au, FILE * f, struct rtattr *arg)
 		mirred_n2a(p->eaction), dev, action_n2a(p->action));
 
 	fprintf(f, "\n ");
-	fprintf(f, "\tindex %d ref %d bind %d", p->index, p->refcnt, p->bindcnt);
+	fprintf(f, "\tindex %u ref %d bind %d", p->index, p->refcnt,
+		p->bindcnt);
 
 	if (show_stats) {
 		if (tb[TCA_MIRRED_TM]) {
diff --git a/tc/m_pedit.c b/tc/m_pedit.c
index 891c2ec..8e9bf07 100644
--- a/tc/m_pedit.c
+++ b/tc/m_pedit.c
@@ -527,7 +527,7 @@ int print_pedit(struct action_util *au, FILE *f, struct rtattr *arg)
 
 	fprintf(f, " pedit action %s keys %d\n ",
 		action_n2a(sel->action), sel->nkeys);
-	fprintf(f, "\t index %d ref %d bind %d", sel->index, sel->refcnt,
+	fprintf(f, "\t index %u ref %d bind %d", sel->index, sel->refcnt,
 		sel->bindcnt);
 
 	if (show_stats) {
diff --git a/tc/m_simple.c b/tc/m_simple.c
index 732eaf1..3a8bd91 100644
--- a/tc/m_simple.c
+++ b/tc/m_simple.c
@@ -187,7 +187,7 @@ static int print_simple(struct action_util *au, FILE *f, struct rtattr *arg)
 	simpdata = RTA_DATA(tb[TCA_DEF_DATA]);
 
 	fprintf(f, "Simple <%s>\n", simpdata);
-	fprintf(f, "\t index %d ref %d bind %d", sel->index,
+	fprintf(f, "\t index %u ref %d bind %d", sel->index,
 		sel->refcnt, sel->bindcnt);
 
 	if (show_stats) {
diff --git a/tc/m_skbedit.c b/tc/m_skbedit.c
index 368debc..8660d60 100644
--- a/tc/m_skbedit.c
+++ b/tc/m_skbedit.c
@@ -214,7 +214,7 @@ static int print_skbedit(struct action_util *au, FILE *f, struct rtattr *arg)
 			fprintf(f, " ptype %d", *ptype);
 	}
 
-	fprintf(f, "\n\t index %d ref %d bind %d",
+	fprintf(f, "\n\t index %u ref %d bind %d",
 		p->index, p->refcnt, p->bindcnt);
 
 	if (show_stats) {
diff --git a/tc/m_skbmod.c b/tc/m_skbmod.c
index 0c293fc..acb7771 100644
--- a/tc/m_skbmod.c
+++ b/tc/m_skbmod.c
@@ -237,7 +237,7 @@ static int print_skbmod(struct action_util *au, FILE *f, struct rtattr *arg)
 	if (p->flags & SKBMOD_F_SWAPMAC)
 		fprintf(f, "swap mac ");
 
-	fprintf(f, "\n\t index %d ref %d bind %d", p->index, p->refcnt,
+	fprintf(f, "\n\t index %u ref %d bind %d", p->index, p->refcnt,
 		p->bindcnt);
 	if (show_stats) {
 		if (tb[TCA_SKBMOD_TM]) {
diff --git a/tc/m_vlan.c b/tc/m_vlan.c
index b32f746..44b9375 100644
--- a/tc/m_vlan.c
+++ b/tc/m_vlan.c
@@ -226,7 +226,7 @@ static int print_vlan(struct action_util *au, FILE *f, struct rtattr *arg)
 	}
 	fprintf(f, " %s", action_n2a(parm->action));
 
-	fprintf(f, "\n\t index %d ref %d bind %d", parm->index, parm->refcnt,
+	fprintf(f, "\n\t index %u ref %d bind %d", parm->index, parm->refcnt,
 		parm->bindcnt);
 
 	if (show_stats) {
diff --git a/tc/m_xt.c b/tc/m_xt.c
index 028bad6..dbb5498 100644
--- a/tc/m_xt.c
+++ b/tc/m_xt.c
@@ -372,7 +372,7 @@ print_ipt(struct action_util *au, FILE *f, struct rtattr *arg)
 		__u32 index;
 
 		index = rta_getattr_u32(tb[TCA_IPT_INDEX]);
-		fprintf(f, "\n\tindex %d", index);
+		fprintf(f, "\n\tindex %u", index);
 	}
 
 	if (tb[TCA_IPT_CNT]) {
diff --git a/tc/m_xt_old.c b/tc/m_xt_old.c
index 20a6342..e9cc624 100644
--- a/tc/m_xt_old.c
+++ b/tc/m_xt_old.c
@@ -412,7 +412,7 @@ print_ipt(struct action_util *au, FILE * f, struct rtattr *arg)
 			__u32 index;
 
 			index = rta_getattr_u32(tb[TCA_IPT_INDEX]);
-			fprintf(f, "\n\tindex %d", index);
+			fprintf(f, "\n\tindex %u", index);
 		}
 
 		if (tb[TCA_IPT_CNT]) {
-- 
1.9.1

^ permalink raw reply related

* [PATCH] net: qcom/emac: don't try to claim clocks on ACPI systems
From: Timur Tabi @ 2016-12-13 19:55 UTC (permalink / raw)
  To: David Miller, netdev, Christopher Covington, alokc

On ACPI systems, clocks are not available to drivers directly.  They are
handled exclusively by ACPI and/or firmware, so there is no clock driver.
Calls to clk_get() always fail, so we should not even attempt to claim
any clocks on ACPI systems.

Signed-off-by: Timur Tabi <timur@codeaurora.org>
---
 drivers/net/ethernet/qualcomm/emac/emac.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c
index ae32f85..b1c1cdc 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac.c
@@ -627,11 +627,12 @@ static int emac_probe(struct platform_device *pdev)
 	if (ret)
 		goto err_undo_netdev;
 
-	/* initialize clocks */
-	ret = emac_clks_phase1_init(pdev, adpt);
-	if (ret) {
-		dev_err(&pdev->dev, "could not initialize clocks\n");
-		goto err_undo_netdev;
+	if (!has_acpi_companion(&pdev->dev)) {
+		ret = emac_clks_phase1_init(pdev, adpt);
+		if (ret) {
+			dev_err(&pdev->dev, "could not initialize clocks\n");
+			goto err_undo_netdev;
+		}
 	}
 
 	netdev->watchdog_timeo = EMAC_WATCHDOG_TIME;
@@ -655,11 +656,12 @@ static int emac_probe(struct platform_device *pdev)
 	if (ret)
 		goto err_undo_mdiobus;
 
-	/* enable clocks */
-	ret = emac_clks_phase2_init(pdev, adpt);
-	if (ret) {
-		dev_err(&pdev->dev, "could not initialize clocks\n");
-		goto err_undo_mdiobus;
+	if (!has_acpi_companion(&pdev->dev)) {
+		ret = emac_clks_phase2_init(pdev, adpt);
+		if (ret) {
+			dev_err(&pdev->dev, "could not initialize clocks\n");
+			goto err_undo_mdiobus;
+		}
 	}
 
 	emac_mac_reset(adpt);
-- 
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc.  Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.

^ permalink raw reply related

* Re: Designing a safe RX-zero-copy Memory Model for Networking
From: John Fastabend @ 2016-12-13 20:08 UTC (permalink / raw)
  To: David Miller
  Cc: brouer, cl, rppt, netdev, linux-mm, willemdebruijn.kernel,
	bjorn.topel, magnus.karlsson, alexander.duyck, mgorman, tom,
	bblanco, tariqt, saeedm, jesse.brandeburg, METH, vyasevich
In-Reply-To: <20161213.145333.514056260418695987.davem@davemloft.net>

On 16-12-13 11:53 AM, David Miller wrote:
> From: John Fastabend <john.fastabend@gmail.com>
> Date: Tue, 13 Dec 2016 09:43:59 -0800
> 
>> What does "zero-copy send packet-pages to the application/socket that
>> requested this" mean? At the moment on x86 page-flipping appears to be
>> more expensive than memcpy (I can post some data shortly) and shared
>> memory was proposed and rejected for security reasons when we were
>> working on bifurcated driver.
> 
> The whole idea is that we map all the active RX ring pages into
> userspace from the start.
> 
> And just how Jesper's page pool work will avoid DMA map/unmap,
> it will also avoid changing the userspace mapping of the pages
> as well.
> 
> Thus avoiding the TLB/VM overhead altogether.
> 

I get this but it requires applications to be isolated. The pages from
a queue can not be shared between multiple applications in different
trust domains. And the application has to be cooperative meaning it
can't "look" at data that has not been marked by the stack as OK. In
these schemes we tend to end up with something like virtio/vhost or
af_packet.

Any ACLs/filtering/switching/headers need to be done in hardware or
the application trust boundaries are broken.

If the above can not be met then a copy is needed. What I am trying
to tease out is the above comment along with other statements like
this "can be done with out HW filter features".

.John

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH net-next] netlink: revert broken, broken "2-clause nla_ok()"
From: David Miller @ 2016-12-13 19:55 UTC (permalink / raw)
  To: adobriyan; +Cc: netdev, johannes
In-Reply-To: <20161213193015.GA10610@avx2>

From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 13 Dec 2016 22:30:15 +0300

> Commit 4f7df337fe79bba1e4c2d525525d63b5ba186bbd
> "netlink: 2-clause nla_ok()" is BROKEN.
> 
> First clause tests if "->nla_len" could even be accessed at all,
> it can not possibly be omitted.
> 
> Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>

Applied, thanks.

^ permalink raw reply

* Re: Designing a safe RX-zero-copy Memory Model for Networking
From: David Miller @ 2016-12-13 19:53 UTC (permalink / raw)
  To: john.fastabend
  Cc: brouer, cl, rppt, netdev, linux-mm, willemdebruijn.kernel,
	bjorn.topel, magnus.karlsson, alexander.duyck, mgorman, tom,
	bblanco, tariqt, saeedm, jesse.brandeburg, METH, vyasevich
In-Reply-To: <5850335F.6090000@gmail.com>

From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 13 Dec 2016 09:43:59 -0800

> What does "zero-copy send packet-pages to the application/socket that
> requested this" mean? At the moment on x86 page-flipping appears to be
> more expensive than memcpy (I can post some data shortly) and shared
> memory was proposed and rejected for security reasons when we were
> working on bifurcated driver.

The whole idea is that we map all the active RX ring pages into
userspace from the start.

And just how Jesper's page pool work will avoid DMA map/unmap,
it will also avoid changing the userspace mapping of the pages
as well.

Thus avoiding the TLB/VM overhead altogether.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: bpf debug info
From: Alexei Starovoitov @ 2016-12-13 19:38 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: netdev@vger.kernel.org, Brenden Blanco, Thomas Graf, Wangnan,
	He Kuang, Kernel Team

On Tue, Nov 29, 2016 at 9:01 AM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>> >If I try to run samples/bpf/test_cls_bpf.sh the verifier will complain:
>> >R0=imm0,min_value=0,max_value=0 R1=pkt(id=0,off=0,r=42) R2=pkt_end
>> >112: (0f) r4 += r3
>> >113: (0f) r1 += r4
>> >114: (b7) r0 = 2
>> >115: (69) r2 = *(u16 *)(r1 +2)
>> >invalid access to packet, off=2 size=2, R1(id=3,off=0,r=0)
>> >
>> >Now multiply 115 * 8 and convert to hex. This is address 0x398 in llvm-objdump:
>> >; struct udphdr *udp = data + tp_off;
>> >      388:       r1 += r4
>> >      390:       r0 = 2
>> >; if (udp->dest == htons(DEFAULT_PKTGEN_UDP_PORT) ||
>> >      398:       r2 = *(u16 *)(r1 + 2)
>> >      3a0:       if r2 == 2304 goto 16
>> >
>> >Now it's clear which line of C code is causing the verifier to reject.
>> [...]
>>
>> Could llvm-objdump switch line numbering for bpf same way as verifier
>> output, so mapping step is not really needed?
>
> you mean that llvm-objdump to print 113,114,115 ?
> I guess it's doable. Will give it a try.

Hi Daniel,

your feature request turned out to be pretty straightforward
to implement. Please pull the latest llvm and rebuild llvm-objdump.
It will be printing instruction numbers instead of absolute addresses.
No "multiply 115 * 8 and convert to hex" steps necessary anymore.

Thanks

^ permalink raw reply

* Re: "virtio-net: enable multiqueue by default" in linux-next breaks networking on GCE
From: Theodore Ts'o @ 2016-12-13 19:44 UTC (permalink / raw)
  To: Wei Xu; +Cc: jasowang, netdev, mst, nhorman, davem
In-Reply-To: <bb997932-20d2-42f4-0f42-bd28ae151076@redhat.com>

Jason's patch fixed the issue, so I think we have the proper fix, but
to answer your questions:

On Wed, Dec 14, 2016 at 01:46:44AM +0800, Wei Xu wrote:
> 
> Q1:
> Which distribution are you using for the GCE instance?

The test appliance is based on Debian Jessie.

> Q2:
> Are you running xfs test as an embedded VM case, which means XFS test
> appliance is also a VM inside the GCE instance? Or the kernel is built
> for the instance itself?

No, GCE currently doesn't support running nested VM's (e.g., running
VM's inside GCE).  So the kernel is built for the instance itself.
The way the test appliance works is that it initially boots using the
Debian Jessie default kernel and then we kexec into the kernel under
test.

> Q3:
> Can this bug be reproduced for kvm-xfstests case? I'm trying to set up
> a local test bed if it makes sense.

You definitely can't do it out of the box -- you need to build the
image using "gen-image --networking", and then run "kvm-xfstests -N
shell" as root.  But the bug doesn't reproduce on kvm-xfstests, using
a 4.9 host kernel and linux-next guest kernel.

Cheers,

					- Ted

^ permalink raw reply

* Re: [iproute2 v3 net-next 0/8] Add support for vrf helper
From: Stephen Hemminger @ 2016-12-13 18:44 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev
In-Reply-To: <1481503995-24825-1-git-send-email-dsa@cumulusnetworks.com>

On Sun, 11 Dec 2016 16:53:07 -0800
David Ahern <dsa@cumulusnetworks.com> wrote:

> This series adds support to iproute2 to run a command against a specific
> VRF. The user semantics are similar to 'ip netns'.
> 
> The 'ip vrf' subcommand supports 3 usages:
> 
> 1. Run a command against a given vrf:
>        ip vrf exec NAME CMD
> 
>    Uses the recently committed cgroup/sock BPF option. vrf directory
>    is added to cgroup2 mount. Individual vrfs are created under it. BPF
>    filter is attached to vrf/NAME cgroup2 to set sk_bound_dev_if to the
>    device index of the VRF. From there the current process (ip's pid) is
>    addded to the cgroups.proc file and the given command is exected. In
>    doing so all AF_INET/AF_INET6 (ipv4/ipv6) sockets are automatically
>    bound to the VRF domain.
> 
>    The association is inherited parent to child allowing the command to
>    be a shell from which other commands are run relative to the VRF.
> 
> 2. Show the VRF a process is bound to:
>        ip vrf id [PID]
>    This command essentially looks at /proc/pid/cgroup for a "::/vrf/"
>    entry. If pid arg is not given current process id is used.
> 
> 3. Show process ids bound to a VRF
>        ip vrf pids NAME
>    This command dumps the file MNT/vrf/NAME/cgroup.procs since that file
>    shows the process ids in the particular vrf cgroup.
> 
> v3
> - bpf_prog_{at,de}tach changes as requested by Daniel
> - BPF macros added to bpf_util.h versus adding a new file as requested by Daniel
> 
> v2
> - updated suject of patch 3 to avoid spam filters on vger
> 
> David Ahern (8):
>   lib bpf: Add support for BPF_PROG_ATTACH and BPF_PROG_DETACH
>   bpf: export bpf_prog_load
>   bpf: Add BPF_ macros
>   move cmd_exec to lib utils
>   Add filesystem APIs to lib
>   change name_is_vrf to return index
>   libnetlink: Add variant of rtnl_talk that does not display RTNETLINK
>     answers error
>   Introduce ip vrf command
> 
>  include/bpf_util.h   | 186 +++++++++++++++++++++++++++++++++
>  include/libnetlink.h |   3 +
>  include/utils.h      |   4 +
>  ip/Makefile          |   3 +-
>  ip/ip.c              |   4 +-
>  ip/ip_common.h       |   4 +-
>  ip/iplink_vrf.c      |  29 ++++--
>  ip/ipnetns.c         |  34 ------
>  ip/ipvrf.c           | 289 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  lib/Makefile         |   2 +-
>  lib/bpf.c            |  61 +++++++----
>  lib/exec.c           |  41 ++++++++
>  lib/fs.c             | 143 +++++++++++++++++++++++++
>  lib/libnetlink.c     |  20 +++-
>  man/man8/ip-vrf.8    |  88 ++++++++++++++++
>  15 files changed, 841 insertions(+), 70 deletions(-)
>  create mode 100644 ip/ipvrf.c
>  create mode 100644 lib/exec.c
>  create mode 100644 lib/fs.c
>  create mode 100644 man/man8/ip-vrf.8
> 

Thanks, applied. Then I went and cleanup the long lines and whitespace issues

^ permalink raw reply

* Re: Designing a safe RX-zero-copy Memory Model for Networking
From: Hannes Frederic Sowa @ 2016-12-13 18:39 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, Christoph Lameter
  Cc: John Fastabend, Mike Rapoport, netdev@vger.kernel.org, linux-mm,
	Willem de Bruijn, Björn Töpel, Karlsson, Magnus,
	Alexander Duyck, Mel Gorman, Tom Herbert, Brenden Blanco,
	Tariq Toukan, Saeed Mahameed, Jesse Brandeburg, Kalman Meth,
	Vladislav Yasevich
In-Reply-To: <20161213171028.24dbf519@redhat.com>

On 13.12.2016 17:10, Jesper Dangaard Brouer wrote:
>> What is bad about RDMA is that it is a separate kernel subsystem.
>> What I would like to see is a deeper integration with the network
>> stack so that memory regions can be registred with a network socket
>> and work requests then can be submitted and processed that directly
>> read and write in these regions. The network stack should provide the
>> services that the hardware of the NIC does not suppport as usual.
> 
> Interesting.  So you even imagine sockets registering memory regions
> with the NIC.  If we had a proper NIC HW filter API across the drivers,
> to register the steering rule (like ibv_create_flow), this would be
> doable, but we don't (DPDK actually have an interesting proposal[1])

On a side note, this is what windows does with RIO ("registered I/O").
Maybe you want to look at the API to get some ideas: allocating and
pinning down memory in user space and registering that with sockets to
get zero-copy IO.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH iproute2 V2 1/2] tc: flower: Fix typo and style in flower man page
From: Stephen Hemminger @ 2016-12-13 18:17 UTC (permalink / raw)
  To: Roi Dayan; +Cc: netdev, Amir Vadai, Hadar Hen Zion
In-Reply-To: <1481632742-18020-2-git-send-email-roid@mellanox.com>

On Tue, 13 Dec 2016 14:39:01 +0200
Roi Dayan <roid@mellanox.com> wrote:

> Replace vlan_eth_type with vlan_ethtype.
> 
> Fixes: 745d91726006 ("tc: flower: Introduce vlan support")
> Signed-off-by: Roi Dayan <roid@mellanox.com>
> Reviewed-by: Hadar Hen Zion <hadarh@mellanox.com>

Both applied, thanks.

^ permalink raw reply

* Re: [PATCH iproute2 1/2] tc/cls_flower: Add dest UDP port to tunnel params
From: Stephen Hemminger @ 2016-12-13 18:17 UTC (permalink / raw)
  To: Hadar Hen Zion; +Cc: netdev, Or Gerlitz, Roi Dayan, Amir Vadai
In-Reply-To: <1481616467-769-2-git-send-email-hadarh@mellanox.com>

On Tue, 13 Dec 2016 10:07:46 +0200
Hadar Hen Zion <hadarh@mellanox.com> wrote:

> Enhance IP tunnel parameters by adding destination UDP port.
> 
> Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
> Reviewed-by: Roi Dayan <roid@mellanox.com>

Both applied, thanks.

^ permalink raw reply

* sctp: suspicious rcu_dereference_check() usage in sctp_epaddr_lookup_transport
From: Dmitry Vyukov @ 2016-12-13 18:07 UTC (permalink / raw)
  To: Vladislav Yasevich, Neil Horman, David Miller, linux-sctp, netdev,
	LKML, Eric Dumazet, Marcelo Ricardo Leitner
  Cc: syzkaller

Hello,

I am getting the following reports while running syzkaller fuzzer:

[ INFO: suspicious RCU usage. ]
4.9.0+ #85 Not tainted
-------------------------------
./include/linux/rhashtable.h:572 suspicious rcu_dereference_check() usage!

other info that might help us debug this:

rcu_scheduler_active = 1, debug_locks = 0
1 lock held by syz-executor1/18023:
 #0:  (sk_lock-AF_INET){+.+.+.}, at: [<     inline     >] lock_sock
include/net/sock.h:1454
 #0:  (sk_lock-AF_INET){+.+.+.}, at: [<ffffffff87bb3ccf>]
sctp_getsockopt+0x45f/0x6800 net/sctp/socket.c:6432

stack backtrace:
CPU: 2 PID: 18023 Comm: syz-executor1 Not tainted 4.9.0+ #85
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
[<     inline     >] __dump_stack lib/dump_stack.c:15
[<        none        >] dump_stack+0x2ee/0x3ef lib/dump_stack.c:51
[<        none        >] lockdep_rcu_suspicious+0x139/0x180
kernel/locking/lockdep.c:4448
[<     inline     >] __rhashtable_lookup ./include/linux/rhashtable.h:572
[<     inline     >] rhltable_lookup ./include/linux/rhashtable.h:660
[<        none        >] sctp_epaddr_lookup_transport+0x641/0x930
net/sctp/input.c:946
[<        none        >] sctp_endpoint_lookup_assoc+0x83/0x120
net/sctp/endpointola.c:335
[<        none        >] sctp_addr_id2transport+0xaf/0x1e0 net/sctp/socket.c:241
[<        none        >] sctp_getsockopt_peer_addr_info+0x216/0x630
net/sctp/socket.c:4625
[<        none        >] sctp_getsockopt+0x2860/0x6800 net/sctp/socket.c:6500
[<        none        >] sock_common_getsockopt+0x9a/0xe0 net/core/sock.c:2685
[<     inline     >] SYSC_getsockopt net/socket.c:1819
[<        none        >] SyS_getsockopt+0x245/0x380 net/socket.c:1801
[<        none        >] entry_SYSCALL_64_fastpath+0x23/0xc6
arch/x86/entry/entry_64.S:203

On commit e7aa8c2eb11ba69b1b69099c3c7bd6be3087b0ba (Dec 12).

^ permalink raw reply

* Re: "virtio-net: enable multiqueue by default" in linux-next breaks networking on GCE
From: Wei Xu @ 2016-12-13 17:46 UTC (permalink / raw)
  To: Theodore Ts'o, jasowang; +Cc: netdev, mst, nhorman, davem
In-Reply-To: <20161212233343.q5xlv55rc5npqaqp@thunk.org>


On 2016年12月13日 07:33, Theodore Ts'o wrote:
> Hi,
>
> I was doing a last minute regression test of the ext4 tree before
> sending a pull request to Linus, which I do using gce-xfstests[1], and
> I found that using networking was broken on GCE on linux-next.  I was
> using next-20161209, and after bisecting things, I narrowed down the
> commit which causing things to break to commit 449000102901:
> "virtio-net: enable multiqueue by default".  Reverting this commit on
> top of next-20161209 fixed the problem.
>
> [1] http://thunk.org/gce-xfstests
>
> You can reproduce the problem for building the kernel for Google
> Compute Engine --- I use a config such as this [2], and then try to
> boot a kernel on a VM.  The way I do this involves booting a test
> appliance and then kexec'ing into the kernel to be tested[3], using a
> 2cpu configuration.  (GCE machine type: n1-standard-2)
>
> [2] https://git.kernel.org/cgit/fs/ext2/xfstests-bld.git/tree/kernel-configs/ext4-x86_64-config-4.9
> [3] https://github.com/tytso/xfstests-bld/blob/master/Documentation/gce-xfstests.md
>
> You can then take a look at serial console using a command such as
> "gcloud compute instances get-serial-port-output <instance-name>", and
> you will get something like this (see attached).  The important bit is
> that the dhclient command is completely failing to be able to get a
> response from the network, from which I deduce that apparently that
> either networking send or receive or both seem to be badly affected by
> the commit in question.
>
> Please let me know if there's anything I can do to help you debug this
> further.

Hi Ted,
Just had a quick try on GCE, sorry for my stupid questions.

Q1:
Which distribution are you using for the GCE instance?

Q2:
Are you running xfs test as an embedded VM case, which means XFS test
appliance is also a VM inside the GCE instance? Or the kernel is built
for the instance itself?

Q3:
Can this bug be reproduced for kvm-xfstests case? I'm trying to set up
a local test bed if it makes sense.

>
> Cheers,
>
> 						- Ted
>
> Dec 11 23:53:20 xfstests-201612120451 kernel: [    0.000000] Linux version 4.9.0-rc8-ext4-06387-g03e5cbd (tytso@tytso-ssd) (gcc version 4.9.2 (Debian 4.9.2-10) ) #9 SMP Mon Dec 12 04:50:16 UTC 2016
> Dec 11 23:53:20 xfstests-201612120451 kernel: [    0.000000] Command line: root=/dev/sda1 ro console=ttyS0,38400n8 elevator=noop console=ttyS0  fstestcfg=4k fstestset=-g,quick fstestexc= fstestopt=aex fstesttyp=ext4 fstestapi=1.3
> Dec 11 23:53:20 xfstests-201612120451 kernel: [    0.000000] x86/fpu: Supporting XSAVE feature 0x001: 'x87 floating point registers'
> Dec 11 23:53:20 xfstests-201612120451 kernel: [    0.000000] x86/fpu: Supporting XSAVE feature 0x002: 'SSE registers'
> Dec 11 23:53:20 xfstests-201612120451 kernel: [    0.000000] x86/fpu: Supporting XSAVE feature 0x004: 'AVX registers'
> Dec 11 23:53:20 xfstests-201612120451 kernel: [    0.000000] x86/fpu: xstate_offset[2]:  576, xstate_sizes[2]:  256
> Dec 11 23:53:20 xfstests-201612120451 kernel: [    0.000000] x86/fpu: Enabled xstate features 0x7, context size is 832 bytes, using 'standard' format.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Started Load Kernel Modules.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Starting Apply Kernel Variables...
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Mounting Configuration File System...
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Mounting FUSE Control File System...
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Mounted FUSE Control File System.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Mounted Configuration File System.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Started Apply Kernel Variables.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Started Create Static Device Nodes in /dev.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Starting udev Kernel Device Manager...
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Started udev Kernel Device Manager.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Started udev Coldplug all Devices.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Starting udev Wait for Complete Device Initialization...
> Dec 11 23:53:20 xfstests-201612120451 systemd-fsck[1659]: xfstests-root: clean, 56268/655360 files, 357439/2620928 blocks
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Started File System Check on Root Device.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Starting Remount Root and Kernel File Systems...
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Started Remount Root and Kernel File Systems.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Started Various fixups to make systemd work better on Debian.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Starting Load/Save Random Seed...
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Starting Local File Systems (Pre).
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Reached target Local File Systems (Pre).
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Started Load/Save Random Seed.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Started udev Wait for Complete Device Initialization.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Starting Activation of LVM2 logical volumes...
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Starting Copy rules generated while the root was ro...
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Found device /dev/ttyS0.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Found device /dev/ttyS1.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Started Copy rules generated while the root was ro.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Found device /dev/ttyS2.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Found device /dev/ttyS3.
> Dec 11 23:53:20 xfstests-201612120451 systemd-udevd[2568]: could not open moddep file '/lib/modules/4.9.0-rc8-ext4-06387-g03e5cbd/modules.dep.bin'
> Dec 11 23:53:20 xfstests-201612120451 lvm[2579]: No volume groups found
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Started Activation of LVM2 logical volumes.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Starting Encrypted Volumes.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Reached target Encrypted Volumes.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Starting Activation of LVM2 logical volumes...
> Dec 11 23:53:20 xfstests-201612120451 lvm[2625]: No volume groups found
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Started Activation of LVM2 logical volumes.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Starting Monitoring of LVM2 mirrors, snapshots etc. using dmeventd or progress polling...
> Dec 11 23:53:20 xfstests-201612120451 lvm[2627]: No volume groups found
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Started Monitoring of LVM2 mirrors, snapshots etc. using dmeventd or progress polling.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Starting Local File Systems.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Reached target Local File Systems.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Starting Remote File Systems.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Reached target Remote File Systems.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Starting Trigger Flushing of Journal to Persistent Storage...
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Starting Create Volatile Files and Directories...
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Starting LSB: Generate ssh host keys if they do not exist...
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Starting LSB: Raise network interfaces....
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Started Trigger Flushing of Journal to Persistent Storage.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Started Create Volatile Files and Directories.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Started LSB: Generate ssh host keys if they do not exist.
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Starting Update UTMP about System Boot/Shutdown...
> Dec 11 23:53:20 xfstests-201612120451 systemd[1]: Started Update UTMP about System Boot/Shutdown.
> Dec 11 23:53:20 xfstests-201612120451 dhclient: Internet Systems Consortium DHCP Client 4.3.1
> Dec 11 23:53:20 xfstests-201612120451 dhclient: Copyright 2004-2014 Internet Systems Consortium.
> Dec 11 23:53:20 xfstests-201612120451 dhclient: All rights reserved.
> Dec 11 23:53:20 xfstests-201612120451 dhclient: For info, please visit https://www.isc.org/software/dhcp/
> Dec 11 23:53:20 xfstests-201612120451 dhclient:
> Dec 11 23:53:20 xfstests-201612120451 networking[2633]: Configuring network interfaces...Internet Systems Consortium DHCP Client 4.3.1
> Dec 11 23:53:20 xfstests-201612120451 networking[2633]: Copyright 2004-2014 Internet Systems Consortium.
> Dec 11 23:53:20 xfstests-201612120451 networking[2633]: All rights reserved.
> Dec 11 23:53:20 xfstests-201612120451 networking[2633]: For info, please visit https://www.isc.org/software/dhcp/
> Dec 11 23:53:20 xfstests-201612120451 dhclient: Listening on LPF/eth0/42:01:0a:f0:00:03
> Dec 11 23:53:20 xfstests-201612120451 dhclient: Sending on   LPF/eth0/42:01:0a:f0:00:03
> Dec 11 23:53:20 xfstests-201612120451 dhclient: Sending on   Socket/fallback
> Dec 11 23:53:20 xfstests-201612120451 dhclient: DHCPREQUEST on eth0 to 255.255.255.255 port 67
> Dec 11 23:53:20 xfstests-201612120451 networking[2633]: Listening on LPF/eth0/42:01:0a:f0:00:03
> Dec 11 23:53:20 xfstests-201612120451 networking[2633]: Sending on   LPF/eth0/42:01:0a:f0:00:03
> Dec 11 23:53:20 xfstests-201612120451 networking[2633]: Sending on   Socket/fallback
> Dec 11 23:53:20 xfstests-201612120451 networking[2633]: DHCPREQUEST on eth0 to 255.255.255.255 port 67
> Dec 11 23:53:20 xfstests-201612120451 dhclient: DHCPREQUEST on eth0 to 255.255.255.255 port 67
> Dec 11 23:53:20 xfstests-201612120451 networking[2633]: DHCPREQUEST on eth0 to 255.255.255.255 port 67
> Dec 11 23:53:20 xfstests-201612120451 dhclient: DHCPDISCOVER on eth0 to 255.255.255.255 port 67 interval 8
> Dec 11 23:53:20 xfstests-201612120451 networking[2633]: DHCPDISCOVER on eth0 to 255.255.255.255 port 67 interval 8
> Dec 11 23:53:20 xfstests-201612120451 dhclient: DHCPDISCOVER on eth0 to 255.255.255.255 port 67 interval 8
> Dec 11 23:53:20 xfstests-201612120451 networking[2633]: DHCP[^[[32m  OK  ^[[0m] DISCOVER on eth0 to 255.255.255.255 port 67 interval 8
> Dec 11 23:53:20 xfstests-201612120451 dhclient: DHCPDISCOVER on eth0 to 255.255.255.255 port 67 interval 13
> Dec 11 23:53:20 xfstests-201612120451 networking[2633]: DHCPDISCOVER on eth0 to 255.255.255.255 port 67 interval 13
> Dec 11 23:53:20 xfstests-201612120451 dhclient: DHCPDISCOVER on eth0 to 255.255.255.255 port 67 interval 17
> Dec 11 23:53:20 xfstests-201612120451 networking[2633]: DHCPDISCOVER on eth0 to 255.255.255.255 port 67 interval 17
> Dec 11 23:53:20 xfstests-201612120451 dhclient: DHCPDISCOVER on eth0 to 255.255.255.255 port 67 interval 15
> Dec 11 23:53:20 xfstests-201612120451 networking[2633]: DHCPDISCOVER on eth0 to 255.255.255.255 port 67 interval 15
> Dec 11 23:53:20 xfstests-201612120451 dhclient: No DHCPOFFERS received.
> Dec 11 23:53:20 xfstests-201612120451 dhclient: Trying recorded lease 10.240.0.3
> Dec 11 23:53:20 xfstests-201612120451 networking[2633]: No DHCPOFFERS received.
> Dec 11 23:53:20 xfstests-201612120451 networking[2633]: Trying recorded lease 10.240.0.3
> Dec 11 23:53:20 xfstests-201612120451 networking[2633]: connect: Network is unreachable
> Dec 11 23:53:20 xfstests-201612120451 logger: /etc/dhcp/dhclient-exit-hooks returned non-zero exit status 2
> Dec 11 23:53:20 xfstests-201612120451 dhclient: bound: renewal in 38598 seconds.
> Dec 11 23:53:20 xfstests-201612120451 networking[2633]: bound: renewal in 38598 seconds.
> Dec 11 23:53:20 xfstests-201612120451 networking[2633]: done.
>

^ permalink raw reply

* Re: [PATCH iproute2 -net-next] lwt: BPF support for LWT
From: Stephen Hemminger @ 2016-12-12 23:41 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: tgraf, alexei.starovoitov, netdev
In-Reply-To: <43d8d9ddc604f83e9abff9f998b9581210529c30.1481501217.git.daniel@iogearbox.net>

On Mon, 12 Dec 2016 01:14:35 +0100
Daniel Borkmann <daniel@iogearbox.net> wrote:

> +
> +static int lwt_parse_bpf(struct rtattr *rta, size_t len, int *argcp, char ***argvp,
> +			 int attr, const enum bpf_prog_type bpf_type)

Please break long lines like this.


> +
> +	/* argv is currently the first unparsed argument,
> +	 * but the lwt_parse_encap() caller will move to the next,
> +	 * so step back */
> +	*argcp = argc + 1;

iproute2 uses kernel comment style. 

I went ahead and fixed these.

^ permalink raw reply

* RE: [PATCH net-next 07/27] gianfar: remove use of VLAN_TAG_PRESENT
From: Claudiu Manoil @ 2016-12-13 12:09 UTC (permalink / raw)
  To: Michał Mirosław, netdev@vger.kernel.org
In-Reply-To: <244d34e8fb9a120fa79c40f06e9da7e10c1c0536.1481586602.git.mirq-linux@rere.qmqm.pl>

>-----Original Message-----
>From: Michał Mirosław [mailto:mirq-linux@rere.qmqm.pl]
>Sent: Tuesday, December 13, 2016 2:13 AM
>To: netdev@vger.kernel.org
>Cc: Claudiu Manoil <claudiu.manoil@freescale.com>
>Subject: [PATCH net-next 07/27] gianfar: remove use of VLAN_TAG_PRESENT
>
>Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
>---
> drivers/net/ethernet/freescale/gianfar_ethtool.c | 8 +++-----
> 1 file changed, 3 insertions(+), 5 deletions(-)
>
>diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c
>b/drivers/net/ethernet/freescale/gianfar_ethtool.c
>index 56588f2..95fa647 100644
>--- a/drivers/net/ethernet/freescale/gianfar_ethtool.c
>+++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c
>@@ -1155,11 +1155,9 @@ static int gfar_convert_to_filer(struct
>ethtool_rx_flow_spec *rule,
> 		prio = vlan_tci_prio(rule);
> 		prio_mask = vlan_tci_priom(rule);
>
>-		if (cfi == VLAN_TAG_PRESENT && cfi_mask ==
>VLAN_TAG_PRESENT) {
>-			vlan |= RQFPR_CFI;
>-			vlan_mask |= RQFPR_CFI;
>-		} else if (cfi != VLAN_TAG_PRESENT &&
>-			   cfi_mask == VLAN_TAG_PRESENT) {
>+		if (cfi_mask) {
>+			if (cfi)
>+				vlan |= RQFPR_CFI;
> 			vlan_mask |= RQFPR_CFI;
> 		}
> 	}

Reviewed-by: Claudiu Manoil <claudiu.manoil@nxp.com>

^ permalink raw reply

* Re: Designing a safe RX-zero-copy Memory Model for Networking
From: John Fastabend @ 2016-12-13 17:43 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, Christoph Lameter
  Cc: Mike Rapoport, netdev@vger.kernel.org, linux-mm, Willem de Bruijn,
	Björn Töpel, Karlsson, Magnus, Alexander Duyck,
	Mel Gorman, Tom Herbert, Brenden Blanco, Tariq Toukan,
	Saeed Mahameed, Jesse Brandeburg, Kalman Meth, Vladislav Yasevich
In-Reply-To: <20161213171028.24dbf519@redhat.com>

On 16-12-13 08:10 AM, Jesper Dangaard Brouer wrote:
> 
> On Mon, 12 Dec 2016 12:06:59 -0600 (CST) Christoph Lameter <cl@linux.com> wrote:
>> On Mon, 12 Dec 2016, Jesper Dangaard Brouer wrote:
>>
>>> Hmmm. If you can rely on hardware setup to give you steering and
>>> dedicated access to the RX rings.  In those cases, I guess, the "push"
>>> model could be a more direct API approach.  
>>
>> If the hardware does not support steering then one should be able to
>> provide those services in software.
> 
> This is the early demux problem.  With the push-mode of registering
> memory, you need hardware steering support, for zero-copy support, as
> the software step happens after DMA engine have written into the memory.
> 
> My model pre-VMA map all the pages in the RX ring (if zero-copy gets
> enabled, by a single user).  The software step can filter and zero-copy
> send packet-pages to the application/socket that requested this. The

What does "zero-copy send packet-pages to the application/socket that
requested this" mean? At the moment on x86 page-flipping appears to be
more expensive than memcpy (I can post some data shortly) and shared
memory was proposed and rejected for security reasons when we were
working on bifurcated driver.

> disadvantage is all zero-copy application need to share this VMA
> mapping.  This is solved by configuring HW filters into a RX-queue, and
> then only attach your zero-copy application to that queue.
> 
> 
>>> I was shooting for a model that worked without hardware support.
>>> And then transparently benefit from HW support by configuring a HW
>>> filter into a specific RX queue and attaching/using to that queue.  
>>
>> The discussion here is a bit amusing since these issues have been
>> resolved a long time ago with the design of the RDMA subsystem. Zero
>> copy is already in wide use. Memory registration is used to pin down
>> memory areas. Work requests can be filed with the RDMA subsystem that
>> then send and receive packets from the registered memory regions.
>> This is not strictly remote memory access but this is a basic mode of
>> operations supported  by the RDMA subsystem. The mlx5 driver quoted
>> here supports all of that.
> 
> I hear what you are saying.  I will look into a push-model, as it might
> be a better solution.
>  I will read up on RDMA + verbs and learn more about their API model.  I
> even plan to write a small sample program to get a feeling for the API,
> and maybe we can use that as a baseline for the performance target we
> can obtain on the same HW. (Thanks to Björn for already giving me some
> pointer here)
> 
> 
>> What is bad about RDMA is that it is a separate kernel subsystem.
>> What I would like to see is a deeper integration with the network
>> stack so that memory regions can be registred with a network socket
>> and work requests then can be submitted and processed that directly
>> read and write in these regions. The network stack should provide the
>> services that the hardware of the NIC does not suppport as usual.
> 
> Interesting.  So you even imagine sockets registering memory regions
> with the NIC.  If we had a proper NIC HW filter API across the drivers,
> to register the steering rule (like ibv_create_flow), this would be
> doable, but we don't (DPDK actually have an interesting proposal[1])
> 

Note rte_flow is in the same family of APIs as the proposed Flow API
that was rejected as well.  The features in Flow API that are not
included in the rte_flow proposal have logical extensions to support
them. In kernel we have 'tc' and multiple vendors support cls_flower
and cls_tc which offer a subset of the functionality in the DPDK
implementation.

Are you suggesting 'tc' is not a proper NIC HW filter API?

>  
>> The RX/TX ring in user space should be an additional mode of
>> operation of the socket layer. Once that is in place the "Remote
>> memory acces" can be trivially implemented on top of that and the
>> ugly RDMA sidecar subsystem can go away.
>  
> I cannot follow that 100%, but I guess you are saying we also need a
> more efficient mode of handing over pages/packet to userspace (than
> going through the normal socket API calls).
> 
> 
> Appreciate your input, it challenged my thinking.
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH] kcm: fix spelling mistake in Kconfig, "connectons"
From: Colin Ian King @ 2016-12-13 17:32 UTC (permalink / raw)
  To: David S . Miller, Tom Herbert, netdev; +Cc: linux-kernel
In-Reply-To: <20161213173025.24331-1-colin.king@canonical.com>

On 13/12/16 17:30, Colin King wrote:
> From: Colin Ian King <colin.king@canonical.com>
> 
> Trivial fix to spelling mistake "connectons" to "connections" in
> Kconfig text.
> 
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
> ---
>  net/kcm/Kconfig | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig
> index 87fca36..23b01e1 100644
> --- a/net/kcm/Kconfig
> +++ b/net/kcm/Kconfig
> @@ -7,5 +7,5 @@ config AF_KCM
>  	---help---
>  	  KCM (Kernel Connection Multiplexor) sockets provide a method
>  	  for multiplexing messages of a message based application
> -	  protocol over kernel connectons (e.g. TCP connections).
> +	  protocol over kernel connections (e.g. TCP connections).
>  
> 
Oops, ignore that, I was working on the wrong tree.

^ permalink raw reply

* [PATCH] kcm: fix spelling mistake in Kconfig, "connectons"
From: Colin King @ 2016-12-13 17:30 UTC (permalink / raw)
  To: David S . Miller, Tom Herbert, netdev; +Cc: linux-kernel

From: Colin Ian King <colin.king@canonical.com>

Trivial fix to spelling mistake "connectons" to "connections" in
Kconfig text.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
 net/kcm/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig
index 87fca36..23b01e1 100644
--- a/net/kcm/Kconfig
+++ b/net/kcm/Kconfig
@@ -7,5 +7,5 @@ config AF_KCM
 	---help---
 	  KCM (Kernel Connection Multiplexor) sockets provide a method
 	  for multiplexing messages of a message based application
-	  protocol over kernel connectons (e.g. TCP connections).
+	  protocol over kernel connections (e.g. TCP connections).
 
-- 
2.10.2

^ permalink raw reply related

* [PATCH net-next] netlink: revert broken, broken "2-clause nla_ok()"
From: Alexey Dobriyan @ 2016-12-13 19:30 UTC (permalink / raw)
  To: davem; +Cc: netdev, johannes
In-Reply-To: <1480950553.31788.40.camel@sipsolutions.net>

Commit 4f7df337fe79bba1e4c2d525525d63b5ba186bbd
"netlink: 2-clause nla_ok()" is BROKEN.

First clause tests if "->nla_len" could even be accessed at all,
it can not possibly be omitted.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 include/net/netlink.h |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -698,7 +698,8 @@ static inline int nla_len(const struct nlattr *nla)
  */
 static inline int nla_ok(const struct nlattr *nla, int remaining)
 {
-	return nla->nla_len >= sizeof(*nla) &&
+	return remaining >= (int) sizeof(*nla) &&
+	       nla->nla_len >= sizeof(*nla) &&
 	       nla->nla_len <= remaining;
 }
 

^ permalink raw reply

* Re: [PATCH] net: mvpp2: fix dma unmapping of TX buffers for fragments
From: Marcin Wojtas @ 2016-12-13 17:03 UTC (permalink / raw)
  To: Thomas Petazzoni
  Cc: David S . Miller, netdev, linux-arm-kernel@lists.infradead.org,
	Jason Cooper, Andrew Lunn, Sebastian Hesselbarth, Gregory Clement,
	Stefan Chulski, Nadav Haklai, Hanna Hawa, Yehuda Yitschak,
	Raphael G, stable@vger.kernel.org
In-Reply-To: <1481647995-7213-1-git-send-email-thomas.petazzoni@free-electrons.com>

Hi Thomas,

Reviewed-by: Marcin Wojtas <mw@semihalf.com>

Best regards,
Marcin

2016-12-13 17:53 GMT+01:00 Thomas Petazzoni
<thomas.petazzoni@free-electrons.com>:
> Since commit 71ce391dfb784 ("net: mvpp2: enable proper per-CPU TX
> buffers unmapping"), we are not correctly DMA unmapping TX buffers for
> fragments.
>
> Indeed, the mvpp2_txq_inc_put() function only stores in the
> txq_cpu->tx_buffs[] array the physical address of the buffer to be
> DMA-unmapped when skb != NULL. In addition, when DMA-unmapping, we use
> skb_headlen(skb) to get the size to be unmapped. Both of this works fine
> for TX descriptors that are associated directly to a SKB, but not the
> ones that are used for fragments, with a NULL pointer as skb:
>
>  - We have a NULL physical address when calling DMA unmap
>  - skb_headlen(skb) crashes because skb is NULL
>
> This causes random crashes when fragments are used.
>
> To solve this problem, this commit:
>
>  - Stores the physical address of the buffer to be unmapped
>    unconditionally, regardless of whether it is tied to a SKB or not.
>
>  - Adds a txq_cpu->tx_data_size[] array to store the size of the DMA
>    buffer to be unmapped upon TX completion.
>
> Fixes: 71ce391dfb784 ("net: mvpp2: enable proper per-CPU TX buffers unmapping")
> Reported-by: Raphael G <raphael.glon@corp.ovh.com>
> Cc: Raphael G <raphael.glon@corp.ovh.com>
> Cc: stable@vger.kernel.org
> Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
> ---
>  drivers/net/ethernet/marvell/mvpp2.c | 20 ++++++++++++++++----
>  1 file changed, 16 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
> index 1026c45..d168b13 100644
> --- a/drivers/net/ethernet/marvell/mvpp2.c
> +++ b/drivers/net/ethernet/marvell/mvpp2.c
> @@ -791,6 +791,8 @@ struct mvpp2_txq_pcpu {
>         /* Array of transmitted buffers' physical addresses */
>         dma_addr_t *tx_buffs;
>
> +       size_t *tx_data_size;
> +
>         /* Index of last TX DMA descriptor that was inserted */
>         int txq_put_index;
>
> @@ -980,9 +982,10 @@ static void mvpp2_txq_inc_put(struct mvpp2_txq_pcpu *txq_pcpu,
>                               struct mvpp2_tx_desc *tx_desc)
>  {
>         txq_pcpu->tx_skb[txq_pcpu->txq_put_index] = skb;
> -       if (skb)
> -               txq_pcpu->tx_buffs[txq_pcpu->txq_put_index] =
> -                                                        tx_desc->buf_phys_addr;
> +       txq_pcpu->tx_data_size[txq_pcpu->txq_put_index] =
> +               tx_desc->data_size;
> +       txq_pcpu->tx_buffs[txq_pcpu->txq_put_index] =
> +               tx_desc->buf_phys_addr;
>         txq_pcpu->txq_put_index++;
>         if (txq_pcpu->txq_put_index == txq_pcpu->size)
>                 txq_pcpu->txq_put_index = 0;
> @@ -4404,11 +4407,13 @@ static void mvpp2_txq_bufs_free(struct mvpp2_port *port,
>                 dma_addr_t buf_phys_addr =
>                                     txq_pcpu->tx_buffs[txq_pcpu->txq_get_index];
>                 struct sk_buff *skb = txq_pcpu->tx_skb[txq_pcpu->txq_get_index];
> +               size_t data_size =
> +                       txq_pcpu->tx_data_size[txq_pcpu->txq_get_index];
>
>                 mvpp2_txq_inc_get(txq_pcpu);
>
>                 dma_unmap_single(port->dev->dev.parent, buf_phys_addr,
> -                                skb_headlen(skb), DMA_TO_DEVICE);
> +                                data_size, DMA_TO_DEVICE);
>                 if (!skb)
>                         continue;
>                 dev_kfree_skb_any(skb);
> @@ -4662,6 +4667,11 @@ static int mvpp2_txq_init(struct mvpp2_port *port,
>                 if (!txq_pcpu->tx_buffs)
>                         goto error;
>
> +               txq_pcpu->tx_data_size = kmalloc(txq_pcpu->size *
> +                                                sizeof(size_t), GFP_KERNEL);
> +               if (!txq_pcpu->tx_data_size)
> +                       goto error;
> +
>                 txq_pcpu->count = 0;
>                 txq_pcpu->reserved_num = 0;
>                 txq_pcpu->txq_put_index = 0;
> @@ -4675,6 +4685,7 @@ static int mvpp2_txq_init(struct mvpp2_port *port,
>                 txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
>                 kfree(txq_pcpu->tx_skb);
>                 kfree(txq_pcpu->tx_buffs);
> +               kfree(txq_pcpu->tx_data_size);
>         }
>
>         dma_free_coherent(port->dev->dev.parent,
> @@ -4695,6 +4706,7 @@ static void mvpp2_txq_deinit(struct mvpp2_port *port,
>                 txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
>                 kfree(txq_pcpu->tx_skb);
>                 kfree(txq_pcpu->tx_buffs);
> +               kfree(txq_pcpu->tx_data_size);
>         }
>
>         if (txq->descs)
> --
> 2.7.4
>

^ permalink raw reply

* Re: [PATCH] ARM: add cmpxchg64 helper for ARMv7-M
From: Russell King - ARM Linux @ 2016-12-13 16:58 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Arnd Bergmann, netdev, linux-kernel, coreteam, netfilter-devel,
	Paul E. McKenney, David S. Miller, linux-arm-kernel
In-Reply-To: <20161210123234.GA5468@salvia>

On Sat, Dec 10, 2016 at 01:32:34PM +0100, Pablo Neira Ayuso wrote:
> Hi Arnd,
> 
> On Sat, Dec 10, 2016 at 11:36:34AM +0100, Arnd Bergmann wrote:
> > A change to the netfilter code in net-next introduced the first caller of
> > cmpxchg64 that can get built on ARMv7-M, leading to an error from the
> > assembler that points out the lack of 64-bit atomics on this architecture:
> > 
> > /tmp/ccMe7djj.s: Assembler messages:
> > /tmp/ccMe7djj.s:367: Error: selected processor does not support `ldrexd r0,r1,[lr]' in Thumb mode
> > /tmp/ccMe7djj.s:371: Error: selected processor does not support `strexd ip,r2,r3,[lr]' in Thumb mode
> > /tmp/ccMe7djj.s:389: Error: selected processor does not support `ldrexd r8,r9,[r7]' in Thumb mode
> > /tmp/ccMe7djj.s:393: Error: selected processor does not support `strexd lr,r0,r1,[r7]' in Thumb mode
> > scripts/Makefile.build:299: recipe for target 'net/netfilter/nft_counter.o' failed
> > 
> > This makes ARMv7-M use the same emulation from asm-generic/cmpxchg-local.h
> > that we use on architectures earlier than ARMv6K, to fix the build. The
> > 32-bit atomics are available on ARMv7-M and we keep using them there.
> > This ARM specific change is probably something we should do regardless
> > of the netfilter code.
> > 
> > However, looking at the new nft_counter_reset() function in nft_counter.c,
> > this looks incorrect to me not just on ARMv7-M but also on other
> > architectures, with at least the following possible race:
> 
> Right, Eric Dumazet already spotted this problem. I'm preparing a
> patch that doesn't require cmpxchg64(). Will keep you on Cc. Thanks.

Please keep me on the Cc as well so I know what's happening, thanks.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply

* Re: [PATCH V2 18/22] bnxt_re: Support for DCB
From: Jason Gunthorpe @ 2016-12-13 16:56 UTC (permalink / raw)
  To: Selvin Xavier
  Cc: Or Gerlitz, Doug Ledford,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Linux Netdev List, Eddie Wai, Devesh Sharma, Somnath Kotur,
	Sriharsha Basavapatna
In-Reply-To: <CA+sbYW1irBd0cTqJJSGJWRbBi-iFzvX3JpoTfF_daU47EqNtAg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Tue, Dec 13, 2016 at 11:55:55AM +0530, Selvin Xavier wrote:

> v1 eth_type is not defined. All vendor drivers have their own definition.

Send a cleanup patch?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: double free issue, mvpp2 driver, armada375 modules
From: Thomas Petazzoni @ 2016-12-13 16:55 UTC (permalink / raw)
  To: Raphael G
  Cc: linux-kernel, linux-arm-kernel, netdev, marcin wojtas, davem,
	Grégory Clement
In-Reply-To: <57272206.6070703@corp.ovh.com>

Hello,

On Mon, 2 May 2016 11:46:46 +0200, Raphael G wrote:

> enclosed the kernel panic we obtain after boot with a slightly patched 
> upstream kernel (4.5.2).
> 
> (as well as the patchset applied to the upstream kernel, so that you can 
> know which code we are talking about. Too bad we cannot use the upstream 
> kernel but we had no choice about this + we're no experts so we rely on 
> provided patches, adapted to our bootloader and hardware for this)
> 
> Reproduce:
> boot on kernel on an armada375 module, connect to it, launch a top in 
> commandline
> 
> As seen with Marcin Wojtas, reverting commit 
> e864b4c7b184bde36fa6a02bb3190983d2f796f9 fixes this issue.
> 
> Reporting upstream so that you can decide what should be done next

Thanks for your report. I have finally submitted a fix for this issue.
You can find it at:

  http://lists.infradead.org/pipermail/linux-arm-kernel/2016-December/473989.html

If you have the time to test it and report back, it would be very
useful.

Thanks a lot!

Thomas
-- 
Thomas Petazzoni, CTO, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply

* [PATCH] net: mvpp2: fix dma unmapping of TX buffers for fragments
From: Thomas Petazzoni @ 2016-12-13 16:53 UTC (permalink / raw)
  To: David S . Miller
  Cc: Thomas Petazzoni, Andrew Lunn, Yehuda Yitschak, Jason Cooper,
	Raphael G, netdev, Hanna Hawa, stable, Nadav Haklai,
	Gregory Clement, Stefan Chulski, Marcin Wojtas, linux-arm-kernel,
	Sebastian Hesselbarth

Since commit 71ce391dfb784 ("net: mvpp2: enable proper per-CPU TX
buffers unmapping"), we are not correctly DMA unmapping TX buffers for
fragments.

Indeed, the mvpp2_txq_inc_put() function only stores in the
txq_cpu->tx_buffs[] array the physical address of the buffer to be
DMA-unmapped when skb != NULL. In addition, when DMA-unmapping, we use
skb_headlen(skb) to get the size to be unmapped. Both of this works fine
for TX descriptors that are associated directly to a SKB, but not the
ones that are used for fragments, with a NULL pointer as skb:

 - We have a NULL physical address when calling DMA unmap
 - skb_headlen(skb) crashes because skb is NULL

This causes random crashes when fragments are used.

To solve this problem, this commit:

 - Stores the physical address of the buffer to be unmapped
   unconditionally, regardless of whether it is tied to a SKB or not.

 - Adds a txq_cpu->tx_data_size[] array to store the size of the DMA
   buffer to be unmapped upon TX completion.

Fixes: 71ce391dfb784 ("net: mvpp2: enable proper per-CPU TX buffers unmapping")
Reported-by: Raphael G <raphael.glon@corp.ovh.com>
Cc: Raphael G <raphael.glon@corp.ovh.com>
Cc: stable@vger.kernel.org
Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
---
 drivers/net/ethernet/marvell/mvpp2.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index 1026c45..d168b13 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -791,6 +791,8 @@ struct mvpp2_txq_pcpu {
 	/* Array of transmitted buffers' physical addresses */
 	dma_addr_t *tx_buffs;
 
+	size_t *tx_data_size;
+
 	/* Index of last TX DMA descriptor that was inserted */
 	int txq_put_index;
 
@@ -980,9 +982,10 @@ static void mvpp2_txq_inc_put(struct mvpp2_txq_pcpu *txq_pcpu,
 			      struct mvpp2_tx_desc *tx_desc)
 {
 	txq_pcpu->tx_skb[txq_pcpu->txq_put_index] = skb;
-	if (skb)
-		txq_pcpu->tx_buffs[txq_pcpu->txq_put_index] =
-							 tx_desc->buf_phys_addr;
+	txq_pcpu->tx_data_size[txq_pcpu->txq_put_index] =
+		tx_desc->data_size;
+	txq_pcpu->tx_buffs[txq_pcpu->txq_put_index] =
+		tx_desc->buf_phys_addr;
 	txq_pcpu->txq_put_index++;
 	if (txq_pcpu->txq_put_index == txq_pcpu->size)
 		txq_pcpu->txq_put_index = 0;
@@ -4404,11 +4407,13 @@ static void mvpp2_txq_bufs_free(struct mvpp2_port *port,
 		dma_addr_t buf_phys_addr =
 				    txq_pcpu->tx_buffs[txq_pcpu->txq_get_index];
 		struct sk_buff *skb = txq_pcpu->tx_skb[txq_pcpu->txq_get_index];
+		size_t data_size =
+			txq_pcpu->tx_data_size[txq_pcpu->txq_get_index];
 
 		mvpp2_txq_inc_get(txq_pcpu);
 
 		dma_unmap_single(port->dev->dev.parent, buf_phys_addr,
-				 skb_headlen(skb), DMA_TO_DEVICE);
+				 data_size, DMA_TO_DEVICE);
 		if (!skb)
 			continue;
 		dev_kfree_skb_any(skb);
@@ -4662,6 +4667,11 @@ static int mvpp2_txq_init(struct mvpp2_port *port,
 		if (!txq_pcpu->tx_buffs)
 			goto error;
 
+		txq_pcpu->tx_data_size = kmalloc(txq_pcpu->size *
+						 sizeof(size_t), GFP_KERNEL);
+		if (!txq_pcpu->tx_data_size)
+			goto error;
+
 		txq_pcpu->count = 0;
 		txq_pcpu->reserved_num = 0;
 		txq_pcpu->txq_put_index = 0;
@@ -4675,6 +4685,7 @@ static int mvpp2_txq_init(struct mvpp2_port *port,
 		txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
 		kfree(txq_pcpu->tx_skb);
 		kfree(txq_pcpu->tx_buffs);
+		kfree(txq_pcpu->tx_data_size);
 	}
 
 	dma_free_coherent(port->dev->dev.parent,
@@ -4695,6 +4706,7 @@ static void mvpp2_txq_deinit(struct mvpp2_port *port,
 		txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
 		kfree(txq_pcpu->tx_skb);
 		kfree(txq_pcpu->tx_buffs);
+		kfree(txq_pcpu->tx_data_size);
 	}
 
 	if (txq->descs)
-- 
2.7.4

^ permalink raw reply related

* Re: Designing a safe RX-zero-copy Memory Model for Networking
From: Christoph Lameter @ 2016-12-13 16:36 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: John Fastabend, Mike Rapoport, netdev@vger.kernel.org, linux-mm,
	Willem de Bruijn, Björn Töpel, Karlsson, Magnus,
	Alexander Duyck, Mel Gorman, Tom Herbert, Brenden Blanco,
	Tariq Toukan, Saeed Mahameed, Jesse Brandeburg, Kalman Meth,
	Vladislav Yasevich
In-Reply-To: <20161213171028.24dbf519@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 3223 bytes --]

On Tue, 13 Dec 2016, Jesper Dangaard Brouer wrote:

> This is the early demux problem.  With the push-mode of registering
> memory, you need hardware steering support, for zero-copy support, as
> the software step happens after DMA engine have written into the memory.

Right. But we could fall back to software. Transfer to a kernel buffer and
then move stuff over. Not much of an improvment but it will make things
work.

> > The discussion here is a bit amusing since these issues have been
> > resolved a long time ago with the design of the RDMA subsystem. Zero
> > copy is already in wide use. Memory registration is used to pin down
> > memory areas. Work requests can be filed with the RDMA subsystem that
> > then send and receive packets from the registered memory regions.
> > This is not strictly remote memory access but this is a basic mode of
> > operations supported  by the RDMA subsystem. The mlx5 driver quoted
> > here supports all of that.
>
> I hear what you are saying.  I will look into a push-model, as it might
> be a better solution.
>  I will read up on RDMA + verbs and learn more about their API model.  I
> even plan to write a small sample program to get a feeling for the API,
> and maybe we can use that as a baseline for the performance target we
> can obtain on the same HW. (Thanks to Björn for already giving me some
> pointer here)

Great.

> > What is bad about RDMA is that it is a separate kernel subsystem.
> > What I would like to see is a deeper integration with the network
> > stack so that memory regions can be registred with a network socket
> > and work requests then can be submitted and processed that directly
> > read and write in these regions. The network stack should provide the
> > services that the hardware of the NIC does not suppport as usual.
>
> Interesting.  So you even imagine sockets registering memory regions
> with the NIC.  If we had a proper NIC HW filter API across the drivers,
> to register the steering rule (like ibv_create_flow), this would be
> doable, but we don't (DPDK actually have an interesting proposal[1])

Well doing this would mean adding some features and that also would at
best allow general support for zero copy direct to user space with a
fallback to software if the hardware is missing some feature.

> > The RX/TX ring in user space should be an additional mode of
> > operation of the socket layer. Once that is in place the "Remote
> > memory acces" can be trivially implemented on top of that and the
> > ugly RDMA sidecar subsystem can go away.
>
> I cannot follow that 100%, but I guess you are saying we also need a
> more efficient mode of handing over pages/packet to userspace (than
> going through the normal socket API calls).

A work request contains the user space address of the data to be sent
and/or received. The address must be in a registered memory region. This
is different from copying the packet into kernel data structures.

I think this can easily be generalized. We need support for registering
memory regions, submissions of work request and the processing of
completion requets. QP (queue-pair) processing is probably the basis for
the whole scheme that is used in multiple context these days.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox