Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next  v3 1/2] inet: add bound ports statistic
From: Stephen Hemminger @ 2018-05-07 18:43 UTC (permalink / raw)
  To: davem, gerrit, kuznet, yoshfuji
  Cc: netdev, dccp, Stephen Hemminger, Stephen Hemminger
In-Reply-To: <20180507184333.32688-1-sthemmin@microsoft.com>

This adds a number of bound ports which fixes socket summary
command.  The ss -s has been broken since changes to slab info
and this is one way to recover the missing value by adding a
field onto /proc/net/sockstat.

Since this is an informational value only, there is no need
for locking.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 include/net/inet_hashtables.h    |  3 +++
 include/net/inet_timewait_sock.h |  2 ++
 net/dccp/proto.c                 |  1 +
 net/ipv4/inet_hashtables.c       | 22 +++++++++++++++++++---
 net/ipv4/inet_timewait_sock.c    |  8 +++++---
 net/ipv4/proc.c                  |  5 +++--
 net/ipv4/tcp.c                   |  1 +
 7 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 9141e95529e7..b02524e2571e 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -103,6 +103,7 @@ static inline struct net *ib_net(struct inet_bind_bucket *ib)
 
 struct inet_bind_hashbucket {
 	spinlock_t		lock;
+	unsigned int		count;
 	struct hlist_head	chain;
 };
 
@@ -193,7 +194,9 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
 			struct inet_bind_hashbucket *head,
 			const unsigned short snum);
 void inet_bind_bucket_destroy(struct kmem_cache *cachep,
+			      struct inet_bind_hashbucket *head,
 			      struct inet_bind_bucket *tb);
+unsigned int inet_bind_bucket_count(const struct proto *prot);
 
 static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
 			       const u32 bhash_size)
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index c7be1ca8e562..4cdb8034ad80 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -87,7 +87,9 @@ static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
 void inet_twsk_free(struct inet_timewait_sock *tw);
 void inet_twsk_put(struct inet_timewait_sock *tw);
 
+struct inet_bind_hashbucket;
 void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
+			   struct inet_bind_hashbucket *head,
 			   struct inet_hashinfo *hashinfo);
 
 struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 84cd4e3fd01b..25f03e62cfea 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1208,6 +1208,7 @@ static int __init dccp_init(void)
 	for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
 		spin_lock_init(&dccp_hashinfo.bhash[i].lock);
 		INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
+		dccp_hashinfo.bhash[i].count = 0;
 	}
 
 	rc = dccp_mib_init();
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 31ff46daae97..8ba6b17d95d5 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -58,6 +58,18 @@ static u32 sk_ehashfn(const struct sock *sk)
 			    sk->sk_daddr, sk->sk_dport);
 }
 
+/* Count how many any entries are in the bind hash table */
+unsigned int inet_bind_bucket_count(const struct proto *prot)
+{
+	const struct inet_hashinfo *hinfo = prot->h.hashinfo;
+	unsigned int i, ports = 0;
+
+	for (i = 0; i < hinfo->bhash_size; i++)
+		ports += hinfo->bhash[i].count;
+
+	return ports;
+}
+
 /*
  * Allocate and initialize a new local port bind bucket.
  * The bindhash mutex for snum's hash chain must be held here.
@@ -76,6 +88,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 		tb->fastreuseport = 0;
 		INIT_HLIST_HEAD(&tb->owners);
 		hlist_add_head(&tb->node, &head->chain);
+		++head->count;
 	}
 	return tb;
 }
@@ -83,10 +96,13 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
 /*
  * Caller must hold hashbucket lock for this tb with local BH disabled
  */
-void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
+void inet_bind_bucket_destroy(struct kmem_cache *cachep,
+			      struct inet_bind_hashbucket *head,
+			      struct inet_bind_bucket *tb)
 {
 	if (hlist_empty(&tb->owners)) {
 		__hlist_del(&tb->node);
+		--head->count;
 		kmem_cache_free(cachep, tb);
 	}
 }
@@ -115,7 +131,7 @@ static void __inet_put_port(struct sock *sk)
 	__sk_del_bind_node(sk);
 	inet_csk(sk)->icsk_bind_hash = NULL;
 	inet_sk(sk)->inet_num = 0;
-	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, head, tb);
 	spin_unlock(&head->lock);
 }
 
@@ -756,7 +772,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 		inet_ehash_nolisten(sk, (struct sock *)tw);
 	}
 	if (tw)
-		inet_twsk_bind_unhash(tw, hinfo);
+		inet_twsk_bind_unhash(tw, head, hinfo);
 	spin_unlock(&head->lock);
 	if (tw)
 		inet_twsk_deschedule_put(tw);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 88c5069b5d20..dd888c52f958 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -26,7 +26,8 @@
  *	Returns 1 if caller should call inet_twsk_put() after lock release.
  */
 void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
-			  struct inet_hashinfo *hashinfo)
+			   struct inet_bind_hashbucket *head,
+			   struct inet_hashinfo *hashinfo)
 {
 	struct inet_bind_bucket *tb = tw->tw_tb;
 
@@ -35,7 +36,8 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 
 	__hlist_del(&tw->tw_bind_node);
 	tw->tw_tb = NULL;
-	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep,
+				 head, tb);
 	__sock_put((struct sock *)tw);
 }
 
@@ -55,7 +57,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
 			hashinfo->bhash_size)];
 
 	spin_lock(&bhead->lock);
-	inet_twsk_bind_unhash(tw, hashinfo);
+	inet_twsk_bind_unhash(tw, bhead, hashinfo);
 	spin_unlock(&bhead->lock);
 
 	atomic_dec(&tw->tw_dr->tw_count);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 261b71d0ccc5..83bc9a0f2785 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -60,10 +60,11 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 	sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
 
 	socket_seq_show(seq);
-	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
+	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld ports %u\n",
 		   sock_prot_inuse_get(net, &tcp_prot), orphans,
 		   atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets,
-		   proto_memory_allocated(&tcp_prot));
+		   proto_memory_allocated(&tcp_prot),
+		   inet_bind_bucket_count(&tcp_prot));
 	seq_printf(seq, "UDP: inuse %d mem %ld\n",
 		   sock_prot_inuse_get(net, &udp_prot),
 		   proto_memory_allocated(&udp_prot));
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 868ed74a76a8..f62e2fb02fdf 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3836,6 +3836,7 @@ void __init tcp_init(void)
 	for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
 		spin_lock_init(&tcp_hashinfo.bhash[i].lock);
 		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
+		tcp_hashinfo.bhash[i].count = 0;
 	}
 
 
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next v3 0/2] socket statistics for ss
From: Stephen Hemminger @ 2018-05-07 18:43 UTC (permalink / raw)
  To: davem, gerrit, kuznet, yoshfuji; +Cc: netdev, dccp, Stephen Hemminger

The eventual goal is to remove all accesses to slab statistics
by iproute2 ss command. This are preliminary steps to add two
statistics which are not working now, because of slab merging
and changes that were done years ago to allow for common inet_hash
tables.

Stephen Hemminger (2):
  inet: add bound ports statistic
  socket: keep track of the number of sockets allocated

v3
  - fix build :-(
  - add allocated sockets statistic

 include/net/inet_hashtables.h    |  3 +++
 include/net/inet_timewait_sock.h |  2 ++
 net/dccp/proto.c                 |  1 +
 net/ipv4/inet_hashtables.c       | 22 +++++++++++++++++++---
 net/ipv4/inet_timewait_sock.c    |  8 +++++---
 net/ipv4/proc.c                  |  5 +++--
 net/ipv4/tcp.c                   |  1 +
 net/socket.c                     | 21 +++++++++++++++++++--
 8 files changed, 53 insertions(+), 10 deletions(-)

-- 
2.17.0

^ permalink raw reply

* Re: [PATCH v2 net-next 1/4] umh: introduce fork_usermode_blob() helper
From: Luis R. Rodriguez @ 2018-05-07 18:39 UTC (permalink / raw)
  To: Alexei Starovoitov, Eric Paris, Matthew Auld, Josh Triplett,
	Kirill A. Shutemov, Joonas Lahtinen, Chris Wilson,
	Stephen Smalley, Eric W. Biederman, Mimi Zohar
  Cc: Luis R. Rodriguez, Alexei Starovoitov, davem, daniel, torvalds,
	gregkh, luto, netdev, linux-kernel, kernel-team, Al Viro,
	David Howells, Kees Cook, Andrew Morton, Dominik Brodowski,
	Hugh Dickins, Jann Horn, Jani Nikula, Joonas Lahtinen,
	Rodrigo Vivi, David Airlie, Rafael J. Wysocki,
	Linux FS Devel <linux-fsd
In-Reply-To: <20180505013710.2fb2k6e5uotd22kr@ast-mbp>

On Fri, May 04, 2018 at 06:37:11PM -0700, Alexei Starovoitov wrote:
> On Fri, May 04, 2018 at 07:56:43PM +0000, Luis R. Rodriguez wrote:
> > What a mighty short list of reviewers. Adding some more. My review below.
> > I'd appreciate a Cc on future versions of these patches.
> 
> sure.
> 
> > On Wed, May 02, 2018 at 09:36:01PM -0700, Alexei Starovoitov wrote:
> > > Introduce helper:
> > > int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
> > > struct umh_info {
> > >        struct file *pipe_to_umh;
> > >        struct file *pipe_from_umh;
> > >        pid_t pid;
> > > };
> > > 
> > > that GPLed kernel modules (signed or unsigned) can use it to execute part
> > > of its own data as swappable user mode process.
> > > 
> > > The kernel will do:
> > > - mount "tmpfs"
> > 
> > Actually its a *shared* vfsmount tmpfs for all umh blobs.
> 
> yep

OK just note CONFIG_TMPFS can be disabled, and likewise for CONFIG_SHMEM,
in which case tmpfs and shmem are replaced by a simple ramfs code, more
appropriate for systems without swap.

> > > +static struct vfsmount *umh_fs;
> > > +
> > > +static int init_tmpfs(void)
> > 
> > Please use umh_init_tmpfs(). 
> 
> ok
> 
> > Also see init/main.c do_basic_setup() which calls
> > usermodehelper_enable() prior to do_initcalls(). Now, fortunately TMPFS is only
> > bool, saving us from some races and we do call tmpfs's init first shmem_init():
> > 
> > static void __init do_basic_setup(void)
> > {
> > 	cpuset_init_smp();
> > 	shmem_init();
> > 	driver_init();
> > 	init_irq_proc();
> > 	do_ctors();
> > 	usermodehelper_enable();
> > 	do_initcalls();
> > }
> > 
> > But it also means we're enabling your new call call fork_usermode_blob() on
> > early init code even if we're not setup. Since this umh tmpfs vfsmount is
> > shared I'd say just call this init right before usermodehelper_enable()
> > on do_basic_setup().
> 
> Not following.
> Why init_tmpfs() should be called by __init function?

Nope, not at all, I was suggesting:

diff --git a/init/main.c b/init/main.c
index 0697284a28ee..67a48fbd96ca 100644
--- a/init/main.c
+++ b/init/main.c
@@ -973,6 +973,7 @@ static void __init do_basic_setup(void)
 	driver_init();
 	init_irq_proc();
 	do_ctors();
+	umh_init_tmpfs();
 	usermodehelper_enable();
 	do_initcalls();
 }

Mainly to avoid the locking situation Jann Horn noted, and also provide
proper kernel ordering expectations.

> Are you saying make 'static struct vfsmount *shm_mnt;'
> global and use it here? so no init_tmpfs() necessary?
> I think that can work, but feels that having two
> tmpfs mounts (one for shmem and one for umh) is cleaner.

No, but now that you mention it... if a shared vfsmount is not used the
/sys/kernel/mm/transparent_hugepage/shmem_enabled knob for using huge pages
would not be followed for umh modules. For the i915 driver this was *why*
they ended up adding shmem_file_setup_with_mnt(), they wanted huge pages to
support huge-gtt-pages. What is the rationale behind umh.c using it for
umh modules?

Users of shmem_kernel_file_setup() spawned later out of the desire to
*avoid* LSMs since it didn't make sense in their case as their inodes
are never exposed to userspace. Such is the case for ipc/shm.c and
security/keys/big_key.c. Refer to commit c7277090927a5 ("security: shmem:
implement kernel private shmem inodes") and then commit e1832f2923ec9
("ipc: use private shmem or hugetlbfs inodes for shm segments").

In this new umh usermode modules case we are:

  a) actually mapping data already extracted by the kernel somehow from
     a file somehow, presumably from /lib/modules/ path somewhere, but
     again this is not visible to umc.c, as it just gets called with:

	fork_usermode_blob(void *data, size_t len, struct umh_info *info)

  b) Creating the respective tmpfs file with shmem_file_setup_with_mnt()
     with our on our own shared struct vfsmount *umh_fs.

  c) Populating the file created and stuffing it with our data passed

  d) Calling do_execve_file() on it.

Its not clear to me why you used shmem_file_setup_with_mnt() in this case. What
are the gains? It would make sense to use shmem_kernel_file_setup() to avoid an
LSM check on step b) *but* only if we already had a proper LSM check on step
a).

I checked how you use fork_usermode_blob() in a) and in this case the kernel
module bpfilter would be loaded first, and for that we already have an LSM
check / hook for that module. From my review then, the magic done on your
second patch to stuff the userspace application into the module should be
irrelevant to us from an LSM perspective.

So, I can see a reason to use shmem_kernel_file_setup() but not I cannot
see a reason to be using      shmem_file_setup_with_mnt() at the moment.

I Cc'd tmpfs and LSM folks for further feedback.

> > > +{
> > > +	size_t offset = 0;
> > > +	int err;
> > > +
> > > +	do {
> > > +		unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
> > > +		struct page *page;
> > > +		void *pgdata, *vaddr;
> > > +
> > > +		err = pagecache_write_begin(file, file->f_mapping, offset, len,
> > > +					    0, &page, &pgdata);
> > > +		if (err < 0)
> > > +			goto fail;
> > > +
> > > +		vaddr = kmap(page);
> > > +		memcpy(vaddr, data, len);
> > > +		kunmap(page);
> > > +
> > > +		err = pagecache_write_end(file, file->f_mapping, offset, len,
> > > +					  len, page, pgdata);
> > > +		if (err < 0)
> > > +			goto fail;
> > > +
> > > +		size -= len;
> > > +		data += len;
> > > +		offset += len;
> > > +	} while (size);
> > 
> > Character for character, this looks like a wonderful copy and paste from
> > i915_gem_object_create_from_data()'s own loop which does the same exact
> > thing. Perhaps its time for a helper on mm/filemap.c with an export so
> > if a bug is fixed in one place its fixed in both places.
> 
> yes, of course, but not right now.
> Once it all lands that will be the time to create common helper.
> It's not a good idea to mess with i915 in one patch set.

Either way works with me, so long as its done.

> > > +int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
> > 
> > Please use umh_fork_blob()
> 
> sorry, no. fork_usermode_blob() is much more descriptive name.

Prefixing new umh.c symbols with umh_*() makes it very clear this came from
kernel/umh.c functionality. I've been dealing with other places in the kernel
that have conflated their own use of kernel/umh.c functionality what they
expose in both code and documentation, and correcting this has taken a long
time. Best avoid future confusion and be consistent with new exported symbols
for umc.c functionality.

Also, descriptive as fork_usermode_blob() may seem there is a possible future
clash with a more generic call.

> > Also, can you extend lib/test_kmod.c with a test case for this with its own
> > demo and try to blow it up?
> 
> in what sense? bpfilter is the test and the driving component for it.

That's the thing, it shouldn't be.

We are adding *new* functionality here, I don't want to require enabling
bpfitler or its dependencies to test generic umh user module loading
functionality.  For instance, we have lib/test_module.c to help test generic
module loading, regardless of the functionality or requirements for other
modules.

> I'm expecting that folks who want to use this helper to do usb drivers
> in user space may want to extend this helper further, but that's their job.

I don't even want to test USB, I am just interesting in the *very* *very*
basic aspects of it. A simple lib/test_umh_module.c would do with a respective
check that its loaded, given this is a requirement from the API. It also helps
folks understand the core basic knobs without having to look at bfilter code.

If we're going to get this merged I'd be interested in doing ongoing testing
with 0day with  simple UMH module with and without CONFIG_SHMEM for instance
and check that it works in both cases.

Testing this may seem irrelevant to you but keep in mind we're also already
testing just general kernel module loading. As silly as it may seem, adding new
functionality and a respective test case lets us try to avoid regressions, and
provide small unit tests to help reproduce issues and corner case situations
you may not be considering.

  Luis

^ permalink raw reply related

* Re: [PATCH net-next v8 1/7] sched: Add Common Applications Kept Enhanced (cake) qdisc
From: Toke Høiland-Jørgensen @ 2018-05-07 18:37 UTC (permalink / raw)
  To: Cong Wang; +Cc: Linux Kernel Network Developers, Cake List
In-Reply-To: <CAM_iQpV4qThp2+_sKC7ZBgUCUu0BSZvOkT0tvf2ATdMFgu+YPg@mail.gmail.com>

Cong Wang <xiyou.wangcong@gmail.com> writes:

> On Fri, May 4, 2018 at 12:10 PM, Toke Høiland-Jørgensen <toke@toke.dk> wrote:
>> Thank you for the review! A few comments below, I'll fix the rest.
>>
>>> [...]
>>>
>>> So sch_cake doesn't accept normal tc filters? Is this intentional?
>>> If so, why?
>>
>> For two reasons:
>>
>> - The two-level scheduling used in CAKE (tins / diffserv classes, and
>>   flow hashing) does not map in an obvious way to the classification
>>   index of tc filters.
>
> Sounds like you need to extend struct tcf_result?

Well, the obvious way to support filters would be to have skb->priority
override the diffserv mapping if set, and have the filter classification
result select the queue within that tier. That would probably be doable,
but see below.

>> - No one has asked for it. We have done our best to accommodate the
>>   features people want in a home router qdisc directly in CAKE, and the
>>   ability to integrate tc filters has never been requested.
>
> It is not hard to integrate, basically you need to call
> tcf_classify(). Although it is not mandatory, it is odd to merge a
> qdisc doesn't work with existing tc filters (and actions too).

I looked at the fq_codel code to do this. Is it possible to support
filtering without implementing Qdisc_class_ops? If so, I'll give it a
shot; but implementing the class ops is more than I can commit to...

>>>> +static int cake_init(struct Qdisc *sch, struct nlattr *opt,
>>>> +                    struct netlink_ext_ack *extack)
>>>> +{
>>>> +       struct cake_sched_data *q = qdisc_priv(sch);
>>>> +       int i, j;
>>>> +
>>>> +       sch->limit = 10240;
>>>> +       q->tin_mode = CAKE_DIFFSERV_BESTEFFORT;
>>>> +       q->flow_mode  = CAKE_FLOW_TRIPLE;
>>>> +
>>>> +       q->rate_bps = 0; /* unlimited by default */
>>>> +
>>>> +       q->interval = 100000; /* 100ms default */
>>>> +       q->target   =   5000; /* 5ms: codel RFC argues
>>>> +                              * for 5 to 10% of interval
>>>> +                              */
>>>> +
>>>> +       q->cur_tin = 0;
>>>> +       q->cur_flow  = 0;
>>>> +
>>>> +       if (opt) {
>>>> +               int err = cake_change(sch, opt, extack);
>>>> +
>>>> +               if (err)
>>>> +                       return err;
>>>
>>>
>>> Not sure if you really want to reallocate q->tines below for this
>>> case.
>>
>> I'm not sure what you mean here? If there's an error we return it and
>> the qdisc is not created. If there's not, we allocate and on subsequent
>> changes cake_change() will be called directly, or? Can the init function
>> ever be called again during the lifetime of the qdisc?
>>
>
> In non-error case, you call cake_change() first and then allocate
> ->tins with kvzalloc() below. For me it looks like you don't need to
> allocate it again when ->tins!=NULL.

No, we definitely don't. It's just not clear to me how cake_init() could
ever be called with q->tins already allocated?

I can add a check in any case, though, I see that there is one in
fq_codel as well...

-Toke

^ permalink raw reply

* Re: [net-next PATCH v3 4/6] udp: Partially unroll handling of first segment and last segment
From: Eric Dumazet @ 2018-05-07 18:27 UTC (permalink / raw)
  To: Alexander Duyck, netdev, willemb, davem
In-Reply-To: <20180507180840.3486.67728.stgit@localhost.localdomain>



On 05/07/2018 11:08 AM, Alexander Duyck wrote:
> From: Alexander Duyck <alexander.h.duyck@intel.com>
> 
> This patch allows us to take care of unrolling the first segment and the
> last segment of the loop for processing the segmented skb. Part of the
> motivation for this is that it makes it easier to process the fact that the
> first fame and all of the frames in between should be mostly identical
> in terms of header data, and the last frame has differences in the length
> and partial checksum.
> 
> In addition I am dropping the header length calculation since we don't
> really need it for anything but the last frame and it can be easily
> obtained by just pulling the data_len and offset of tail from the transport
> header.
> 
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>

Reviewed-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply

* Re: [net-next PATCH v3 6/6] udp: Do not copy destructor if one is not present
From: Eric Dumazet @ 2018-05-07 18:25 UTC (permalink / raw)
  To: Alexander Duyck, netdev, willemb, davem
In-Reply-To: <20180507180852.3486.27748.stgit@localhost.localdomain>



On 05/07/2018 11:08 AM, Alexander Duyck wrote:
> From: Alexander Duyck <alexander.h.duyck@intel.com>
> 
> This patch makes it so that if a destructor is not present we avoid trying
> to update the skb socket or any reference counting that would be associated
> with the NULL socket and/or descriptor. By doing this we can support
> traffic coming from another namespace without any issues.
> 
> Acked-by: Willem de Bruijn <willemb@google.com>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>

Reviewed-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply

* Re: [PATCH bpf-next] xsk: fix 64-bit division
From: Randy Dunlap @ 2018-05-07 18:25 UTC (permalink / raw)
  To: Björn Töpel, ast, daniel, netdev
  Cc: Björn Töpel, Stephen Rothwell, magnus.karlsson,
	linux-next, linux-kernel
In-Reply-To: <20180507174350.3534-1-bjorn.topel@gmail.com>

On 05/07/2018 10:43 AM, Björn Töpel wrote:
> From: Björn Töpel <bjorn.topel@intel.com>
> 
> i386 builds report:
>   net/xdp/xdp_umem.o: In function `xdp_umem_reg':
>   xdp_umem.c:(.text+0x47e): undefined reference to `__udivdi3'
> 
> This fix uses div_u64 instead of the GCC built-in.
> 
> Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt")
> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>

I don't know why the subject says xsk (instead of xdp), but anyway:

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>

Thanks.

> ---
>  net/xdp/xdp_umem.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> index 881dfdefe235..2b47a1dd7c6c 100644
> --- a/net/xdp/xdp_umem.c
> +++ b/net/xdp/xdp_umem.c
> @@ -209,7 +209,7 @@ int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
>  	if ((addr + size) < addr)
>  		return -EINVAL;
>  
> -	nframes = size / frame_size;
> +	nframes = (unsigned int)div_u64(size, frame_size);
>  	if (nframes == 0 || nframes > UINT_MAX)
>  		return -EINVAL;
>  
> 


-- 
~Randy

^ permalink raw reply

* Re: [PATCH v2] net: dsa: drop some VLAs in switch.c
From: Florian Fainelli @ 2018-05-07 18:14 UTC (permalink / raw)
  To: Salvatore Mesoraca, Andrew Lunn
  Cc: linux-kernel, kernel-hardening, netdev, David S. Miller,
	Kees Cook, Vivien Didelot, David Laight
In-Reply-To: <1525706596-13601-1-git-send-email-s.mesoraca16@gmail.com>

On 05/07/2018 08:23 AM, Salvatore Mesoraca wrote:
> We avoid 2 VLAs by using a pre-allocated field in dsa_switch.
> We also try to avoid dynamic allocation whenever possible.
> 
> Link: http://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com
> Link: http://lkml.kernel.org/r/20180505185145.GB32630@lunn.ch
> 
> Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>
> ---
>  include/net/dsa.h |  3 +++
>  net/dsa/dsa2.c    | 14 ++++++++++++++
>  net/dsa/switch.c  | 22 ++++++++++------------
>  3 files changed, 27 insertions(+), 12 deletions(-)
> 
> diff --git a/include/net/dsa.h b/include/net/dsa.h
> index 60fb4ec..576791d 100644
> --- a/include/net/dsa.h
> +++ b/include/net/dsa.h
> @@ -256,6 +256,9 @@ struct dsa_switch {
>  	/* Number of switch port queues */
>  	unsigned int		num_tx_queues;
>  
> +	unsigned long		*bitmap;
> +	unsigned long		_bitmap;
> +
>  	/* Dynamically allocated ports, keep last */
>  	size_t num_ports;
>  	struct dsa_port ports[];
> diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
> index adf50fb..cebf35f0 100644
> --- a/net/dsa/dsa2.c
> +++ b/net/dsa/dsa2.c
> @@ -748,6 +748,20 @@ struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
>  	if (!ds)
>  		return NULL;
>  
> +	/* We avoid allocating memory outside dsa_switch
> +	 * if it is not needed.
> +	 */
> +	if (n <= sizeof(ds->_bitmap) * 8) {
> +		ds->bitmap = &ds->_bitmap;

Should not this be / BITS_PER_BYTE? If the sizeof(unsigned long) is <=
8, then you don't need to allocate it, otherwise, you have to.

I would actually just always dynamically allocate the bitmap, optimizing
for the case where we have fewer than or 8 ports is not worth IMHO.
-- 
Florian

^ permalink raw reply

* [net-next PATCH v3 6/6] udp: Do not copy destructor if one is not present
From: Alexander Duyck @ 2018-05-07 18:08 UTC (permalink / raw)
  To: netdev, willemb, davem
In-Reply-To: <20180507180310.3486.35994.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch makes it so that if a destructor is not present we avoid trying
to update the skb socket or any reference counting that would be associated
with the NULL socket and/or descriptor. By doing this we can support
traffic coming from another namespace without any issues.

Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 net/ipv4/udp_offload.c |   22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index d4f2daca0c33..ede2a7305b90 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -195,6 +195,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	struct sk_buff *segs, *seg;
 	struct udphdr *uh;
 	unsigned int mss;
+	bool copy_dtor;
 	__sum16 check;
 	__be16 newlen;
 
@@ -205,12 +206,14 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	skb_pull(gso_skb, sizeof(*uh));
 
 	/* clear destructor to avoid skb_segment assigning it to tail */
-	WARN_ON_ONCE(gso_skb->destructor != sock_wfree);
-	gso_skb->destructor = NULL;
+	copy_dtor = gso_skb->destructor == sock_wfree;
+	if (copy_dtor)
+		gso_skb->destructor = NULL;
 
 	segs = skb_segment(gso_skb, features);
 	if (unlikely(IS_ERR_OR_NULL(segs))) {
-		gso_skb->destructor = sock_wfree;
+		if (copy_dtor)
+			gso_skb->destructor = sock_wfree;
 		return segs;
 	}
 
@@ -229,9 +232,11 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
 
 	for (;;) {
-		seg->destructor = sock_wfree;
-		seg->sk = sk;
-		sum_truesize += seg->truesize;
+		if (copy_dtor) {
+			seg->destructor = sock_wfree;
+			seg->sk = sk;
+			sum_truesize += seg->truesize;
+		}
 
 		if (!seg->next)
 			break;
@@ -263,8 +268,9 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 		uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0;
 
 	/* update refcount for the packet */
-	refcount_add(sum_truesize - gso_skb->truesize, &sk->sk_wmem_alloc);
-
+	if (copy_dtor)
+		refcount_add(sum_truesize - gso_skb->truesize,
+			     &sk->sk_wmem_alloc);
 	return segs;
 }
 EXPORT_SYMBOL_GPL(__udp_gso_segment);

^ permalink raw reply related

* [net-next PATCH v3 5/6] udp: Add support for software checksum and GSO_PARTIAL with GSO offload
From: Alexander Duyck @ 2018-05-07 18:08 UTC (permalink / raw)
  To: netdev, willemb, davem
In-Reply-To: <20180507180310.3486.35994.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch adds support for a software provided checksum and GSO_PARTIAL
segmentation support. With this we can offload UDP segmentation on devices
that only have partial support for tunnels.

Since we are no longer needing the hardware checksum we can drop the checks
in the segmentation code that were verifying if it was present.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 net/ipv4/udp_offload.c |   29 +++++++++++++++++++----------
 net/ipv6/udp_offload.c |   11 +----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index b15c78ac3f23..d4f2daca0c33 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -214,6 +214,13 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 		return segs;
 	}
 
+	/* GSO partial and frag_list segmentation only requires splitting
+	 * the frame into an MSS multiple and possibly a remainder, both
+	 * cases return a GSO skb. So update the mss now.
+	 */
+	if (skb_is_gso(segs))
+		mss *= skb_shinfo(segs)->gso_segs;
+
 	seg = segs;
 	uh = udp_hdr(seg);
 
@@ -232,6 +239,12 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 		uh->len = newlen;
 		uh->check = check;
 
+		if (seg->ip_summed == CHECKSUM_PARTIAL)
+			gso_reset_checksum(seg, ~check);
+		else
+			uh->check = gso_make_checksum(seg, ~check) ? :
+				    CSUM_MANGLED_0;
+
 		seg = seg->next;
 		uh = udp_hdr(seg);
 	}
@@ -244,6 +257,11 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	uh->len = newlen;
 	uh->check = check;
 
+	if (seg->ip_summed == CHECKSUM_PARTIAL)
+		gso_reset_checksum(seg, ~check);
+	else
+		uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0;
+
 	/* update refcount for the packet */
 	refcount_add(sum_truesize - gso_skb->truesize, &sk->sk_wmem_alloc);
 
@@ -251,15 +269,6 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 }
 EXPORT_SYMBOL_GPL(__udp_gso_segment);
 
-static struct sk_buff *__udp4_gso_segment(struct sk_buff *gso_skb,
-					  netdev_features_t features)
-{
-	if (!can_checksum_protocol(features, htons(ETH_P_IP)))
-		return ERR_PTR(-EIO);
-
-	return __udp_gso_segment(gso_skb, features);
-}
-
 static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 					 netdev_features_t features)
 {
@@ -283,7 +292,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 		goto out;
 
 	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
-		return __udp4_gso_segment(skb, features);
+		return __udp_gso_segment(skb, features);
 
 	mss = skb_shinfo(skb)->gso_size;
 	if (unlikely(skb->len <= mss))
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 61e34f1d2fa2..03a2ff3fe1e6 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -17,15 +17,6 @@
 #include <net/ip6_checksum.h>
 #include "ip6_offload.h"
 
-static struct sk_buff *__udp6_gso_segment(struct sk_buff *gso_skb,
-					  netdev_features_t features)
-{
-	if (!can_checksum_protocol(features, htons(ETH_P_IPV6)))
-		return ERR_PTR(-EIO);
-
-	return __udp_gso_segment(gso_skb, features);
-}
-
 static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 					 netdev_features_t features)
 {
@@ -58,7 +49,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 			goto out;
 
 		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
-			return __udp6_gso_segment(skb, features);
+			return __udp_gso_segment(skb, features);
 
 		/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
 		 * do checksum of UDP packets sent as multiple IP fragments.

^ permalink raw reply related

* [net-next PATCH v3 4/6] udp: Partially unroll handling of first segment and last segment
From: Alexander Duyck @ 2018-05-07 18:08 UTC (permalink / raw)
  To: netdev, willemb, davem
In-Reply-To: <20180507180310.3486.35994.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch allows us to take care of unrolling the first segment and the
last segment of the loop for processing the segmented skb. Part of the
motivation for this is that it makes it easier to process the fact that the
first fame and all of the frames in between should be mostly identical
in terms of header data, and the last frame has differences in the length
and partial checksum.

In addition I am dropping the header length calculation since we don't
really need it for anything but the last frame and it can be easily
obtained by just pulling the data_len and offset of tail from the transport
header.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 net/ipv4/udp_offload.c |   33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 92c182e99ddc..b15c78ac3f23 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -193,7 +193,6 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	struct sock *sk = gso_skb->sk;
 	unsigned int sum_truesize = 0;
 	struct sk_buff *segs, *seg;
-	unsigned int hdrlen;
 	struct udphdr *uh;
 	unsigned int mss;
 	__sum16 check;
@@ -203,7 +202,6 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	if (gso_skb->len <= sizeof(*uh) + mss)
 		return ERR_PTR(-EINVAL);
 
-	hdrlen = gso_skb->data - skb_mac_header(gso_skb);
 	skb_pull(gso_skb, sizeof(*uh));
 
 	/* clear destructor to avoid skb_segment assigning it to tail */
@@ -216,30 +214,37 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 		return segs;
 	}
 
-	uh = udp_hdr(segs);
+	seg = segs;
+	uh = udp_hdr(seg);
 
 	/* compute checksum adjustment based on old length versus new */
 	newlen = htons(sizeof(*uh) + mss);
 	check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
 
-	for (seg = segs; seg; seg = seg->next) {
-		uh = udp_hdr(seg);
+	for (;;) {
+		seg->destructor = sock_wfree;
+		seg->sk = sk;
+		sum_truesize += seg->truesize;
 
-		/* last packet can be partial gso_size */
-		if (!seg->next) {
-			newlen = htons(seg->len - hdrlen);
-			check = csum16_add(csum16_sub(uh->check, uh->len),
-					   newlen);
-		}
+		if (!seg->next)
+			break;
 
 		uh->len = newlen;
 		uh->check = check;
 
-		seg->destructor = sock_wfree;
-		seg->sk = sk;
-		sum_truesize += seg->truesize;
+		seg = seg->next;
+		uh = udp_hdr(seg);
 	}
 
+	/* last packet can be partial gso_size, account for that in checksum */
+	newlen = htons(skb_tail_pointer(seg) - skb_transport_header(seg) +
+		       seg->data_len);
+	check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
+
+	uh->len = newlen;
+	uh->check = check;
+
+	/* update refcount for the packet */
 	refcount_add(sum_truesize - gso_skb->truesize, &sk->sk_wmem_alloc);
 
 	return segs;

^ permalink raw reply related

* [net-next PATCH v3 3/6] udp: Do not pass checksum as a parameter to GSO segmentation
From: Alexander Duyck @ 2018-05-07 18:08 UTC (permalink / raw)
  To: netdev, willemb, davem
In-Reply-To: <20180507180310.3486.35994.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

This patch is meant to allow us to avoid having to recompute the checksum
from scratch and have it passed as a parameter.

Instead of taking that approach we can take advantage of the fact that the
length that was used to compute the existing checksum is included in the
UDP header.

Finally to avoid the need to invert the result we can just call csum16_add
and csum16_sub directly. By doing this we can avoid a number of
instructions in the loop that is handling segmentation.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 include/net/udp.h      |    3 +--
 net/ipv4/udp_offload.c |   32 ++++++++++++++++++--------------
 net/ipv6/udp_offload.c |    7 +------
 3 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index 8bd83b044ecd..9289b6425032 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -175,8 +175,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
 
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
-				  netdev_features_t features,
-				  __sum16 check);
+				  netdev_features_t features);
 
 static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 {
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index c1afcd2f1a76..92c182e99ddc 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -188,8 +188,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 EXPORT_SYMBOL(skb_udp_tunnel_segment);
 
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
-				  netdev_features_t features,
-				  __sum16 check)
+				  netdev_features_t features)
 {
 	struct sock *sk = gso_skb->sk;
 	unsigned int sum_truesize = 0;
@@ -197,6 +196,8 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	unsigned int hdrlen;
 	struct udphdr *uh;
 	unsigned int mss;
+	__sum16 check;
+	__be16 newlen;
 
 	mss = skb_shinfo(gso_skb)->gso_size;
 	if (gso_skb->len <= sizeof(*uh) + mss)
@@ -215,17 +216,25 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 		return segs;
 	}
 
+	uh = udp_hdr(segs);
+
+	/* compute checksum adjustment based on old length versus new */
+	newlen = htons(sizeof(*uh) + mss);
+	check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
+
 	for (seg = segs; seg; seg = seg->next) {
 		uh = udp_hdr(seg);
-		uh->len = htons(seg->len - hdrlen);
-		uh->check = check;
 
 		/* last packet can be partial gso_size */
-		if (!seg->next)
-			csum_replace2(&uh->check, htons(mss),
-				      htons(seg->len - hdrlen - sizeof(*uh)));
+		if (!seg->next) {
+			newlen = htons(seg->len - hdrlen);
+			check = csum16_add(csum16_sub(uh->check, uh->len),
+					   newlen);
+		}
+
+		uh->len = newlen;
+		uh->check = check;
 
-		uh->check = ~uh->check;
 		seg->destructor = sock_wfree;
 		seg->sk = sk;
 		sum_truesize += seg->truesize;
@@ -240,15 +249,10 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 static struct sk_buff *__udp4_gso_segment(struct sk_buff *gso_skb,
 					  netdev_features_t features)
 {
-	const struct iphdr *iph = ip_hdr(gso_skb);
-	unsigned int mss = skb_shinfo(gso_skb)->gso_size;
-
 	if (!can_checksum_protocol(features, htons(ETH_P_IP)))
 		return ERR_PTR(-EIO);
 
-	return __udp_gso_segment(gso_skb, features,
-				 udp_v4_check(sizeof(struct udphdr) + mss,
-					      iph->saddr, iph->daddr, 0));
+	return __udp_gso_segment(gso_skb, features);
 }
 
 static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index dea03ec09715..61e34f1d2fa2 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -20,15 +20,10 @@
 static struct sk_buff *__udp6_gso_segment(struct sk_buff *gso_skb,
 					  netdev_features_t features)
 {
-	const struct ipv6hdr *ip6h = ipv6_hdr(gso_skb);
-	unsigned int mss = skb_shinfo(gso_skb)->gso_size;
-
 	if (!can_checksum_protocol(features, htons(ETH_P_IPV6)))
 		return ERR_PTR(-EIO);
 
-	return __udp_gso_segment(gso_skb, features,
-				 udp_v6_check(sizeof(struct udphdr) + mss,
-					      &ip6h->saddr, &ip6h->daddr, 0));
+	return __udp_gso_segment(gso_skb, features);
 }
 
 static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,

^ permalink raw reply related

* [net-next PATCH v3 2/6] udp: Do not pass MSS as parameter to GSO segmentation
From: Alexander Duyck @ 2018-05-07 18:08 UTC (permalink / raw)
  To: netdev, willemb, davem
In-Reply-To: <20180507180310.3486.35994.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

There is no point in passing MSS as a parameter for for the GSO
segmentation call as it is already available via the shared info for the
skb itself.

Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 include/net/udp.h      |    2 +-
 net/ipv4/udp_offload.c |    6 ++++--
 net/ipv6/udp_offload.c |    2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index 05990746810e..8bd83b044ecd 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -176,7 +176,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 				  netdev_features_t features,
-				  unsigned int mss, __sum16 check);
+				  __sum16 check);
 
 static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 {
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 006257092f06..c1afcd2f1a76 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -189,14 +189,16 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 				  netdev_features_t features,
-				  unsigned int mss, __sum16 check)
+				  __sum16 check)
 {
 	struct sock *sk = gso_skb->sk;
 	unsigned int sum_truesize = 0;
 	struct sk_buff *segs, *seg;
 	unsigned int hdrlen;
 	struct udphdr *uh;
+	unsigned int mss;
 
+	mss = skb_shinfo(gso_skb)->gso_size;
 	if (gso_skb->len <= sizeof(*uh) + mss)
 		return ERR_PTR(-EINVAL);
 
@@ -244,7 +246,7 @@ static struct sk_buff *__udp4_gso_segment(struct sk_buff *gso_skb,
 	if (!can_checksum_protocol(features, htons(ETH_P_IP)))
 		return ERR_PTR(-EIO);
 
-	return __udp_gso_segment(gso_skb, features, mss,
+	return __udp_gso_segment(gso_skb, features,
 				 udp_v4_check(sizeof(struct udphdr) + mss,
 					      iph->saddr, iph->daddr, 0));
 }
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index f7b85b1e6b3e..dea03ec09715 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -26,7 +26,7 @@ static struct sk_buff *__udp6_gso_segment(struct sk_buff *gso_skb,
 	if (!can_checksum_protocol(features, htons(ETH_P_IPV6)))
 		return ERR_PTR(-EIO);
 
-	return __udp_gso_segment(gso_skb, features, mss,
+	return __udp_gso_segment(gso_skb, features,
 				 udp_v6_check(sizeof(struct udphdr) + mss,
 					      &ip6h->saddr, &ip6h->daddr, 0));
 }

^ permalink raw reply related

* [net-next PATCH v3 1/6] udp: Record gso_segs when supporting UDP segmentation offload
From: Alexander Duyck @ 2018-05-07 18:08 UTC (permalink / raw)
  To: netdev, willemb, davem
In-Reply-To: <20180507180310.3486.35994.stgit@localhost.localdomain>

From: Alexander Duyck <alexander.h.duyck@intel.com>

We need to record the number of segments that will be generated when this
frame is segmented. The expectation is that if gso_size is set then
gso_segs is set as well. Without this some drivers such as ixgbe get
confused if they attempt to offload this as they record 0 segments for the
entire packet instead of the correct value.

Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 net/ipv4/udp.c |    2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index dd3102a37ef9..e07db83b311e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -793,6 +793,8 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
 
 		skb_shinfo(skb)->gso_size = cork->gso_size;
 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(uh),
+							 cork->gso_size);
 		goto csum_partial;
 	}
 

^ permalink raw reply related

* [net-next PATCH v3 0/6] Series short description
From: Alexander Duyck @ 2018-05-07 18:08 UTC (permalink / raw)
  To: netdev, willemb, davem

This patch set addresses a number of issues I found while sorting out
enabling UDP GSO Segmentation support for ixgbe/ixgbevf. Specifically there
were a number of issues related to the checksum and such that seemed to
cause either minor irregularities or kernel panics in the case of the
offload request being allowed to traverse between name spaces.

With this set applied I am was able to get UDP GSO traffic to pass over
vxlan tunnels in both offloaded modes and non-offloaded modes for ixgbe and
ixgbevf.

I submitted the driver specific patches earlier as an RFC:
https://patchwork.ozlabs.org/project/netdev/list/?series=42477&archive=both&state=*

v2: Updated patches based on feedback from Eric Dumazet
    Split first patch into several patches based on feedback from Eric
v3: Drop patch that was calling pskb_may_pull as it was redundant.
    Added code to use MANGLED_0 in case of UDP checksum
    Drop patch adding NETIF_F_GSO_UDP_L4 to list of GSO software offloads
    Added Acked-by for patches reviewed by Willem and not changed

---

Alexander Duyck (6):
      udp: Record gso_segs when supporting UDP segmentation offload
      udp: Do not pass MSS as parameter to GSO segmentation
      udp: Do not pass checksum as a parameter to GSO segmentation
      udp: Partially unroll handling of first segment and last segment
      udp: Add support for software checksum and GSO_PARTIAL with GSO offload
      udp: Do not copy destructor if one is not present


 include/net/udp.h      |    3 +-
 net/ipv4/udp.c         |    2 +
 net/ipv4/udp_offload.c |   94 +++++++++++++++++++++++++++++++-----------------
 net/ipv6/udp_offload.c |   16 +-------
 4 files changed, 64 insertions(+), 51 deletions(-)

^ permalink raw reply

* Re: [net-next PATCH v2 0/8] UDP GSO Segmentation clean-ups
From: Alexander Duyck @ 2018-05-07 18:02 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Willem de Bruijn, David Miller
In-Reply-To: <CAF=yD-LAF8s+GXbGQxAHx1ptw73Aj3pr3ipMzRhmsnmwSczZ5Q@mail.gmail.com>

On Sat, May 5, 2018 at 3:06 AM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Fri, May 4, 2018 at 8:28 PM, Alexander Duyck
> <alexander.duyck@gmail.com> wrote:
>> This patch set addresses a number of issues I found while sorting out
>> enabling UDP GSO Segmentation support for ixgbe/ixgbevf. Specifically there
>> were a number of issues related to the checksum and such that seemed to
>> cause either minor irregularities or kernel panics in the case of the
>> offload request being allowed to traverse between name spaces.
>
> Were you able to traverse GSO packets between network namespace before
> adding to NETIF_F_GSO_SOFTWARE? It does appear that veth includes
> NETIF_F_GSO_ENCAP_ALL, which also allows GSO.

Without that change the tunnel wouldn't pass the requests between
namespaces. However with it I was able to easily test the software
checksum code as otherwise the socket was returning EIO when the
hardware checksum was disabled.

> In either case, it should not be possible for GSO packets to arrive on a veth
> device, as that can result in queuing the GSO packet to a recipient socket.
> In this regard veth is like loopback and must exclude GSO support.
>
> I'll take a look.

I suspect it was probably sending veth UDP segmentation offload
requests. For now I can probably drop he patch that was adding it and
it can be added later to individual drivers if needed.

Thanks.

- Alex

^ permalink raw reply

* [PATCH v3 bpf-next 1/2] bpf: enable stackmap with build_id in nmi context
From: Song Liu @ 2018-05-07 17:50 UTC (permalink / raw)
  To: netdev
  Cc: Song Liu, kernel-team, qinteng, tobin, Alexei Starovoitov,
	Daniel Borkmann, Peter Zijlstra
In-Reply-To: <20180507175049.1541963-1-songliubraving@fb.com>

Currently, we cannot parse build_id in nmi context because of
up_read(&current->mm->mmap_sem), this makes stackmap with build_id
less useful. This patch enables parsing build_id in nmi by putting
the up_read() call in irq_work. To avoid memory allocation in nmi
context, we use per cpu variable for the irq_work. As a result, only
one irq_work per cpu is allowed. If the irq_work is in-use, we
fallback to only report ips.

Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 init/Kconfig          |  1 +
 kernel/bpf/stackmap.c | 59 +++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index f013afc..480a4f2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1391,6 +1391,7 @@ config BPF_SYSCALL
 	bool "Enable bpf() system call"
 	select ANON_INODES
 	select BPF
+	select IRQ_WORK
 	default n
 	help
 	  Enable the bpf() system call that allows to manipulate eBPF
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 3ba102b..b59ace0 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,7 @@
 #include <linux/perf_event.h>
 #include <linux/elf.h>
 #include <linux/pagemap.h>
+#include <linux/irq_work.h>
 #include "percpu_freelist.h"
 
 #define STACK_CREATE_FLAG_MASK					\
@@ -32,6 +33,23 @@ struct bpf_stack_map {
 	struct stack_map_bucket *buckets[];
 };
 
+/* irq_work to run up_read() for build_id lookup in nmi context */
+struct stack_map_irq_work {
+	struct irq_work irq_work;
+	struct rw_semaphore *sem;
+};
+
+static void do_up_read(struct irq_work *entry)
+{
+	struct stack_map_irq_work *work;
+
+	work = container_of(entry, struct stack_map_irq_work, irq_work);
+	up_read(work->sem);
+	work->sem = NULL;
+}
+
+static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
+
 static inline bool stack_map_use_build_id(struct bpf_map *map)
 {
 	return (map->map_flags & BPF_F_STACK_BUILD_ID);
@@ -267,17 +285,27 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 {
 	int i;
 	struct vm_area_struct *vma;
+	bool in_nmi_ctx = in_nmi();
+	bool irq_work_busy = false;
+	struct stack_map_irq_work *work;
+
+	if (in_nmi_ctx) {
+		work = this_cpu_ptr(&up_read_work);
+		if (work->irq_work.flags & IRQ_WORK_BUSY)
+			/* cannot queue more up_read, fallback */
+			irq_work_busy = true;
+	}
 
 	/*
-	 * We cannot do up_read() in nmi context, so build_id lookup is
-	 * only supported for non-nmi events. If at some point, it is
-	 * possible to run find_vma() without taking the semaphore, we
-	 * would like to allow build_id lookup in nmi context.
+	 * We cannot do up_read() in nmi context. To do build_id lookup
+	 * in nmi context, we need to run up_read() in irq_work. We use
+	 * a percpu variable to do the irq_work. If the irq_work is
+	 * already used by another lookup, we fall back to report ips.
 	 *
 	 * Same fallback is used for kernel stack (!user) on a stackmap
 	 * with build_id.
 	 */
-	if (!user || !current || !current->mm || in_nmi() ||
+	if (!user || !current || !current->mm || irq_work_busy ||
 	    down_read_trylock(&current->mm->mmap_sem) == 0) {
 		/* cannot access current->mm, fall back to ips */
 		for (i = 0; i < trace_nr; i++) {
@@ -299,7 +327,13 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 			- vma->vm_start;
 		id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
 	}
-	up_read(&current->mm->mmap_sem);
+
+	if (!in_nmi_ctx) {
+		up_read(&current->mm->mmap_sem);
+	} else {
+		work->sem = &current->mm->mmap_sem;
+		irq_work_queue(&work->irq_work);
+	}
 }
 
 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
@@ -575,3 +609,16 @@ const struct bpf_map_ops stack_map_ops = {
 	.map_update_elem = stack_map_update_elem,
 	.map_delete_elem = stack_map_delete_elem,
 };
+
+static int __init stack_map_init(void)
+{
+	int cpu;
+	struct stack_map_irq_work *work;
+
+	for_each_possible_cpu(cpu) {
+		work = per_cpu_ptr(&up_read_work, cpu);
+		init_irq_work(&work->irq_work, do_up_read);
+	}
+	return 0;
+}
+subsys_initcall(stack_map_init);
-- 
2.9.5

^ permalink raw reply related

* [PATCH v3 bpf-next 2/2] bpf: add selftest for stackmap with build_id in NMI context
From: Song Liu @ 2018-05-07 17:50 UTC (permalink / raw)
  To: netdev; +Cc: Song Liu, kernel-team, qinteng, tobin
In-Reply-To: <20180507175049.1541963-1-songliubraving@fb.com>

This new test captures stackmap with build_id with hardware event
PERF_COUNT_HW_CPU_CYCLES.

Because we only support one ips-to-build_id lookup per cpu in NMI
context, stack_amap will not be able to do the lookup in this test.
Therefore, we didn't do compare_stack_ips(), as it will alwasy fail.

urandom_read.c is extended to run configurable cycles so that it can be
caught by the perf event.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 tools/testing/selftests/bpf/test_progs.c   | 134 +++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/urandom_read.c |  10 ++-
 2 files changed, 142 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index aa336f0..5d2d8ef 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1272,6 +1272,139 @@ static void test_stacktrace_build_id(void)
 	return;
 }
 
+static void test_stacktrace_build_id_nmi(void)
+{
+	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
+	const char *file = "./test_stacktrace_build_id.o";
+	int err, pmu_fd, prog_fd;
+	struct perf_event_attr attr = {
+		.sample_freq = 5000,
+		.freq = 1,
+		.type = PERF_TYPE_HARDWARE,
+		.config = PERF_COUNT_HW_CPU_CYCLES,
+	};
+	__u32 key, previous_key, val, duration = 0;
+	struct bpf_object *obj;
+	char buf[256];
+	int i, j;
+	struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH];
+	int build_id_matches = 0;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_PERF_EVENT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
+		return;
+
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+			 0 /* cpu 0 */, -1 /* group id */,
+			 0 /* flags */);
+	if (CHECK(pmu_fd < 0, "perf_event_open",
+		  "err %d errno %d. Does the test host support PERF_COUNT_HW_CPU_CYCLES?\n",
+		  pmu_fd, errno))
+		goto close_prog;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
+		  err, errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
+		  err, errno))
+		goto disable_pmu;
+
+	/* find map fds */
+	control_map_fd = bpf_find_map(__func__, obj, "control_map");
+	if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
+	if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
+	if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
+		  err, errno))
+		goto disable_pmu;
+
+	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
+	if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
+	       == 0);
+	assert(system("taskset 0x1 ./urandom_read 100000") == 0);
+	/* disable stack trace collection */
+	key = 0;
+	val = 1;
+	bpf_map_update_elem(control_map_fd, &key, &val, 0);
+
+	/* for every element in stackid_hmap, we can find a corresponding one
+	 * in stackmap, and vise versa.
+	 */
+	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
+	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
+	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = extract_build_id(buf, 256);
+
+	if (CHECK(err, "get build_id with readelf",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = bpf_map_get_next_key(stackmap_fd, NULL, &key);
+	if (CHECK(err, "get_next_key from stackmap",
+		  "err %d, errno %d\n", err, errno))
+		goto disable_pmu;
+
+	do {
+		char build_id[64];
+
+		err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs);
+		if (CHECK(err, "lookup_elem from stackmap",
+			  "err %d, errno %d\n", err, errno))
+			goto disable_pmu;
+		for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i)
+			if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID &&
+			    id_offs[i].offset != 0) {
+				for (j = 0; j < 20; ++j)
+					sprintf(build_id + 2 * j, "%02x",
+						id_offs[i].build_id[j] & 0xff);
+				if (strstr(buf, build_id) != NULL)
+					build_id_matches = 1;
+			}
+		previous_key = key;
+	} while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
+
+	if (CHECK(build_id_matches < 1, "build id match",
+		  "Didn't find expected build ID from the map\n"))
+		goto disable_pmu;
+
+	/*
+	 * We intentionally skip compare_stack_ips(). This is because we
+	 * only support one in_nmi() ips-to-build_id translation per cpu
+	 * at any time, thus stack_amap here will always fallback to
+	 * BPF_STACK_BUILD_ID_IP;
+	 */
+
+disable_pmu:
+	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
+
+close_pmu:
+	close(pmu_fd);
+
+close_prog:
+	bpf_object__close(obj);
+}
+
 #define MAX_CNT_RAWTP	10ull
 #define MAX_STACK_RAWTP	100
 struct get_stack_trace_t {
@@ -1425,6 +1558,7 @@ int main(void)
 	test_tp_attach_query();
 	test_stacktrace_map();
 	test_stacktrace_build_id();
+	test_stacktrace_build_id_nmi();
 	test_stacktrace_map_raw_tp();
 	test_get_stack_raw_tp();
 
diff --git a/tools/testing/selftests/bpf/urandom_read.c b/tools/testing/selftests/bpf/urandom_read.c
index 4acfdeb..9de8b7c 100644
--- a/tools/testing/selftests/bpf/urandom_read.c
+++ b/tools/testing/selftests/bpf/urandom_read.c
@@ -6,15 +6,21 @@
 #include <stdlib.h>
 
 #define BUF_SIZE 256
-int main(void)
+
+int main(int argc, char *argv[])
 {
 	int fd = open("/dev/urandom", O_RDONLY);
 	int i;
 	char buf[BUF_SIZE];
+	int count = 4;
 
 	if (fd < 0)
 		return 1;
-	for (i = 0; i < 4; ++i)
+
+	if (argc == 2)
+		count = atoi(argv[1]);
+
+	for (i = 0; i < count; ++i)
 		read(fd, buf, BUF_SIZE);
 
 	close(fd);
-- 
2.9.5

^ permalink raw reply related

* [PATCH v3 bpf-next 0/2] bpf: enable stackmap with build_id in nmi
From: Song Liu @ 2018-05-07 17:50 UTC (permalink / raw)
  To: netdev; +Cc: Song Liu, kernel-team, qinteng, tobin

Changes v2 -> v3:
  Improve syntax based on suggestion by Tobin C. Harding.

Changes v1 -> v2:
  1. Rename some variables to (hopefully) reduce confusion;
  2. Check irq_work status with IRQ_WORK_BUSY (instead of work->sem);
  3. In Kconfig, let BPF_SYSCALL select IRQ_WORK;
  4. Add static to DEFINE_PER_CPU();
   5. Remove pr_info() in stack_map_init().

Song Liu (2):
  bpf: enable stackmap with build_id in nmi context
  bpf: add selftest for stackmap with build_id in NMI context

 init/Kconfig                               |   1 +
 kernel/bpf/stackmap.c                      |  59 +++++++++++--
 tools/testing/selftests/bpf/test_progs.c   | 134 +++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/urandom_read.c |  10 ++-
 4 files changed, 196 insertions(+), 8 deletions(-)

^ permalink raw reply

* [PATCH net-next v10 3/4] netdev: octeon-ethernet: Add Cavium Octeon III support.
From: Steven J. Hill @ 2018-05-07 17:22 UTC (permalink / raw)
  To: netdev; +Cc: Carlos Munoz, Steven J. Hill, David Daney
In-Reply-To: <1525713731-27092-1-git-send-email-steven.hill@cavium.com>

From: Carlos Munoz <cmunoz@cavium.com>

The Cavium OCTEON cn78xx and cn73xx SoCs have network packet I/O
hardware that is significantly different from previous generations of
the family.

Add a new driver for this hardware.  The Ethernet MAC is called BGX on
these devices.  Common code for the MAC is in octeon3-bgx-port.c.
Four of these BGX MACs are grouped together and managed as a group by
octeon3-bgx-nexus.c.  Ingress packet classification is done by the PKI
unit initialized in octeon3-pki.c.  Queue management is done in the
SSO, initialized by octeon3-sso.c.  Egress is handled by the PKO,
initialized in octeon3-pko.c.

Hardware memory buffer management is done in octeon3-fpa.c

Signed-off-by: Carlos Munoz <cmunoz@cavium.com>
Signed-off-by: Steven J. Hill <Steven.Hill@cavium.com>
Signed-off-by: David Daney <david.daney@cavium.com>
---
 drivers/net/ethernet/cavium/Kconfig                |   59 +-
 drivers/net/ethernet/cavium/octeon/Makefile        |    7 +
 .../net/ethernet/cavium/octeon/octeon3-bgx-nexus.c |  417 ++++
 .../net/ethernet/cavium/octeon/octeon3-bgx-port.c  | 2003 +++++++++++++++++++
 drivers/net/ethernet/cavium/octeon/octeon3-core.c  | 2079 ++++++++++++++++++++
 drivers/net/ethernet/cavium/octeon/octeon3-fpa.c   |  358 ++++
 drivers/net/ethernet/cavium/octeon/octeon3-pki.c   |  823 ++++++++
 drivers/net/ethernet/cavium/octeon/octeon3-pko.c   | 1688 ++++++++++++++++
 drivers/net/ethernet/cavium/octeon/octeon3-sso.c   |  301 +++
 drivers/net/ethernet/cavium/octeon/octeon3.h       |  430 ++++
 10 files changed, 8155 insertions(+), 10 deletions(-)
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-bgx-nexus.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-bgx-port.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-core.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-fpa.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-pki.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-pko.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-sso.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3.h

diff --git a/drivers/net/ethernet/cavium/Kconfig b/drivers/net/ethernet/cavium/Kconfig
index 043e3c1..3ec4384 100644
--- a/drivers/net/ethernet/cavium/Kconfig
+++ b/drivers/net/ethernet/cavium/Kconfig
@@ -4,7 +4,7 @@
 
 config NET_VENDOR_CAVIUM
 	bool "Cavium ethernet drivers"
-	depends on PCI
+	depends on PCI || CAVIUM_OCTEON_SOC
 	default y
 	---help---
 	  Select this option if you want enable Cavium network support.
@@ -13,6 +13,12 @@ config NET_VENDOR_CAVIUM
 
 if NET_VENDOR_CAVIUM
 
+#
+# The Thunder* and LiquidIO drivers require PCI.
+#
+
+if PCI
+
 config THUNDER_NIC_PF
 	tristate "Thunder Physical function driver"
 	depends on 64BIT
@@ -77,6 +83,20 @@ config LIQUIDIO
 	  To compile this driver as a module, choose M here: the module
 	  will be called liquidio.  This is recommended.
 
+config LIQUIDIO_VF
+	tristate "Cavium LiquidIO VF support"
+	depends on 64BIT && PCI_MSI
+	imply PTP_1588_CLOCK
+	---help---
+	  This driver supports Cavium LiquidIO Intelligent Server Adapter
+	  based on CN23XX chips.
+
+	  To compile this driver as a module, choose M here: The module
+	  will be called liquidio_vf. MSI-X interrupt support is required
+	  for this driver to work correctly
+
+endif # PCI
+
 config OCTEON_MGMT_ETHERNET
 	tristate "Octeon Management port ethernet driver (CN5XXX, CN6XXX)"
 	depends on CAVIUM_OCTEON_SOC
@@ -88,16 +108,35 @@ config OCTEON_MGMT_ETHERNET
 	  port on Cavium Networks' Octeon CN57XX, CN56XX, CN55XX,
 	  CN54XX, CN52XX, and CN6XXX chips.
 
-config LIQUIDIO_VF
-	tristate "Cavium LiquidIO VF support"
-	depends on 64BIT && PCI_MSI
-	imply PTP_1588_CLOCK
+config OCTEON_FPA3
+	tristate
+	depends on CAVIUM_OCTEON_SOC
+
+config OCTEON3_BGX_NEXUS
+	tristate
+	depends on CAVIUM_OCTEON_SOC
+
+config OCTEON3_BGX_PORT
+	tristate "Cavium OCTEON-III BGX port support"
+	depends on CAVIUM_OCTEON_SOC
+	select OCTEON3_BGX_NEXUS
 	---help---
-	  This driver supports Cavium LiquidIO Intelligent Server Adapter
-	  based on CN23XX chips.
+	  Enable the driver for Cavium Octeon III BGX ports. BGX ports
+	  support sgmii, rgmii, xaui, rxaui, xlaui, xfi, 10KR and 40KR modes.
 
-	  To compile this driver as a module, choose M here: The module
-	  will be called liquidio_vf. MSI-X interrupt support is required
-	  for this driver to work correctly
+	  Say Y for support of any Octeon III SoC Ethernet port.
+
+config OCTEON3_ETHERNET
+	tristate "Cavium OCTEON-III PKI/PKO Ethernet support"
+	depends on CAVIUM_OCTEON_SOC
+	select OCTEON_BGX_PORT
+	select OCTEON_FPA3
+	select FW_LOADER
+	---help---
+	  Enable the driver for Cavium Octeon III Ethernet via PKI/PKO
+	  units.  No support for cn70xx chips (use OCTEON_ETHERNET for
+	  cn70xx).
+
+	  Say Y for support of any Octeon III SoC Ethernet port.
 
 endif # NET_VENDOR_CAVIUM
diff --git a/drivers/net/ethernet/cavium/octeon/Makefile b/drivers/net/ethernet/cavium/octeon/Makefile
index efa41c1..13efb90 100644
--- a/drivers/net/ethernet/cavium/octeon/Makefile
+++ b/drivers/net/ethernet/cavium/octeon/Makefile
@@ -3,3 +3,10 @@
 #
 
 obj-$(CONFIG_OCTEON_MGMT_ETHERNET)	+= octeon_mgmt.o
+obj-$(CONFIG_OCTEON3_BGX_PORT)		+= octeon3-bgx-port.o
+obj-$(CONFIG_OCTEON3_BGX_NEXUS)		+= octeon3-bgx-nexus.o
+obj-$(CONFIG_OCTEON_FPA3)		+= octeon3-fpa.o
+obj-$(CONFIG_OCTEON3_ETHERNET)		+= octeon3-ethernet.o
+
+octeon3-ethernet-objs += octeon3-core.o octeon3-pki.o octeon3-sso.o \
+			 octeon3-pko.o
diff --git a/drivers/net/ethernet/cavium/octeon/octeon3-bgx-nexus.c b/drivers/net/ethernet/cavium/octeon/octeon3-bgx-nexus.c
new file mode 100644
index 0000000..2d82a68
--- /dev/null
+++ b/drivers/net/ethernet/cavium/octeon/octeon3-bgx-nexus.c
@@ -0,0 +1,417 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* The BGX nexus consists of a group of up to four Ethernet MACs (the
+ * LMACs).  This driver manages the LMACs and creates a child device
+ * for each of the configured LMACs.
+ *
+ * Copyright (c) 2017 Cavium, Inc.
+ */
+#include <linux/platform_device.h>
+#include <linux/of_platform.h>
+#include <linux/of_address.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+
+#include "octeon3.h"
+
+static atomic_t request_mgmt_once;
+static atomic_t load_driver_once;
+static atomic_t pki_id;
+
+#define MAX_MIX_PER_NODE	2
+
+#define MAX_MIX			(MAX_NODES * MAX_MIX_PER_NODE)
+
+/**
+ * struct mix_port_lmac - Describes a lmac that connects to a mix
+ *			  port. The lmac must be on the same node as
+ *			  the mix.
+ * @node: Node of the lmac.
+ * @bgx:  Bgx of the lmac.
+ * @lmac: Lmac index.
+ */
+struct mix_port_lmac {
+	int	node;
+	int	bgx;
+	int	lmac;
+};
+
+/* mix_ports_lmacs contains all the lmacs connected to mix ports */
+static struct mix_port_lmac mix_port_lmacs[MAX_MIX];
+
+/* pki_ports keeps track of the lmacs connected to the pki */
+static bool pki_ports[MAX_NODES][MAX_BGX_PER_NODE][MAX_LMAC_PER_BGX];
+
+/* Created platform devices get added to this list */
+static struct list_head pdev_list;
+static struct mutex pdev_list_lock;
+
+/* Created platform device use this structure to add themselves to the list */
+struct pdev_list_item {
+	struct list_head	list;
+	struct platform_device	*pdev;
+};
+
+/**
+ * is_lmac_to_mix() - Search the list of lmacs connected to mix'es for a match.
+ * @node: Numa node of lmac to search for.
+ * @bgx:  Bgx of lmac to search for.
+ * @lmac: Lmac index to search for.
+ *
+ * Return: true if the lmac is connected to a mix.
+ *         false if the lmac is not connected to a mix.
+ */
+static bool is_lmac_to_mix(int node, int bgx, int lmac)
+{
+	int i;
+
+	for (i = 0; i < MAX_MIX; i++) {
+		if (mix_port_lmacs[i].node == node &&
+		    mix_port_lmacs[i].bgx == bgx &&
+		    mix_port_lmacs[i].lmac == lmac)
+			return true;
+	}
+
+	return false;
+}
+
+/**
+ * is_lmac_to_pki() - Search the list of lmacs connected to the pki for a match.
+ * @node: Numa node of lmac to search for.
+ * @bgx:  Bgx of lmac to search for.
+ * @lmac: Lmac index to search for.
+ *
+ * Return: true if the lmac is connected to the pki.
+ *         false if the lmac is not connected to the pki.
+ */
+static bool is_lmac_to_pki(int node, int bgx, int lmac)
+{
+	return pki_ports[node][bgx][lmac];
+}
+
+/**
+ * is_lmac_to_xcv() - Check if this lmac is connected to the xcv block (rgmii).
+ * @of_node: Device node to check.
+ *
+ * Return: true if the lmac is connected to the xcv port.
+ *         false if the lmac is not connected to the xcv port.
+ */
+static bool is_lmac_to_xcv(struct device_node *of_node)
+{
+	return of_device_is_compatible(of_node, "cavium,octeon-7360-xcv");
+}
+
+static int bgx_probe(struct platform_device *pdev)
+{
+	struct mac_platform_data platform_data;
+	struct platform_device *new_dev;
+	struct platform_device *pki_dev;
+	struct device_node *child;
+	const __be32 *reg;
+	int interface;
+	int numa_node;
+	char id[64];
+	int r = 0;
+	u64 addr;
+	u32 port;
+	u64 data;
+	int i;
+
+	reg = of_get_property(pdev->dev.of_node, "reg", NULL);
+	addr = of_translate_address(pdev->dev.of_node, reg);
+	interface = bgx_addr_to_interface(addr);
+	numa_node = bgx_node_to_numa_node(pdev->dev.of_node);
+
+	/* Assign 8 CAM entries per LMAC */
+	for (i = 0; i < 32; i++) {
+		data = i >> 3;
+		oct_csr_write(data, BGX_CMR_RX_ADRX_CAM(numa_node, interface, i));
+	}
+
+	for_each_available_child_of_node(pdev->dev.of_node, child) {
+		struct pdev_list_item *pdev_item;
+		bool is_mix = false;
+		bool is_pki = false;
+		bool is_xcv = false;
+
+		if (!of_device_is_compatible(child, "cavium,octeon-7890-bgx-port") &&
+		    !of_device_is_compatible(child, "cavium,octeon-7360-xcv"))
+			continue;
+		r = of_property_read_u32(child, "reg", &port);
+		if (r)
+			return -ENODEV;
+
+		is_mix = is_lmac_to_mix(numa_node, interface, port);
+		is_pki = is_lmac_to_pki(numa_node, interface, port);
+		is_xcv = is_lmac_to_xcv(child);
+
+		/* Check if this port should be configured */
+		if (!is_mix && !is_pki)
+			continue;
+
+		/* Connect to PKI/PKO */
+		data = oct_csr_read(BGX_CMR_CONFIG(numa_node, interface, port));
+		if (is_mix)
+			data |= BIT(11);
+		else
+			data &= ~BIT(11);
+		oct_csr_write(data, BGX_CMR_CONFIG(numa_node, interface, port));
+
+		/* Unreset the mix bgx interface or it will interfare with the
+		 * other ports.
+		 */
+		if (is_mix) {
+			data = oct_csr_read(BGX_CMR_GLOBAL_CONFIG(numa_node, interface));
+			if (!port)
+				data &= ~BIT(3);
+			else if (port == 1)
+				data &= ~BIT(4);
+			oct_csr_write(data, BGX_CMR_GLOBAL_CONFIG(numa_node, interface));
+		}
+
+		snprintf(id, sizeof(id), "%llx.%u.ethernet-mac",
+			 (unsigned long long)addr, port);
+		new_dev = of_platform_device_create(child, id, &pdev->dev);
+		if (!new_dev) {
+			dev_err(&pdev->dev, "Error creating %s\n", id);
+			continue;
+		}
+		platform_data.mac_type = BGX_MAC;
+		platform_data.numa_node = numa_node;
+		platform_data.interface = interface;
+		platform_data.port = port;
+		if (is_xcv)
+			platform_data.src_type = XCV;
+		else
+			platform_data.src_type = QLM;
+
+		/* Add device to the list of created devices so we can
+		 * remove it on exit.
+		 */
+		pdev_item = kmalloc(sizeof(*pdev_item), GFP_KERNEL);
+		pdev_item->pdev = new_dev;
+		mutex_lock(&pdev_list_lock);
+		list_add(&pdev_item->list, &pdev_list);
+		mutex_unlock(&pdev_list_lock);
+
+		i = atomic_inc_return(&pki_id);
+		pki_dev = platform_device_register_data(&new_dev->dev,
+							is_mix ? "octeon_mgmt" : "ethernet-mac-pki",
+							i, &platform_data,
+							sizeof(platform_data));
+		dev_info(&pdev->dev, "Created %s %u\n",
+			 is_mix ? "MIX" : "PKI", pki_dev->id);
+
+		/* Add device to the list of created devices so we can
+		 * remove it on exit.
+		 */
+		pdev_item = kmalloc(sizeof(*pdev_item), GFP_KERNEL);
+		pdev_item->pdev = pki_dev;
+		mutex_lock(&pdev_list_lock);
+		list_add(&pdev_item->list, &pdev_list);
+		mutex_unlock(&pdev_list_lock);
+
+#ifdef CONFIG_NUMA
+		new_dev->dev.numa_node = pdev->dev.numa_node;
+		pki_dev->dev.numa_node = pdev->dev.numa_node;
+#endif
+		/* One time request driver module */
+		if (is_mix) {
+			if (atomic_cmpxchg(&request_mgmt_once, 0, 1) == 0)
+				request_module_nowait("octeon_mgmt");
+		}
+		if (is_pki) {
+			if (atomic_cmpxchg(&load_driver_once, 0, 1) == 0)
+				request_module_nowait("octeon3-ethernet");
+		}
+	}
+
+	dev_info(&pdev->dev, "Probed\n");
+	return 0;
+}
+
+/**
+ * bgx_mix_init_from_fdt() - Initialize the list of lmacs that connect to mix
+ *			     ports from information in the device tree.
+ *
+ * Return: 0 if successful.
+ *         < 0 for error codes.
+ */
+static int bgx_mix_init_from_fdt(void)
+{
+	struct device_node *parent = NULL;
+	struct device_node *node;
+	int mix = 0;
+
+	for_each_compatible_node(node, NULL, "cavium,octeon-7890-mix") {
+		struct device_node *lmac_fdt_node;
+		const __be32 *reg;
+		u64 addr;
+		u32 t;
+
+		/* Get the fdt node of the lmac connected to this mix */
+		lmac_fdt_node = of_parse_phandle(node, "cavium,mac-handle", 0);
+		if (!lmac_fdt_node)
+			goto err;
+
+		/* Get the numa node and bgx of the lmac */
+		parent = of_get_parent(lmac_fdt_node);
+		if (!parent)
+			goto err;
+		reg = of_get_property(parent, "reg", NULL);
+		if (!reg)
+			goto err;
+		addr = of_translate_address(parent, reg);
+		of_node_put(parent);
+		parent = NULL;
+
+		mix_port_lmacs[mix].node = bgx_node_to_numa_node(parent);
+		mix_port_lmacs[mix].bgx = bgx_addr_to_interface(addr);
+
+		/* Get the lmac index */
+		if (!of_property_read_u32(lmac_fdt_node, "reg", &t))
+			goto err;
+
+		mix_port_lmacs[mix].lmac = t;
+
+		mix++;
+		if (mix >= MAX_MIX)
+			break;
+	}
+
+	return 0;
+ err:
+	pr_warn("Invalid device tree mix port information\n");
+	for (mix = 0; mix < MAX_MIX; mix++) {
+		mix_port_lmacs[mix].node = -1;
+		mix_port_lmacs[mix].bgx = -1;
+		mix_port_lmacs[mix].lmac = -1;
+	}
+	if (parent)
+		of_node_put(parent);
+
+	return -EINVAL;
+}
+
+/**
+ * bgx_mix_port_lmacs_init() - Initialize the mix_port_lmacs variable with the
+ *			       lmacs that connect to mix ports.
+ *
+ * Return: 0 if successful.
+ *         < 0 for error codes.
+ */
+static int bgx_mix_port_lmacs_init(void)
+{
+	int mix;
+
+	/* Start with no mix ports configured */
+	for (mix = 0; mix < MAX_MIX; mix++) {
+		mix_port_lmacs[mix].node = -1;
+		mix_port_lmacs[mix].bgx = -1;
+		mix_port_lmacs[mix].lmac = -1;
+	}
+
+	/* Configure the mix ports using information from the device tree if no
+	 * parameter was passed. Otherwise, use the information in the module
+	 * parameter.
+	 */
+	bgx_mix_init_from_fdt();
+
+	return 0;
+}
+
+/**
+ * bgx_pki_ports_init() - Initialize the pki_ports variable with the lmacs that
+ *			  connect to the pki.
+ *
+ * Returns 0 if successful.
+ * Returns < 0 for error codes.
+ */
+static int bgx_pki_ports_init(void)
+{
+	int i;
+	int j;
+	int k;
+
+	for (i = 0; i < MAX_NODES; i++) {
+		for (j = 0; j < MAX_BGX_PER_NODE; j++) {
+			for (k = 0; k < MAX_LMAC_PER_BGX; k++)
+				pki_ports[i][j][k] = true;
+		}
+	}
+
+	return 0;
+}
+
+static int bgx_remove(struct platform_device *pdev)
+{
+	return 0;
+}
+
+static void bgx_shutdown(struct platform_device *pdev)
+{
+}
+
+static const struct of_device_id bgx_match[] = {
+	{
+		.compatible = "cavium,octeon-7890-bgx",
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(of, bgx_match);
+
+static struct platform_driver bgx_driver = {
+	.probe		= bgx_probe,
+	.remove		= bgx_remove,
+	.shutdown       = bgx_shutdown,
+	.driver		= {
+		.owner	= THIS_MODULE,
+		.name	= KBUILD_MODNAME,
+		.of_match_table = bgx_match,
+	},
+};
+
+/* Allow bgx_port driver to force this driver to load */
+void bgx_nexus_load(void)
+{
+}
+EXPORT_SYMBOL(bgx_nexus_load);
+
+static int __init bgx_driver_init(void)
+{
+	int r;
+
+	INIT_LIST_HEAD(&pdev_list);
+	mutex_init(&pdev_list_lock);
+
+	bgx_mix_port_lmacs_init();
+	bgx_pki_ports_init();
+
+	r = platform_driver_register(&bgx_driver);
+
+	return r;
+}
+
+static void __exit bgx_driver_exit(void)
+{
+	struct pdev_list_item *pdev_item;
+
+	mutex_lock(&pdev_list_lock);
+	while (!list_empty(&pdev_list)) {
+		pdev_item = list_first_entry(&pdev_list,
+					     struct pdev_list_item, list);
+		list_del(&pdev_item->list);
+		platform_device_unregister(pdev_item->pdev);
+		kfree(pdev_item);
+	}
+	mutex_unlock(&pdev_list_lock);
+
+	platform_driver_unregister(&bgx_driver);
+}
+
+module_init(bgx_driver_init);
+module_exit(bgx_driver_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Cavium, Inc. <support@caviumnetworks.com>");
+MODULE_DESCRIPTION("Cavium, Inc. BGX MAC Nexus driver.");
diff --git a/drivers/net/ethernet/cavium/octeon/octeon3-bgx-port.c b/drivers/net/ethernet/cavium/octeon/octeon3-bgx-port.c
new file mode 100644
index 0000000..260c9e9
--- /dev/null
+++ b/drivers/net/ethernet/cavium/octeon/octeon3-bgx-port.c
@@ -0,0 +1,2003 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (c) 2017 Cavium, Inc.
+ *
+ * The BGX port (LMAC) is a single Ethernet MAC that may be assigned
+ * to either the octeon3-ethernet packet processor or the octeon_mgmt
+ * packet processor.
+ */
+#include <linux/platform_device.h>
+#include <linux/etherdevice.h>
+#include <linux/of_platform.h>
+#include <linux/of_address.h>
+#include <linux/netdevice.h>
+#include <linux/of_mdio.h>
+#include <linux/of_net.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include <asm/octeon/octeon.h>
+
+#include "octeon3.h"
+
+struct bgx_port_priv {
+	int node;
+	int bgx;
+	int index; /* Port index on BGX block*/
+	enum port_mode mode;
+	int pknd;
+	int qlm;
+	const u8 *mac_addr;
+	struct phy_device *phydev;
+	struct device_node *phy_np;
+	int phy_mode;
+	bool mode_1000basex;
+	bool bgx_as_phy;
+	struct net_device *netdev;
+	struct port_status (*get_link)(struct bgx_port_priv *priv);
+	int (*set_link)(struct bgx_port_priv *priv, struct port_status status);
+	struct port_status last_status;
+	struct delayed_work dwork;
+};
+
+/* lmac_pknd keeps track of the port kinds assigned to the lmacs */
+static int lmac_pknd[MAX_NODES][MAX_BGX_PER_NODE][MAX_LMAC_PER_BGX];
+
+static struct workqueue_struct *check_state_wq;
+static DEFINE_MUTEX(check_state_wq_mutex);
+
+int bgx_port_get_qlm(int node, int bgx, int index)
+{
+	int qlm = -1;
+	u64 data;
+
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX)) {
+		if (bgx < 2) {
+			data = oct_csr_read(BGX_CMR_GLOBAL_CONFIG(node, bgx));
+			if (data & 1)
+				qlm = bgx + 2;
+			else
+				qlm = bgx;
+		} else {
+			qlm = bgx + 2;
+		}
+	} else if (OCTEON_IS_MODEL(OCTEON_CN73XX)) {
+		if (bgx < 2) {
+			qlm = bgx + 2;
+		} else {
+			/* Ports on bgx2 can be connected to qlm5 or qlm6 */
+			if (index < 2)
+				qlm = 5;
+			else
+				qlm = 6;
+		}
+	} else if (OCTEON_IS_MODEL(OCTEON_CNF75XX)) {
+		/* Ports on bgx0 can be connected to qlm4 or qlm5 */
+		if (index < 2)
+			qlm = 4;
+		else
+			qlm = 5;
+	}
+
+	return qlm;
+}
+EXPORT_SYMBOL(bgx_port_get_qlm);
+
+/* Returns the mode of the bgx port */
+enum port_mode bgx_port_get_mode(int node, int bgx, int index)
+{
+	enum port_mode mode;
+	u64 data;
+
+	data = oct_csr_read(BGX_CMR_CONFIG(node, bgx, index));
+
+	switch ((data >> 8) & 7) {
+	case 0:
+		mode = PORT_MODE_SGMII;
+		break;
+	case 1:
+		mode = PORT_MODE_XAUI;
+		break;
+	case 2:
+		mode = PORT_MODE_RXAUI;
+		break;
+	case 3:
+		data = oct_csr_read(BGX_SPU_BR_PMD_CONTROL(node, bgx, index));
+		/* The use of training differentiates 10G_KR from xfi */
+		if (data & BIT(1))
+			mode = PORT_MODE_10G_KR;
+		else
+			mode = PORT_MODE_XFI;
+		break;
+	case 4:
+		data = oct_csr_read(BGX_SPU_BR_PMD_CONTROL(node, bgx, index));
+		/* The use of training differentiates 40G_KR4 from xlaui */
+		if (data & BIT(1))
+			mode = PORT_MODE_40G_KR4;
+		else
+			mode = PORT_MODE_XLAUI;
+		break;
+	case 5:
+		mode = PORT_MODE_RGMII;
+		break;
+	default:
+		mode = PORT_MODE_DISABLED;
+		break;
+	}
+
+	return mode;
+}
+EXPORT_SYMBOL(bgx_port_get_mode);
+
+int bgx_port_allocate_pknd(int node)
+{
+	struct global_resource_tag tag;
+	char buf[16];
+	int pknd;
+
+	strncpy((char *)&tag.lo, "cvm_pknd", 8);
+	snprintf(buf, 16, "_%d......", node);
+	memcpy(&tag.hi, buf, 8);
+
+	res_mgr_create_resource(tag, 64);
+	pknd = res_mgr_alloc(tag, -1, false);
+	if (pknd < 0) {
+		pr_err("bgx-port: Failed to allocate pknd\n");
+		return -ENODEV;
+	}
+
+	return pknd;
+}
+EXPORT_SYMBOL(bgx_port_allocate_pknd);
+
+int bgx_port_get_pknd(int node, int bgx, int index)
+{
+	return lmac_pknd[node][bgx][index];
+}
+EXPORT_SYMBOL(bgx_port_get_pknd);
+
+/* GSER-20075 */
+static void bgx_port_gser_20075(struct bgx_port_priv *priv, int qlm, int lane)
+{
+	u64 addr;
+	u64 data;
+
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X) &&
+	    (lane == -1 || lane == 3)) {
+		/* Enable software control */
+		addr = GSER_BR_RX_CTL(priv->node, qlm, 3);
+		data = oct_csr_read(addr);
+		data |= BIT(2);
+		oct_csr_write(data, addr);
+
+		/* Clear the completion flag */
+		addr = GSER_BR_RX_EER(priv->node, qlm, 3);
+		data = oct_csr_read(addr);
+		data &= ~BIT(14);
+		oct_csr_write(data, addr);
+
+		/* Initiate a new request on lane 2 */
+		if (lane == 3) {
+			addr = GSER_BR_RX_EER(priv->node, qlm, 2);
+			data = oct_csr_read(addr);
+			data |= BIT(15);
+			oct_csr_write(data, addr);
+		}
+	}
+}
+
+static void bgx_common_init_pknd(struct bgx_port_priv *priv)
+{
+	int num_ports;
+	u64 data;
+
+	/* Setup pkind */
+	priv->pknd = bgx_port_allocate_pknd(priv->node);
+	lmac_pknd[priv->node][priv->bgx][priv->index] = priv->pknd;
+	data = oct_csr_read(BGX_CMR_RX_ID_MAP(priv->node, priv->bgx, priv->index));
+	data &= ~GENMASK_ULL(7, 0);
+	data |= priv->pknd;
+	if (OCTEON_IS_MODEL(OCTEON_CN73XX)) {
+		/* Change the default reassembly id (max allowed is 14) */
+		data &= ~GENMASK_ULL(14, 8);
+		data |= ((4 * priv->bgx) + 2 + priv->index) << 8;
+	}
+	oct_csr_write(data, BGX_CMR_RX_ID_MAP(priv->node, priv->bgx, priv->index));
+
+	/* Set backpressure channel mask AND/OR registers */
+	data = oct_csr_read(BGX_CMR_CHAN_MSK_AND(priv->node, priv->bgx));
+	data |= 0xffff << (16 * priv->index);
+	oct_csr_write(data, BGX_CMR_CHAN_MSK_AND(priv->node, priv->bgx));
+
+	data = oct_csr_read(BGX_CMR_CHAN_MSK_OR(priv->node, priv->bgx));
+	data |= 0xffff << (16 * priv->index);
+	oct_csr_write(data, BGX_CMR_CHAN_MSK_OR(priv->node, priv->bgx));
+
+	/* Rx back pressure watermark:
+	 * Set to 1/4 of the available lmacs buffer (in multiple of 16 bytes)
+	 */
+	data = oct_csr_read(BGX_CMR_TX_LMACS(priv->node, priv->bgx));
+	num_ports = data & 7;
+	data = BGX_RX_FIFO_SIZE / (num_ports * 4 * 16);
+	oct_csr_write(data, BGX_CMR_RX_BP_ON(priv->node, priv->bgx, priv->index));
+}
+
+static int bgx_xgmii_hardware_init(struct bgx_port_priv *priv)
+{
+	u64 clock_mhz;
+	u64 data;
+	u64 ctl;
+
+	/* Set TX Threshold */
+	data = 0x20;
+	oct_csr_write(data, BGX_GMP_GMI_TX_THRESH(priv->node, priv->bgx, priv->index));
+
+	data = oct_csr_read(BGX_GMP_PCS_MISC_CTL(priv->node, priv->bgx, priv->index));
+	data &= ~(BIT(8) | BIT(9));
+	if (priv->mode_1000basex)
+		data |= BIT(8);
+	if (priv->bgx_as_phy)
+		data |= BIT(9);
+	oct_csr_write(data, BGX_GMP_PCS_MISC_CTL(priv->node, priv->bgx, priv->index));
+
+	data = oct_csr_read(BGX_GMP_PCS_LINK_TIMER(priv->node, priv->bgx, priv->index));
+	clock_mhz = octeon_get_io_clock_rate() / 1000000;
+	if (priv->mode_1000basex)
+		data = (10000ull * clock_mhz) >> 10;
+	else
+		data = (1600ull * clock_mhz) >> 10;
+	oct_csr_write(data, BGX_GMP_PCS_LINK_TIMER(priv->node, priv->bgx, priv->index));
+
+	if (priv->mode_1000basex) {
+		data = oct_csr_read(BGX_GMP_PCS_AN_ADV(priv->node, priv->bgx, priv->index));
+		data &= ~(GENMASK_ULL(13, 12) | GENMASK_ULL(8, 7));
+		data |= 3 << 7;
+		data |= BIT(6) | BIT(5);
+		oct_csr_write(data, BGX_GMP_PCS_AN_ADV(priv->node, priv->bgx, priv->index));
+	} else if (priv->bgx_as_phy) {
+		data = oct_csr_read(BGX_GMP_PCS_SGM_AN_ADV(priv->node, priv->bgx, priv->index));
+		data |= BIT(12);
+		data &= ~(GENMASK_ULL(11, 10));
+		data |= 2 << 10;
+		oct_csr_write(data, BGX_GMP_PCS_SGM_AN_ADV(priv->node, priv->bgx, priv->index));
+	}
+
+	data = oct_csr_read(BGX_GMP_GMI_TX_APPEND(priv->node, priv->bgx, priv->index));
+	ctl = oct_csr_read(BGX_GMP_GMI_TX_SGMII_CTL(priv->node, priv->bgx, priv->index));
+	ctl &= ~BIT(0);
+	ctl |= (data & BIT(0)) ? 0 : 1;
+	oct_csr_write(ctl, BGX_GMP_GMI_TX_SGMII_CTL(priv->node, priv->bgx, priv->index));
+
+	if (priv->mode == PORT_MODE_RGMII) {
+		/* Disable XCV interface when initialized */
+		data = oct_csr_read(XCV_RESET(priv->node));
+		data &= ~(BIT(63) | BIT(3) | BIT(1));
+		oct_csr_write(data, XCV_RESET(priv->node));
+	}
+
+	return 0;
+}
+
+int bgx_get_tx_fifo_size(struct bgx_port_priv *priv)
+{
+	u64 data;
+	int num_ports;
+
+	data = oct_csr_read(BGX_CMR_TX_LMACS(priv->node, priv->bgx));
+	num_ports = data & 7;
+
+	switch (num_ports) {
+	case 1:
+		return BGX_TX_FIFO_SIZE;
+	case 2:
+		return BGX_TX_FIFO_SIZE / 2;
+	case 3:
+	case 4:
+		return BGX_TX_FIFO_SIZE / 4;
+	default:
+		return 0;
+	}
+}
+
+static int bgx_xaui_hardware_init(struct bgx_port_priv *priv)
+{
+	u64 tx_fifo_size;
+	u64 clock_mhz;
+	u64 data;
+
+	if (octeon_is_simulation()) {
+		/* Enable the port */
+		data = oct_csr_read(BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+		data |= BIT(15);
+		oct_csr_write(data, BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+	} else {
+		/* Reset the port */
+		data = oct_csr_read(BGX_SPU_CONTROL1(priv->node, priv->bgx, priv->index));
+		data |= BIT(15);
+		oct_csr_write(data, BGX_SPU_CONTROL1(priv->node, priv->bgx, priv->index));
+
+		/* Wait for reset to complete */
+		udelay(1);
+		data = oct_csr_read(BGX_SPU_CONTROL1(priv->node, priv->bgx, priv->index));
+		if (data & BIT(15)) {
+			netdev_err(priv->netdev,
+				   "BGX%d:%d: SPU stuck in reset\n", priv->bgx, priv->node);
+			return -1;
+		}
+
+		/* Reset the SerDes lanes */
+		data = oct_csr_read(BGX_SPU_CONTROL1(priv->node, priv->bgx, priv->index));
+		data |= BIT(11);
+		oct_csr_write(data, BGX_SPU_CONTROL1(priv->node, priv->bgx, priv->index));
+
+		/* Disable packet reception */
+		data = oct_csr_read(BGX_SPU_MISC_CONTROL(priv->node, priv->bgx, priv->index));
+		data |= BIT(12);
+		oct_csr_write(data, BGX_SPU_MISC_CONTROL(priv->node, priv->bgx, priv->index));
+
+		/* Clear/disable interrupts */
+		data = oct_csr_read(BGX_SMU_RX_INT(priv->node, priv->bgx, priv->index));
+		oct_csr_write(data, BGX_SMU_RX_INT(priv->node, priv->bgx, priv->index));
+		data = oct_csr_read(BGX_SMU_TX_INT(priv->node, priv->bgx, priv->index));
+		oct_csr_write(data, BGX_SMU_TX_INT(priv->node, priv->bgx, priv->index));
+		data = oct_csr_read(BGX_SPU_INT(priv->node, priv->bgx, priv->index));
+		oct_csr_write(data, BGX_SPU_INT(priv->node, priv->bgx, priv->index));
+
+		if ((priv->mode == PORT_MODE_10G_KR ||
+		     priv->mode == PORT_MODE_40G_KR4) &&
+		    !OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X)) {
+			oct_csr_write(0, BGX_SPU_BR_PMD_LP_CUP(priv->node, priv->bgx, priv->index));
+			oct_csr_write(0, BGX_SPU_BR_PMD_LD_CUP(priv->node, priv->bgx, priv->index));
+			oct_csr_write(0, BGX_SPU_BR_PMD_LD_REP(priv->node, priv->bgx, priv->index));
+			data = oct_csr_read(BGX_SPU_BR_PMD_CONTROL(priv->node, priv->bgx, priv->index));
+			data |= BIT(1);
+			oct_csr_write(data, BGX_SPU_BR_PMD_CONTROL(priv->node, priv->bgx, priv->index));
+		}
+	}
+
+	data = oct_csr_read(BGX_SMU_TX_APPEND(priv->node, priv->bgx, priv->index));
+	data |= BIT(3);
+	oct_csr_write(data, BGX_SMU_TX_APPEND(priv->node, priv->bgx, priv->index));
+
+	if (!octeon_is_simulation()) {
+		/* Disable fec */
+		data = oct_csr_read(BGX_SPU_FEC_CONTROL(priv->node, priv->bgx, priv->index));
+		data &= ~BIT(0);
+		oct_csr_write(data, BGX_SPU_FEC_CONTROL(priv->node, priv->bgx, priv->index));
+
+		/* Disable/configure auto negotiation */
+		data = oct_csr_read(BGX_SPU_AN_CONTROL(priv->node, priv->bgx, priv->index));
+		data &= ~(BIT(13) | BIT(12));
+		oct_csr_write(data, BGX_SPU_AN_CONTROL(priv->node, priv->bgx, priv->index));
+
+		data = oct_csr_read(BGX_SPU_AN_ADV(priv->node, priv->bgx, priv->index));
+		data &= ~(BIT(47) | BIT(26) | BIT(25) | BIT(22) | BIT(21) |
+			  BIT(13) | BIT(12));
+		data |= BIT(46);
+		if (priv->mode == PORT_MODE_40G_KR4)
+			data |= BIT(24);
+		else
+			data &= ~BIT(24);
+		if (priv->mode == PORT_MODE_10G_KR)
+			data |= BIT(23);
+		else
+			data &= ~BIT(23);
+		oct_csr_write(data, BGX_SPU_AN_ADV(priv->node, priv->bgx, priv->index));
+
+		data = oct_csr_read(BGX_SPU_DBG_CONTROL(priv->node, priv->bgx));
+		data |= BIT(29);
+		if (priv->mode == PORT_MODE_10G_KR ||
+		    priv->mode == PORT_MODE_40G_KR4)
+			data |= BIT(18);
+		else
+			data &= ~BIT(18);
+		oct_csr_write(data, BGX_SPU_DBG_CONTROL(priv->node, priv->bgx));
+
+		/* Enable the port */
+		data = oct_csr_read(BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+		data |= BIT(15);
+		oct_csr_write(data, BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+
+		if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X) && priv->index) {
+			/* BGX-22429 */
+			data = oct_csr_read(BGX_CMR_CONFIG(priv->node, priv->bgx, 0));
+			data |= BIT(15);
+			oct_csr_write(data, BGX_CMR_CONFIG(priv->node, priv->bgx, 0));
+		}
+	}
+
+	data = oct_csr_read(BGX_SPU_CONTROL1(priv->node, priv->bgx, priv->index));
+	data &= ~BIT(11);
+	oct_csr_write(data, BGX_SPU_CONTROL1(priv->node, priv->bgx, priv->index));
+
+	data = oct_csr_read(BGX_SMU_TX_CTL(priv->node, priv->bgx, priv->index));
+	data |= BIT(0);
+	data &= ~BIT(1);
+	oct_csr_write(data, BGX_SMU_TX_CTL(priv->node, priv->bgx, priv->index));
+
+	clock_mhz = octeon_get_io_clock_rate() / 1000000;
+	data = oct_csr_read(BGX_SPU_DBG_CONTROL(priv->node, priv->bgx));
+	data &= ~GENMASK_ULL(43, 32);
+	data |= (clock_mhz - 1) << 32;
+	oct_csr_write(data, BGX_SPU_DBG_CONTROL(priv->node, priv->bgx));
+
+	/* Fifo in 16-byte words */
+	tx_fifo_size = bgx_get_tx_fifo_size(priv);
+	tx_fifo_size >>= 4;
+	oct_csr_write(tx_fifo_size - 10, BGX_SMU_TX_THRESH(priv->node, priv->bgx, priv->index));
+
+	if (priv->mode == PORT_MODE_RXAUI && priv->phy_np) {
+		data = oct_csr_read(BGX_SPU_MISC_CONTROL(priv->node, priv->bgx, priv->index));
+		data |= BIT(10);
+		oct_csr_write(data, BGX_SPU_MISC_CONTROL(priv->node, priv->bgx, priv->index));
+	}
+
+	/* Some PHYs take up to 250ms to stabilize */
+	if (!octeon_is_simulation())
+		usleep_range(250000, 300000);
+
+	return 0;
+}
+
+/* Configure/initialize a bgx port. */
+static int bgx_port_init(struct bgx_port_priv *priv)
+{
+	int rc = 0;
+	u64 data;
+
+	/* GSER-20956 */
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X) &&
+	    (priv->mode == PORT_MODE_10G_KR ||
+	     priv->mode == PORT_MODE_XFI ||
+	     priv->mode == PORT_MODE_40G_KR4 ||
+	     priv->mode == PORT_MODE_XLAUI)) {
+		/* Disable link training */
+		data = oct_csr_read(BGX_SPU_BR_PMD_CONTROL(priv->node, priv->bgx, priv->index));
+		data &= ~(1 << 1);
+		oct_csr_write(data, BGX_SPU_BR_PMD_CONTROL(priv->node, priv->bgx, priv->index));
+	}
+
+	bgx_common_init_pknd(priv);
+
+	if (priv->mode == PORT_MODE_SGMII ||
+	    priv->mode == PORT_MODE_RGMII)
+		rc = bgx_xgmii_hardware_init(priv);
+	else
+		rc = bgx_xaui_hardware_init(priv);
+
+	return rc;
+}
+
+static int bgx_port_get_qlm_speed(struct bgx_port_priv *priv, int qlm)
+{
+	enum lane_mode lmode;
+	u64 data;
+
+	data = oct_csr_read(GSER_LANE_MODE(priv->node, qlm));
+	lmode = data & 0xf;
+
+	switch (lmode) {
+	case R_25G_REFCLK100:
+		return 2500;
+	case R_5G_REFCLK100:
+		return 5000;
+	case R_8G_REFCLK100:
+		return 8000;
+	case R_125G_REFCLK15625_KX:
+		return 1250;
+	case R_3125G_REFCLK15625_XAUI:
+		return 3125;
+	case R_103125G_REFCLK15625_KR:
+		return 10312;
+	case R_125G_REFCLK15625_SGMII:
+		return 1250;
+	case R_5G_REFCLK15625_QSGMII:
+		return 5000;
+	case R_625G_REFCLK15625_RXAUI:
+		return 6250;
+	case R_25G_REFCLK125:
+		return 2500;
+	case R_5G_REFCLK125:
+		return 5000;
+	case R_8G_REFCLK125:
+		return 8000;
+	default:
+		return 0;
+	}
+}
+
+static struct port_status bgx_port_get_sgmii_link(struct bgx_port_priv *priv)
+{
+	struct port_status status;
+	int speed;
+
+	/* The simulator always uses a 1Gbps full duplex port */
+	if (octeon_is_simulation()) {
+		status.link = 1;
+		status.duplex = DUPLEX_FULL;
+		status.speed = 1000;
+	} else {
+		/* Use the qlm speed */
+		speed = bgx_port_get_qlm_speed(priv, priv->qlm);
+		status.link = 1;
+		status.duplex = DUPLEX_FULL;
+		status.speed = speed * 8 / 10;
+	}
+
+	return status;
+}
+
+static int bgx_port_xgmii_set_link_up(struct bgx_port_priv *priv)
+{
+	u64 data;
+	int timeout;
+
+	if (!octeon_is_simulation()) {
+		/* PCS reset sequence */
+		data = oct_csr_read(BGX_GMP_PCS_MR_CONTROL(priv->node, priv->bgx, priv->index));
+		data |= BIT(15);
+		oct_csr_write(data, BGX_GMP_PCS_MR_CONTROL(priv->node, priv->bgx, priv->index));
+
+		/* Wait for reset to complete */
+		udelay(1);
+		data = oct_csr_read(BGX_GMP_PCS_MR_CONTROL(priv->node, priv->bgx, priv->index));
+		if (data & BIT(15)) {
+			netdev_err(priv->netdev,
+				   "BGX%d:%d: PCS stuck in reset\n", priv->bgx, priv->node);
+			return -1;
+		}
+	}
+
+	/* Autonegotiation */
+	if (priv->phy_np) {
+		data = oct_csr_read(BGX_GMP_PCS_MR_CONTROL(priv->node, priv->bgx, priv->index));
+		data |= BIT(9);
+		if (priv->mode != PORT_MODE_RGMII)
+			data |= BIT(12);
+		else
+			data &= ~BIT(12);
+		data &= ~BIT(11);
+		oct_csr_write(data, BGX_GMP_PCS_MR_CONTROL(priv->node, priv->bgx, priv->index));
+	} else {
+		data = oct_csr_read(BGX_GMP_PCS_MR_CONTROL(priv->node, priv->bgx, priv->index));
+		data |= BIT(6);
+		data &= ~(BIT(13) | BIT(12) | BIT(11));
+		oct_csr_write(data, BGX_GMP_PCS_MR_CONTROL(priv->node, priv->bgx, priv->index));
+	}
+
+	data = oct_csr_read(BGX_GMP_PCS_MISC_CTL(priv->node, priv->bgx, priv->index));
+	data &= ~(BIT(9) | BIT(8));
+	if (priv->mode_1000basex)
+		data |= BIT(8);
+	if (priv->bgx_as_phy)
+		data |= BIT(9);
+	oct_csr_write(data, BGX_GMP_PCS_MISC_CTL(priv->node, priv->bgx, priv->index));
+
+	/* Wait for autonegotiation to complete */
+	if (!octeon_is_simulation() && !priv->bgx_as_phy &&
+	    priv->mode != PORT_MODE_RGMII) {
+		timeout = 10000;
+		do {
+			data = oct_csr_read(BGX_GMP_PCS_MR_STATUS(priv->node, priv->bgx, priv->index));
+			if (data & BIT(5))
+				break;
+			timeout--;
+			udelay(1);
+		} while (timeout);
+		if (!timeout) {
+			netdev_err(priv->netdev, "BGX%d:%d: AN timeout\n", priv->bgx, priv->node);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static void bgx_port_rgmii_set_link_down(struct bgx_port_priv *priv)
+{
+	int rx_fifo_len;
+	u64 data;
+
+	data = oct_csr_read(XCV_RESET(priv->node));
+	data &= ~BIT(1);
+	oct_csr_write(data, XCV_RESET(priv->node));
+	/* Is this read really needed? TODO */
+	data = oct_csr_read(XCV_RESET(priv->node));
+
+	/* Wait for 2 MTUs */
+	mdelay(10);
+
+	data = oct_csr_read(BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+	data &= ~BIT(14);
+	oct_csr_write(data, BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+
+	/* Wait for the rx and tx fifos to drain */
+	do {
+		data = oct_csr_read(BGX_CMR_RX_FIFO_LEN(priv->node, priv->bgx, priv->index));
+		rx_fifo_len = data & 0x1fff;
+		data = oct_csr_read(BGX_CMR_TX_FIFO_LEN(priv->node, priv->bgx, priv->index));
+	} while (rx_fifo_len > 0 || !(data & BIT(13)));
+
+	data = oct_csr_read(BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+	data &= ~BIT(13);
+	oct_csr_write(data, BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+
+	data = oct_csr_read(XCV_RESET(priv->node));
+	data &= ~BIT(3);
+	oct_csr_write(data, XCV_RESET(priv->node));
+
+	data = oct_csr_read(BGX_GMP_PCS_MR_CONTROL(priv->node, priv->bgx, priv->index));
+	data |= BIT(11);
+	oct_csr_write(data, BGX_GMP_PCS_MR_CONTROL(priv->node, priv->bgx, priv->index));
+}
+
+static void bgx_port_sgmii_set_link_down(struct bgx_port_priv *priv)
+{
+	u64 data;
+
+	data = oct_csr_read(BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+	data &= ~(BIT(14) | BIT(13));
+	oct_csr_write(data, BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+
+	data = oct_csr_read(BGX_GMP_PCS_MR_CONTROL(priv->node, priv->bgx, priv->index));
+	data &= ~BIT(12);
+	oct_csr_write(data, BGX_GMP_PCS_MR_CONTROL(priv->node, priv->bgx, priv->index));
+
+	data = oct_csr_read(BGX_GMP_PCS_MISC_CTL(priv->node, priv->bgx, priv->index));
+	data |= BIT(11);
+	oct_csr_write(data, BGX_GMP_PCS_MISC_CTL(priv->node, priv->bgx, priv->index));
+	data = oct_csr_read(BGX_GMP_PCS_MISC_CTL(priv->node, priv->bgx, priv->index));
+}
+
+static int bgx_port_sgmii_set_link_speed(struct bgx_port_priv *priv,
+					 struct port_status status)
+{
+	int timeout;
+	u64 miscx;
+	u64 data;
+	u64 prtx;
+
+	data = oct_csr_read(BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+	data &= ~(BIT(14) | BIT(13));
+	oct_csr_write(data, BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+
+	timeout = 10000;
+	do {
+		prtx = oct_csr_read(BGX_GMP_GMI_PRT_CFG(priv->node, priv->bgx, priv->index));
+		if (prtx & BIT(13) && prtx & BIT(12))
+			break;
+		timeout--;
+		udelay(1);
+	} while (timeout);
+	if (!timeout) {
+		netdev_err(priv->netdev, "BGX%d:%d: GMP idle timeout\n",
+			   priv->bgx, priv->node);
+		return -1;
+	}
+
+	prtx = oct_csr_read(BGX_GMP_GMI_PRT_CFG(priv->node, priv->bgx, priv->index));
+	miscx = oct_csr_read(BGX_GMP_PCS_MISC_CTL(priv->node, priv->bgx, priv->index));
+	if (status.link) {
+		miscx &= ~BIT(11);
+		if (status.duplex == DUPLEX_FULL)
+			prtx |= BIT(2);
+		else
+			prtx &= ~BIT(2);
+	} else {
+		miscx |= BIT(11);
+	}
+
+	switch (status.speed) {
+	case 10:
+		prtx &= ~(BIT(3) | BIT(1));
+		prtx |= BIT(8);
+		miscx &= ~GENMASK_ULL(6, 0);
+		miscx |= 25;
+		oct_csr_write(64, BGX_GMP_GMI_TX_SLOT(priv->node, priv->bgx, priv->index));
+		oct_csr_write(0, BGX_GMP_GMI_TX_BURST(priv->node, priv->bgx, priv->index));
+		break;
+	case 100:
+		prtx &= ~(BIT(8) | BIT(3) | BIT(1));
+		miscx &= ~GENMASK_ULL(6, 0);
+		miscx |= 5;
+		oct_csr_write(64, BGX_GMP_GMI_TX_SLOT(priv->node, priv->bgx, priv->index));
+		oct_csr_write(0, BGX_GMP_GMI_TX_BURST(priv->node, priv->bgx, priv->index));
+		break;
+	case 1000:
+		prtx |= (BIT(3) | BIT(1));
+		prtx &= ~BIT(8);
+		miscx &= ~GENMASK_ULL(6, 0);
+		miscx |= 1;
+		oct_csr_write(512, BGX_GMP_GMI_TX_SLOT(priv->node, priv->bgx, priv->index));
+		if (status.duplex == DUPLEX_FULL)
+			oct_csr_write(0, BGX_GMP_GMI_TX_BURST(priv->node, priv->bgx, priv->index));
+		else
+			oct_csr_write(8192, BGX_GMP_GMI_TX_BURST(priv->node, priv->bgx, priv->index));
+		break;
+	default:
+		break;
+	}
+
+	oct_csr_write(miscx, BGX_GMP_PCS_MISC_CTL(priv->node, priv->bgx, priv->index));
+	oct_csr_write(prtx, BGX_GMP_GMI_PRT_CFG(priv->node, priv->bgx, priv->index));
+	/* This read verifies the write completed */
+	prtx = oct_csr_read(BGX_GMP_GMI_PRT_CFG(priv->node, priv->bgx, priv->index));
+
+	data = oct_csr_read(BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+	data |= (BIT(14) | BIT(13));
+	oct_csr_write(data, BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+
+	return 0;
+}
+
+static int bgx_port_rgmii_set_link_speed(struct bgx_port_priv *priv,
+					 struct port_status status)
+{
+	bool speed_changed = false;
+	bool int_lpbk = false;
+	bool do_credits;
+	int speed;
+	u64 data;
+
+	switch (status.speed) {
+	case 10:
+		speed = 0;
+		break;
+	case 100:
+		speed = 1;
+		break;
+	case 1000:
+	default:
+		speed = 2;
+		break;
+	}
+
+	/* Do credits if link came up */
+	data = oct_csr_read(XCV_RESET(priv->node));
+	do_credits = status.link && !(data & BIT(63));
+
+	/* Was there a speed change */
+	data = oct_csr_read(XCV_CTL(priv->node));
+	if ((data & GENMASK_ULL(1, 0)) != speed)
+		speed_changed = true;
+
+	/* Clear clkrst when in internal loopback */
+	if (data & BIT(2)) {
+		int_lpbk = true;
+		data = oct_csr_read(XCV_RESET(priv->node));
+		data &= ~BIT(15);
+		oct_csr_write(data, XCV_RESET(priv->node));
+	}
+
+	/* Link came up or there was a speed change */
+	data = oct_csr_read(XCV_RESET(priv->node));
+	if (status.link && (!(data & BIT(63)) || speed_changed)) {
+		data |= BIT(63);
+		oct_csr_write(data, XCV_RESET(priv->node));
+
+		data = oct_csr_read(XCV_CTL(priv->node));
+		data &= ~GENMASK_ULL(1, 0);
+		data |= speed;
+		oct_csr_write(data, XCV_CTL(priv->node));
+
+		data = oct_csr_read(XCV_DLL_CTL(priv->node));
+		data |= BIT(23);
+		data &= ~GENMASK_ULL(22, 16);
+		data &= ~BIT(15);
+		oct_csr_write(data, XCV_DLL_CTL(priv->node));
+
+		data = oct_csr_read(XCV_DLL_CTL(priv->node));
+		data &= ~GENMASK_ULL(1, 0);
+		oct_csr_write(data, XCV_DLL_CTL(priv->node));
+
+		data = oct_csr_read(XCV_RESET(priv->node));
+		data &= ~BIT(11);
+		oct_csr_write(data, XCV_RESET(priv->node));
+
+		usleep_range(10, 100);
+
+		data = oct_csr_read(XCV_COMP_CTL(priv->node));
+		data &= ~BIT(63);
+		oct_csr_write(data, XCV_COMP_CTL(priv->node));
+
+		data = oct_csr_read(XCV_RESET(priv->node));
+		data |= BIT(7);
+		oct_csr_write(data, XCV_RESET(priv->node));
+
+		data = oct_csr_read(XCV_RESET(priv->node));
+		if (int_lpbk)
+			data &= ~BIT(15);
+		else
+			data |= BIT(15);
+		oct_csr_write(data, XCV_RESET(priv->node));
+
+		data = oct_csr_read(XCV_RESET(priv->node));
+		data |= BIT(2) | BIT(0);
+		oct_csr_write(data, XCV_RESET(priv->node));
+	}
+
+	data = oct_csr_read(XCV_RESET(priv->node));
+	if (status.link)
+		data |= BIT(3) | BIT(1);
+	else
+		data &= ~(BIT(3) | BIT(1));
+	oct_csr_write(data, XCV_RESET(priv->node));
+
+	if (!status.link) {
+		mdelay(10);
+		oct_csr_write(0, XCV_RESET(priv->node));
+	}
+
+	/* Grant pko tx credits */
+	if (do_credits) {
+		data = oct_csr_read(XCV_BATCH_CRD_RET(priv->node));
+		data |= BIT(0);
+		oct_csr_write(data, XCV_BATCH_CRD_RET(priv->node));
+	}
+
+	return 0;
+}
+
+static int bgx_port_set_xgmii_link(struct bgx_port_priv *priv,
+				   struct port_status status)
+{
+	int rc = 0;
+	u64 data;
+
+	if (status.link) {
+		/* Link up */
+		data = oct_csr_read(BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+		data |= BIT(15);
+		oct_csr_write(data, BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+
+		/* BGX-22429 */
+		if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X) && priv->index) {
+			data = oct_csr_read(BGX_CMR_CONFIG(priv->node, priv->bgx, 0));
+			data |= BIT(15);
+			oct_csr_write(data, BGX_CMR_CONFIG(priv->node, priv->bgx, 0));
+		}
+
+		rc = bgx_port_xgmii_set_link_up(priv);
+		if (rc)
+			return rc;
+		rc = bgx_port_sgmii_set_link_speed(priv, status);
+		if (rc)
+			return rc;
+		if (priv->mode == PORT_MODE_RGMII)
+			rc = bgx_port_rgmii_set_link_speed(priv, status);
+	} else {
+		/* Link down */
+		if (priv->mode == PORT_MODE_RGMII) {
+			bgx_port_rgmii_set_link_down(priv);
+			rc = bgx_port_sgmii_set_link_speed(priv, status);
+			if (rc)
+				return rc;
+			rc = bgx_port_rgmii_set_link_speed(priv, status);
+		} else {
+			bgx_port_sgmii_set_link_down(priv);
+		}
+	}
+
+	return rc;
+}
+
+static struct port_status bgx_port_get_xaui_link(struct bgx_port_priv *priv)
+{
+	struct port_status status;
+	int lanes;
+	int speed;
+	u64 data;
+
+	status.link = 0;
+	status.duplex = DUPLEX_HALF;
+	status.speed = 0;
+
+	/* Get the link state */
+	data = oct_csr_read(BGX_SMU_TX_CTL(priv->node, priv->bgx, priv->index));
+	data &= GENMASK_ULL(5, 4);
+	if (!data) {
+		data = oct_csr_read(BGX_SMU_RX_CTL(priv->node, priv->bgx, priv->index));
+		data &= GENMASK_ULL(1, 0);
+		if (!data) {
+			data = oct_csr_read(BGX_SPU_STATUS1(priv->node, priv->bgx, priv->index));
+			if (data & BIT(2))
+				status.link = 1;
+		}
+	}
+
+	if (status.link) {
+		/* Always full duplex */
+		status.duplex = DUPLEX_FULL;
+
+		/* Speed */
+		speed = bgx_port_get_qlm_speed(priv, priv->qlm);
+		data = oct_csr_read(BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+		switch ((data >> 8) & 7) {
+		default:
+		case 1:
+			speed = (speed * 8 + 5) / 10;
+			lanes = 4;
+			break;
+		case 2:
+			speed = (speed * 8 + 5) / 10;
+			lanes = 2;
+			break;
+		case 3:
+			speed = (speed * 64 + 33) / 66;
+			lanes = 1;
+			break;
+		case 4:
+			if (speed == 6250)
+				speed = 6445;
+			speed = (speed * 64 + 33) / 66;
+			lanes = 4;
+			break;
+		}
+
+		speed *= lanes;
+		status.speed = speed;
+	}
+
+	return status;
+}
+
+static int bgx_port_init_xaui_an(struct bgx_port_priv *priv)
+{
+	u64 data;
+
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X)) {
+		data = oct_csr_read(BGX_SPU_INT(priv->node, priv->bgx, priv->index));
+		/* If autonegotiation is no good */
+		if (!(data & BIT(11))) {
+			data = BIT(12) | BIT(11) | BIT(10);
+			oct_csr_write(data, BGX_SPU_INT(priv->node, priv->bgx, priv->index));
+
+			data = oct_csr_read(BGX_SPU_AN_CONTROL(priv->node, priv->bgx, priv->index));
+			data |= BIT(9);
+			oct_csr_write(data, BGX_SPU_AN_CONTROL(priv->node, priv->bgx, priv->index));
+			return -1;
+		}
+	} else {
+		data = oct_csr_read(BGX_SPU_AN_STATUS(priv->node, priv->bgx, priv->index));
+		/* If autonegotiation hasn't completed */
+		if (!(data & BIT(5))) {
+			data = oct_csr_read(BGX_SPU_AN_CONTROL(priv->node, priv->bgx, priv->index));
+			data |= BIT(9);
+			oct_csr_write(data, BGX_SPU_AN_CONTROL(priv->node, priv->bgx, priv->index));
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static void bgx_port_xaui_start_training(struct bgx_port_priv *priv)
+{
+	u64 data;
+
+	data = BIT(14) | BIT(13);
+	oct_csr_write(data, BGX_SPU_INT(priv->node, priv->bgx, priv->index));
+
+	/* BGX-20968 */
+	oct_csr_write(0, BGX_SPU_BR_PMD_LP_CUP(priv->node, priv->bgx, priv->index));
+	oct_csr_write(0, BGX_SPU_BR_PMD_LD_CUP(priv->node, priv->bgx, priv->index));
+	oct_csr_write(0, BGX_SPU_BR_PMD_LD_REP(priv->node, priv->bgx, priv->index));
+	data = oct_csr_read(BGX_SPU_AN_CONTROL(priv->node, priv->bgx, priv->index));
+	data &= ~BIT(12);
+	oct_csr_write(data, BGX_SPU_AN_CONTROL(priv->node, priv->bgx, priv->index));
+	udelay(1);
+
+	data = oct_csr_read(BGX_SPU_BR_PMD_CONTROL(priv->node, priv->bgx, priv->index));
+	data |= BIT(1);
+	oct_csr_write(data, BGX_SPU_BR_PMD_CONTROL(priv->node, priv->bgx, priv->index));
+	udelay(1);
+
+	data = oct_csr_read(BGX_SPU_BR_PMD_CONTROL(priv->node, priv->bgx, priv->index));
+	data |= BIT(0);
+	oct_csr_write(data, BGX_SPU_BR_PMD_CONTROL(priv->node, priv->bgx, priv->index));
+}
+
+static int bgx_port_gser_27882(struct bgx_port_priv *priv)
+{
+	int timeout;
+	u64 addr;
+	u64 data;
+
+	timeout = 200;
+	do {
+		data = oct_csr_read(GSER_RX_EIE_DETSTS(priv->node, priv->qlm));
+		if (data & (1 << (priv->index + 8)))
+			break;
+		timeout--;
+		udelay(1);
+	} while (timeout);
+	if (!timeout)
+		return -1;
+
+	addr = GSER_LANE_PCS_CTLIFC_0(priv->node, priv->qlm, priv->index);
+	data = oct_csr_read(addr);
+	data |= BIT(12);
+	oct_csr_write(data, addr);
+
+	addr = GSER_LANE_PCS_CTLIFC_2(priv->node, priv->qlm, priv->index);
+	data = oct_csr_read(addr);
+	data |= BIT(7);
+	oct_csr_write(data, addr);
+
+	data = oct_csr_read(addr);
+	data |= BIT(15);
+	oct_csr_write(data, addr);
+
+	data = oct_csr_read(addr);
+	data &= ~BIT(7);
+	oct_csr_write(data, addr);
+
+	data = oct_csr_read(addr);
+	data |= BIT(15);
+	oct_csr_write(data, addr);
+
+	return 0;
+}
+
+static void bgx_port_xaui_restart_training(struct bgx_port_priv *priv)
+{
+	u64 data;
+
+	data = BIT(14) | BIT(13);
+	oct_csr_write(data, BGX_SPU_INT(priv->node, priv->bgx, priv->index));
+	usleep_range(1700, 2000);
+
+	/* BGX-20968 */
+	oct_csr_write(0, BGX_SPU_BR_PMD_LP_CUP(priv->node, priv->bgx, priv->index));
+	oct_csr_write(0, BGX_SPU_BR_PMD_LD_CUP(priv->node, priv->bgx, priv->index));
+	oct_csr_write(0, BGX_SPU_BR_PMD_LD_REP(priv->node, priv->bgx, priv->index));
+
+	/* Restart training */
+	data = oct_csr_read(BGX_SPU_BR_PMD_CONTROL(priv->node, priv->bgx, priv->index));
+	data |= BIT(0);
+	oct_csr_write(data, BGX_SPU_BR_PMD_CONTROL(priv->node, priv->bgx, priv->index));
+}
+
+static int bgx_port_get_max_qlm_lanes(int qlm)
+{
+	if (OCTEON_IS_MODEL(OCTEON_CN73XX))
+		return (qlm < 4) ? 4 : 2;
+	else if (OCTEON_IS_MODEL(OCTEON_CNF75XX))
+		return 2;
+	return 4;
+}
+
+static int bgx_port_qlm_rx_equalization(struct bgx_port_priv *priv,
+					int qlm, int lane)
+{
+	int max_lanes = bgx_port_get_max_qlm_lanes(qlm);
+	int lane_mask;
+	int timeout;
+	int rc = 0;
+	u64 lmode;
+	u64 addr;
+	u64 data;
+	int i;
+
+	lane_mask = lane == -1 ? ((1 << max_lanes) - 1) : (1 << lane);
+
+	/* Nothing to do for qlms in reset */
+	data = oct_csr_read(GSER_PHY_CTL(priv->node, qlm));
+	if (data & (BIT(0) | BIT(1)))
+		return -1;
+
+	for (i = 0; i < max_lanes; i++) {
+		if (!(i & lane_mask))
+			continue;
+
+		addr = GSER_LANE_LBERT_CFG(priv->node, qlm, i);
+		data = oct_csr_read(addr);
+		/* Rx equalization can't be completed while pattern matcher is
+		 * enabled because it causes errors.
+		 */
+		if (data & BIT(6))
+			return -1;
+	}
+
+	lmode = oct_csr_read(GSER_LANE_MODE(priv->node, qlm));
+	lmode &= 0xf;
+	addr = GSER_LANE_P_MODE_1(priv->node, qlm, lmode);
+	data = oct_csr_read(addr);
+	/* Don't complete rx equalization if in VMA manual mode */
+	if (data & BIT(14))
+		return 0;
+
+	/* Apply rx equalization for speed > 6250 */
+	if (bgx_port_get_qlm_speed(priv, qlm) < 6250)
+		return 0;
+
+	/* Wait until rx data is valid (CDRLOCK) */
+	timeout = 500;
+	addr = GSER_RX_EIE_DETSTS(priv->node, qlm);
+	do {
+		data = oct_csr_read(addr);
+		data >>= 8;
+		data &= lane_mask;
+		if (data == lane_mask)
+			break;
+		timeout--;
+		udelay(1);
+	} while (timeout);
+	if (!timeout) {
+		pr_debug("QLM%d:%d: CDRLOCK timeout\n", qlm, priv->node);
+		return -1;
+	}
+
+	bgx_port_gser_20075(priv, qlm, lane);
+
+	for (i = 0; i < max_lanes; i++) {
+		if (!(i & lane_mask))
+			continue;
+		/* Skip lane 3 on 78p1.x due to gser-20075. Handled above */
+		if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X) && i == 3)
+			continue;
+
+		/* Enable software control */
+		addr = GSER_BR_RX_CTL(priv->node, qlm, i);
+		data = oct_csr_read(addr);
+		data |= BIT(2);
+		oct_csr_write(data, addr);
+
+		/* Clear the completion flag */
+		addr = GSER_BR_RX_EER(priv->node, qlm, i);
+		data = oct_csr_read(addr);
+		data &= ~BIT(14);
+		data |= BIT(15);
+		oct_csr_write(data, addr);
+	}
+
+	/* Wait for rx equalization to complete */
+	for (i = 0; i < max_lanes; i++) {
+		if (!(i & lane_mask))
+			continue;
+
+		timeout = 250000;
+		addr = GSER_BR_RX_EER(priv->node, qlm, i);
+		do {
+			data = oct_csr_read(addr);
+			if (data & BIT(14))
+				break;
+			timeout--;
+			udelay(1);
+		} while (timeout);
+		if (!timeout) {
+			pr_debug("QLM%d:%d: RXT_ESV timeout\n",
+				 qlm, priv->node);
+			rc = -1;
+		}
+
+		/* Switch back to hardware control */
+		addr = GSER_BR_RX_CTL(priv->node, qlm, i);
+		data = oct_csr_read(addr);
+		data &= ~BIT(2);
+		oct_csr_write(data, addr);
+	}
+
+	return rc;
+}
+
+static int bgx_port_xaui_equalization(struct bgx_port_priv *priv)
+{
+	u64 data;
+	int lane;
+
+	/* Nothing to do for loopback mode */
+	data = oct_csr_read(BGX_SPU_CONTROL1(priv->node, priv->bgx,
+					     priv->index));
+	if (data & BIT(14))
+		return 0;
+
+	if (priv->mode == PORT_MODE_XAUI || priv->mode == PORT_MODE_XLAUI) {
+		if (bgx_port_qlm_rx_equalization(priv, priv->qlm, -1))
+			return -1;
+
+		/* BGX2 of 73xx uses 2 dlms */
+		if (OCTEON_IS_MODEL(OCTEON_CN73XX) && priv->bgx == 2) {
+			if (bgx_port_qlm_rx_equalization(priv, priv->qlm + 1, -1))
+				return -1;
+		}
+	} else if (priv->mode == PORT_MODE_RXAUI) {
+		/* Rxaui always uses 2 lanes */
+		if (bgx_port_qlm_rx_equalization(priv, priv->qlm, -1))
+			return -1;
+	} else if (priv->mode == PORT_MODE_XFI) {
+		lane = priv->index;
+		if ((OCTEON_IS_MODEL(OCTEON_CN73XX) && priv->qlm == 6) ||
+		    (OCTEON_IS_MODEL(OCTEON_CNF75XX) && priv->qlm == 5))
+			lane -= 2;
+
+		if (bgx_port_qlm_rx_equalization(priv, priv->qlm, lane))
+			return -1;
+	}
+
+	return 0;
+}
+
+static int bgx_port_init_xaui_link(struct bgx_port_priv *priv)
+{
+	int use_training = 0;
+	int use_ber = 0;
+	int timeout;
+	int rc = 0;
+	u64 data;
+
+	if (priv->mode == PORT_MODE_10G_KR || priv->mode == PORT_MODE_40G_KR4)
+		use_training = 1;
+
+	if (!octeon_is_simulation() &&
+	    (priv->mode == PORT_MODE_XFI || priv->mode == PORT_MODE_XLAUI ||
+	     priv->mode == PORT_MODE_10G_KR || priv->mode == PORT_MODE_40G_KR4))
+		use_ber = 1;
+
+	data = oct_csr_read(BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+	data &= ~(BIT(14) | BIT(13));
+	oct_csr_write(data, BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+
+	data = oct_csr_read(BGX_SPU_MISC_CONTROL(priv->node, priv->bgx, priv->index));
+	data |= BIT(12);
+	oct_csr_write(data, BGX_SPU_MISC_CONTROL(priv->node, priv->bgx, priv->index));
+
+	if (!octeon_is_simulation()) {
+		data = oct_csr_read(BGX_SPU_AN_CONTROL(priv->node, priv->bgx, priv->index));
+		/* Restart autonegotiation */
+		if (data & BIT(12)) {
+			rc = bgx_port_init_xaui_an(priv);
+			if (rc)
+				return rc;
+		}
+
+		if (use_training) {
+			data = oct_csr_read(BGX_SPU_BR_PMD_CONTROL(priv->node, priv->bgx, priv->index));
+			/* Check if training is enabled */
+			if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X) &&
+			    !(data & BIT(1))) {
+				bgx_port_xaui_start_training(priv);
+				return -1;
+			}
+
+			if (OCTEON_IS_MODEL(OCTEON_CN73XX) ||
+			    OCTEON_IS_MODEL(OCTEON_CNF75XX) ||
+			    OCTEON_IS_MODEL(OCTEON_CN78XX))
+				bgx_port_gser_27882(priv);
+
+			data = oct_csr_read(BGX_SPU_INT(priv->node, priv->bgx, priv->index));
+
+			/* Restart training if it failed */
+			if ((data & BIT(14)) &&
+			    !OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X)) {
+				bgx_port_xaui_restart_training(priv);
+				return -1;
+			}
+
+			if (!(data & BIT(13))) {
+				pr_debug("Waiting for link training\n");
+				return -1;
+			}
+		} else {
+			bgx_port_xaui_equalization(priv);
+		}
+
+		/* Wait until the reset is complete */
+		timeout = 10000;
+		do {
+			data = oct_csr_read(BGX_SPU_CONTROL1(priv->node, priv->bgx, priv->index));
+			if (!(data & BIT(15)))
+				break;
+			timeout--;
+			udelay(1);
+		} while (timeout);
+		if (!timeout) {
+			pr_debug("BGX%d:%d:%d: Reset timeout\n", priv->bgx,
+				 priv->index, priv->node);
+			return -1;
+		}
+
+		if (use_ber) {
+			timeout = 10000;
+			do {
+				data =
+				oct_csr_read(BGX_SPU_BR_STATUS1(priv->node, priv->bgx, priv->index));
+				if (data & BIT(0))
+					break;
+				timeout--;
+				udelay(1);
+			} while (timeout);
+			if (!timeout) {
+				pr_debug("BGX%d:%d:%d: BLK_LOCK timeout\n",
+					 priv->bgx, priv->index, priv->node);
+				return -1;
+			}
+		} else {
+			timeout = 10000;
+			do {
+				data =
+				oct_csr_read(BGX_SPU_BX_STATUS(priv->node, priv->bgx, priv->index));
+				if (data & BIT(12))
+					break;
+				timeout--;
+				udelay(1);
+			} while (timeout);
+			if (!timeout) {
+				pr_debug("BGX%d:%d:%d: Lanes align timeout\n",
+					 priv->bgx, priv->index, priv->node);
+				return -1;
+			}
+		}
+
+		if (use_ber) {
+			data = oct_csr_read(BGX_SPU_BR_STATUS2(priv->node, priv->bgx, priv->index));
+			data |= BIT(15);
+			oct_csr_write(data, BGX_SPU_BR_STATUS2(priv->node, priv->bgx, priv->index));
+		}
+
+		data = oct_csr_read(BGX_SPU_STATUS2(priv->node, priv->bgx, priv->index));
+		data |= BIT(10);
+		oct_csr_write(data, BGX_SPU_STATUS2(priv->node, priv->bgx, priv->index));
+
+		data = oct_csr_read(BGX_SPU_STATUS2(priv->node, priv->bgx, priv->index));
+		if (data & BIT(10)) {
+			if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X) &&
+			    use_training)
+				bgx_port_xaui_restart_training(priv);
+			return -1;
+		}
+
+		/* Wait for mac rx to be ready */
+		timeout = 10000;
+		do {
+			data = oct_csr_read(BGX_SMU_RX_CTL(priv->node, priv->bgx, priv->index));
+			data &= GENMASK_ULL(1, 0);
+			if (!data)
+				break;
+			timeout--;
+			udelay(1);
+		} while (timeout);
+		if (!timeout) {
+			pr_debug("BGX%d:%d:%d: mac ready timeout\n",
+				 priv->bgx, priv->index, priv->node);
+			return -1;
+		}
+
+		/* Wait for bgx rx to be idle */
+		timeout = 10000;
+		do {
+			data = oct_csr_read(BGX_SMU_CTRL(priv->node, priv->bgx, priv->index));
+			if (data & BIT(0))
+				break;
+			timeout--;
+			udelay(1);
+		} while (timeout);
+		if (!timeout) {
+			pr_debug("BGX%d:%d:%d: rx idle timeout\n",
+				 priv->bgx, priv->index, priv->node);
+			return -1;
+		}
+
+		/* Wait for gmx tx to be idle */
+		timeout = 10000;
+		do {
+			data = oct_csr_read(BGX_SMU_CTRL(priv->node, priv->bgx, priv->index));
+			if (data & BIT(1))
+				break;
+			timeout--;
+			udelay(1);
+		} while (timeout);
+		if (!timeout) {
+			pr_debug("BGX%d:%d:%d: tx idle timeout\n",
+				 priv->bgx, priv->index, priv->node);
+			return -1;
+		}
+
+		/* Check rcvflt is still be 0 */
+		data = oct_csr_read(BGX_SPU_STATUS2(priv->node, priv->bgx, priv->index));
+		if (data & BIT(10)) {
+			pr_debug("BGX%d:%d:%d: receive fault\n",
+				 priv->bgx, priv->index, priv->node);
+			return -1;
+		}
+
+		/* Receive link is latching low. Force it high and verify it */
+		data = oct_csr_read(BGX_SPU_STATUS1(priv->node, priv->bgx, priv->index));
+		data |= BIT(2);
+		oct_csr_write(data, BGX_SPU_STATUS1(priv->node, priv->bgx, priv->index));
+		timeout = 10000;
+		do {
+			data = oct_csr_read(BGX_SPU_STATUS1(priv->node, priv->bgx, priv->index));
+			if (data & BIT(2))
+				break;
+			timeout--;
+			udelay(1);
+		} while (timeout);
+		if (!timeout) {
+			pr_debug("BGX%d:%d:%d: rx link down\n",
+				 priv->bgx, priv->index, priv->node);
+			return -1;
+		}
+	}
+
+	if (use_ber) {
+		/* Read error counters to clear */
+		data = oct_csr_read(BGX_SPU_BR_BIP_ERR_CNT(priv->node, priv->bgx, priv->index));
+		data = oct_csr_read(BGX_SPU_BR_STATUS2(priv->node, priv->bgx, priv->index));
+
+		/* Verify latch lock is set */
+		if (!(data & BIT(15))) {
+			pr_debug("BGX%d:%d:%d: latch lock lost\n",
+				 priv->bgx, priv->index, priv->node);
+			return -1;
+		}
+
+		/* LATCHED_BER is cleared by writing 1 to it */
+		if (data & BIT(14))
+			oct_csr_write(data, BGX_SPU_BR_STATUS2(priv->node, priv->bgx, priv->index));
+
+		usleep_range(1500, 2000);
+		data = oct_csr_read(BGX_SPU_BR_STATUS2(priv->node, priv->bgx, priv->index));
+		if (data & BIT(14)) {
+			pr_debug("BGX%d:%d:%d: BER test failed\n",
+				 priv->bgx, priv->index, priv->node);
+			return -1;
+		}
+	}
+
+	/* Enable packet transmit and receive */
+	data = oct_csr_read(BGX_SPU_MISC_CONTROL(priv->node, priv->bgx, priv->index));
+	data &= ~BIT(12);
+	oct_csr_write(data, BGX_SPU_MISC_CONTROL(priv->node, priv->bgx, priv->index));
+	data = oct_csr_read(BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+	data |= BIT(14) | BIT(13);
+	oct_csr_write(data, BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+
+	return 0;
+}
+
+static int bgx_port_set_xaui_link(struct bgx_port_priv *priv,
+				  struct port_status status)
+{
+	bool spu_link_ok = false;
+	bool smu_rx_ok = false;
+	bool smu_tx_ok = false;
+	int rc = 0;
+	u64 data;
+
+	/* Initialize hardware if link is up but hardware is not happy */
+	if (status.link) {
+		data = oct_csr_read(BGX_SMU_TX_CTL(priv->node, priv->bgx, priv->index));
+		data &= GENMASK_ULL(5, 4);
+		smu_tx_ok = data == 0;
+
+		data = oct_csr_read(BGX_SMU_RX_CTL(priv->node, priv->bgx, priv->index));
+		data &= GENMASK_ULL(1, 0);
+		smu_rx_ok = data == 0;
+
+		data = oct_csr_read(BGX_SPU_STATUS1(priv->node, priv->bgx, priv->index));
+		data &= BIT(2);
+		spu_link_ok = data == BIT(2);
+
+		if (!smu_tx_ok || !smu_rx_ok || !spu_link_ok)
+			rc = bgx_port_init_xaui_link(priv);
+	}
+
+	return rc;
+}
+
+static struct bgx_port_priv *bgx_port_netdev2priv(struct net_device *netdev)
+{
+	struct bgx_port_netdev_priv *nd_priv = netdev_priv(netdev);
+
+	return nd_priv->bgx_priv;
+}
+
+void bgx_port_set_netdev(struct device *dev, struct net_device *netdev)
+{
+	struct bgx_port_priv *priv = dev_get_drvdata(dev);
+
+	if (netdev) {
+		struct bgx_port_netdev_priv *nd_priv = netdev_priv(netdev);
+
+		nd_priv->bgx_priv = priv;
+	}
+
+	priv->netdev = netdev;
+}
+EXPORT_SYMBOL(bgx_port_set_netdev);
+
+int bgx_port_ethtool_get_link_ksettings(struct net_device *netdev,
+					struct ethtool_link_ksettings *cmd)
+{
+	struct bgx_port_priv *priv = bgx_port_netdev2priv(netdev);
+
+	if (priv->phydev) {
+		phy_ethtool_ksettings_get(priv->phydev, cmd);
+		return 0;
+	}
+	return -EINVAL;
+}
+EXPORT_SYMBOL(bgx_port_ethtool_get_link_ksettings);
+
+int bgx_port_ethtool_set_settings(struct net_device *netdev,
+				  struct ethtool_cmd *cmd)
+{
+	struct bgx_port_priv *p = bgx_port_netdev2priv(netdev);
+
+	if (p->phydev)
+		return phy_ethtool_sset(p->phydev, cmd);
+
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(bgx_port_ethtool_set_settings);
+
+int bgx_port_ethtool_nway_reset(struct net_device *netdev)
+{
+	struct bgx_port_priv *p = bgx_port_netdev2priv(netdev);
+
+	if (p->phydev)
+		return phy_start_aneg(p->phydev);
+
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(bgx_port_ethtool_nway_reset);
+
+const u8 *bgx_port_get_mac(struct net_device *netdev)
+{
+	struct bgx_port_priv *priv = bgx_port_netdev2priv(netdev);
+
+	return priv->mac_addr;
+}
+EXPORT_SYMBOL(bgx_port_get_mac);
+
+int bgx_port_do_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
+{
+	struct bgx_port_priv *p = bgx_port_netdev2priv(netdev);
+
+	if (p->phydev)
+		return phy_mii_ioctl(p->phydev, ifr, cmd);
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(bgx_port_do_ioctl);
+
+static void bgx_port_write_cam(struct bgx_port_priv *priv, int cam,
+			       const u8 *mac)
+{
+	u64 m = 0;
+	int i;
+
+	if (mac) {
+		for (i = 0; i < 6; i++)
+			m |= (((u64)mac[i]) << ((5 - i) * 8));
+		m |= BIT(48);
+	}
+
+	m |= (u64)priv->index << 52;
+	oct_csr_write(m, BGX_CMR_RX_ADRX_CAM(priv->node, priv->bgx, priv->index * 8 + cam));
+}
+
+/* Set MAC address for the net_device that is attached. */
+void bgx_port_set_rx_filtering(struct net_device *netdev)
+{
+	struct bgx_port_priv *priv = bgx_port_netdev2priv(netdev);
+	struct netdev_hw_addr *ha;
+	int available_cam_entries;
+	int current_cam_entry;
+	u64 data;
+
+	available_cam_entries = 8;
+	data = 0;
+	data |= BIT(0); /* Accept all Broadcast*/
+
+	if ((netdev->flags & IFF_PROMISC) || netdev->uc.count > 7) {
+		data &= ~BIT(3); /* Reject CAM match */
+		available_cam_entries = 0;
+	} else {
+		/* One CAM entry for the primary address, leaves seven
+		 * for the secondary addresses.
+		 */
+		data |= BIT(3); /* Accept CAM match */
+		available_cam_entries = 7 - netdev->uc.count;
+	}
+
+	if (netdev->flags & IFF_PROMISC) {
+		data |= 1 << 1; /* Accept all Multicast */
+	} else {
+		if (netdev->flags & IFF_MULTICAST) {
+			if ((netdev->flags & IFF_ALLMULTI) ||
+			    netdev_mc_count(netdev) > available_cam_entries)
+				data |= 1 << 1; /* Accept all Multicast */
+			else
+				data |= 2 << 1; /* Accept all Mcast via CAM */
+		}
+	}
+	current_cam_entry = 0;
+	if (data & BIT(3)) {
+		bgx_port_write_cam(priv, current_cam_entry, netdev->dev_addr);
+		current_cam_entry++;
+		netdev_for_each_uc_addr(ha, netdev) {
+			bgx_port_write_cam(priv, current_cam_entry, ha->addr);
+			current_cam_entry++;
+		}
+	}
+	if (((data & GENMASK_ULL(2, 1)) >> 1) == 2) {
+		/* Accept all Multicast via CAM */
+		netdev_for_each_mc_addr(ha, netdev) {
+			bgx_port_write_cam(priv, current_cam_entry, ha->addr);
+			current_cam_entry++;
+		}
+	}
+	while (current_cam_entry < 8) {
+		bgx_port_write_cam(priv, current_cam_entry, NULL);
+		current_cam_entry++;
+	}
+	oct_csr_write(data, BGX_CMR_RX_ADR_CTL(priv->node, priv->bgx,
+					       priv->index));
+}
+EXPORT_SYMBOL(bgx_port_set_rx_filtering);
+
+static void bgx_port_adjust_link(struct net_device *netdev)
+{
+	struct bgx_port_priv *priv = bgx_port_netdev2priv(netdev);
+	bool link_changed = false;
+	unsigned int duplex;
+	unsigned int speed;
+	unsigned int link;
+
+	/* Called under the phydev->lock, no additional locking needed. */
+	if (!priv->phydev->link && priv->last_status.link)
+		link_changed = true;
+
+	if (priv->phydev->link &&
+	    (priv->last_status.link != priv->phydev->link ||
+	     priv->last_status.duplex != priv->phydev->duplex ||
+	     priv->last_status.speed != priv->phydev->speed))
+		link_changed = true;
+
+	link = priv->phydev->link;
+	priv->last_status.link = priv->phydev->link;
+
+	speed = priv->phydev->speed;
+	priv->last_status.speed = priv->phydev->speed;
+
+	duplex = priv->phydev->duplex;
+	priv->last_status.duplex = priv->phydev->duplex;
+
+	if (link_changed) {
+		struct port_status status;
+
+		phy_print_status(priv->phydev);
+
+		status.link = link ? 1 : 0;
+		status.duplex = duplex;
+		status.speed = speed;
+		if (!link)
+			/* Let TX drain. FIXME check that it is drained. */
+			mdelay(50);
+		priv->set_link(priv, status);
+	}
+}
+
+static void bgx_port_check_state(struct work_struct *work)
+{
+	struct bgx_port_priv *priv;
+	struct port_status status;
+
+	priv = container_of(work, struct bgx_port_priv, dwork.work);
+
+	status = priv->get_link(priv);
+
+	if (!status.link &&
+	    priv->mode != PORT_MODE_SGMII && priv->mode != PORT_MODE_RGMII)
+		bgx_port_init_xaui_link(priv);
+
+	/* Only used when there is no phy device, so we cannot use
+	 * phy_print_status() here.
+	 */
+	if (priv->last_status.link != status.link) {
+		priv->last_status.link = status.link;
+		if (status.link)
+			netdev_info(priv->netdev, "Link is up - %d/%s\n",
+				    status.speed,
+				    status.duplex == DUPLEX_FULL ? "Full" : "Half");
+		else
+			netdev_info(priv->netdev, "Link is down\n");
+	}
+
+	queue_delayed_work(check_state_wq, &priv->dwork, HZ);
+}
+
+int bgx_port_enable(struct net_device *netdev)
+{
+	struct bgx_port_priv *priv = bgx_port_netdev2priv(netdev);
+	struct port_status status;
+	bool dont_use_phy;
+	u64 data;
+
+	if (priv->mode == PORT_MODE_SGMII || priv->mode == PORT_MODE_RGMII) {
+		/* 1G */
+		data = oct_csr_read(BGX_GMP_GMI_TX_APPEND(priv->node, priv->bgx, priv->index));
+		data |= BIT(2) | BIT(1);
+		oct_csr_write(data, BGX_GMP_GMI_TX_APPEND(priv->node, priv->bgx, priv->index));
+
+		/* Packets are padded (without FCS) to MIN_SIZE + 1 in SGMII */
+		data = 60 - 1;
+		oct_csr_write(data, BGX_GMP_GMI_TX_MIN_PKT(priv->node, priv->bgx, priv->index));
+	} else {
+		/* 10G or higher */
+		data = oct_csr_read(BGX_SMU_TX_APPEND(priv->node, priv->bgx, priv->index));
+		data |= BIT(2) | BIT(1);
+		oct_csr_write(data, BGX_SMU_TX_APPEND(priv->node, priv->bgx, priv->index));
+
+		/* Packets are padded(with FCS) to MIN_SIZE  in non-SGMII */
+		data = 60 + 4;
+		oct_csr_write(data, BGX_SMU_TX_MIN_PKT(priv->node, priv->bgx, priv->index));
+	}
+
+	switch (priv->mode) {
+	case PORT_MODE_XLAUI:
+	case PORT_MODE_XFI:
+	case PORT_MODE_10G_KR:
+	case PORT_MODE_40G_KR4:
+		dont_use_phy = true;
+		break;
+	default:
+		dont_use_phy = false;
+		break;
+	}
+
+	INIT_DELAYED_WORK(&priv->dwork, bgx_port_check_state);
+
+	if (!priv->phy_np || dont_use_phy) {
+		status = priv->get_link(priv);
+		priv->set_link(priv, status);
+		netif_carrier_on(netdev);
+
+		mutex_lock(&check_state_wq_mutex);
+		if (!check_state_wq) {
+			check_state_wq =
+				alloc_workqueue("check_state_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
+		}
+		mutex_unlock(&check_state_wq_mutex);
+		if (!check_state_wq)
+			return -ENOMEM;
+
+		queue_delayed_work(check_state_wq, &priv->dwork, 0);
+
+		netdev_info(priv->netdev, "Link is not ready\n");
+
+	} else {
+		priv->phydev = of_phy_connect(netdev, priv->phy_np,
+					      bgx_port_adjust_link, 0, priv->phy_mode);
+		if (!priv->phydev)
+			return -ENODEV;
+
+		netif_carrier_off(netdev);
+		phy_start_aneg(priv->phydev);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(bgx_port_enable);
+
+int bgx_port_disable(struct net_device *netdev)
+{
+	struct bgx_port_priv *priv = bgx_port_netdev2priv(netdev);
+	struct port_status status;
+
+	if (priv->phydev) {
+		phy_stop(priv->phydev);
+		phy_disconnect(priv->phydev);
+	}
+	priv->phydev = NULL;
+
+	netif_carrier_off(netdev);
+	memset(&status, 0, sizeof(status));
+	priv->last_status.link = 0;
+	priv->set_link(priv, status);
+	/* May have never been scheduled anywhere, unconditionally
+	 * cancel in case it was. */
+	cancel_delayed_work_sync(&priv->dwork);
+
+	return 0;
+}
+EXPORT_SYMBOL(bgx_port_disable);
+
+int bgx_port_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	struct bgx_port_priv *priv = bgx_port_netdev2priv(netdev);
+	int max_frame;
+
+	netdev->mtu = new_mtu;
+
+	max_frame = round_up(new_mtu + ETH_HLEN + ETH_FCS_LEN, 8);
+
+	if (priv->mode == PORT_MODE_SGMII || priv->mode == PORT_MODE_RGMII) {
+		/* 1G */
+		oct_csr_write(max_frame, BGX_GMP_GMI_RX_JABBER(priv->node, priv->bgx, priv->index));
+	} else {
+		/* 10G or higher */
+		oct_csr_write(max_frame, BGX_SMU_RX_JABBER(priv->node, priv->bgx, priv->index));
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(bgx_port_change_mtu);
+
+void bgx_port_mix_assert_reset(struct net_device *netdev, int mix, bool v)
+{
+	struct bgx_port_priv *priv = bgx_port_netdev2priv(netdev);
+	u64 mask = 1ull << (3 + (mix & 1));
+	u64 data;
+
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X) && v) {
+		/* Need to disable the mix before resetting the bgx-mix
+		 * interface as not doing so confuses the other already up
+		 * lmacs.
+		 */
+		data = oct_csr_read(BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+		data &= ~BIT(11);
+		oct_csr_write(data, BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+	}
+
+	data = oct_csr_read(BGX_CMR_GLOBAL_CONFIG(priv->node, priv->bgx));
+	if (v)
+		data |= mask;
+	else
+		data &= ~mask;
+	oct_csr_write(data, BGX_CMR_GLOBAL_CONFIG(priv->node, priv->bgx));
+
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X) && !v) {
+		data = oct_csr_read(BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+		data |= BIT(11);
+		oct_csr_write(data, BGX_CMR_CONFIG(priv->node, priv->bgx, priv->index));
+	}
+}
+EXPORT_SYMBOL(bgx_port_mix_assert_reset);
+
+static int bgx_port_probe(struct platform_device *pdev)
+{
+	struct bgx_port_priv *priv;
+	const __be32 *reg;
+	const u8 *mac;
+	int numa_node;
+	u32 index;
+	u64 addr;
+	int rc;
+
+	reg = of_get_property(pdev->dev.parent->of_node, "reg", NULL);
+	addr = of_translate_address(pdev->dev.parent->of_node, reg);
+	mac = of_get_mac_address(pdev->dev.of_node);
+
+	numa_node = bgx_node_to_numa_node(pdev->dev.parent->of_node);
+
+	rc = of_property_read_u32(pdev->dev.of_node, "reg", &index);
+	if (rc)
+		return -ENODEV;
+	priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, numa_node);
+	if (!priv)
+		return -ENOMEM;
+	priv->phy_np = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0);
+	priv->phy_mode = of_get_phy_mode(pdev->dev.of_node);
+	/* If phy-mode absent, default to SGMII. */
+	if (priv->phy_mode < 0)
+		priv->phy_mode = PHY_INTERFACE_MODE_SGMII;
+
+	if (priv->phy_mode == PHY_INTERFACE_MODE_1000BASEX)
+		priv->mode_1000basex = true;
+
+	if (of_phy_is_fixed_link(pdev->dev.of_node))
+		priv->bgx_as_phy = true;
+
+	priv->node = numa_node;
+	priv->bgx = bgx_addr_to_interface(addr);
+	priv->index = index;
+	if (mac)
+		priv->mac_addr = mac;
+
+	priv->qlm = bgx_port_get_qlm(priv->node, priv->bgx, priv->index);
+	priv->mode = bgx_port_get_mode(priv->node, priv->bgx, priv->index);
+
+	switch (priv->mode) {
+	case PORT_MODE_SGMII:
+		if (priv->phy_np &&
+		    priv->phy_mode != PHY_INTERFACE_MODE_SGMII)
+			dev_warn(&pdev->dev, "SGMII phy mode mismatch.\n");
+		goto set_link_functions;
+	case PORT_MODE_RGMII:
+		if (priv->phy_np && phy_interface_mode_is_rgmii(priv->phy_mode))
+			dev_warn(&pdev->dev, "RGMII phy mode mismatch.\n");
+set_link_functions:
+		priv->get_link = bgx_port_get_sgmii_link;
+		priv->set_link = bgx_port_set_xgmii_link;
+		break;
+	case PORT_MODE_XAUI:
+	case PORT_MODE_RXAUI:
+	case PORT_MODE_XLAUI:
+	case PORT_MODE_XFI:
+	case PORT_MODE_10G_KR:
+	case PORT_MODE_40G_KR4:
+		priv->get_link = bgx_port_get_xaui_link;
+		priv->set_link = bgx_port_set_xaui_link;
+		break;
+	default:
+		goto err;
+	}
+
+	dev_set_drvdata(&pdev->dev, priv);
+
+	bgx_port_init(priv);
+
+	dev_info(&pdev->dev, "Probed\n");
+	return 0;
+ err:
+	kfree(priv);
+	return rc;
+}
+
+static int bgx_port_remove(struct platform_device *pdev)
+{
+	struct bgx_port_priv *priv = dev_get_drvdata(&pdev->dev);
+
+	kfree(priv);
+	return 0;
+}
+
+static void bgx_port_shutdown(struct platform_device *pdev)
+{
+}
+
+static const struct of_device_id bgx_port_match[] = {
+	{
+		.compatible = "cavium,octeon-7890-bgx-port",
+	},
+	{
+		.compatible = "cavium,octeon-7360-xcv",
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(of, bgx_port_match);
+
+static struct platform_driver bgx_port_driver = {
+	.probe		= bgx_port_probe,
+	.remove		= bgx_port_remove,
+	.shutdown       = bgx_port_shutdown,
+	.driver		= {
+		.owner	= THIS_MODULE,
+		.name	= KBUILD_MODNAME,
+		.of_match_table = bgx_port_match,
+	},
+};
+
+static int __init bgx_port_driver_init(void)
+{
+	int r;
+	int i;
+	int j;
+	int k;
+
+	for (i = 0; i < MAX_NODES; i++) {
+		for (j = 0; j < MAX_BGX_PER_NODE; j++) {
+			for (k = 0; k < MAX_LMAC_PER_BGX; k++)
+				lmac_pknd[i][j][k] = -1;
+		}
+	}
+
+	bgx_nexus_load();
+	r =  platform_driver_register(&bgx_port_driver);
+	return r;
+}
+module_init(bgx_port_driver_init);
+
+static void __exit bgx_port_driver_exit(void)
+{
+	platform_driver_unregister(&bgx_port_driver);
+	if (check_state_wq)
+		destroy_workqueue(check_state_wq);
+}
+module_exit(bgx_port_driver_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Cavium, Inc. <support@caviumnetworks.com>");
+MODULE_DESCRIPTION("Cavium, Inc. BGX Ethernet MAC driver.");
diff --git a/drivers/net/ethernet/cavium/octeon/octeon3-core.c b/drivers/net/ethernet/cavium/octeon/octeon3-core.c
new file mode 100644
index 0000000..c0f6909
--- /dev/null
+++ b/drivers/net/ethernet/cavium/octeon/octeon3-core.c
@@ -0,0 +1,2079 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (c) 2017 Cavium, Inc.
+ *
+ * Core packet processing and initialization for the octeon3_ethernet
+ * packet processor.  The packets are received, classified and initial
+ * input checks done by the PKI unit.  Input queuing is done by the
+ * SSO.  All output processing and hardware queuing is done in the
+ * PKO.
+ */
+#include <linux/ptp_clock_kernel.h>
+#include <linux/platform_device.h>
+#include <linux/etherdevice.h>
+#include <linux/timecounter.h>
+#include <linux/net_tstamp.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/kthread.h>
+#include <linux/rculist.h>
+#include <linux/rio_drv.h>
+#include <linux/rio_ids.h>
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/wait.h>
+#include <linux/ipv6.h>
+#include <linux/ip.h>
+
+#include <asm/octeon/octeon.h>
+
+#include "octeon3.h"
+
+/*  First buffer:
+ *
+ *                            +---SKB---------+
+ *                            |               |
+ *                            |               |
+ *                         +--+--*data        |
+ *                         |  |               |
+ *                         |  |               |
+ *                         |  +---------------+
+ *                         |       /|\
+ *                         |        |
+ *                         |        |
+ *                        \|/       |
+ * WQE - 128 -+-----> +-------------+-------+     -+-
+ *            |       |    *skb ----+       |      |
+ *            |       |                     |      |
+ *            |       |                     |      |
+ *  WQE_SKIP = 128    |                     |      |
+ *            |       |                     |      |
+ *            |       |                     |      |
+ *            |       |                     |      |
+ *            |       |                     |      First Skip
+ * WQE   -----+-----> +---------------------+      |
+ *                    |   word 0            |      |
+ *                    |   word 1            |      |
+ *                    |   word 2            |      |
+ *                    |   word 3            |      |
+ *                    |   word 4            |      |
+ *                    +---------------------+     -+-
+ *               +----+- packet link        |
+ *               |    |  packet data        |
+ *               |    |                     |
+ *               |    |                     |
+ *               |    |         .           |
+ *               |    |         .           |
+ *               |    |         .           |
+ *               |    +---------------------+
+ *               |
+ *               |
+ * Later buffers:|
+ *               |
+ *               |
+ *               |
+ *               |
+ *               |
+ *               |            +---SKB---------+
+ *               |            |               |
+ *               |            |               |
+ *               |         +--+--*data        |
+ *               |         |  |               |
+ *               |         |  |               |
+ *               |         |  +---------------+
+ *               |         |       /|\
+ *               |         |        |
+ *               |         |        |
+ *               |        \|/       |
+ * WQE - 128 ----+--> +-------------+-------+     -+-
+ *               |    |    *skb ----+       |      |
+ *               |    |                     |      |
+ *               |    |                     |      |
+ *               |    |                     |      |
+ *               |    |                     |      LATER_SKIP = 128
+ *               |    |                     |      |
+ *               |    |                     |      |
+ *               |    |                     |      |
+ *               |    +---------------------+     -+-
+ *               |    |  packet link        |
+ *               +--> |  packet data        |
+ *                    |                     |
+ *                    |                     |
+ *                    |         .           |
+ *                    |         .           |
+ *                    |         .           |
+ *                    +---------------------+
+ */
+
+#define MAX_TX_QUEUE_DEPTH 512
+#define SSO_INTSN_EXE 0x61
+#define MAX_RX_QUEUES 32
+
+#define SKB_PTR_OFFSET		0
+
+#define MAX_CORES		48
+#define FPA3_NUM_AURAS		1024
+
+#define USE_ASYNC_IOBDMA	1
+#define SCR_SCRATCH		0ull
+#define SSO_NO_WAIT		0ull
+#define DID_TAG_SWTAG		0x60ull
+#define IOBDMA_SENDSINGLE	0xffffffffffffa200ull
+
+/* Values for the value of wqe word2 [ERRLEV] */
+#define PKI_ERRLEV_LA		0x01
+
+/* Values for the value of wqe word2 [OPCODE] */
+#define PKI_OPCODE_NONE		0x00
+#define PKI_OPCODE_JABBER	0x02
+#define PKI_OPCODE_FCS		0x07
+
+/* Values for the layer type in the wqe */
+#define PKI_LTYPE_IP4		0x08
+#define PKI_LTYPE_IP6		0x0a
+#define PKI_LTYPE_TCP		0x10
+#define PKI_LTYPE_UDP		0x11
+#define PKI_LTYPE_SCTP		0x12
+
+/* Registers are accessed via xkphys */
+#define SSO_BASE			0x1670000000000ull
+#define SSO_ADDR(node)			(SET_XKPHYS + NODE_OFFSET(node) +      \
+					 SSO_BASE)
+#define GRP_OFFSET(grp)			((grp) << 16)
+#define GRP_ADDR(n, g)			(SSO_ADDR(n) + GRP_OFFSET(g))
+#define SSO_GRP_AQ_CNT(n, g)		(GRP_ADDR(n, g)		   + 0x20000700)
+
+#define MIO_PTP_BASE			0x1070000000000ull
+#define MIO_PTP_ADDR(node)		(SET_XKPHYS + NODE_OFFSET(node) +      \
+					 MIO_PTP_BASE)
+#define MIO_PTP_CLOCK_CFG(node)		(MIO_PTP_ADDR(node)		+ 0xf00)
+#define MIO_PTP_CLOCK_HI(node)		(MIO_PTP_ADDR(node)		+ 0xf10)
+#define MIO_PTP_CLOCK_COMP(node)	(MIO_PTP_ADDR(node)		+ 0xf18)
+
+struct octeon3_ethernet;
+
+struct octeon3_rx {
+	struct napi_struct	napi;
+	struct octeon3_ethernet *parent;
+	int rx_grp;
+	int rx_irq;
+	cpumask_t rx_affinity_hint;
+} ____cacheline_aligned_in_smp;
+
+struct octeon3_ethernet {
+	struct bgx_port_netdev_priv bgx_priv; /* Must be first element. */
+	struct list_head list;
+	struct net_device *netdev;
+	enum octeon3_mac_type mac_type;
+	struct octeon3_rx rx_cxt[MAX_RX_QUEUES];
+	struct ptp_clock_info ptp_info;
+	struct ptp_clock *ptp_clock;
+	struct cyclecounter cc;
+	struct timecounter tc;
+	spinlock_t ptp_lock;		/* Serialize ptp clock adjustments */
+	int num_rx_cxt;
+	int pki_aura;
+	int pknd;
+	int pko_queue;
+	int node;
+	int interface;
+	int index;
+	int rx_buf_count;
+	int tx_complete_grp;
+	unsigned int rx_timestamp_hw:1;
+	unsigned int tx_timestamp_hw:1;
+	struct delayed_work stat_work;
+	spinlock_t stat_lock;		/* Protects stats counters */
+	u64 last_packets;
+	u64 last_octets;
+	u64 last_dropped;
+	atomic64_t rx_packets;
+	atomic64_t rx_octets;
+	atomic64_t rx_dropped;
+	atomic64_t rx_errors;
+	atomic64_t rx_length_errors;
+	atomic64_t rx_crc_errors;
+	atomic64_t tx_packets;
+	atomic64_t tx_octets;
+	atomic64_t tx_dropped;
+	/* The following two fields need to be on a different cache line as
+	 * they are updated by pko which invalidates the cache every time it
+	 * updates them. The idea is to prevent other fields from being
+	 * invalidated unnecessarily.
+	 */
+	char cacheline_pad1[CVMX_CACHE_LINE_SIZE];
+	atomic64_t buffers_needed;
+	atomic64_t tx_backlog;
+	char cacheline_pad2[CVMX_CACHE_LINE_SIZE];
+};
+
+static DEFINE_MUTEX(octeon3_eth_init_mutex);
+
+struct octeon3_ethernet_node;
+
+struct octeon3_ethernet_worker {
+	wait_queue_head_t queue;
+	struct task_struct *task;
+	struct octeon3_ethernet_node *oen;
+	atomic_t kick;
+	int order;
+};
+
+struct octeon3_ethernet_node {
+	bool init_done;
+	int next_cpu_irq_affinity;
+	int node;
+	int pki_packet_pool;
+	int sso_pool;
+	int pko_pool;
+	void *sso_pool_stack;
+	void *pko_pool_stack;
+	void *pki_packet_pool_stack;
+	int sso_aura;
+	int pko_aura;
+	int tx_complete_grp;
+	int tx_irq;
+	cpumask_t tx_affinity_hint;
+	struct octeon3_ethernet_worker workers[8];
+	struct mutex device_list_lock;	/* Protects the device list */
+	struct list_head device_list;
+	spinlock_t napi_alloc_lock;	/* Protects napi allocations */
+};
+
+static int num_packet_buffers = 768;
+
+int ilk0_lanes = 1;
+int ilk1_lanes = 1;
+
+static int rx_queues = 1;
+static int packet_buffer_size = 2048;
+
+static struct octeon3_ethernet_node octeon3_eth_node[MAX_NODES];
+static struct kmem_cache *octeon3_eth_sso_pko_cache;
+
+/**
+ * scratch_read64() - Read a 64 bit value from the processor local
+ *		      scratchpad memory.
+ *
+ * @offset: Byte offset into scratch pad to read.
+ *
+ * Return: The value read.
+ */
+static inline u64 scratch_read64(u64 offset)
+{
+	/* Barriers never needed for this CPU-local memory. */
+	return *(u64 *)((long)SCRATCH_BASE + offset);
+}
+
+/**
+ * scratch_write64() - Write a 64 bit value to the processor local
+ *		       scratchpad memory.
+ *
+ * @offset: Byte offset into scratch pad to write
+ * @value: Value to write
+ */
+static inline void scratch_write64(u64 offset, u64 value)
+{
+	/* Barriers never needed for this CPU-local memory. */
+	*(u64 *)((long)SCRATCH_BASE + offset) = value;
+}
+
+static int get_pki_chan(int node, int interface, int index)
+{
+	int pki_chan;
+
+	pki_chan = node << 12;
+
+	if (OCTEON_IS_MODEL(OCTEON_CNF75XX) &&
+	    (interface == 1 || interface == 2)) {
+		/* SRIO */
+		pki_chan |= 0x240 + (2 * (interface - 1)) + index;
+	} else {
+		/* BGX */
+		pki_chan |= 0x800 + (0x100 * interface) + (0x10 * index);
+	}
+
+	return pki_chan;
+}
+
+static int octeon3_eth_lgrp_to_ggrp(int node, int grp)
+{
+	return (node << 8) | grp;
+}
+
+static void octeon3_eth_gen_affinity(int node, cpumask_t *mask)
+{
+	int cpu;
+
+	do {
+		cpu = cpumask_next(octeon3_eth_node[node].next_cpu_irq_affinity, cpu_online_mask);
+		octeon3_eth_node[node].next_cpu_irq_affinity++;
+		if (cpu >= nr_cpu_ids) {
+			octeon3_eth_node[node].next_cpu_irq_affinity = -1;
+			continue;
+		}
+	} while (false);
+	cpumask_clear(mask);
+	cpumask_set_cpu(cpu, mask);
+}
+
+struct wr_ret {
+	void *work;
+	u16 grp;
+};
+
+static inline struct wr_ret octeon3_core_get_work_sync(int grp)
+{
+	u64 node = cvmx_get_node_num();
+	struct wr_ret r;
+	u64 response;
+	u64 addr;
+
+	/* See SSO_GET_WORK_LD_S for the address to read */
+	addr = 1ull << 63;
+	addr |= BIT(48);
+	addr |= DID_TAG_SWTAG << 40;
+	addr |= node << 36;
+	addr |= BIT(30);
+	addr |= BIT(29);
+	addr |= octeon3_eth_lgrp_to_ggrp(node, grp) << 4;
+	addr |= SSO_NO_WAIT << 3;
+	response = __raw_readq((void __iomem *)addr);
+
+	/* See SSO_GET_WORK_RTN_S for the format of the response */
+	r.grp = (response & GENMASK_ULL(57, 48)) >> 48;
+	if (response & BIT(63))
+		r.work = NULL;
+	else
+		r.work = phys_to_virt(response & GENMASK_ULL(41, 0));
+
+	return r;
+}
+
+/**
+ * octeon3_core_get_work_async() - Request work via a iobdma command.
+ *				   Doesn't wait for the response.
+ *
+ * @grp: Group to request work for.
+ */
+static inline void octeon3_core_get_work_async(unsigned int grp)
+{
+	u64 node = cvmx_get_node_num();
+	u64 data;
+
+	/* See SSO_GET_WORK_DMA_S for the command structure */
+	data = SCR_SCRATCH << 56;
+	data |= 1ull << 48;
+	data |= DID_TAG_SWTAG << 40;
+	data |= node << 36;
+	data |= 1ull << 30;
+	data |= 1ull << 29;
+	data |= octeon3_eth_lgrp_to_ggrp(node, grp) << 4;
+	data |= SSO_NO_WAIT << 3;
+
+	__raw_writeq(data, (void __iomem *)IOBDMA_SENDSINGLE);
+}
+
+/**
+ * octeon3_core_get_response_async() - Read the request work response. Must be
+ *				       called after calling
+ *				       octeon3_core_get_work_async().
+ *
+ * Return: Work queue entry.
+ */
+static inline struct wr_ret octeon3_core_get_response_async(void)
+{
+	struct wr_ret r;
+	u64 response;
+
+	CVMX_SYNCIOBDMA;
+	response = scratch_read64(SCR_SCRATCH);
+
+	/* See SSO_GET_WORK_RTN_S for the format of the response */
+	r.grp = (response & GENMASK_ULL(57, 48)) >> 48;
+	if (response & BIT(63))
+		r.work = NULL;
+	else
+		r.work = phys_to_virt(response & GENMASK_ULL(41, 0));
+
+	return r;
+}
+
+static void octeon3_eth_replenish_rx(struct octeon3_ethernet *priv, int count)
+{
+	struct sk_buff *skb;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		void **buf;
+
+		skb = __alloc_skb(packet_buffer_size, GFP_ATOMIC, 0, priv->node);
+		if (!skb)
+			break;
+		buf = (void **)PTR_ALIGN(skb->head, 128);
+		buf[SKB_PTR_OFFSET] = skb;
+		octeon_fpa3_free(priv->node, priv->pki_aura, buf);
+	}
+}
+
+static bool octeon3_eth_tx_complete_runnable(struct octeon3_ethernet_worker *worker)
+{
+	return atomic_read(&worker->kick) != 0 || kthread_should_stop();
+}
+
+static int octeon3_eth_replenish_all(struct octeon3_ethernet_node *oen)
+{
+	struct octeon3_ethernet *priv;
+	int batch_size = 32;
+	int pending = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(priv, &oen->device_list, list) {
+		int amount = atomic64_sub_if_positive(batch_size, &priv->buffers_needed);
+
+		if (amount >= 0) {
+			octeon3_eth_replenish_rx(priv, batch_size);
+			pending += amount;
+		}
+	}
+	rcu_read_unlock();
+	return pending;
+}
+
+static int octeon3_eth_tx_complete_hwtstamp(struct octeon3_ethernet *priv,
+					    struct sk_buff *skb)
+{
+	struct skb_shared_hwtstamps shts;
+	u64 hwts;
+	u64 ns;
+
+	hwts = *((u64 *)(skb->cb) + 1);
+	ns = timecounter_cyc2time(&priv->tc, hwts);
+	memset(&shts, 0, sizeof(shts));
+	shts.hwtstamp = ns_to_ktime(ns);
+	skb_tstamp_tx(skb, &shts);
+
+	return 0;
+}
+
+static int octeon3_eth_tx_complete_worker(void *data)
+{
+	struct octeon3_ethernet_worker *worker = data;
+	struct octeon3_ethernet_node *oen;
+	int tx_complete_stop_thresh;
+	int backlog_stop_thresh;
+	int backlog;
+	u64 aq_cnt;
+	int order;
+	int i;
+
+	oen = worker->oen;
+	order = worker->order;
+	tx_complete_stop_thresh = order * 100;
+	backlog_stop_thresh = order == 0 ? 31 : order * 80;
+
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(worker->queue, octeon3_eth_tx_complete_runnable(worker));
+		atomic_dec_if_positive(&worker->kick); /* clear the flag */
+
+		do {
+			backlog = octeon3_eth_replenish_all(oen);
+			for (i = 0; i < 100; i++) {
+				struct octeon3_ethernet *tx_priv;
+				struct net_device *tx_netdev;
+				struct sk_buff *skb;
+				struct wr_ret r;
+				void **work;
+
+				r = octeon3_core_get_work_sync(oen->tx_complete_grp);
+				work = r.work;
+				if (!work)
+					break;
+				tx_netdev = work[0];
+				tx_priv = netdev_priv(tx_netdev);
+				if (unlikely(netif_queue_stopped(tx_netdev)) &&
+				    atomic64_read(&tx_priv->tx_backlog) < MAX_TX_QUEUE_DEPTH)
+					netif_wake_queue(tx_netdev);
+				skb = container_of((void *)work, struct sk_buff, cb);
+				if (unlikely(tx_priv->tx_timestamp_hw) &&
+				    unlikely(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS))
+					octeon3_eth_tx_complete_hwtstamp(tx_priv, skb);
+				consume_skb(skb);
+			}
+
+			aq_cnt = oct_csr_read(SSO_GRP_AQ_CNT(oen->node, oen->tx_complete_grp));
+			aq_cnt &= GENMASK_ULL(32, 0);
+			if ((backlog > backlog_stop_thresh || aq_cnt > tx_complete_stop_thresh) &&
+			    order < ARRAY_SIZE(oen->workers) - 1) {
+				atomic_set(&oen->workers[order + 1].kick, 1);
+				wake_up(&oen->workers[order + 1].queue);
+			}
+		} while (!need_resched() && (backlog > backlog_stop_thresh ||
+					     aq_cnt > tx_complete_stop_thresh));
+
+		cond_resched();
+
+		if (!octeon3_eth_tx_complete_runnable(worker))
+			octeon3_sso_irq_set(oen->node, oen->tx_complete_grp, true);
+	}
+
+	return 0;
+}
+
+static irqreturn_t octeon3_eth_tx_handler(int irq, void *info)
+{
+	struct octeon3_ethernet_node *oen = info;
+	/* Disarm the irq. */
+	octeon3_sso_irq_set(oen->node, oen->tx_complete_grp, false);
+	atomic_set(&oen->workers[0].kick, 1);
+	wake_up(&oen->workers[0].queue);
+	return IRQ_HANDLED;
+}
+
+static int octeon3_eth_global_init(unsigned int node,
+				   struct platform_device *pdev)
+{
+	struct octeon3_ethernet_node *oen;
+	unsigned int sso_intsn;
+	int rv = 0;
+	int i;
+
+	mutex_lock(&octeon3_eth_init_mutex);
+
+	oen = octeon3_eth_node + node;
+
+	if (oen->init_done)
+		goto done;
+
+	/* CN78XX-P1.0 cannot un-initialize PKO, so get a module
+	 * reference to prevent it from being unloaded.
+	 */
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_0))
+		if (!try_module_get(THIS_MODULE))
+			dev_err(&pdev->dev,
+				"ERROR: Could not obtain module reference for CN78XX-P1.0\n");
+
+	INIT_LIST_HEAD(&oen->device_list);
+	mutex_init(&oen->device_list_lock);
+	spin_lock_init(&oen->napi_alloc_lock);
+
+	oen->node = node;
+
+	octeon_fpa3_init(node);
+	rv = octeon_fpa3_pool_init(node, -1, &oen->sso_pool,
+				   &oen->sso_pool_stack, 40960);
+	if (rv)
+		goto done;
+
+	rv = octeon_fpa3_pool_init(node, -1, &oen->pko_pool,
+				   &oen->pko_pool_stack, 40960);
+	if (rv)
+		goto done;
+
+	rv = octeon_fpa3_pool_init(node, -1, &oen->pki_packet_pool,
+				   &oen->pki_packet_pool_stack, 64 * num_packet_buffers);
+	if (rv)
+		goto done;
+
+	rv = octeon_fpa3_aura_init(node, oen->sso_pool, -1,
+				   &oen->sso_aura, num_packet_buffers, 20480);
+	if (rv)
+		goto done;
+
+	rv = octeon_fpa3_aura_init(node, oen->pko_pool, -1,
+				   &oen->pko_aura, num_packet_buffers, 20480);
+	if (rv)
+		goto done;
+
+	dev_info(&pdev->dev, "SSO:%d:%d, PKO:%d:%d\n", oen->sso_pool,
+		 oen->sso_aura, oen->pko_pool, oen->pko_aura);
+
+	if (!octeon3_eth_sso_pko_cache) {
+		octeon3_eth_sso_pko_cache = kmem_cache_create("sso_pko", 4096, 128, 0, NULL);
+		if (!octeon3_eth_sso_pko_cache) {
+			rv = -ENOMEM;
+			goto done;
+		}
+	}
+
+	rv = octeon_fpa3_mem_fill(node, octeon3_eth_sso_pko_cache,
+				  oen->sso_aura, 1024);
+	if (rv)
+		goto done;
+
+	rv = octeon_fpa3_mem_fill(node, octeon3_eth_sso_pko_cache,
+				  oen->pko_aura, 1024);
+	if (rv)
+		goto done;
+
+	rv = octeon3_sso_init(node, oen->sso_aura);
+	if (rv)
+		goto done;
+
+	oen->tx_complete_grp = octeon3_sso_alloc_grp(node, -1);
+	if (oen->tx_complete_grp < 0)
+		goto done;
+
+	sso_intsn = SSO_INTSN_EXE << 12 | oen->tx_complete_grp;
+	oen->tx_irq = irq_create_mapping(NULL, sso_intsn);
+	if (!oen->tx_irq) {
+		rv = -ENODEV;
+		goto done;
+	}
+
+	rv = octeon3_pko_init_global(node, oen->pko_aura);
+	if (rv) {
+		rv = -ENODEV;
+		goto done;
+	}
+
+	octeon3_pki_vlan_init(node);
+	octeon3_pki_cluster_init(node, pdev);
+	octeon3_pki_ltype_init(node);
+	octeon3_pki_enable(node);
+
+	for (i = 0; i < ARRAY_SIZE(oen->workers); i++) {
+		oen->workers[i].oen = oen;
+		init_waitqueue_head(&oen->workers[i].queue);
+		oen->workers[i].order = i;
+	}
+	for (i = 0; i < ARRAY_SIZE(oen->workers); i++) {
+		oen->workers[i].task = kthread_create_on_node(octeon3_eth_tx_complete_worker,
+							      oen->workers + i, node,
+							      "oct3_eth/%d:%d", node, i);
+		if (IS_ERR(oen->workers[i].task)) {
+			rv = PTR_ERR(oen->workers[i].task);
+			goto done;
+		} else {
+#ifdef CONFIG_NUMA
+			set_cpus_allowed_ptr(oen->workers[i].task, cpumask_of_node(node));
+#endif
+			wake_up_process(oen->workers[i].task);
+		}
+	}
+
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X))
+		octeon3_sso_pass1_limit(node, oen->tx_complete_grp);
+
+	rv = request_irq(oen->tx_irq, octeon3_eth_tx_handler,
+			 IRQ_TYPE_EDGE_RISING, "oct3_eth_tx_done", oen);
+	if (rv)
+		goto done;
+	octeon3_eth_gen_affinity(node, &oen->tx_affinity_hint);
+	irq_set_affinity_hint(oen->tx_irq, &oen->tx_affinity_hint);
+
+	octeon3_sso_irq_set(node, oen->tx_complete_grp, true);
+
+	oen->init_done = true;
+done:
+	mutex_unlock(&octeon3_eth_init_mutex);
+	return rv;
+}
+
+static struct sk_buff *octeon3_eth_work_to_skb(void *w)
+{
+	struct sk_buff *skb;
+	void **f = w;
+
+	skb = f[-16];
+	return skb;
+}
+
+/* Receive one packet.
+ * returns the number of RX buffers consumed.
+ */
+static int octeon3_eth_rx_one(struct octeon3_rx *rx,
+			      bool is_async, bool req_next)
+{
+	struct octeon3_ethernet *priv = rx->parent;
+	union buf_ptr packet_ptr;
+	unsigned int packet_len;
+	struct sk_buff *skb;
+	int len_remaining;
+	struct wqe *work;
+	struct wr_ret r;
+	int segments;
+	u8 *data;
+	int ret;
+
+	if (is_async)
+		r = octeon3_core_get_response_async();
+	else
+		r = octeon3_core_get_work_sync(rx->rx_grp);
+	work = r.work;
+	if (!work)
+		return 0;
+
+	/* Request the next work so it'll be ready when we need it */
+	if (is_async && req_next)
+		octeon3_core_get_work_async(rx->rx_grp);
+
+	skb = octeon3_eth_work_to_skb(work);
+
+	segments = work->word0.bufs;
+	ret = segments;
+	packet_ptr = work->packet_ptr;
+	if (unlikely(work->word2.err_level <= PKI_ERRLEV_LA &&
+		     work->word2.err_code != PKI_OPCODE_NONE)) {
+		atomic64_inc(&priv->rx_errors);
+		switch (work->word2.err_code) {
+		case PKI_OPCODE_JABBER:
+			atomic64_inc(&priv->rx_length_errors);
+			break;
+		case PKI_OPCODE_FCS:
+			atomic64_inc(&priv->rx_crc_errors);
+			break;
+		}
+		data = phys_to_virt(packet_ptr.addr);
+		for (;;) {
+			dev_kfree_skb_any(skb);
+			segments--;
+			if (segments <= 0)
+				break;
+			packet_ptr.u64 = *(u64 *)(data - 8);
+#ifndef __LITTLE_ENDIAN
+			if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X)) {
+				/* PKI_BUFLINK_S's are endian-swapped */
+				packet_ptr.u64 = swab64(packet_ptr.u64);
+			}
+#endif
+			data = phys_to_virt(packet_ptr.addr);
+			skb = octeon3_eth_work_to_skb((void *)round_down((unsigned long)data, 128ull));
+		}
+		goto out;
+	}
+
+	packet_len = work->word1.len;
+	data = phys_to_virt(packet_ptr.addr);
+	skb->data = data;
+	skb->len = packet_len;
+	len_remaining = packet_len;
+	if (segments == 1) {
+		/* Strip the ethernet fcs */
+		skb->len -= 4;
+		skb_set_tail_pointer(skb, skb->len);
+	} else {
+		struct sk_buff *current_skb = skb;
+		struct sk_buff *next_skb = NULL;
+		unsigned int segment_size;
+		bool first_frag = true;
+
+		skb_frag_list_init(skb);
+		for (;;) {
+			segment_size = (segments == 1) ? len_remaining : packet_ptr.size;
+			len_remaining -= segment_size;
+			if (!first_frag) {
+				current_skb->len = segment_size;
+				skb->data_len += segment_size;
+				skb->truesize += current_skb->truesize;
+			}
+			skb_set_tail_pointer(current_skb, segment_size);
+			segments--;
+			if (segments == 0)
+				break;
+			packet_ptr.u64 = *(u64 *)(data - 8);
+#ifndef __LITTLE_ENDIAN
+			if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X)) {
+				/* PKI_BUFLINK_S's are endian-swapped */
+				packet_ptr.u64 = swab64(packet_ptr.u64);
+			}
+#endif
+			data = phys_to_virt(packet_ptr.addr);
+			next_skb = octeon3_eth_work_to_skb((void *)round_down((unsigned long)data, 128ull));
+			if (first_frag) {
+				next_skb->next = skb_shinfo(current_skb)->frag_list;
+				skb_shinfo(current_skb)->frag_list = next_skb;
+			} else {
+				current_skb->next = next_skb;
+				next_skb->next = NULL;
+			}
+			current_skb = next_skb;
+			first_frag = false;
+			current_skb->data = data;
+		}
+
+		/* Strip the ethernet fcs */
+		pskb_trim(skb, skb->len - 4);
+	}
+
+	skb_checksum_none_assert(skb);
+	if (unlikely(priv->rx_timestamp_hw)) {
+		/* The first 8 bytes are the timestamp */
+		u64 hwts = *(u64 *)skb->data;
+		u64 ns;
+		struct skb_shared_hwtstamps *shts;
+
+		ns = timecounter_cyc2time(&priv->tc, hwts);
+		shts = skb_hwtstamps(skb);
+		memset(shts, 0, sizeof(*shts));
+		shts->hwtstamp = ns_to_ktime(ns);
+		__skb_pull(skb, 8);
+	}
+
+	skb->protocol = eth_type_trans(skb, priv->netdev);
+	skb->dev = priv->netdev;
+	if (priv->netdev->features & NETIF_F_RXCSUM) {
+		if ((work->word2.lc_hdr_type == PKI_LTYPE_IP4 ||
+		     work->word2.lc_hdr_type == PKI_LTYPE_IP6) &&
+		    (work->word2.lf_hdr_type == PKI_LTYPE_TCP ||
+		     work->word2.lf_hdr_type == PKI_LTYPE_UDP ||
+		     work->word2.lf_hdr_type == PKI_LTYPE_SCTP))
+			if (work->word2.err_code == 0)
+				skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+
+	napi_gro_receive(&rx->napi, skb);
+out:
+	return ret;
+}
+
+static int octeon3_eth_napi(struct napi_struct *napi, int budget)
+{
+	struct octeon3_ethernet *priv;
+	struct octeon3_rx *cxt;
+	int rx_count = 0;
+	u64 old_scratch;
+	int n_bufs = 0;
+	u64 aq_cnt;
+	int n = 0;
+
+	cxt = container_of(napi, struct octeon3_rx, napi);
+	priv = cxt->parent;
+
+	/* Get the amount of work pending */
+	aq_cnt = oct_csr_read(SSO_GRP_AQ_CNT(priv->node, cxt->rx_grp));
+	aq_cnt &= GENMASK_ULL(32, 0);
+
+	if (likely(USE_ASYNC_IOBDMA)) {
+		/* Save scratch in case userspace is using it */
+		CVMX_SYNCIOBDMA;
+		old_scratch = scratch_read64(SCR_SCRATCH);
+
+		octeon3_core_get_work_async(cxt->rx_grp);
+	}
+
+	while (rx_count < budget) {
+		n = 0;
+
+		if (likely(USE_ASYNC_IOBDMA)) {
+			bool req_next = rx_count < (budget - 1) ? true : false;
+
+			n = octeon3_eth_rx_one(cxt, true, req_next);
+		} else {
+			n = octeon3_eth_rx_one(cxt, false, false);
+		}
+
+		if (n == 0)
+			break;
+
+		n_bufs += n;
+		rx_count++;
+	}
+
+	/* Wake up worker threads */
+	n_bufs = atomic64_add_return(n_bufs, &priv->buffers_needed);
+	if (n_bufs >= 32) {
+		struct octeon3_ethernet_node *oen;
+
+		oen = octeon3_eth_node + priv->node;
+		atomic_set(&oen->workers[0].kick, 1);
+		wake_up(&oen->workers[0].queue);
+	}
+
+	/* Stop the thread when no work is pending */
+	if (rx_count < budget) {
+		napi_complete(napi);
+		octeon3_sso_irq_set(cxt->parent->node, cxt->rx_grp, true);
+	}
+
+	if (likely(USE_ASYNC_IOBDMA)) {
+		/* Restore the scratch area */
+		scratch_write64(SCR_SCRATCH, old_scratch);
+	}
+
+	return rx_count;
+}
+
+#undef BROKEN_SIMULATOR_CSUM
+
+static void ethtool_get_drvinfo(struct net_device *netdev,
+				struct ethtool_drvinfo *info)
+{
+	strlcpy(info->driver, "octeon3-ethernet", sizeof(info->driver));
+	strlcpy(info->version, "1.0", sizeof(info->version));
+	strlcpy(info->bus_info,
+		dev_name(netdev->dev.parent), sizeof(info->bus_info));
+}
+
+static int ethtool_get_ts_info(struct net_device *ndev,
+			       struct ethtool_ts_info *info)
+{
+	struct octeon3_ethernet *priv = netdev_priv(ndev);
+
+	info->so_timestamping =
+		SOF_TIMESTAMPING_TX_HARDWARE |
+		SOF_TIMESTAMPING_RX_HARDWARE |
+		SOF_TIMESTAMPING_RAW_HARDWARE;
+
+	if (priv->ptp_clock)
+		info->phc_index = ptp_clock_index(priv->ptp_clock);
+	else
+		info->phc_index = -1;
+
+	info->tx_types = (1 << HWTSTAMP_TX_OFF) | (1 << HWTSTAMP_TX_ON);
+
+	info->rx_filters = (1 << HWTSTAMP_FILTER_NONE) | (1 << HWTSTAMP_FILTER_ALL);
+
+	return 0;
+}
+
+static const struct ethtool_ops octeon3_ethtool_ops = {
+	.get_drvinfo = ethtool_get_drvinfo,
+	.get_link_ksettings = bgx_port_ethtool_get_link_ksettings,
+	.set_settings = bgx_port_ethtool_set_settings,
+	.nway_reset = bgx_port_ethtool_nway_reset,
+	.get_link = ethtool_op_get_link,
+	.get_ts_info = ethtool_get_ts_info,
+};
+
+static int octeon3_eth_common_ndo_init(struct net_device *netdev,
+				       int extra_skip)
+{
+	struct octeon3_ethernet_node *oen;
+	int base_rx_grp[MAX_RX_QUEUES];
+	struct octeon3_ethernet *priv;
+	int pki_chan;
+	int aura;
+	int dq;
+	int i;
+	int r;
+
+	priv = netdev_priv(netdev);
+	oen = octeon3_eth_node + priv->node;
+
+	netif_carrier_off(netdev);
+
+	netdev->features |=
+#ifndef BROKEN_SIMULATOR_CSUM
+		NETIF_F_IP_CSUM |
+		NETIF_F_IPV6_CSUM |
+#endif
+		NETIF_F_SG |
+		NETIF_F_FRAGLIST |
+		NETIF_F_RXCSUM |
+		NETIF_F_LLTX;
+
+	if (!OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X))
+		netdev->features |= NETIF_F_SCTP_CRC;
+
+	netdev->features |= NETIF_F_TSO | NETIF_F_TSO6;
+
+	/* Set user changeable settings */
+	netdev->hw_features = netdev->features;
+
+	priv->rx_buf_count = num_packet_buffers;
+
+	pki_chan = get_pki_chan(priv->node, priv->interface, priv->index);
+
+	dq = octeon3_pko_interface_init(priv->node, priv->interface,
+					priv->index, priv->mac_type, pki_chan);
+	if (dq < 0) {
+		dev_err(netdev->dev.parent, "Failed to initialize pko\n");
+		return -ENODEV;
+	}
+
+	r = octeon3_pko_activate_dq(priv->node, dq, 1);
+	if (r < 0) {
+		dev_err(netdev->dev.parent, "Failed to activate dq\n");
+		return -ENODEV;
+	}
+
+	priv->pko_queue = dq;
+	octeon_fpa3_aura_init(priv->node, oen->pki_packet_pool, -1, &aura,
+			      num_packet_buffers, num_packet_buffers * 2);
+	priv->pki_aura = aura;
+
+	r = octeon3_sso_alloc_grp_range(priv->node,
+					-1, rx_queues, false, base_rx_grp);
+	if (r) {
+		dev_err(netdev->dev.parent, "Failed to allocated SSO group\n");
+		return -ENODEV;
+	}
+	for (i = 0; i < rx_queues; i++) {
+		priv->rx_cxt[i].rx_grp = base_rx_grp[i];
+		priv->rx_cxt[i].parent = priv;
+
+		if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X))
+			octeon3_sso_pass1_limit(priv->node, priv->rx_cxt[i].rx_grp);
+	}
+	priv->num_rx_cxt = rx_queues;
+
+	priv->tx_complete_grp = oen->tx_complete_grp;
+	dev_info(netdev->dev.parent,
+		 "rx sso grp:%d..%d aura:%d pknd:%d pko_queue:%d\n",
+		 *base_rx_grp, *(base_rx_grp + priv->num_rx_cxt - 1),
+		 priv->pki_aura, priv->pknd, priv->pko_queue);
+
+	octeon3_pki_port_init(priv->node, priv->pki_aura, *base_rx_grp,
+			      extra_skip, (packet_buffer_size - 128),
+			      priv->pknd, priv->num_rx_cxt);
+
+	priv->last_packets = 0;
+	priv->last_octets = 0;
+	priv->last_dropped = 0;
+
+	/* Register ethtool methods */
+	netdev->ethtool_ops = &octeon3_ethtool_ops;
+
+	return 0;
+}
+
+static int octeon3_eth_bgx_ndo_init(struct net_device *netdev)
+{
+	struct octeon3_ethernet *priv = netdev_priv(netdev);
+	const u8 *mac;
+	int r;
+
+	priv->pknd = bgx_port_get_pknd(priv->node, priv->interface, priv->index);
+	octeon3_eth_common_ndo_init(netdev, 0);
+
+	/* Padding and FCS are done in BGX */
+	r = octeon3_pko_set_mac_options(priv->node, priv->interface, priv->index,
+					priv->mac_type, false, false, 0);
+	if (r)
+		return r;
+
+	mac = bgx_port_get_mac(netdev);
+	if (mac && is_valid_ether_addr(mac)) {
+		memcpy(netdev->dev_addr, mac, ETH_ALEN);
+		netdev->addr_assign_type &= ~NET_ADDR_RANDOM;
+	} else {
+		eth_hw_addr_random(netdev);
+	}
+
+	bgx_port_set_rx_filtering(netdev);
+	bgx_port_change_mtu(netdev, netdev->mtu);
+
+	return 0;
+}
+
+static void octeon3_eth_ndo_uninit(struct net_device *netdev)
+{
+	struct octeon3_ethernet *priv = netdev_priv(netdev);
+	int grp[MAX_RX_QUEUES];
+	int i;
+
+	/* Shutdwon pki for this interface */
+	octeon3_pki_port_shutdown(priv->node, priv->pknd);
+	octeon_fpa3_release_aura(priv->node, priv->pki_aura);
+
+	/* Shutdown pko for this interface */
+	octeon3_pko_interface_uninit(priv->node, &priv->pko_queue, 1);
+
+	/* Free the receive contexts sso groups */
+	for (i = 0; i < rx_queues; i++)
+		grp[i] = priv->rx_cxt[i].rx_grp;
+	octeon3_sso_free_grp_range(priv->node, grp, rx_queues);
+}
+
+static void octeon3_eth_ndo_get_stats64(struct net_device *netdev,
+					struct rtnl_link_stats64 *s)
+{
+	struct octeon3_ethernet *priv = netdev_priv(netdev);
+	u64 delta_packets;
+	u64 delta_dropped;
+	u64 delta_octets;
+	u64 dropped;
+	u64 packets;
+	u64 octets;
+
+	/* The 48 bits counters may wrap around.  We need to call this
+	 * function periodically, to catch any wrap.  Locking is
+	 * needed to ensure consistency of the RMW operation on the
+	 * last_{packets, octets, dropped} variables if two or more
+	 * threads enter here at the same time.
+	 */
+	spin_lock(&priv->stat_lock);
+
+	octeon3_pki_get_stats(priv->node, priv->pknd, &packets, &octets, &dropped);
+
+	delta_packets = (packets - priv->last_packets) & ((1ull << 48) - 1);
+	delta_octets = (octets - priv->last_octets) & ((1ull << 48) - 1);
+	delta_dropped = (dropped - priv->last_dropped) & ((1ull << 48) - 1);
+
+	priv->last_packets = packets;
+	priv->last_octets = octets;
+	priv->last_dropped = dropped;
+
+	spin_unlock(&priv->stat_lock);
+
+	s->rx_packets = atomic64_add_return_relaxed(delta_packets, &priv->rx_packets);
+	s->rx_bytes = atomic64_add_return_relaxed(delta_octets, &priv->rx_octets);
+	s->rx_dropped = atomic64_add_return_relaxed(delta_dropped, &priv->rx_dropped);
+
+	s->rx_errors = atomic64_read(&priv->rx_errors);
+	s->rx_length_errors = atomic64_read(&priv->rx_length_errors);
+	s->rx_crc_errors = atomic64_read(&priv->rx_crc_errors);
+
+	s->tx_packets = atomic64_read(&priv->tx_packets);
+	s->tx_bytes = atomic64_read(&priv->tx_octets);
+	s->tx_dropped = atomic64_read(&priv->tx_dropped);
+}
+
+static void octeon3_eth_stat_poll(struct work_struct *work)
+{
+	struct octeon3_ethernet *priv;
+	struct rtnl_link_stats64 s;
+
+	priv = container_of(work, struct octeon3_ethernet, stat_work.work);
+	octeon3_eth_ndo_get_stats64(priv->netdev, &s);
+
+	/* Poll every 60s */
+	mod_delayed_work(system_unbound_wq,
+			 &priv->stat_work, msecs_to_jiffies(60000));
+}
+
+static irqreturn_t octeon3_eth_rx_handler(int irq, void *info)
+{
+	struct octeon3_rx *rx = info;
+
+	/* Disarm the irq. */
+	octeon3_sso_irq_set(rx->parent->node, rx->rx_grp, false);
+
+	napi_schedule(&rx->napi);
+	return IRQ_HANDLED;
+}
+
+static int octeon3_eth_common_ndo_open(struct net_device *netdev)
+{
+	struct octeon3_ethernet *priv = netdev_priv(netdev);
+	struct octeon3_rx *rx;
+	int i;
+	int r;
+
+	for (i = 0; i < priv->num_rx_cxt; i++) {
+		unsigned int sso_intsn;
+
+		rx = priv->rx_cxt + i;
+		sso_intsn = SSO_INTSN_EXE << 12 | rx->rx_grp;
+
+		rx->rx_irq = irq_create_mapping(NULL, sso_intsn);
+		if (!rx->rx_irq) {
+			netdev_err(netdev,
+				   "ERROR: Couldn't map hwirq: %x\n", sso_intsn);
+			r = -EINVAL;
+			goto err1;
+		}
+		r = request_irq(rx->rx_irq, octeon3_eth_rx_handler,
+				IRQ_TYPE_EDGE_RISING, netdev_name(netdev), rx);
+		if (r) {
+			netdev_err(netdev, "ERROR: Couldn't request irq: %d\n",
+				   rx->rx_irq);
+			r = -ENOMEM;
+			goto err2;
+		}
+
+		octeon3_eth_gen_affinity(priv->node, &rx->rx_affinity_hint);
+		irq_set_affinity_hint(rx->rx_irq, &rx->rx_affinity_hint);
+
+		netif_napi_add(priv->netdev, &rx->napi,
+			       octeon3_eth_napi, NAPI_POLL_WEIGHT);
+		napi_enable(&rx->napi);
+
+		/* Arm the irq. */
+		octeon3_sso_irq_set(priv->node, rx->rx_grp, true);
+	}
+	octeon3_eth_replenish_rx(priv, priv->rx_buf_count);
+
+	/* Start stat polling */
+	octeon3_eth_stat_poll(&priv->stat_work.work);
+
+	return 0;
+
+err2:
+	irq_dispose_mapping(rx->rx_irq);
+err1:
+	for (i--; i >= 0; i--) {
+		rx = priv->rx_cxt + i;
+		free_irq(rx->rx_irq, rx);
+		irq_dispose_mapping(rx->rx_irq);
+		napi_disable(&rx->napi);
+		netif_napi_del(&rx->napi);
+	}
+
+	return r;
+}
+
+static int octeon3_eth_bgx_ndo_open(struct net_device *netdev)
+{
+	int rc;
+
+	rc = octeon3_eth_common_ndo_open(netdev);
+	if (rc == 0)
+		rc = bgx_port_enable(netdev);
+
+	return rc;
+}
+
+static int octeon3_eth_common_ndo_stop(struct net_device *netdev)
+{
+	struct octeon3_ethernet *priv = netdev_priv(netdev);
+	struct octeon3_rx *rx;
+	struct sk_buff *skb;
+	void **w;
+	int i;
+
+	cancel_delayed_work_sync(&priv->stat_work);
+
+	/* Allow enough time for ingress in transit packets to be drained */
+	msleep(20);
+
+	/* Wait until sso has no more work for this interface */
+	for (i = 0; i < priv->num_rx_cxt; i++) {
+		rx = priv->rx_cxt + i;
+		while (oct_csr_read(SSO_GRP_AQ_CNT(priv->node, rx->rx_grp)))
+			msleep(20);
+	}
+
+	/* Free the irq and napi context for each rx context */
+	for (i = 0; i < priv->num_rx_cxt; i++) {
+		rx = priv->rx_cxt + i;
+		octeon3_sso_irq_set(priv->node, rx->rx_grp, false);
+		irq_set_affinity_hint(rx->rx_irq, NULL);
+		free_irq(rx->rx_irq, rx);
+		irq_dispose_mapping(rx->rx_irq);
+		rx->rx_irq = 0;
+		napi_disable(&rx->napi);
+		netif_napi_del(&rx->napi);
+	}
+
+	/* Free the packet buffers */
+	for (;;) {
+		w = octeon_fpa3_alloc(priv->node, priv->pki_aura);
+		if (!w)
+			break;
+		skb = w[0];
+		dev_kfree_skb(skb);
+	}
+
+	return 0;
+}
+
+static int octeon3_eth_bgx_ndo_stop(struct net_device *netdev)
+{
+	int r;
+
+	r = bgx_port_disable(netdev);
+	if (r)
+		return r;
+
+	return octeon3_eth_common_ndo_stop(netdev);
+}
+
+static inline u64 build_pko_send_hdr_desc(struct sk_buff *skb)
+{
+	u64 checksum_alg;
+	u64 send_hdr = 0;
+	u8 l4_hdr = 0;
+
+	/* See PKO_SEND_HDR_S in the HRM for the send header descriptor
+	 * format.
+	 */
+#ifdef __LITTLE_ENDIAN
+	send_hdr |= BIT(43);
+#endif
+
+	if (!OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X)) {
+		/* Don't allocate to L2 */
+		send_hdr |= BIT(42);
+	}
+
+	/* Don't automatically free to FPA */
+	send_hdr |= BIT(40);
+
+	send_hdr |= skb->len;
+
+	if (skb->ip_summed != CHECKSUM_NONE &&
+	    skb->ip_summed != CHECKSUM_UNNECESSARY) {
+#ifndef BROKEN_SIMULATOR_CSUM
+		switch (skb->protocol) {
+		case htons(ETH_P_IP):
+			send_hdr |= ETH_HLEN << 16;
+			send_hdr |= BIT(45);
+			l4_hdr = ip_hdr(skb)->protocol;
+			send_hdr |= (ETH_HLEN + (4 * ip_hdr(skb)->ihl)) << 24;
+			break;
+
+		case htons(ETH_P_IPV6):
+			l4_hdr = ipv6_hdr(skb)->nexthdr;
+			send_hdr |= ETH_HLEN << 16;
+			break;
+
+		default:
+			break;
+		}
+#endif
+
+		checksum_alg = 1; /* UDP == 1 */
+		switch (l4_hdr) {
+		case IPPROTO_SCTP:
+			if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X))
+				break;
+			checksum_alg++; /* SCTP == 3 */
+			/* Fall through */
+		case IPPROTO_TCP: /* TCP == 2 */
+			checksum_alg++;
+			/* Fall through */
+		case IPPROTO_UDP:
+			if (skb_transport_header_was_set(skb)) {
+				int l4ptr = skb_transport_header(skb) -
+					skb->data;
+				send_hdr &= ~GENMASK_ULL(31, 24);
+				send_hdr |= l4ptr << 24;
+				send_hdr |= checksum_alg << 46;
+			}
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	return send_hdr;
+}
+
+static inline u64 build_pko_send_ext_desc(struct sk_buff *skb)
+{
+	u64 send_ext;
+
+	/* See PKO_SEND_EXT_S in the HRM for the send extended descriptor
+	 * format.
+	 */
+	skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+	send_ext = (u64)PKO_SENDSUBDC_EXT << 44;
+	send_ext |= 1ull << 40;
+	send_ext |= BIT(39);
+	send_ext |= ETH_HLEN << 16;
+
+	return send_ext;
+}
+
+static inline u64 build_pko_send_tso(struct sk_buff *skb, uint mtu)
+{
+	u64 send_tso;
+
+	/* See PKO_SEND_TSO_S in the HRM for the send tso descriptor format */
+	send_tso = 12ull << 56;
+	send_tso |= (u64)PKO_SENDSUBDC_TSO << 44;
+	send_tso |= (skb_transport_offset(skb) + tcp_hdrlen(skb)) << 24;
+	send_tso |= (mtu + ETH_HLEN) << 8;
+
+	return send_tso;
+}
+
+static inline u64 build_pko_send_mem_sub(u64 addr)
+{
+	u64 send_mem;
+
+	/* See PKO_SEND_MEM_S in the HRM for the send mem descriptor format */
+	send_mem = (u64)PKO_SENDSUBDC_MEM << 44;
+	send_mem |= (u64)MEMDSZ_B64 << 60;
+	send_mem |= (u64)MEMALG_SUB << 56;
+	send_mem |= BIT_ULL(48);
+	send_mem |= addr;
+
+	return send_mem;
+}
+
+static inline u64 build_pko_send_mem_ts(u64 addr)
+{
+	u64 send_mem;
+
+	/* See PKO_SEND_MEM_S in the HRM for the send mem descriptor format */
+	send_mem = 1ull << 62;
+	send_mem |= (u64)PKO_SENDSUBDC_MEM << 44;
+	send_mem |= (u64)MEMDSZ_B64 << 60;
+	send_mem |= (u64)MEMALG_SETTSTMP << 56;
+	send_mem |= addr;
+
+	return send_mem;
+}
+
+static inline u64 build_pko_send_free(u64 addr)
+{
+	u64 send_free;
+
+	/* See PKO_SEND_FREE_S in the HRM for the send free descriptor format */
+	send_free = (u64)PKO_SENDSUBDC_FREE << 44;
+	send_free |= addr;
+
+	return send_free;
+}
+
+static inline u64 build_pko_send_work(int grp, u64 addr)
+{
+	u64 send_work;
+
+	/* See PKO_SEND_WORK_S in the HRM for the send work descriptor format */
+	send_work = (u64)PKO_SENDSUBDC_WORK << 44;
+	send_work |= (u64)grp << 52;
+	send_work |= 2ull << 50;
+	send_work |= addr;
+
+	return send_work;
+}
+
+static int octeon3_eth_ndo_start_xmit(struct sk_buff *skb,
+				      struct net_device *netdev)
+{
+	struct octeon3_ethernet *priv = netdev_priv(netdev);
+	struct octeon3_ethernet_node *oen;
+	u64 scr_off = LMTDMA_SCR_OFFSET;
+	struct sk_buff *skb_tmp;
+	u64 pko_send_desc;
+	u64 *lmtdma_addr;
+	unsigned int mss;
+	u64 lmtdma_data;
+	u64 aq_cnt = 0;
+	int frag_count;
+	long backlog;
+	u64 head_len;
+	void **work;
+	int grp;
+	int i;
+
+	frag_count = 0;
+	if (skb_has_frag_list(skb))
+		skb_walk_frags(skb, skb_tmp)
+			frag_count++;
+
+	/* Stop the queue if pko or sso are not keeping up */
+	oen = octeon3_eth_node + priv->node;
+	aq_cnt = oct_csr_read(SSO_GRP_AQ_CNT(oen->node, oen->tx_complete_grp));
+	aq_cnt &= GENMASK_ULL(32, 0);
+	backlog = atomic64_inc_return(&priv->tx_backlog);
+	if (unlikely(backlog > MAX_TX_QUEUE_DEPTH || aq_cnt > 100000))
+		netif_stop_queue(netdev);
+
+	/* We have space for 11 segment pointers, If there will be
+	 * more than that, we must linearize.  The count is: 1 (base
+	 * SKB) + frag_count + nr_frags.
+	 */
+	if (unlikely(skb_shinfo(skb)->nr_frags + frag_count > 10)) {
+		if (unlikely(__skb_linearize(skb)))
+			goto skip_xmit;
+		frag_count = 0;
+	}
+
+	work = (void **)skb->cb;
+	work[0] = netdev;
+	work[1] = NULL;
+
+	/* Adjust the port statistics. */
+	atomic64_inc(&priv->tx_packets);
+	atomic64_add(skb->len, &priv->tx_octets);
+
+	/* Make sure packet data writes are committed before
+	 * submitting the command below
+	 */
+	wmb();
+
+	/* Build the pko command */
+	pko_send_desc = build_pko_send_hdr_desc(skb);
+	/* We don't save/restore state of CPU local memory for kernel
+	 * space access, so we must disable preemption while we build
+	 * and transmit the PKO command.
+	 */
+	preempt_disable();
+	scratch_write64(scr_off, pko_send_desc);
+	scr_off += sizeof(pko_send_desc);
+
+	/* Request packet to be ptp timestamped */
+	if ((unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) &&
+	    unlikely(priv->tx_timestamp_hw)) {
+		pko_send_desc = build_pko_send_ext_desc(skb);
+		scratch_write64(scr_off, pko_send_desc);
+		scr_off += sizeof(pko_send_desc);
+	}
+
+	/* Add the tso descriptor if needed */
+	mss = skb_shinfo(skb)->gso_size;
+	if (unlikely(mss)) {
+		pko_send_desc = build_pko_send_tso(skb, netdev->mtu);
+		scratch_write64(scr_off, pko_send_desc);
+		scr_off += sizeof(pko_send_desc);
+	}
+
+	/* Add a gather descriptor for each segment. See PKO_SEND_GATHER_S for
+	 * the send gather descriptor format.
+	 */
+	pko_send_desc = (u64)PKO_SENDSUBDC_GATHER << 45;
+	head_len = skb_headlen(skb);
+	if (head_len > 0) {
+		pko_send_desc |= head_len << 48;
+		pko_send_desc |= virt_to_phys(skb->data);
+		scratch_write64(scr_off, pko_send_desc);
+		scr_off += sizeof(pko_send_desc);
+	}
+	for (i = 1; i <= skb_shinfo(skb)->nr_frags; i++) {
+		struct skb_frag_struct *fs = skb_shinfo(skb)->frags + i - 1;
+
+		pko_send_desc &= ~(GENMASK_ULL(63, 48) | GENMASK_ULL(41, 0));
+		pko_send_desc |= (u64)fs->size << 48;
+		pko_send_desc |= virt_to_phys((u8 *)page_address(fs->page.p) + fs->page_offset);
+		scratch_write64(scr_off, pko_send_desc);
+		scr_off += sizeof(pko_send_desc);
+	}
+	skb_walk_frags(skb, skb_tmp) {
+		pko_send_desc &= ~(GENMASK_ULL(63, 48) | GENMASK_ULL(41, 0));
+		pko_send_desc |= (u64)skb_tmp->len << 48;
+		pko_send_desc |= virt_to_phys(skb_tmp->data);
+		scratch_write64(scr_off, pko_send_desc);
+		scr_off += sizeof(pko_send_desc);
+	}
+
+	/* Subtract 1 from the tx_backlog. */
+	pko_send_desc = build_pko_send_mem_sub(virt_to_phys(&priv->tx_backlog));
+	scratch_write64(scr_off, pko_send_desc);
+	scr_off += sizeof(pko_send_desc);
+
+	/* Write the ptp timestamp in the skb itself */
+	if ((unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) &&
+	    unlikely(priv->tx_timestamp_hw)) {
+		pko_send_desc = build_pko_send_mem_ts(virt_to_phys(&work[1]));
+		scratch_write64(scr_off, pko_send_desc);
+		scr_off += sizeof(pko_send_desc);
+	}
+
+	/* Send work when finished with the packet. */
+	grp = octeon3_eth_lgrp_to_ggrp(priv->node, priv->tx_complete_grp);
+	pko_send_desc = build_pko_send_work(grp, virt_to_phys(work));
+	scratch_write64(scr_off, pko_send_desc);
+	scr_off += sizeof(pko_send_desc);
+
+	/* See PKO_SEND_DMA_S in the HRM for the lmtdam data format */
+	lmtdma_data = 0;
+	lmtdma_data |= (u64)(LMTDMA_SCR_OFFSET >> 3) << 56;
+	lmtdma_data |= 0x51ull << 40;
+	lmtdma_data |= (u64)priv->node << 36;
+	lmtdma_data |= priv->pko_queue << 16;
+
+	lmtdma_addr = (u64 *)(LMTDMA_ORDERED_IO_ADDR | ((scr_off & 0x78) - 8));
+	*lmtdma_addr = lmtdma_data;
+
+	preempt_enable();
+
+	return NETDEV_TX_OK;
+skip_xmit:
+	atomic64_inc(&priv->tx_dropped);
+	dev_consume_skb_any(skb);
+	return NETDEV_TX_OK;
+}
+
+static int octeon3_eth_set_mac_address(struct net_device *netdev, void *addr)
+{
+	int r = eth_mac_addr(netdev, addr);
+
+	if (r)
+		return r;
+
+	bgx_port_set_rx_filtering(netdev);
+
+	return 0;
+}
+
+static u64 octeon3_cyclecounter_read(const struct cyclecounter *cc)
+{
+	struct octeon3_ethernet	*priv;
+	u64 count;
+
+	priv = container_of(cc, struct octeon3_ethernet, cc);
+	count = oct_csr_read(MIO_PTP_CLOCK_HI(priv->node));
+	return count;
+}
+
+static int octeon3_bgx_hwtstamp(struct net_device *netdev, int en)
+{
+	struct octeon3_ethernet *priv = netdev_priv(netdev);
+	u64 data;
+
+	switch (bgx_port_get_mode(priv->node, priv->interface, priv->index)) {
+	case PORT_MODE_RGMII:
+	case PORT_MODE_SGMII:
+		data = oct_csr_read(BGX_GMP_GMI_RX_FRM_CTL(priv->node, priv->interface, priv->index));
+		if (en)
+			data |= BIT(12);
+		else
+			data &= ~BIT(12);
+		oct_csr_write(data, BGX_GMP_GMI_RX_FRM_CTL(priv->node, priv->interface, priv->index));
+		break;
+
+	case PORT_MODE_XAUI:
+	case PORT_MODE_RXAUI:
+	case PORT_MODE_10G_KR:
+	case PORT_MODE_XLAUI:
+	case PORT_MODE_40G_KR4:
+	case PORT_MODE_XFI:
+		data = oct_csr_read(BGX_SMU_RX_FRM_CTL(priv->node, priv->interface, priv->index));
+		if (en)
+			data |= BIT(12);
+		else
+			data &= ~BIT(12);
+		oct_csr_write(data, BGX_SMU_RX_FRM_CTL(priv->node, priv->interface, priv->index));
+		break;
+
+	default:
+		/* No timestamp support*/
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int octeon3_pki_hwtstamp(struct net_device *netdev, int en)
+{
+	struct octeon3_ethernet		*priv = netdev_priv(netdev);
+	int				skip = en ? 8 : 0;
+
+	octeon3_pki_set_ptp_skip(priv->node, priv->pknd, skip);
+
+	return 0;
+}
+
+static int octeon3_ioctl_hwtstamp(struct net_device *netdev,
+				  struct ifreq *rq, int cmd)
+{
+	struct octeon3_ethernet *priv = netdev_priv(netdev);
+	struct hwtstamp_config config;
+	u64 data;
+	int en;
+
+	/* The PTP block should be enabled */
+	data = oct_csr_read(MIO_PTP_CLOCK_CFG(priv->node));
+	if (!(data & BIT(0))) {
+		netdev_err(netdev, "Error: PTP clock not enabled\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (copy_from_user(&config, rq->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	if (config.flags) /* reserved for future extensions */
+		return -EINVAL;
+
+	switch (config.tx_type) {
+	case HWTSTAMP_TX_OFF:
+		priv->tx_timestamp_hw = 0;
+		break;
+	case HWTSTAMP_TX_ON:
+		priv->tx_timestamp_hw = 1;
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	switch (config.rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		priv->rx_timestamp_hw = 0;
+		en = 0;
+		break;
+	case HWTSTAMP_FILTER_ALL:
+	case HWTSTAMP_FILTER_SOME:
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+		priv->rx_timestamp_hw = 1;
+		en = 1;
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	octeon3_bgx_hwtstamp(netdev, en);
+	octeon3_pki_hwtstamp(netdev, en);
+
+	priv->cc.read = octeon3_cyclecounter_read;
+	priv->cc.mask = CYCLECOUNTER_MASK(64);
+	/* Ptp counter is always in nsec */
+	priv->cc.mult = 1;
+	priv->cc.shift = 0;
+	timecounter_init(&priv->tc, &priv->cc, ktime_to_ns(ktime_get_real()));
+
+	return 0;
+}
+
+static int octeon3_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
+{
+	struct octeon3_ethernet	*priv;
+	int neg_ppb = 0;
+	u64 comp;
+	u64 diff;
+
+	priv = container_of(ptp, struct octeon3_ethernet, ptp_info);
+
+	if (ppb < 0) {
+		ppb = -ppb;
+		neg_ppb = 1;
+	}
+
+	/* The part per billion (ppb) is a delta from the base frequency */
+	comp = (NSEC_PER_SEC << 32) / octeon_get_io_clock_rate();
+
+	diff = comp;
+	diff *= ppb;
+	diff = div_u64(diff, 1000000000ULL);
+
+	comp = neg_ppb ? comp - diff : comp + diff;
+
+	oct_csr_write(comp, MIO_PTP_CLOCK_COMP(priv->node));
+
+	return 0;
+}
+
+static int octeon3_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	struct octeon3_ethernet *priv;
+	unsigned long flags;
+	s64 now;
+
+	priv = container_of(ptp, struct octeon3_ethernet, ptp_info);
+
+	spin_lock_irqsave(&priv->ptp_lock, flags);
+	now = timecounter_read(&priv->tc);
+	now += delta;
+	timecounter_init(&priv->tc, &priv->cc, now);
+	spin_unlock_irqrestore(&priv->ptp_lock, flags);
+
+	return 0;
+}
+
+static int octeon3_gettime(struct ptp_clock_info *ptp, struct timespec *ts)
+{
+	struct octeon3_ethernet	*priv;
+	unsigned long flags;
+	u32 remainder;
+	u64 ns;
+
+	priv = container_of(ptp, struct octeon3_ethernet, ptp_info);
+
+	spin_lock_irqsave(&priv->ptp_lock, flags);
+	ns = timecounter_read(&priv->tc);
+	spin_unlock_irqrestore(&priv->ptp_lock, flags);
+	ts->tv_sec = div_u64_rem(ns, 1000000000ULL, &remainder);
+	ts->tv_nsec = remainder;
+
+	return 0;
+}
+
+static int octeon3_settime(struct ptp_clock_info *ptp,
+			   const struct timespec *ts)
+{
+	struct octeon3_ethernet	*priv;
+	unsigned long flags;
+	u64 ns;
+
+	priv = container_of(ptp, struct octeon3_ethernet, ptp_info);
+	ns = timespec_to_ns(ts);
+
+	spin_lock_irqsave(&priv->ptp_lock, flags);
+	timecounter_init(&priv->tc, &priv->cc, ns);
+	spin_unlock_irqrestore(&priv->ptp_lock, flags);
+
+	return 0;
+}
+
+static int octeon3_enable(struct ptp_clock_info *ptp,
+			  struct ptp_clock_request *rq, int on)
+{
+	return -EOPNOTSUPP;
+}
+
+static int octeon3_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
+{
+	int rc;
+
+	switch (cmd) {
+	case SIOCSHWTSTAMP:
+		rc = octeon3_ioctl_hwtstamp(netdev, ifr, cmd);
+		break;
+
+	default:
+		rc = bgx_port_do_ioctl(netdev, ifr, cmd);
+		break;
+	}
+
+	return rc;
+}
+
+static const struct net_device_ops octeon3_eth_netdev_ops = {
+	.ndo_init		= octeon3_eth_bgx_ndo_init,
+	.ndo_uninit		= octeon3_eth_ndo_uninit,
+	.ndo_open		= octeon3_eth_bgx_ndo_open,
+	.ndo_stop		= octeon3_eth_bgx_ndo_stop,
+	.ndo_start_xmit		= octeon3_eth_ndo_start_xmit,
+	.ndo_get_stats64	= octeon3_eth_ndo_get_stats64,
+	.ndo_set_rx_mode	= bgx_port_set_rx_filtering,
+	.ndo_set_mac_address	= octeon3_eth_set_mac_address,
+	.ndo_change_mtu		= bgx_port_change_mtu,
+	.ndo_do_ioctl		= octeon3_ioctl,
+};
+
+static int octeon3_eth_probe(struct platform_device *pdev)
+{
+	struct octeon3_ethernet *priv;
+	struct net_device *netdev;
+	int r;
+
+	struct mac_platform_data *pd = dev_get_platdata(&pdev->dev);
+
+	r = octeon3_eth_global_init(pd->numa_node, pdev);
+	if (r)
+		return r;
+
+	dev_info(&pdev->dev, "Probing %d-%d:%d\n",
+		 pd->numa_node, pd->interface, pd->port);
+	netdev = alloc_etherdev(sizeof(struct octeon3_ethernet));
+	if (!netdev) {
+		dev_err(&pdev->dev, "Failed to allocated ethernet device\n");
+		return -ENOMEM;
+	}
+
+	SET_NETDEV_DEV(netdev, &pdev->dev);
+	dev_set_drvdata(&pdev->dev, netdev);
+
+	if (pd->mac_type == BGX_MAC)
+		bgx_port_set_netdev(pdev->dev.parent, netdev);
+	priv = netdev_priv(netdev);
+	priv->netdev = netdev;
+	priv->mac_type = pd->mac_type;
+	INIT_LIST_HEAD(&priv->list);
+	priv->node = pd->numa_node;
+
+	mutex_lock(&octeon3_eth_node[priv->node].device_list_lock);
+	list_add_tail_rcu(&priv->list, &octeon3_eth_node[priv->node].device_list);
+	mutex_unlock(&octeon3_eth_node[priv->node].device_list_lock);
+
+	priv->index = pd->port;
+	priv->interface = pd->interface;
+	spin_lock_init(&priv->stat_lock);
+	INIT_DEFERRABLE_WORK(&priv->stat_work, octeon3_eth_stat_poll);
+
+	if (pd->src_type == XCV)
+		snprintf(netdev->name, IFNAMSIZ, "rgmii%d", pd->port);
+
+	if (priv->mac_type == BGX_MAC)
+		netdev->netdev_ops = &octeon3_eth_netdev_ops;
+
+	netdev->min_mtu = 60;
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X)) {
+		int fifo_size;
+		int max_mtu = 1500;
+
+		/* On 78XX-Pass1 the mtu must be limited.  The PKO may
+		 * to lock up when calculating the L4 checksum for
+		 * large packets. How large the packets can be depends
+		 * on the amount of pko fifo assigned to the port.
+		 *
+		 *   FIFO size                Max frame size
+		 *	2.5 KB				1920
+		 *	5.0 KB				4480
+		 *     10.0 KB				9600
+		 *
+		 * The maximum mtu is set to the largest frame size minus the
+		 * l2 header.
+		 */
+		fifo_size = octeon3_pko_get_fifo_size(priv->node, priv->interface,
+						      priv->index, priv->mac_type);
+
+		switch (fifo_size) {
+		case 2560:
+			max_mtu = 1920 - ETH_HLEN - ETH_FCS_LEN - (2 * VLAN_HLEN);
+			break;
+
+		case 5120:
+			max_mtu = 4480 - ETH_HLEN - ETH_FCS_LEN - (2 * VLAN_HLEN);
+			break;
+
+		case 10240:
+			max_mtu = 9600 - ETH_HLEN - ETH_FCS_LEN - (2 * VLAN_HLEN);
+			break;
+
+		default:
+			break;
+		}
+		netdev->max_mtu = max_mtu;
+	} else {
+		netdev->max_mtu = 65392;
+	}
+
+	if (register_netdev(netdev) < 0) {
+		dev_err(&pdev->dev, "Failed to register ethernet device\n");
+		list_del(&priv->list);
+		free_netdev(netdev);
+	}
+
+	spin_lock_init(&priv->ptp_lock);
+	priv->ptp_info.owner = THIS_MODULE;
+	snprintf(priv->ptp_info.name, 16, "octeon3 ptp");
+	priv->ptp_info.max_adj = 250000000;
+	priv->ptp_info.n_alarm = 0;
+	priv->ptp_info.n_ext_ts = 0;
+	priv->ptp_info.n_per_out = 0;
+	priv->ptp_info.pps = 0;
+	priv->ptp_info.adjfreq = octeon3_adjfreq;
+	priv->ptp_info.adjtime = octeon3_adjtime;
+	priv->ptp_info.gettime64 = octeon3_gettime;
+	priv->ptp_info.settime64 = octeon3_settime;
+	priv->ptp_info.enable = octeon3_enable;
+	priv->ptp_clock = ptp_clock_register(&priv->ptp_info, &pdev->dev);
+
+	netdev_info(netdev, "%d rx queues\n", rx_queues);
+	return 0;
+}
+
+/**
+ * octeon3_eth_global_exit() - Free all the used resources and restore the
+ *			       hardware to the default state.
+ * @node: Node to free/reset.
+ *
+ * Return: 0 if successful.
+ *	 < 0 for error codes.
+ */
+static int octeon3_eth_global_exit(int node)
+{
+	struct octeon3_ethernet_node *oen = octeon3_eth_node + node;
+	int i;
+
+	/* Free the tx_complete irq */
+	octeon3_sso_irq_set(node, oen->tx_complete_grp, false);
+	irq_set_affinity_hint(oen->tx_irq, NULL);
+	free_irq(oen->tx_irq, oen);
+	irq_dispose_mapping(oen->tx_irq);
+	oen->tx_irq = 0;
+
+	/* Stop the worker threads */
+	for (i = 0; i < ARRAY_SIZE(oen->workers); i++)
+		kthread_stop(oen->workers[i].task);
+
+	/* Shutdown pki */
+	octeon3_pki_shutdown(node);
+	octeon_fpa3_release_pool(node, oen->pki_packet_pool);
+	kfree(oen->pki_packet_pool_stack);
+
+	/* Shutdown pko */
+	octeon3_pko_exit_global(node);
+	for (;;) {
+		void **w;
+
+		w = octeon_fpa3_alloc(node, oen->pko_aura);
+		if (!w)
+			break;
+		kmem_cache_free(octeon3_eth_sso_pko_cache, w);
+	}
+	octeon_fpa3_release_aura(node, oen->pko_aura);
+	octeon_fpa3_release_pool(node, oen->pko_pool);
+	kfree(oen->pko_pool_stack);
+
+	/* Shutdown sso */
+	octeon3_sso_shutdown(node, oen->sso_aura);
+	octeon3_sso_free_grp(node, oen->tx_complete_grp);
+	for (;;) {
+		void **w;
+
+		w = octeon_fpa3_alloc(node, oen->sso_aura);
+		if (!w)
+			break;
+		kmem_cache_free(octeon3_eth_sso_pko_cache, w);
+	}
+	octeon_fpa3_release_aura(node, oen->sso_aura);
+	octeon_fpa3_release_pool(node, oen->sso_pool);
+	kfree(oen->sso_pool_stack);
+
+	return 0;
+}
+
+static int octeon3_eth_remove(struct platform_device *pdev)
+{
+	struct net_device *netdev = dev_get_drvdata(&pdev->dev);
+	struct octeon3_ethernet *priv = netdev_priv(netdev);
+	struct octeon3_ethernet_node *oen;
+	struct mac_platform_data *pd;
+	int node;
+
+	node = priv->node;
+	oen = octeon3_eth_node + node;
+	pd = dev_get_platdata(&pdev->dev);
+
+	ptp_clock_unregister(priv->ptp_clock);
+	unregister_netdev(netdev);
+	if (pd->mac_type == BGX_MAC)
+		bgx_port_set_netdev(pdev->dev.parent, NULL);
+	dev_set_drvdata(&pdev->dev, NULL);
+
+	/* Free all resources when there are no more devices */
+	mutex_lock(&octeon3_eth_init_mutex);
+	mutex_lock(&oen->device_list_lock);
+	list_del_rcu(&priv->list);
+	if (oen->init_done && list_empty(&oen->device_list)) {
+		oen->init_done = false;
+		octeon3_eth_global_exit(node);
+	}
+
+	mutex_unlock(&oen->device_list_lock);
+	mutex_unlock(&octeon3_eth_init_mutex);
+	free_netdev(netdev);
+
+	return 0;
+}
+
+static void octeon3_eth_shutdown(struct platform_device *pdev)
+{
+	octeon3_eth_remove(pdev);
+}
+
+static struct platform_driver octeon3_eth_driver = {
+	.probe		= octeon3_eth_probe,
+	.remove		= octeon3_eth_remove,
+	.shutdown       = octeon3_eth_shutdown,
+	.driver		= {
+		.owner	= THIS_MODULE,
+		.name	= "ethernet-mac-pki",
+	},
+};
+
+static int __init octeon3_eth_init(void)
+{
+	int nr = num_online_cpus();
+
+	if (nr >= 4)
+		rx_queues = 4;
+	else if (nr >= 2)
+		rx_queues = 2;
+	else
+		rx_queues = 1;
+
+	return platform_driver_register(&octeon3_eth_driver);
+}
+module_init(octeon3_eth_init);
+
+static void __exit octeon3_eth_exit(void)
+{
+	platform_driver_unregister(&octeon3_eth_driver);
+
+	/* Destroy the memory cache used by sso and pko */
+	kmem_cache_destroy(octeon3_eth_sso_pko_cache);
+}
+module_exit(octeon3_eth_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Cavium, Inc. <support@caviumnetworks.com>");
+MODULE_DESCRIPTION("Cavium, Inc. PKI/PKO Ethernet driver.");
diff --git a/drivers/net/ethernet/cavium/octeon/octeon3-fpa.c b/drivers/net/ethernet/cavium/octeon/octeon3-fpa.c
new file mode 100644
index 0000000..f302c32
--- /dev/null
+++ b/drivers/net/ethernet/cavium/octeon/octeon3-fpa.c
@@ -0,0 +1,358 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Driver for the Octeon III Free Pool Unit (fpa).
+ *
+ * Copyright (C) 2015-2017 Cavium, Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <asm/octeon/octeon.h>
+
+#include "octeon3.h"
+
+/* Registers are accessed via xkphys */
+#define FPA_BASE(node)	(SET_XKPHYS + NODE_OFFSET(node) + 0x1280000000000ull)
+
+#define FPA_GEN_CFG(n)			(FPA_BASE(n)	     + 0x00000050)
+
+#define FPA_POOLX_CFG(n, p)		(FPA_BASE(n) + ((p) << 3) + 0x10000000)
+#define FPA_POOLX_START_ADDR(n, p)	(FPA_BASE(n) + ((p) << 3) + 0x10500000)
+#define FPA_POOLX_END_ADDR(n, p)	(FPA_BASE(n) + ((p) << 3) + 0x10600000)
+#define FPA_POOLX_STACK_BASE(n, p)	(FPA_BASE(n) + ((p) << 3) + 0x10700000)
+#define FPA_POOLX_STACK_END(n, p)	(FPA_BASE(n) + ((p) << 3) + 0x10800000)
+#define FPA_POOLX_STACK_ADDR(n, p)	(FPA_BASE(n) + ((p) << 3) + 0x10900000)
+
+#define FPA_AURAX_POOL(n, a)		(FPA_BASE(n) + ((a) << 3) + 0x20000000)
+#define FPA_AURAX_CFG(n, a)		(FPA_BASE(n) + ((a) << 3) + 0x20100000)
+#define FPA_AURAX_CNT(n, a)		(FPA_BASE(n) + ((a) << 3) + 0x20200000)
+#define FPA_AURAX_CNT_LIMIT(n, a)	(FPA_BASE(n) + ((a) << 3) + 0x20400000)
+#define FPA_AURAX_CNT_THRESHOLD(n, a)	(FPA_BASE(n) + ((a) << 3) + 0x20500000)
+#define FPA_AURAX_POOL_LEVELS(n, a)	(FPA_BASE(n) + ((a) << 3) + 0x20700000)
+#define FPA_AURAX_CNT_LEVELS(n, a)	(FPA_BASE(n) + ((a) << 3) + 0x20800000)
+
+static DEFINE_MUTEX(octeon_fpa3_lock);
+
+static int get_num_pools(void)
+{
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX))
+		return 64;
+	if (OCTEON_IS_MODEL(OCTEON_CNF75XX) || OCTEON_IS_MODEL(OCTEON_CN73XX))
+		return 32;
+	return 0;
+}
+
+static int get_num_auras(void)
+{
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX))
+		return 1024;
+	if (OCTEON_IS_MODEL(OCTEON_CNF75XX) || OCTEON_IS_MODEL(OCTEON_CN73XX))
+		return 512;
+	return 0;
+}
+
+/**
+ * octeon_fpa3_init() - Initialize the fpa to default values.
+ * @node: Node of fpa to initialize.
+ *
+ * Return: 0 if successful.
+ *         < 0 for error codes.
+ */
+int octeon_fpa3_init(int node)
+{
+	static bool init_done[2];
+	int aura_cnt;
+	u64 data;
+	int i;
+
+	mutex_lock(&octeon_fpa3_lock);
+
+	if (init_done[node])
+		goto done;
+
+	aura_cnt = get_num_auras();
+	for (i = 0; i < aura_cnt; i++) {
+		oct_csr_write(0x100000000ull, FPA_AURAX_CNT(node, i));
+		oct_csr_write(0xfffffffffull, FPA_AURAX_CNT_LIMIT(node, i));
+		oct_csr_write(0xffffffffeull, FPA_AURAX_CNT_THRESHOLD(node, i));
+	}
+
+	data = oct_csr_read(FPA_GEN_CFG(node));
+	data &= ~GENMASK_ULL(9, 4);
+	data |= 3 << 4;
+	oct_csr_write(data, FPA_GEN_CFG(node));
+
+	init_done[node] = 1;
+ done:
+	mutex_unlock(&octeon_fpa3_lock);
+	return 0;
+}
+EXPORT_SYMBOL(octeon_fpa3_init);
+
+/**
+ * octeon_fpa3_pool_init() - Initialize a pool.
+ * @node: Node to initialize pool on.
+ * @pool_num: Requested pool number (-1 for don't care).
+ * @pool: Updated with the initialized pool number.
+ * @pool_stack: Updated with the base of the memory allocated for the pool
+ *		stack.
+ * @num_ptrs: Number of pointers to allocated on the stack.
+ *
+ * Return: 0 if successful.
+ *         < 0 for error codes.
+ */
+int octeon_fpa3_pool_init(int node, int pool_num, int *pool,
+			  void **pool_stack, int num_ptrs)
+{
+	struct global_resource_tag tag;
+	u64 pool_stack_start;
+	u64 pool_stack_end;
+	int stack_size;
+	char buf[16];
+	int rc = 0;
+	u64 data;
+
+	mutex_lock(&octeon_fpa3_lock);
+
+	strncpy((char *)&tag.lo, "cvm_pool", 8);
+	snprintf(buf, 16, "_%d......", node);
+	memcpy(&tag.hi, buf, 8);
+
+	res_mgr_create_resource(tag, get_num_pools());
+	*pool = res_mgr_alloc(tag, pool_num, true);
+	if (*pool < 0) {
+		rc = -ENODEV;
+		goto error;
+	}
+
+	oct_csr_write(0, FPA_POOLX_CFG(node, *pool));
+	oct_csr_write(128, FPA_POOLX_START_ADDR(node, *pool));
+	oct_csr_write(GENMASK_ULL(41, 7), FPA_POOLX_END_ADDR(node, *pool));
+
+	stack_size = (DIV_ROUND_UP(num_ptrs, 29) + 1) * 128;
+	*pool_stack = kmalloc_node(stack_size, GFP_KERNEL, node);
+	if (!*pool_stack) {
+		pr_err("Failed to allocate pool stack memory pool=%d\n",
+		       pool_num);
+		rc = -ENOMEM;
+		goto error_stack;
+	}
+
+	pool_stack_start = virt_to_phys(*pool_stack);
+	pool_stack_end = round_down(pool_stack_start + stack_size, 128);
+	pool_stack_start = round_up(pool_stack_start, 128);
+	oct_csr_write(pool_stack_start, FPA_POOLX_STACK_BASE(node, *pool));
+	oct_csr_write(pool_stack_start, FPA_POOLX_STACK_ADDR(node, *pool));
+	oct_csr_write(pool_stack_end, FPA_POOLX_STACK_END(node, *pool));
+
+	data = (2 << 3) | BIT(0);
+	oct_csr_write(data, FPA_POOLX_CFG(node, *pool));
+
+	mutex_unlock(&octeon_fpa3_lock);
+	return 0;
+
+error_stack:
+	res_mgr_free(tag, *pool);
+error:
+	mutex_unlock(&octeon_fpa3_lock);
+	return rc;
+}
+EXPORT_SYMBOL(octeon_fpa3_pool_init);
+
+/**
+ * octeon_fpa3_release_pool() - Release a pool.
+ * @node: Node pool is on.
+ * @pool: Pool to release.
+ */
+void octeon_fpa3_release_pool(int node, int pool)
+{
+	struct global_resource_tag tag;
+	char buf[16];
+
+	mutex_lock(&octeon_fpa3_lock);
+
+	strncpy((char *)&tag.lo, "cvm_pool", 8);
+	snprintf(buf, 16, "_%d......", node);
+	memcpy(&tag.hi, buf, 8);
+
+	res_mgr_free(tag, pool);
+
+	mutex_unlock(&octeon_fpa3_lock);
+}
+EXPORT_SYMBOL(octeon_fpa3_release_pool);
+
+/**
+ * octeon_fpa3_aura_init() - Initialize an aura.
+ * @node: Node to initialize aura on.
+ * @pool: Pool the aura belongs to.
+ * @aura_num: Requested aura number (-1 for don't care).
+ * @aura: Updated with the initialized aura number.
+ * @num_bufs: Number of buffers in the aura.
+ * @limit: Limit for the aura.
+ *
+ * Return: 0 if successful.
+ *         < 0 for error codes.
+ */
+int octeon_fpa3_aura_init(int node, int pool, int aura_num,
+			  int *aura, int num_bufs, unsigned int limit)
+{
+	struct global_resource_tag tag;
+	unsigned int drop;
+	unsigned int pass;
+	char buf[16];
+	int rc = 0;
+	u64 shift;
+	u64 data;
+
+	mutex_lock(&octeon_fpa3_lock);
+
+	strncpy((char *)&tag.lo, "cvm_aura", 8);
+	snprintf(buf, 16, "_%d......", node);
+	memcpy(&tag.hi, buf, 8);
+
+	res_mgr_create_resource(tag, get_num_auras());
+	*aura = res_mgr_alloc(tag, aura_num, true);
+	if (*aura < 0) {
+		rc = -ENODEV;
+		goto error;
+	}
+
+	oct_csr_write(0, FPA_AURAX_CFG(node, *aura));
+
+	/* Allow twice the limit before saturation at zero */
+	limit *= 2;
+	data = limit;
+	oct_csr_write(data, FPA_AURAX_CNT_LIMIT(node, *aura));
+	oct_csr_write(data, FPA_AURAX_CNT(node, *aura));
+
+	oct_csr_write(pool, FPA_AURAX_POOL(node, *aura));
+
+	/* No per-pool RED/Drop */
+	oct_csr_write(0, FPA_AURAX_POOL_LEVELS(node, *aura));
+
+	shift = 0;
+	while ((limit >> shift) > 255)
+		shift++;
+
+	drop = (limit - num_bufs / 20) >> shift;	/* 95% */
+	pass = (limit - (num_bufs * 3) / 20) >> shift;	/* 85% */
+
+	/* Enable per aura RED/drop */
+	data = BIT(38) | (shift << 32) | (drop << 16) | (pass << 8);
+	oct_csr_write(data, FPA_AURAX_CNT_LEVELS(node, *aura));
+
+error:
+	mutex_unlock(&octeon_fpa3_lock);
+	return rc;
+}
+EXPORT_SYMBOL(octeon_fpa3_aura_init);
+
+/**
+ * octeon_fpa3_release_aura() - Release an aura.
+ * @node: Node to aura is on.
+ * @aura: Aura to release.
+ */
+void octeon_fpa3_release_aura(int node, int aura)
+{
+	struct global_resource_tag tag;
+	char buf[16];
+
+	mutex_lock(&octeon_fpa3_lock);
+
+	strncpy((char *)&tag.lo, "cvm_aura", 8);
+	snprintf(buf, 16, "_%d......", node);
+	memcpy(&tag.hi, buf, 8);
+
+	res_mgr_free(tag, aura);
+
+	mutex_unlock(&octeon_fpa3_lock);
+}
+EXPORT_SYMBOL(octeon_fpa3_release_aura);
+
+/**
+ * octeon_fpa3_alloc() - Get a buffer from a aura's pool.
+ * @node: Node to free memory to.
+ * @aura: Aura to free memory to.
+ *
+ * Return: Allocated buffer pointer if successful, NULL on error.
+ */
+void *octeon_fpa3_alloc(int node, int aura)
+{
+	void *buf = NULL;
+	u64 buf_phys;
+	u64 addr;
+
+	/* Buffer pointers are obtained using load operations */
+	addr = BIT(63) | BIT(48) | (0x29ull << 40) | ((u64)node << 36) |
+		(aura << 16);
+	buf_phys = READ_ONCE(*(u64 *)addr);
+
+	if (buf_phys)
+		buf = phys_to_virt(buf_phys);
+
+	return buf;
+}
+EXPORT_SYMBOL(octeon_fpa3_alloc);
+
+/**
+ * octeon_fpa3_free() - Add a buffer back to the aura's pool.
+ * @node: Node to free memory to.
+ * @aura: Aura to free memory to.
+ * @buf: Address of buffer to free to the aura's pool.
+ */
+void octeon_fpa3_free(int node, int aura, const void *buf)
+{
+	u64 buf_phys;
+	u64 addr;
+
+	buf_phys = virt_to_phys(buf);
+
+	/* Make sure that any previous writes to memory go out before we free
+	 * this buffer. This also serves as a barrier to prevent GCC from
+	 * reordering operations to after the free.
+	 */
+	wmb();
+
+	/* Buffers are added to fpa pools using store operations */
+	addr = BIT(63) | BIT(48) | (0x29ull << 40) | ((u64)node << 36);
+	addr |= (aura << 16);
+	WRITE_ONCE(*(u64 *)addr, buf_phys);
+}
+EXPORT_SYMBOL(octeon_fpa3_free);
+
+/**
+ * octeon_fpa3_mem_fill() - Add buffers to an aura.
+ * @node: Node to get memory from.
+ * @cache: Memory cache to allocate from.
+ * @aura: Aura to add buffers to.
+ * @num_bufs: Number of buffers to add to the aura.
+ *
+ * Return: 0 if successful.
+ *         < 0 for error codes.
+ */
+int octeon_fpa3_mem_fill(int node, struct kmem_cache *cache,
+			 int aura, int num_bufs)
+{
+	void *mem;
+	int rc = 0;
+	int i;
+
+	mutex_lock(&octeon_fpa3_lock);
+
+	for (i = 0; i < num_bufs; i++) {
+		mem = kmem_cache_alloc_node(cache, GFP_KERNEL, node);
+		if (!mem) {
+			pr_err("Failed to allocate memory for aura=%d\n", aura);
+			rc = -ENOMEM;
+			break;
+		}
+		octeon_fpa3_free(node, aura, mem);
+	}
+
+	mutex_unlock(&octeon_fpa3_lock);
+	return rc;
+}
+EXPORT_SYMBOL(octeon_fpa3_mem_fill);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Cavium, Inc. Octeon III FPA manager.");
diff --git a/drivers/net/ethernet/cavium/octeon/octeon3-pki.c b/drivers/net/ethernet/cavium/octeon/octeon3-pki.c
new file mode 100644
index 0000000..f2fa085
--- /dev/null
+++ b/drivers/net/ethernet/cavium/octeon/octeon3-pki.c
@@ -0,0 +1,823 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (c) 2017 Cavium, Inc.
+ *
+ * Configuration and management of the PacKet Input (PKI) block.
+ */
+#include <linux/firmware.h>
+#include <linux/module.h>
+
+#include <asm/octeon/octeon.h>
+
+#include "octeon3.h"
+
+#define PKI_CLUSTER_FIRMWARE		"cavium/pki-cluster.bin"
+#define VERSION_LEN			8
+
+#define MAX_CLUSTERS			4
+#define MAX_BANKS			2
+#define MAX_BANK_ENTRIES		192
+#define PKI_NUM_QPG_ENTRY		2048
+#define PKI_NUM_STYLE			256
+#define PKI_NUM_FINAL_STYLE		64
+#define MAX_PKNDS			64
+
+/* Registers are accessed via xkphys */
+#define PKI_BASE			0x1180044000000ull
+#define PKI_ADDR(node)			(SET_XKPHYS + NODE_OFFSET(node) +      \
+					 PKI_BASE)
+
+#define PKI_SFT_RST(n)			(PKI_ADDR(n)		     + 0x000010)
+#define PKI_BUF_CTL(n)			(PKI_ADDR(n)		     + 0x000100)
+#define PKI_STAT_CTL(n)			(PKI_ADDR(n)		     + 0x000110)
+#define PKI_ICG_CFG(n)			(PKI_ADDR(n)		     + 0x00a000)
+
+#define CLUSTER_OFFSET(c)		((c) << 16)
+#define CL_ADDR(n, c)			(PKI_ADDR(n) + CLUSTER_OFFSET(c))
+#define PKI_CL_ECC_CTL(n, c)		(CL_ADDR(n, c)		     + 0x00c020)
+
+#define PKI_STYLE_BUF(n, s)		(PKI_ADDR(n) + ((s) << 3)    + 0x024000)
+
+#define PKI_LTYPE_MAP(n, l)		(PKI_ADDR(n) + ((l) << 3)    + 0x005000)
+#define PKI_IMEM(n, i)			(PKI_ADDR(n) + ((i) << 3)    + 0x100000)
+
+#define PKI_CL_PKIND_CFG(n, c, p)	(CL_ADDR(n, c) + ((p) << 8)  + 0x300040)
+#define PKI_CL_PKIND_STYLE(n, c, p)	(CL_ADDR(n, c) + ((p) << 8)  + 0x300048)
+#define PKI_CL_PKIND_SKIP(n, c, p)	(CL_ADDR(n, c) + ((p) << 8)  + 0x300050)
+#define PKI_CL_PKIND_L2_CUSTOM(n, c, p)	(CL_ADDR(n, c) + ((p) << 8)  + 0x300058)
+#define PKI_CL_PKIND_LG_CUSTOM(n, c, p)	(CL_ADDR(n, c) + ((p) << 8)  + 0x300060)
+
+#define STYLE_OFFSET(s)			((s) << 3)
+#define STYLE_ADDR(n, c, s)		(PKI_ADDR(n) + CLUSTER_OFFSET(c) +     \
+					 STYLE_OFFSET(s))
+#define PKI_CL_STYLE_CFG(n, c, s)	(STYLE_ADDR(n, c, s)	     + 0x500000)
+#define PKI_CL_STYLE_CFG2(n, c, s)	(STYLE_ADDR(n, c, s)	     + 0x500800)
+#define PKI_CLX_STYLEX_ALG(n, c, s)	(STYLE_ADDR(n, c, s)	     + 0x501000)
+
+#define PCAM_OFFSET(bank)		((bank) << 12)
+#define PCAM_ENTRY_OFFSET(entry)	((entry) << 3)
+#define PCAM_ADDR(n, c, b, e)		(PKI_ADDR(n) + CLUSTER_OFFSET(c) +     \
+					 PCAM_OFFSET(b) + PCAM_ENTRY_OFFSET(e))
+#define PKI_CL_PCAM_TERM(n, c, b, e)	(PCAM_ADDR(n, c, b, e)	     + 0x700000)
+#define PKI_CL_PCAM_MATCH(n, c, b, e)	(PCAM_ADDR(n, c, b, e)	     + 0x704000)
+#define PKI_CL_PCAM_ACTION(n, c, b, e)	(PCAM_ADDR(n, c, b, e)	     + 0x708000)
+
+#define PKI_QPG_TBLX(n, i)		(PKI_ADDR(n) + ((i) << 3)    + 0x800000)
+#define PKI_AURAX_CFG(n, a)		(PKI_ADDR(n) + ((a) << 3)    + 0x900000)
+#define PKI_STATX_STAT0(n, p)		(PKI_ADDR(n) + ((p) << 8)    + 0xe00038)
+#define PKI_STATX_STAT1(n, p)		(PKI_ADDR(n) + ((p) << 8)    + 0xe00040)
+#define PKI_STATX_STAT3(n, p)		(PKI_ADDR(n) + ((p) << 8)    + 0xe00050)
+
+enum pcam_term {
+	NONE		= 0x0,
+	L2_CUSTOM	= 0x2,
+	HIGIGD		= 0x4,
+	HIGIG		= 0x5,
+	SMACH		= 0x8,
+	SMACL		= 0x9,
+	DMACH		= 0xa,
+	DMACL		= 0xb,
+	GLORT		= 0x12,
+	DSA		= 0x13,
+	ETHTYPE0	= 0x18,
+	ETHTYPE1	= 0x19,
+	ETHTYPE2	= 0x1a,
+	ETHTYPE3	= 0x1b,
+	MPLS0		= 0x1e,
+	L3_SIPHH	= 0x1f,
+	L3_SIPMH	= 0x20,
+	L3_SIPML	= 0x21,
+	L3_SIPLL	= 0x22,
+	L3_FLAGS	= 0x23,
+	L3_DIPHH	= 0x24,
+	L3_DIPMH	= 0x25,
+	L3_DIPML	= 0x26,
+	L3_DIPLL	= 0x27,
+	LD_VNI		= 0x28,
+	IL3_FLAGS	= 0x2b,
+	LF_SPI		= 0x2e,
+	L4_SPORT	= 0x2f,
+	L4_PORT		= 0x30,
+	LG_CUSTOM	= 0x39
+};
+
+enum pki_ltype {
+	LTYPE_NONE		= 0x00,
+	LTYPE_ENET		= 0x01,
+	LTYPE_VLAN		= 0x02,
+	LTYPE_SNAP_PAYLD	= 0x05,
+	LTYPE_ARP		= 0x06,
+	LTYPE_RARP		= 0x07,
+	LTYPE_IP4		= 0x08,
+	LTYPE_IP4_OPT		= 0x09,
+	LTYPE_IP6		= 0x0a,
+	LTYPE_IP6_OPT		= 0x0b,
+	LTYPE_IPSEC_ESP		= 0x0c,
+	LTYPE_IPFRAG		= 0x0d,
+	LTYPE_IPCOMP		= 0x0e,
+	LTYPE_TCP		= 0x10,
+	LTYPE_UDP		= 0x11,
+	LTYPE_SCTP		= 0x12,
+	LTYPE_UDP_VXLAN		= 0x13,
+	LTYPE_GRE		= 0x14,
+	LTYPE_NVGRE		= 0x15,
+	LTYPE_GTP		= 0x16,
+	LTYPE_UDP_GENEVE	= 0x17,
+	LTYPE_SW28		= 0x1c,
+	LTYPE_SW29		= 0x1d,
+	LTYPE_SW30		= 0x1e,
+	LTYPE_SW31		= 0x1f
+};
+
+enum pki_beltype {
+	BELTYPE_NONE	= 0x00,
+	BELTYPE_MISC	= 0x01,
+	BELTYPE_IP4	= 0x02,
+	BELTYPE_IP6	= 0x03,
+	BELTYPE_TCP	= 0x04,
+	BELTYPE_UDP	= 0x05,
+	BELTYPE_SCTP	= 0x06,
+	BELTYPE_SNAP	= 0x07
+};
+
+struct ltype_beltype {
+	enum pki_ltype		ltype;
+	enum pki_beltype	beltype;
+};
+
+/**
+ * struct pcam_term_info - Describes a term to configure in the pcam.
+ * @term:	Identifies the term to configure.
+ * @term_mask:	Specifies don't cares in the term.
+ * @style:	Style to compare.
+ * @style_mask:	Specifies don't cares in the style.
+ * @data:	Data to compare.
+ * @data_mask:	Specifies don't cares in the data.
+ */
+struct pcam_term_info {
+	u8	term;
+	u8	term_mask;
+	u8	style;
+	u8	style_mask;
+	u32	data;
+	u32	data_mask;
+};
+
+/**
+ * struct fw_hdr - Describes the firmware.
+ * @version:	Firmware version.
+ * @size:	Size of the data in bytes.
+ * @data:	Actual firmware data.
+ */
+struct fw_hdr {
+	char	version[VERSION_LEN];
+	u64	size;
+	u64	data[];
+};
+
+static struct ltype_beltype dflt_ltype_config[] = {
+	{ LTYPE_NONE,		BELTYPE_NONE },
+	{ LTYPE_ENET,		BELTYPE_MISC },
+	{ LTYPE_VLAN,		BELTYPE_MISC },
+	{ LTYPE_SNAP_PAYLD,	BELTYPE_MISC },
+	{ LTYPE_ARP,		BELTYPE_MISC },
+	{ LTYPE_RARP,		BELTYPE_MISC },
+	{ LTYPE_IP4,		BELTYPE_IP4  },
+	{ LTYPE_IP4_OPT,	BELTYPE_IP4  },
+	{ LTYPE_IP6,		BELTYPE_IP6  },
+	{ LTYPE_IP6_OPT,	BELTYPE_IP6  },
+	{ LTYPE_IPSEC_ESP,	BELTYPE_MISC },
+	{ LTYPE_IPFRAG,		BELTYPE_MISC },
+	{ LTYPE_IPCOMP,		BELTYPE_MISC },
+	{ LTYPE_TCP,		BELTYPE_TCP  },
+	{ LTYPE_UDP,		BELTYPE_UDP  },
+	{ LTYPE_SCTP,		BELTYPE_SCTP },
+	{ LTYPE_UDP_VXLAN,	BELTYPE_UDP  },
+	{ LTYPE_GRE,		BELTYPE_MISC },
+	{ LTYPE_NVGRE,		BELTYPE_MISC },
+	{ LTYPE_GTP,		BELTYPE_MISC },
+	{ LTYPE_UDP_GENEVE,	BELTYPE_UDP  },
+	{ LTYPE_SW28,		BELTYPE_MISC },
+	{ LTYPE_SW29,		BELTYPE_MISC },
+	{ LTYPE_SW30,		BELTYPE_MISC },
+	{ LTYPE_SW31,		BELTYPE_MISC }
+};
+
+static int get_num_clusters(void)
+{
+	if (OCTEON_IS_MODEL(OCTEON_CN73XX) || OCTEON_IS_MODEL(OCTEON_CNF75XX))
+		return 2;
+	return 4;
+}
+
+static int octeon3_pki_pcam_alloc_entry(int node, int entry, int bank)
+{
+	struct global_resource_tag tag;
+	char buf[16];
+	int num_clusters;
+	int rc;
+	int i;
+
+	/* Allocate a pcam entry for cluster0*/
+	strncpy((char *)&tag.lo, "cvm_pcam", 8);
+	snprintf(buf, 16, "_%d%d%d....", node, 0, bank);
+	memcpy(&tag.hi, buf, 8);
+
+	res_mgr_create_resource(tag, MAX_BANK_ENTRIES);
+	rc = res_mgr_alloc(tag, entry, false);
+	if (rc < 0)
+		return rc;
+
+	entry = rc;
+
+	/* Need to allocate entries for all clusters as se code needs it */
+	num_clusters = get_num_clusters();
+	for (i = 1; i < num_clusters; i++) {
+		strncpy((char *)&tag.lo, "cvm_pcam", 8);
+		snprintf(buf, 16, "_%d%d%d....", node, i, bank);
+		memcpy(&tag.hi, buf, 8);
+
+		res_mgr_create_resource(tag, MAX_BANK_ENTRIES);
+		rc = res_mgr_alloc(tag, entry, false);
+		if (rc < 0) {
+			int	j;
+
+			pr_err("octeon3-pki: Failed to allocate pcam entry\n");
+			/* Undo whatever we've did */
+			for (j = 0; i < i; j++) {
+				strncpy((char *)&tag.lo, "cvm_pcam", 8);
+				snprintf(buf, 16, "_%d%d%d....", node, j, bank);
+				memcpy(&tag.hi, buf, 8);
+				res_mgr_free(tag, entry);
+			}
+
+			return -1;
+		}
+	}
+
+	return entry;
+}
+
+static int octeon3_pki_pcam_write_entry(int node,
+					struct pcam_term_info *term_info)
+{
+	int num_clusters;
+	u64 action;
+	int entry;
+	u64 match;
+	int bank;
+	u64 term;
+	int i;
+
+	/* Bit 0 of the pcam term determines the bank to use */
+	bank = term_info->term & 1;
+
+	/* Allocate a pcam entry */
+	entry = octeon3_pki_pcam_alloc_entry(node, -1, bank);
+	if (entry < 0)
+		return entry;
+
+	term = 1ull << 63;
+	term |= (u64)(term_info->term & term_info->term_mask) << 40;
+	term |= (~term_info->term & term_info->term_mask) << 8;
+	term |= (u64)(term_info->style & term_info->style_mask) << 32;
+	term |= ~term_info->style & term_info->style_mask;
+
+	match = (u64)(term_info->data & term_info->data_mask) << 32;
+	match |= ~term_info->data & term_info->data_mask;
+
+	action = 0;
+	if (term_info->term >= ETHTYPE0 && term_info->term <= ETHTYPE3) {
+		action |= 2 << 8;
+		action |= 4;
+	}
+
+	/* Must write the term to all clusters */
+	num_clusters = get_num_clusters();
+	for (i = 0; i < num_clusters; i++) {
+		oct_csr_write(0, PKI_CL_PCAM_TERM(node, i, bank, entry));
+		oct_csr_write(match, PKI_CL_PCAM_MATCH(node, i, bank, entry));
+		oct_csr_write(action, PKI_CL_PCAM_ACTION(node, i, bank, entry));
+		oct_csr_write(term, PKI_CL_PCAM_TERM(node, i, bank, entry));
+	}
+
+	return 0;
+}
+
+static int octeon3_pki_alloc_qpg_entry(int node)
+{
+	struct global_resource_tag tag;
+	char buf[16];
+	int entry;
+
+	/* Allocate a qpg entry */
+	strncpy((char *)&tag.lo, "cvm_qpge", 8);
+	snprintf(buf, 16, "t_%d.....", node);
+	memcpy(&tag.hi, buf, 8);
+
+	res_mgr_create_resource(tag, PKI_NUM_QPG_ENTRY);
+	entry = res_mgr_alloc(tag, -1, false);
+	if (entry < 0)
+		pr_err("octeon3-pki: Failed to allocate qpg entry");
+
+	return entry;
+}
+
+static int octeon3_pki_alloc_style(int node)
+{
+	struct global_resource_tag tag;
+	char buf[16];
+	int entry;
+
+	/* Allocate a style entry */
+	strncpy((char *)&tag.lo, "cvm_styl", 8);
+	snprintf(buf, 16, "e_%d.....", node);
+	memcpy(&tag.hi, buf, 8);
+
+	res_mgr_create_resource(tag, PKI_NUM_STYLE);
+	entry = res_mgr_alloc(tag, -1, false);
+	if (entry < 0)
+		pr_err("octeon3-pki: Failed to allocate style");
+
+	return entry;
+}
+
+int octeon3_pki_set_ptp_skip(int node, int pknd, int skip)
+{
+	int num_clusters;
+	u64 data;
+	u64 i;
+
+	num_clusters = get_num_clusters();
+	for (i = 0; i < num_clusters; i++) {
+		data = oct_csr_read(PKI_CL_PKIND_SKIP(node, i, pknd));
+		data &= ~(GENMASK_ULL(15, 8) | GENMASK_ULL(7, 0));
+		data |= (skip << 8) | skip;
+		oct_csr_write(data, PKI_CL_PKIND_SKIP(node, i, pknd));
+
+		data = oct_csr_read(PKI_CL_PKIND_L2_CUSTOM(node, i, pknd));
+		data &= ~GENMASK_ULL(7, 0);
+		data |= skip;
+		oct_csr_write(data, PKI_CL_PKIND_L2_CUSTOM(node, i, pknd));
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(octeon3_pki_set_ptp_skip);
+
+/**
+ * octeon3_pki_get_stats() - Get the statistics for a given pknd (port).
+ * @node:	Node to get statistics for.
+ * @pknd:	Pknd to get statistis for.
+ * @packets:	Updated with the number of packets received.
+ * @octets:	Updated with the number of octets received.
+ * @dropped:	Updated with the number of dropped packets.
+ *
+ * Return: 0 if successful.
+ *	   < 0 for error codes.
+ */
+int octeon3_pki_get_stats(int node, int pknd,
+			  u64 *packets, u64 *octets, u64 *dropped)
+{
+	/* PKI-20775, must read until not all ones. */
+	do {
+		*packets = oct_csr_read(PKI_STATX_STAT0(node, pknd));
+	} while (*packets == 0xffffffffffffffffull);
+
+	do {
+		*octets = oct_csr_read(PKI_STATX_STAT1(node, pknd));
+	} while (*octets == 0xffffffffffffffffull);
+
+	do {
+		*dropped = oct_csr_read(PKI_STATX_STAT3(node, pknd));
+	} while (*dropped == 0xffffffffffffffffull);
+
+	return 0;
+}
+EXPORT_SYMBOL(octeon3_pki_get_stats);
+
+/**
+ * octeon3_pki_port_init() - Initialize a port.
+ * @node:	Node port is using.
+ * @aura:	Aura to use for packet buffers.
+ * @grp:	SSO group packets will be queued up for.
+ * @skip:	Extra bytes to skip before packet data.
+ * @mb_size:	Size of packet buffers.
+ * @pknd:	Port kind assigned to the port.
+ * @num_rx_cxt:	Number of sso groups used by the port.
+ *
+ * Return: 0 if successful.
+ *	   < 0 for error codes.
+ */
+int octeon3_pki_port_init(int node, int aura, int grp, int skip,
+			  int mb_size, int pknd, int num_rx_cxt)
+{
+	int num_clusters;
+	int qpg_entry;
+	int style;
+	u64 data;
+	u64 i;
+
+	/* Allocate and configure a qpg table entry for the port's group */
+	i = 0;
+	while ((num_rx_cxt & (1 << i)) == 0)
+		i++;
+	qpg_entry = octeon3_pki_alloc_qpg_entry(node);
+	data = i << 45;				/* GRPTAG_OK */
+	data |= ((u64)((node << 8) | grp) << 32); /* GRP_OK */
+	data |= i << 29;			/* GRPTAG_BAD*/
+	data |= ((u64)((node << 8) | grp) << 16); /* GRP_BAD */
+	data |= aura;				/* LAURA */
+	oct_csr_write(data, PKI_QPG_TBLX(node, qpg_entry));
+
+	/* Allocate a style for the port */
+	style = octeon3_pki_alloc_style(node);
+
+	/* Map the qpg table entry to the style */
+	num_clusters = get_num_clusters();
+	for (i = 0; i < num_clusters; i++) {
+		data = BIT(29) | BIT(22) | qpg_entry;
+		oct_csr_write(data, PKI_CL_STYLE_CFG(node, i, style));
+
+		/* Specify the tag generation rules and checksum to use */
+		oct_csr_write(0xfff49f, PKI_CL_STYLE_CFG2(node, i, style));
+
+		data = BIT(31);
+		oct_csr_write(data, PKI_CLX_STYLEX_ALG(node, i, style));
+	}
+
+	/* Set the style's buffer size and skips:
+	 *	Every buffer has 128 bytes reserved for Linux.
+	 *	The first buffer must also skip the wqe (40 bytes).
+	 *	Srio also requires skipping its header (skip)
+	 */
+	data = 1ull << 28;			/* WQE_SKIP */
+#ifdef __LITTLE_ENDIAN
+	data |= BIT(32);			/* PKT_LEND */
+#endif
+	data |= ((128 + 40 + skip) / 8) << 22;	/* FIRST_SKIP */
+	data |= (128 / 8) << 16;		/* LATER_SKIP */
+	data |= (mb_size & ~0xf) / 8;		/* MB_SIZE */
+	oct_csr_write(data, PKI_STYLE_BUF(node, style));
+
+	/* Assign the initial style to the port via the pknd */
+	for (i = 0; i < num_clusters; i++) {
+		data = oct_csr_read(PKI_CL_PKIND_STYLE(node, i, pknd));
+		data &= ~GENMASK_ULL(7, 0);
+		data |= style;
+		oct_csr_write(data, PKI_CL_PKIND_STYLE(node, i, pknd));
+	}
+
+	/* Enable red */
+	data = BIT(18);
+	oct_csr_write(data, PKI_AURAX_CFG(node, aura));
+
+	/* Clear statistic counters */
+	oct_csr_write(0, PKI_STATX_STAT0(node, pknd));
+	oct_csr_write(0, PKI_STATX_STAT1(node, pknd));
+	oct_csr_write(0, PKI_STATX_STAT3(node, pknd));
+
+	return 0;
+}
+EXPORT_SYMBOL(octeon3_pki_port_init);
+
+/**
+ * octeon3_pki_port_shutdown() - Release all the resources used by a port.
+ * @node: Node port is on.
+ * @pknd: Pknd assigned to the port.
+ *
+ * Return: 0 if successful.
+ *	   < 0 for error codes.
+ */
+int octeon3_pki_port_shutdown(int node, int pknd)
+{
+	/* Nothing at the moment */
+	return 0;
+}
+EXPORT_SYMBOL(octeon3_pki_port_shutdown);
+
+/**
+ * octeon3_pki_cluster_init() - Loads the cluster firmware into the
+ *				pki clusters.
+ * @node: Node to configure.
+ * @pdev: Device requesting the firmware.
+ *
+ * Return: 0 if successful.
+ *	   < 0 for error codes.
+ */
+int octeon3_pki_cluster_init(int node, struct platform_device *pdev)
+{
+	const struct firmware *pki_fw;
+	const struct fw_hdr *hdr;
+	const u64 *data;
+	int rc;
+	int i;
+
+	rc = request_firmware(&pki_fw, PKI_CLUSTER_FIRMWARE, &pdev->dev);
+	if (rc) {
+		dev_err(&pdev->dev, "octeon3-pki: Failed to load %s error=%d\n",
+			PKI_CLUSTER_FIRMWARE, rc);
+		return rc;
+	}
+
+	/* Verify the firmware is valid */
+	hdr = (const struct fw_hdr *)pki_fw->data;
+	if ((pki_fw->size - sizeof(const struct fw_hdr) != hdr->size) ||
+	    hdr->size % 8) {
+		dev_err(&pdev->dev, ("octeon3-pki: Corrupted PKI firmware\n"));
+		goto err;
+	}
+
+	dev_info(&pdev->dev, "octeon3-pki: Loading PKI firmware %s\n",
+		 hdr->version);
+	data = hdr->data;
+	for (i = 0; i < hdr->size / 8; i++) {
+		oct_csr_write(cpu_to_be64(*data), PKI_IMEM(node, i));
+		data++;
+	}
+err:
+	release_firmware(pki_fw);
+
+	return 0;
+}
+EXPORT_SYMBOL(octeon3_pki_cluster_init);
+
+/**
+ * octeon3_pki_vlan_init() - Configures the pcam to recognize the vlan ethtypes.
+ *
+ * @node: Node to configure.
+ *
+ * Return: 0 if successful.
+ *	   < 0 for error codes.
+ */
+int octeon3_pki_vlan_init(int node)
+{
+	u64 data;
+	int rc;
+	int i;
+
+	/* PKI-20858 */
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X)) {
+		for (i = 0; i < 4; i++) {
+			data = oct_csr_read(PKI_CL_ECC_CTL(node, i));
+			data &= ~BIT(63);
+			data |= BIT(4) | BIT(3);
+			oct_csr_write(data, PKI_CL_ECC_CTL(node, i));
+		}
+	}
+
+	/* Configure the pcam ethtype0 and ethtype1 terms */
+	for (i = ETHTYPE0; i <= ETHTYPE1; i++) {
+		struct pcam_term_info term_info;
+
+		/* Term for 0x8100 ethtype */
+		term_info.term = i;
+		term_info.term_mask = 0xfd;
+		term_info.style = 0;
+		term_info.style_mask = 0;
+		term_info.data = 0x81000000;
+		term_info.data_mask = 0xffff0000;
+		rc = octeon3_pki_pcam_write_entry(node, &term_info);
+		if (rc)
+			return rc;
+
+		/* Term for 0x88a8 ethtype */
+		term_info.data = 0x88a80000;
+		rc = octeon3_pki_pcam_write_entry(node, &term_info);
+		if (rc)
+			return rc;
+
+		/* Term for 0x9200 ethtype */
+		term_info.data = 0x92000000;
+		rc = octeon3_pki_pcam_write_entry(node, &term_info);
+		if (rc)
+			return rc;
+
+		/* Term for 0x9100 ethtype */
+		term_info.data = 0x91000000;
+		rc = octeon3_pki_pcam_write_entry(node, &term_info);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(octeon3_pki_vlan_init);
+
+/**
+ * octeon3_pki_ltype_init() - Configures the pki layer types.
+ *
+ * @node: Node to configure.
+ *
+ * Return: 0 if successful.
+ *	   < 0 for error codes.
+ */
+int octeon3_pki_ltype_init(int node)
+{
+	enum pki_ltype ltype;
+	u64 data;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(dflt_ltype_config); i++) {
+		ltype = dflt_ltype_config[i].ltype;
+		data = oct_csr_read(PKI_LTYPE_MAP(node, ltype));
+		data &= ~GENMASK_ULL(2, 0);
+		data |= dflt_ltype_config[i].beltype;
+		oct_csr_write(data, PKI_LTYPE_MAP(node, ltype));
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(octeon3_pki_ltype_init);
+
+int octeon3_pki_srio_init(int node, int pknd)
+{
+	int num_clusters;
+	int style;
+	u64 data;
+	int i;
+
+	num_clusters = get_num_clusters();
+	for (i = 0; i < num_clusters; i++) {
+		data = oct_csr_read(PKI_CL_PKIND_STYLE(node, i, pknd));
+		style = data & GENMASK_ULL(7, 0);
+		data &= ~GENMASK_ULL(14, 8);
+		oct_csr_write(data, PKI_CL_PKIND_STYLE(node, i, pknd));
+
+		/* Disable packet length errors and fcs */
+		data = oct_csr_read(PKI_CL_STYLE_CFG(node, i, style));
+		data &= ~(BIT(29) | BIT(26) | BIT(25) | BIT(23) | BIT(22));
+		oct_csr_write(data, PKI_CL_STYLE_CFG(node, i, style));
+
+		/* Packets have no fcs */
+		data = oct_csr_read(PKI_CL_PKIND_CFG(node, i, pknd));
+		data &= ~BIT(7);
+		oct_csr_write(data, PKI_CL_PKIND_CFG(node, i, pknd));
+
+		/* Skip the srio header and the INST_HDR_S data */
+		data = oct_csr_read(PKI_CL_PKIND_SKIP(node, i, pknd));
+		data &= ~(GENMASK_ULL(15, 8) | GENMASK_ULL(7, 0));
+		data |= (16 << 8) | 16;
+		oct_csr_write(data, PKI_CL_PKIND_SKIP(node, i, pknd));
+
+		/* Exclude port number from qpg */
+		data = oct_csr_read(PKI_CLX_STYLEX_ALG(node, i, style));
+		data &= ~GENMASK_ULL(20, 17);
+		oct_csr_write(data, PKI_CLX_STYLEX_ALG(node, i, style));
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(octeon3_pki_srio_init);
+
+/**
+ * octeon3_pki_enable() - Enable the pki.
+ *
+ * @node: Node to configure.
+ *
+ * Return: 0 if successful.
+ *	   < 0 for error codes.
+ */
+int octeon3_pki_enable(int node)
+{
+	int timeout;
+	u64 data;
+
+	/* Enable backpressure */
+	data = oct_csr_read(PKI_BUF_CTL(node));
+	data |= BIT(2);
+	oct_csr_write(data, PKI_BUF_CTL(node));
+
+	/* Enable cluster parsing */
+	data = oct_csr_read(PKI_ICG_CFG(node));
+	data |= BIT(24);
+	oct_csr_write(data, PKI_ICG_CFG(node));
+
+	/* Wait until the pki is out of reset */
+	timeout = 10000;
+	do {
+		data = oct_csr_read(PKI_SFT_RST(node));
+		if (!(data & BIT(63)))
+			break;
+		timeout--;
+		udelay(1);
+	} while (timeout);
+	if (!timeout) {
+		pr_err("octeon3-pki: timeout waiting for reset\n");
+		return -1;
+	}
+
+	/* Enable the pki */
+	data = oct_csr_read(PKI_BUF_CTL(node));
+	data |= BIT(0);
+	oct_csr_write(data, PKI_BUF_CTL(node));
+
+	/* Statistics are kept per pkind */
+	oct_csr_write(0, PKI_STAT_CTL(node));
+
+	return 0;
+}
+EXPORT_SYMBOL(octeon3_pki_enable);
+
+void octeon3_pki_shutdown(int node)
+{
+	struct global_resource_tag tag;
+	char buf[16];
+	int timeout;
+	u64 data;
+	int i;
+	int j;
+	int k;
+
+	/* Disable the pki */
+	data = oct_csr_read(PKI_BUF_CTL(node));
+	if (data & BIT(0)) {
+		data &= ~BIT(0);
+		oct_csr_write(data, PKI_BUF_CTL(node));
+
+		/* Wait until the pki has finished processing packets */
+		timeout = 10000;
+		do {
+			data = oct_csr_read(PKI_SFT_RST(node));
+			if (data & BIT(32))
+				break;
+			timeout--;
+			udelay(1);
+		} while (timeout);
+		if (!timeout)
+			pr_warn("octeon3_pki: disable timeout\n");
+	}
+
+	/* Free all prefetched fpa buffers back to the fpa */
+	data = oct_csr_read(PKI_BUF_CTL(node));
+	data |= BIT(5) | BIT(9);
+	oct_csr_write(data, PKI_BUF_CTL(node));
+	/* Dummy read to get the register write to take effect */
+	data = oct_csr_read(PKI_BUF_CTL(node));
+
+	/* Now we can reset the pki */
+	data = oct_csr_read(PKI_SFT_RST(node));
+	data |= BIT(0);
+	oct_csr_write(data, PKI_SFT_RST(node));
+	timeout = 10000;
+	do {
+		data = oct_csr_read(PKI_SFT_RST(node));
+		if ((data & BIT(63)) == 0)
+			break;
+		timeout--;
+		udelay(1);
+	} while (timeout);
+	if (!timeout)
+		pr_warn("octeon3_pki: reset timeout\n");
+
+	/* Free all the allocated resources. We should only free the
+	 * resources allocated by us (TODO).
+	 */
+	for (i = 0; i < PKI_NUM_STYLE; i++) {
+		strncpy((char *)&tag.lo, "cvm_styl", 8);
+		snprintf(buf, 16, "e_%d.....", node);
+		memcpy(&tag.hi, buf, 8);
+		res_mgr_free(tag, i);
+	}
+	for (i = 0; i < PKI_NUM_QPG_ENTRY; i++) {
+		strncpy((char *)&tag.lo, "cvm_qpge", 8);
+		snprintf(buf, 16, "t_%d.....", node);
+		memcpy(&tag.hi, buf, 8);
+		res_mgr_free(tag, i);
+	}
+	for (i = 0; i < get_num_clusters(); i++) {
+		for (j = 0; j < MAX_BANKS; j++) {
+			strncpy((char *)&tag.lo, "cvm_pcam", 8);
+			snprintf(buf, 16, "_%d%d%d....", node, i, j);
+			memcpy(&tag.hi, buf, 8);
+			for (k = 0; k < MAX_BANK_ENTRIES; k++)
+				res_mgr_free(tag, k);
+		}
+	}
+
+	/* Restore the registers back to their reset state. We should
+	 * only reset the registers used by us (TODO).
+	 */
+	for (i = 0; i < get_num_clusters(); i++) {
+		for (j = 0; j < MAX_PKNDS; j++) {
+			oct_csr_write(0, PKI_CL_PKIND_CFG(node, i, j));
+			oct_csr_write(0, PKI_CL_PKIND_STYLE(node, i, j));
+			oct_csr_write(0, PKI_CL_PKIND_SKIP(node, i, j));
+			oct_csr_write(0, PKI_CL_PKIND_L2_CUSTOM(node, i, j));
+			oct_csr_write(0, PKI_CL_PKIND_LG_CUSTOM(node, i, j));
+		}
+
+		for (j = 0; j < PKI_NUM_FINAL_STYLE; j++) {
+			oct_csr_write(0, PKI_CL_STYLE_CFG(node, i, j));
+			oct_csr_write(0, PKI_CL_STYLE_CFG2(node, i, j));
+			oct_csr_write(0, PKI_CLX_STYLEX_ALG(node, i, j));
+		}
+	}
+	for (i = 0; i < PKI_NUM_FINAL_STYLE; i++)
+		oct_csr_write((0x5 << 22) | 0x20, PKI_STYLE_BUF(node, i));
+}
+EXPORT_SYMBOL(octeon3_pki_shutdown);
+
+MODULE_LICENSE("GPL");
+MODULE_FIRMWARE(PKI_CLUSTER_FIRMWARE);
+MODULE_AUTHOR("Carlos Munoz <cmunoz@cavium.com>");
+MODULE_DESCRIPTION("Cavium, Inc. PKI management.");
diff --git a/drivers/net/ethernet/cavium/octeon/octeon3-pko.c b/drivers/net/ethernet/cavium/octeon/octeon3-pko.c
new file mode 100644
index 0000000..9207d0b
--- /dev/null
+++ b/drivers/net/ethernet/cavium/octeon/octeon3-pko.c
@@ -0,0 +1,1688 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (c) 2017 Cavium, Inc.
+ *
+ * Configuration and management of the PacKet Output (PKO) block.
+ */
+#include <linux/module.h>
+
+#include <asm/octeon/octeon.h>
+
+#include "octeon3.h"
+
+#define MAX_OUTPUT_MAC			28
+#define MAX_FIFO_GRP			8
+
+#define FIFO_SIZE			2560
+
+/* Registers are accessed via xkphys */
+#define PKO_BASE			0x1540000000000ull
+#define PKO_ADDR(node)			(SET_XKPHYS + NODE_OFFSET(node) +      \
+					 PKO_BASE)
+
+#define PKO_L1_SQ_SHAPE(n, q)		(PKO_ADDR(n) + ((q) << 9)    + 0x000010)
+#define PKO_L1_SQ_LINK(n, q)		(PKO_ADDR(n) + ((q) << 9)    + 0x000038)
+#define PKO_DQ_WM_CTL(n, q)		(PKO_ADDR(n) + ((q) << 9)    + 0x000040)
+#define PKO_L1_SQ_TOPOLOGY(n, q)	(PKO_ADDR(n) + ((q) << 9)    + 0x080000)
+#define PKO_L2_SQ_SCHEDULE(n, q)	(PKO_ADDR(n) + ((q) << 9)    + 0x080008)
+#define PKO_L3_L2_SQ_CHANNEL(n, q)	(PKO_ADDR(n) + ((q) << 9)    + 0x080038)
+#define PKO_CHANNEL_LEVEL(n)		(PKO_ADDR(n)		     + 0x0800f0)
+#define PKO_SHAPER_CFG(n)		(PKO_ADDR(n)		     + 0x0800f8)
+#define PKO_L2_SQ_TOPOLOGY(n, q)	(PKO_ADDR(n) + ((q) << 9)    + 0x100000)
+#define PKO_L3_SQ_SCHEDULE(n, q)	(PKO_ADDR(n) + ((q) << 9)    + 0x100008)
+#define PKO_L3_SQ_TOPOLOGY(n, q)	(PKO_ADDR(n) + ((q) << 9)    + 0x180000)
+#define PKO_L4_SQ_SCHEDULE(n, q)	(PKO_ADDR(n) + ((q) << 9)    + 0x180008)
+#define PKO_L4_SQ_TOPOLOGY(n, q)	(PKO_ADDR(n) + ((q) << 9)    + 0x200000)
+#define PKO_L5_SQ_SCHEDULE(n, q)	(PKO_ADDR(n) + ((q) << 9)    + 0x200008)
+#define PKO_L5_SQ_TOPOLOGY(n, q)	(PKO_ADDR(n) + ((q) << 9)    + 0x280000)
+#define PKO_DQ_SCHEDULE(n, q)		(PKO_ADDR(n) + ((q) << 9)    + 0x280008)
+#define PKO_DQ_SW_XOFF(n, q)		(PKO_ADDR(n) + ((q) << 9)    + 0x2800e0)
+#define PKO_DQ_TOPOLOGY(n, q)		(PKO_ADDR(n) + ((q) << 9)    + 0x300000)
+#define PKO_PDM_CFG(n)			(PKO_ADDR(n)		     + 0x800000)
+#define PKO_PDM_DQ_MINPAD(n, q)		(PKO_ADDR(n) + ((q) << 3)    + 0x8f0000)
+#define PKO_MAC_CFG(n, m)		(PKO_ADDR(n) + ((m) << 3)    + 0x900000)
+#define PKO_PTF_STATUS(n, f)		(PKO_ADDR(n) + ((f) << 3)    + 0x900100)
+#define PKO_PTGF_CFG(n, g)		(PKO_ADDR(n) + ((g) << 3)    + 0x900200)
+#define PKO_PTF_IOBP_CFG(n)		(PKO_ADDR(n)		     + 0x900300)
+#define PKO_MCI0_MAX_CRED(n, m)		(PKO_ADDR(n) + ((m) << 3)    + 0xa00000)
+#define PKO_MCI1_MAX_CRED(n, m)		(PKO_ADDR(n) + ((m) << 3)    + 0xa80000)
+#define PKO_LUT(n, c)			(PKO_ADDR(n) + ((c) << 3)    + 0xb00000)
+#define PKO_DPFI_STATUS(n)		(PKO_ADDR(n)		     + 0xc00000)
+#define PKO_DPFI_FLUSH(n)		(PKO_ADDR(n)		     + 0xc00008)
+#define PKO_DPFI_FPA_AURA(n)		(PKO_ADDR(n)		     + 0xc00010)
+#define PKO_DPFI_ENA(n)			(PKO_ADDR(n)		     + 0xc00018)
+#define PKO_STATUS(n)			(PKO_ADDR(n)		     + 0xd00000)
+#define PKO_ENABLE(n)			(PKO_ADDR(n)		     + 0xd00008)
+
+/* These levels mimic the pko internal linked queue structure */
+enum queue_level {
+	PQ	= 1,
+	L2_SQ	= 2,
+	L3_SQ	= 3,
+	L4_SQ	= 4,
+	L5_SQ	= 5,
+	DQ	= 6
+};
+
+enum pko_dqop_e {
+	DQOP_SEND,
+	DQOP_OPEN,
+	DQOP_CLOSE,
+	DQOP_QUERY
+};
+
+enum pko_dqstatus_e {
+	PASS = 0,
+	BADSTATE = 0x8,
+	NOFPABUF = 0x9,
+	NOPKOBUF = 0xa,
+	FAILRTNPTR = 0xb,
+	ALREADY = 0xc,
+	NOTCREATED = 0xd,
+	NOTEMPTY = 0xe,
+	SENDPKTDROP = 0xf
+};
+
+struct mac_info {
+	int	fifo_cnt;
+	int	prio;
+	int	speed;
+	int	fifo;
+	int	num_lmacs;
+};
+
+struct fifo_grp_info {
+	int	speed;
+	int	size;
+};
+
+static const int lut_index_78xx[] = {
+	0x200,
+	0x240,
+	0x280,
+	0x2c0,
+	0x300,
+	0x340
+};
+
+static const int lut_index_73xx[] = {
+	0x000,
+	0x040,
+	0x080
+};
+
+static enum queue_level max_sq_level(void)
+{
+	/* 73xx and 75xx only have 3 scheduler queue levels */
+	if (OCTEON_IS_MODEL(OCTEON_CN73XX) || OCTEON_IS_MODEL(OCTEON_CNF75XX))
+		return L3_SQ;
+
+	return L5_SQ;
+}
+
+static int get_num_fifos(void)
+{
+	if (OCTEON_IS_MODEL(OCTEON_CN73XX) || OCTEON_IS_MODEL(OCTEON_CNF75XX))
+		return 16;
+
+	return 28;
+}
+
+static int get_num_fifo_groups(void)
+{
+	if (OCTEON_IS_MODEL(OCTEON_CN73XX) || OCTEON_IS_MODEL(OCTEON_CNF75XX))
+		return 5;
+
+	return 8;
+}
+
+static int get_num_output_macs(void)
+{
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX))
+		return 28;
+	else if (OCTEON_IS_MODEL(OCTEON_CNF75XX))
+		return 10;
+	else if (OCTEON_IS_MODEL(OCTEON_CN73XX))
+		return 14;
+
+	return 0;
+}
+
+static int get_output_mac(int interface, int index,
+			  enum octeon3_mac_type	mac_type)
+{
+	int mac;
+
+	/* Output macs are hardcoded in the hardware. See PKO Output MACs
+	 * section in the HRM.
+	 */
+	if (OCTEON_IS_MODEL(OCTEON_CN73XX) || OCTEON_IS_MODEL(OCTEON_CNF75XX)) {
+		if (mac_type == SRIO_MAC)
+			mac = 4 + 2 * interface + index;
+		else
+			mac = 2 + 4 * interface + index;
+	} else {
+		mac = 4 + 4 * interface + index;
+	}
+
+	return mac;
+}
+
+static int get_num_port_queues(void)
+{
+	if (OCTEON_IS_MODEL(OCTEON_CN73XX) || OCTEON_IS_MODEL(OCTEON_CNF75XX))
+		return 16;
+
+	return 32;
+}
+
+static int allocate_queues(int node, enum queue_level level,
+			   int num_queues, int *queues)
+{
+	struct global_resource_tag tag;
+	int max_queues = 0;
+	char buf[16];
+	int rc;
+
+	if (level == PQ) {
+		strncpy((char *)&tag.lo, "cvm_pkop", 8);
+		snprintf(buf, 16, "oq_%d....", node);
+		memcpy(&tag.hi, buf, 8);
+
+		if (OCTEON_IS_MODEL(OCTEON_CN78XX))
+			max_queues = 32;
+		else
+			max_queues = 16;
+	} else if (level == L2_SQ) {
+		strncpy((char *)&tag.lo, "cvm_pkol", 8);
+		snprintf(buf, 16, "2q_%d....", node);
+		memcpy(&tag.hi, buf, 8);
+
+		if (OCTEON_IS_MODEL(OCTEON_CN78XX))
+			max_queues = 512;
+		else
+			max_queues = 256;
+	} else if (level == L3_SQ) {
+		strncpy((char *)&tag.lo, "cvm_pkol", 8);
+		snprintf(buf, 16, "3q_%d....", node);
+		memcpy(&tag.hi, buf, 8);
+
+		if (OCTEON_IS_MODEL(OCTEON_CN78XX))
+			max_queues = 512;
+		else
+			max_queues = 256;
+	} else if (level == L4_SQ) {
+		strncpy((char *)&tag.lo, "cvm_pkol", 8);
+		snprintf(buf, 16, "4q_%d....", node);
+		memcpy(&tag.hi, buf, 8);
+
+		if (OCTEON_IS_MODEL(OCTEON_CN78XX))
+			max_queues = 1024;
+		else
+			max_queues = 0;
+	} else if (level == L5_SQ) {
+		strncpy((char *)&tag.lo, "cvm_pkol", 8);
+		snprintf(buf, 16, "5q_%d....", node);
+		memcpy(&tag.hi, buf, 8);
+
+		if (OCTEON_IS_MODEL(OCTEON_CN78XX))
+			max_queues = 1024;
+		else
+			max_queues = 0;
+	} else if (level == DQ) {
+		strncpy((char *)&tag.lo, "cvm_pkod", 8);
+		snprintf(buf, 16, "eq_%d....", node);
+		memcpy(&tag.hi, buf, 8);
+
+		if (OCTEON_IS_MODEL(OCTEON_CN78XX))
+			max_queues = 1024;
+		else
+			max_queues = 256;
+	}
+
+	res_mgr_create_resource(tag, max_queues);
+	rc = res_mgr_alloc_range(tag, -1, num_queues, false, queues);
+	if (rc < 0)
+		return rc;
+
+	return 0;
+}
+
+static void free_queues(int node, enum queue_level level,
+			int num_queues, const int *queues)
+{
+	struct global_resource_tag tag;
+	char buf[16];
+
+	if (level == PQ) {
+		strncpy((char *)&tag.lo, "cvm_pkop", 8);
+		snprintf(buf, 16, "oq_%d....", node);
+		memcpy(&tag.hi, buf, 8);
+	} else if (level == L2_SQ) {
+		strncpy((char *)&tag.lo, "cvm_pkol", 8);
+		snprintf(buf, 16, "2q_%d....", node);
+		memcpy(&tag.hi, buf, 8);
+	} else if (level == L3_SQ) {
+		strncpy((char *)&tag.lo, "cvm_pkol", 8);
+		snprintf(buf, 16, "3q_%d....", node);
+		memcpy(&tag.hi, buf, 8);
+	} else if (level == L4_SQ) {
+		strncpy((char *)&tag.lo, "cvm_pkol", 8);
+		snprintf(buf, 16, "4q_%d....", node);
+		memcpy(&tag.hi, buf, 8);
+	} else if (level == L5_SQ) {
+		strncpy((char *)&tag.lo, "cvm_pkol", 8);
+		snprintf(buf, 16, "5q_%d....", node);
+		memcpy(&tag.hi, buf, 8);
+	} else if (level == DQ) {
+		strncpy((char *)&tag.lo, "cvm_pkod", 8);
+		snprintf(buf, 16, "eq_%d....", node);
+		memcpy(&tag.hi, buf, 8);
+	}
+
+	res_mgr_free_range(tag, queues, num_queues);
+}
+
+static int port_queue_init(int node, int pq, int mac)
+{
+	u64 data;
+
+	data = mac << 16;
+	oct_csr_write(data, PKO_L1_SQ_TOPOLOGY(node, pq));
+
+	data = mac << 13;
+	oct_csr_write(data, PKO_L1_SQ_SHAPE(node, pq));
+
+	data = mac;
+	data <<= 44;
+	oct_csr_write(data, PKO_L1_SQ_LINK(node, pq));
+
+	return 0;
+}
+
+static int scheduler_queue_l2_init(int node, int queue, int parent_q)
+{
+	u64 data;
+
+	data = oct_csr_read(PKO_L1_SQ_TOPOLOGY(node, parent_q));
+	data &= ~(GENMASK_ULL(40, 32) | GENMASK_ULL(4, 1));
+	data |= (u64)queue << 32;
+	data |= 0xf << 1;
+	oct_csr_write(data, PKO_L1_SQ_TOPOLOGY(node, parent_q));
+
+	oct_csr_write(0, PKO_L2_SQ_SCHEDULE(node, queue));
+
+	data = parent_q << 16;
+	oct_csr_write(data, PKO_L2_SQ_TOPOLOGY(node, queue));
+
+	return 0;
+}
+
+static int scheduler_queue_l3_init(int node, int queue, int parent_q)
+{
+	u64 data;
+
+	data = oct_csr_read(PKO_L2_SQ_TOPOLOGY(node, parent_q));
+	data &= ~(GENMASK_ULL(40, 32) | GENMASK_ULL(4, 1));
+	data |= (u64)queue << 32;
+	data |= 0xf << 1;
+	oct_csr_write(data, PKO_L2_SQ_TOPOLOGY(node, parent_q));
+
+	oct_csr_write(0, PKO_L3_SQ_SCHEDULE(node, queue));
+
+	data = parent_q << 16;
+	oct_csr_write(data, PKO_L3_SQ_TOPOLOGY(node, queue));
+
+	return 0;
+}
+
+static int scheduler_queue_l4_init(int node, int queue, int parent_q)
+{
+	u64 data;
+
+	data = oct_csr_read(PKO_L3_SQ_TOPOLOGY(node, parent_q));
+	data &= ~(GENMASK_ULL(41, 32) | GENMASK_ULL(4, 1));
+	data |= (u64)queue << 32;
+	data |= 0xf << 1;
+	oct_csr_write(data, PKO_L3_SQ_TOPOLOGY(node, parent_q));
+
+	oct_csr_write(0, PKO_L4_SQ_SCHEDULE(node, queue));
+
+	data = parent_q << 16;
+	oct_csr_write(data, PKO_L4_SQ_TOPOLOGY(node, queue));
+
+	return 0;
+}
+
+static int scheduler_queue_l5_init(int node, int queue, int parent_q)
+{
+	u64 data;
+
+	data = oct_csr_read(PKO_L4_SQ_TOPOLOGY(node, parent_q));
+	data &= ~(GENMASK_ULL(41, 32) | GENMASK_ULL(4, 1));
+	data |= (u64)queue << 32;
+	data |= 0xf << 1;
+	oct_csr_write(data, PKO_L4_SQ_TOPOLOGY(node, parent_q));
+
+	oct_csr_write(0, PKO_L5_SQ_SCHEDULE(node, queue));
+
+	data = parent_q << 16;
+	oct_csr_write(data, PKO_L5_SQ_TOPOLOGY(node, queue));
+
+	return 0;
+}
+
+static int descriptor_queue_init(int node, const int *queue, int parent_q,
+				 int num_dq)
+{
+	int rr_quantum;
+	int rr_prio;
+	u64 addr;
+	u64 data;
+	int prio;
+	int i;
+
+	/* Limit static priorities to the available prio field bits */
+	if (num_dq > 9) {
+		pr_err("octeon3-pko: Invalid number of dqs\n");
+		return -1;
+	}
+
+	prio = 0;
+
+	if (num_dq == 1) {
+		/* Single dq */
+		rr_prio = 0xf;
+		rr_quantum = 0x10;
+	} else {
+		/* Multiple dqs */
+		rr_prio = num_dq;
+		rr_quantum = 0;
+	}
+
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX))
+		addr = PKO_L5_SQ_TOPOLOGY(node, parent_q);
+	else
+		addr = PKO_L3_SQ_TOPOLOGY(node, parent_q);
+
+	data = oct_csr_read(addr);
+	data &= ~(GENMASK_ULL(41, 32) | GENMASK_ULL(4, 1));
+	data |= (u64)queue[0] << 32;
+	data |= rr_prio << 1;
+	oct_csr_write(data, addr);
+
+	for (i = 0; i < num_dq; i++) {
+		data = (prio << 24) | rr_quantum;
+		oct_csr_write(data, PKO_DQ_SCHEDULE(node, queue[i]));
+
+		data = parent_q << 16;
+		oct_csr_write(data, PKO_DQ_TOPOLOGY(node, queue[i]));
+
+		data = BIT(49);
+		oct_csr_write(data, PKO_DQ_WM_CTL(node, queue[i]));
+
+		if (prio << rr_prio)
+			prio++;
+	}
+
+	return 0;
+}
+
+static int map_channel(int node, int pq, int queue, int ipd_port)
+{
+	int lut_index = 0;
+	int table_index;
+	u64 data;
+
+	data = oct_csr_read(PKO_L3_L2_SQ_CHANNEL(node, queue));
+	data &= ~GENMASK_ULL(43, 32);
+	data |= (u64)ipd_port << 32;
+	oct_csr_write(data, PKO_L3_L2_SQ_CHANNEL(node, queue));
+
+	/* See PKO_LUT register description in the HRM for how to compose the
+	 * lut_index.
+	 */
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX)) {
+		table_index = ((ipd_port & 0xf00) - 0x800) >> 8;
+		lut_index = lut_index_78xx[table_index];
+		lut_index += ipd_port & 0xff;
+	} else if (OCTEON_IS_MODEL(OCTEON_CN73XX)) {
+		table_index = ((ipd_port & 0xf00) - 0x800) >> 8;
+		lut_index = lut_index_73xx[table_index];
+		lut_index += ipd_port & 0xff;
+	} else if (OCTEON_IS_MODEL(OCTEON_CNF75XX)) {
+		if ((ipd_port & 0xf00) != 0x800)
+			return -1;
+		lut_index = ipd_port & 0xff;
+	}
+
+	data = BIT(15);
+	data |= pq << 9;
+	data |= queue;
+	oct_csr_write(data, PKO_LUT(node, lut_index));
+
+	return 0;
+}
+
+static int open_dq(int node, int dq)
+{
+	enum pko_dqstatus_e status;
+	u64 *scratch_addr;
+	u64 *iobdma_addr;
+	u64 data;
+
+	/* Build the dq open query. See PKO_QUERY_DMA_S in the HRM for the
+	 * query format.
+	 */
+	data = (LMTDMA_SCR_OFFSET >> 3) << 56;
+	data |= 1ull << 48;
+	data |= 0x51ull << 40;
+	data |= (u64)node << 36;
+	data |= (u64)DQOP_OPEN << 32;
+	data |= dq << 16;
+
+	CVMX_SYNCWS;
+	preempt_disable();
+
+	/* Clear return location */
+	scratch_addr = (u64 *)(SCRATCH_BASE + LMTDMA_SCR_OFFSET);
+	*scratch_addr = ~0ull;
+
+	/* Issue pko lmtdma command */
+	iobdma_addr = (u64 *)(IOBDMA_ORDERED_IO_ADDR);
+	*iobdma_addr = data;
+
+	/* Wait for lmtdma command to complete and get response*/
+	CVMX_SYNCIOBDMA;
+	data = *scratch_addr;
+
+	preempt_enable();
+
+	/* See PKO_QUERY_RTN_S in the HRM for response format */
+	status = (data & GENMASK_ULL(63, 60)) >> 60;
+	if (status != PASS && status != ALREADY) {
+		pr_err("octeon3-pko: Failed to open dq\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static s64 query_dq(int node, int dq)
+{
+	enum pko_dqstatus_e status;
+	u64 *scratch_addr;
+	u64 *iobdma_addr;
+	s64 depth;
+	u64 data;
+
+	/* Build the dq open query. See PKO_QUERY_DMA_S in the HRM for the
+	 * query format.
+	 */
+	data = (LMTDMA_SCR_OFFSET >> 3) << 56;
+	data |= 1ull << 48;
+	data |= 0x51ull << 40;
+	data |= (u64)node << 36;
+	data |= (u64)DQOP_QUERY << 32;
+	data |= dq << 16;
+
+	CVMX_SYNCWS;
+	preempt_disable();
+
+	/* Clear return location */
+	scratch_addr = (u64 *)(SCRATCH_BASE + LMTDMA_SCR_OFFSET);
+	*scratch_addr = ~0ull;
+
+	/* Issue pko lmtdma command */
+	iobdma_addr = (u64 *)(IOBDMA_ORDERED_IO_ADDR);
+	*iobdma_addr = data;
+
+	/* Wait for lmtdma command to complete and get response*/
+	CVMX_SYNCIOBDMA;
+	data = *scratch_addr;
+
+	preempt_enable();
+
+	/* See PKO_QUERY_RTN_S in the HRM for response format */
+	status = (data & GENMASK_ULL(63, 60)) >> 60;
+	if (status != PASS) {
+		pr_err("octeon3-pko: Failed to query dq=%d\n", dq);
+		return -1;
+	}
+
+	depth = data & GENMASK_ULL(47, 0);
+
+	return depth;
+}
+
+static u64 close_dq(int node, int dq)
+{
+	enum pko_dqstatus_e status;
+	u64 *scratch_addr;
+	u64 *iobdma_addr;
+	u64 data;
+
+	/* Build the dq open query. See PKO_QUERY_DMA_S in the HRM for the
+	 * query format.
+	 */
+	data = (LMTDMA_SCR_OFFSET >> 3) << 56;
+	data |= 1ull << 48;
+	data |= 0x51ull << 40;
+	data |= (u64)node << 36;
+	data |= (u64)DQOP_CLOSE << 32;
+	data |= dq << 16;
+
+	CVMX_SYNCWS;
+	preempt_disable();
+
+	/* Clear return location */
+	scratch_addr = (u64 *)(SCRATCH_BASE + LMTDMA_SCR_OFFSET);
+	*scratch_addr = ~0ull;
+
+	/* Issue pko lmtdma command */
+	iobdma_addr = (u64 *)(IOBDMA_ORDERED_IO_ADDR);
+	*iobdma_addr = data;
+
+	/* Wait for lmtdma command to complete and get response*/
+	CVMX_SYNCIOBDMA;
+	data = *scratch_addr;
+
+	preempt_enable();
+
+	/* See PKO_QUERY_RTN_S in the HRM for response format */
+	status = (data & GENMASK_ULL(63, 60)) >> 60;
+	if (status != PASS) {
+		pr_err("octeon3-pko: Failed to close dq\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int get_78xx_fifos_required(int node, struct mac_info *macs)
+{
+	enum port_mode mode;
+	int fifo_cnt = 0;
+	int num_lmacs;
+	int index;
+	u64 data;
+	int prio;
+	int bgx;
+	int cnt;
+	int qlm;
+	int i;
+
+	/* The loopback mac gets 1 fifo by default */
+	macs[0].fifo_cnt = 1;
+	macs[0].speed = 1;
+	fifo_cnt += 1;
+
+	/* The dpi mac gets 1 fifo by default */
+	macs[1].fifo_cnt = 1;
+	macs[1].speed = 50;
+	fifo_cnt += 1;
+
+	/* The ilk macs get default number of fifos (module param) */
+	macs[2].fifo_cnt = ilk0_lanes <= 4 ? ilk0_lanes : 4;
+	macs[2].speed = 40;
+	fifo_cnt += macs[2].fifo_cnt;
+	macs[3].fifo_cnt = ilk1_lanes <= 4 ? ilk1_lanes : 4;
+	macs[3].speed = 40;
+	fifo_cnt += macs[3].fifo_cnt;
+
+	/* Assign fifos to the active bgx macs */
+	for (i = 4; i < get_num_output_macs(); i += 4) {
+		bgx = (i - 4) / 4;
+		qlm = bgx_port_get_qlm(node, bgx, 0);
+
+		data = oct_csr_read(GSER_CFG(node, qlm));
+		if (data & BIT(2)) {
+			data = oct_csr_read(BGX_CMR_TX_LMACS(node, bgx));
+			num_lmacs = data & 7;
+
+			for (index = 0; index < num_lmacs; index++) {
+				switch (num_lmacs) {
+				case 1:
+					macs[i + index].num_lmacs = 4;
+					break;
+				case 2:
+					macs[i + index].num_lmacs = 2;
+					break;
+				case 4:
+				default:
+					macs[i + index].num_lmacs = 1;
+					break;
+				}
+
+				mode = bgx_port_get_mode(node, bgx, 0);
+				switch (mode) {
+				case PORT_MODE_SGMII:
+				case PORT_MODE_RGMII:
+					macs[i + index].fifo_cnt = 1;
+					macs[i + index].prio = 1;
+					macs[i + index].speed = 1;
+					break;
+
+				case PORT_MODE_XAUI:
+				case PORT_MODE_RXAUI:
+					macs[i + index].fifo_cnt = 4;
+					macs[i + index].prio = 2;
+					macs[i + index].speed = 20;
+					break;
+
+				case PORT_MODE_10G_KR:
+				case PORT_MODE_XFI:
+					macs[i + index].fifo_cnt = 4;
+					macs[i + index].prio = 2;
+					macs[i + index].speed = 10;
+					break;
+
+				case PORT_MODE_40G_KR4:
+				case PORT_MODE_XLAUI:
+					macs[i + index].fifo_cnt = 4;
+					macs[i + index].prio = 3;
+					macs[i + index].speed = 40;
+					break;
+
+				default:
+					macs[i + index].fifo_cnt = 0;
+					macs[i + index].prio = 0;
+					macs[i + index].speed = 0;
+					macs[i + index].num_lmacs = 0;
+					break;
+				}
+
+				fifo_cnt += macs[i + index].fifo_cnt;
+			}
+		}
+	}
+
+	/* If more fifos than available were assigned, reduce the number of
+	 * fifos until within limit. Start with the lowest priority macs with 4
+	 * fifos.
+	 */
+	prio = 1;
+	cnt = 4;
+	while (fifo_cnt > get_num_fifos()) {
+		for (i = 0; i < get_num_output_macs(); i++) {
+			if (macs[i].prio == prio && macs[i].fifo_cnt == cnt) {
+				macs[i].fifo_cnt >>= 1;
+				fifo_cnt -= macs[i].fifo_cnt;
+			}
+
+			if (fifo_cnt <= get_num_fifos())
+				break;
+		}
+
+		if (prio >= 3) {
+			prio = 1;
+			cnt >>= 1;
+		} else {
+			prio++;
+		}
+
+		if (cnt == 0)
+			break;
+	}
+
+	/* Assign left over fifos to dpi */
+	if (get_num_fifos() - fifo_cnt > 0) {
+		if (get_num_fifos() - fifo_cnt >= 3) {
+			macs[1].fifo_cnt += 3;
+			fifo_cnt -= 3;
+		} else {
+			macs[1].fifo_cnt += 1;
+			fifo_cnt -= 1;
+		}
+	}
+
+	return 0;
+}
+
+static int get_75xx_fifos_required(int node, struct mac_info *macs)
+{
+	enum port_mode mode;
+	int fifo_cnt = 0;
+	int index;
+	u64 data;
+	int prio;
+	int bgx;
+	int cnt;
+	int qlm;
+	int i;
+
+	/* The loopback mac gets 1 fifo by default */
+	macs[0].fifo_cnt = 1;
+	macs[0].speed = 1;
+	fifo_cnt += 1;
+
+	/* The dpi mac gets 1 fifo by default */
+	macs[1].fifo_cnt = 1;
+	macs[1].speed = 50;
+	fifo_cnt += 1;
+
+	/* Assign fifos to the active bgx macs */
+	bgx = 0;
+	for (i = 2; i < 6; i++) {
+		index = i - 2;
+		qlm = bgx_port_get_qlm(node, bgx, index);
+		data = oct_csr_read(GSER_CFG(node, qlm));
+		if (data & BIT(2)) {
+			macs[i].num_lmacs = 1;
+
+			mode = bgx_port_get_mode(node, bgx, index);
+			switch (mode) {
+			case PORT_MODE_SGMII:
+			case PORT_MODE_RGMII:
+				macs[i].fifo_cnt = 1;
+				macs[i].prio = 1;
+				macs[i].speed = 1;
+				break;
+
+			case PORT_MODE_10G_KR:
+			case PORT_MODE_XFI:
+				macs[i].fifo_cnt = 4;
+				macs[i].prio = 2;
+				macs[i].speed = 10;
+				break;
+
+			default:
+				macs[i].fifo_cnt = 0;
+				macs[i].prio = 0;
+				macs[i].speed = 0;
+				macs[i].num_lmacs = 0;
+				break;
+			}
+
+			fifo_cnt += macs[i].fifo_cnt;
+		}
+	}
+
+	/* If more fifos than available were assigned, reduce the number of
+	 * fifos until within limit. Start with the lowest priority macs with 4
+	 * fifos.
+	 */
+	prio = 1;
+	cnt = 4;
+	while (fifo_cnt > get_num_fifos()) {
+		for (i = 0; i < get_num_output_macs(); i++) {
+			if (macs[i].prio == prio && macs[i].fifo_cnt == cnt) {
+				macs[i].fifo_cnt >>= 1;
+				fifo_cnt -= macs[i].fifo_cnt;
+			}
+
+			if (fifo_cnt <= get_num_fifos())
+				break;
+		}
+
+		if (prio >= 3) {
+			prio = 1;
+			cnt >>= 1;
+		} else {
+			prio++;
+		}
+
+		if (cnt == 0)
+			break;
+	}
+
+	/* Assign left over fifos to dpi */
+	if (get_num_fifos() - fifo_cnt > 0) {
+		if (get_num_fifos() - fifo_cnt >= 3) {
+			macs[1].fifo_cnt += 3;
+			fifo_cnt -= 3;
+		} else {
+			macs[1].fifo_cnt += 1;
+			fifo_cnt -= 1;
+		}
+	}
+
+	return 0;
+}
+
+static int get_73xx_fifos_required(int node, struct mac_info *macs)
+{
+	enum port_mode mode;
+	int fifo_cnt = 0;
+	int num_lmacs;
+	int index;
+	u64 data;
+	int prio;
+	int bgx;
+	int cnt;
+	int qlm;
+	int i;
+
+	/* The loopback mac gets 1 fifo by default */
+	macs[0].fifo_cnt = 1;
+	macs[0].speed = 1;
+	fifo_cnt += 1;
+
+	/* The dpi mac gets 1 fifo by default */
+	macs[1].fifo_cnt = 1;
+	macs[1].speed = 50;
+	fifo_cnt += 1;
+
+	/* Assign fifos to the active bgx macs */
+	for (i = 2; i < get_num_output_macs(); i += 4) {
+		bgx = (i - 2) / 4;
+		qlm = bgx_port_get_qlm(node, bgx, 0);
+		data = oct_csr_read(GSER_CFG(node, qlm));
+
+		/* Bgx2 can be connected to dlm 5, 6, or both */
+		if (bgx == 2) {
+			if (!(data & BIT(2))) {
+				qlm = bgx_port_get_qlm(node, bgx, 2);
+				data = oct_csr_read(GSER_CFG(node, qlm));
+			}
+		}
+
+		if (data & BIT(2)) {
+			data = oct_csr_read(BGX_CMR_TX_LMACS(node, bgx));
+			num_lmacs = data & 7;
+
+			for (index = 0; index < num_lmacs; index++) {
+				switch (num_lmacs) {
+				case 1:
+					macs[i + index].num_lmacs = 4;
+					break;
+				case 2:
+					macs[i + index].num_lmacs = 2;
+					break;
+				case 4:
+				default:
+					macs[i + index].num_lmacs = 1;
+					break;
+				}
+
+				mode = bgx_port_get_mode(node, bgx, index);
+				switch (mode) {
+				case PORT_MODE_SGMII:
+				case PORT_MODE_RGMII:
+					macs[i + index].fifo_cnt = 1;
+					macs[i + index].prio = 1;
+					macs[i + index].speed = 1;
+					break;
+
+				case PORT_MODE_XAUI:
+				case PORT_MODE_RXAUI:
+					macs[i + index].fifo_cnt = 4;
+					macs[i + index].prio = 2;
+					macs[i + index].speed = 20;
+					break;
+
+				case PORT_MODE_10G_KR:
+				case PORT_MODE_XFI:
+					macs[i + index].fifo_cnt = 4;
+					macs[i + index].prio = 2;
+					macs[i + index].speed = 10;
+					break;
+
+				case PORT_MODE_40G_KR4:
+				case PORT_MODE_XLAUI:
+					macs[i + index].fifo_cnt = 4;
+					macs[i + index].prio = 3;
+					macs[i + index].speed = 40;
+					break;
+
+				default:
+					macs[i + index].fifo_cnt = 0;
+					macs[i + index].prio = 0;
+					macs[i + index].speed = 0;
+					break;
+				}
+
+				fifo_cnt += macs[i + index].fifo_cnt;
+			}
+		}
+	}
+
+	/* If more fifos than available were assigned, reduce the number of
+	 * fifos until within limit. Start with the lowest priority macs with 4
+	 * fifos.
+	 */
+	prio = 1;
+	cnt = 4;
+	while (fifo_cnt > get_num_fifos()) {
+		for (i = 0; i < get_num_output_macs(); i++) {
+			if (macs[i].prio == prio && macs[i].fifo_cnt == cnt) {
+				macs[i].fifo_cnt >>= 1;
+				fifo_cnt -= macs[i].fifo_cnt;
+			}
+
+			if (fifo_cnt <= get_num_fifos())
+				break;
+		}
+
+		if (prio >= 3) {
+			prio = 1;
+			cnt >>= 1;
+		} else {
+			prio++;
+		}
+
+		if (cnt == 0)
+			break;
+	}
+
+	/* Assign left over fifos to dpi */
+	if (get_num_fifos() - fifo_cnt > 0) {
+		if (get_num_fifos() - fifo_cnt >= 3) {
+			macs[1].fifo_cnt += 3;
+			fifo_cnt -= 3;
+		} else {
+			macs[1].fifo_cnt += 1;
+			fifo_cnt -= 1;
+		}
+	}
+
+	return 0;
+}
+
+static int setup_macs(int node)
+{
+	struct fifo_grp_info fifo_grp[MAX_FIFO_GRP];
+	struct mac_info macs[MAX_OUTPUT_MAC];
+	u64 data;
+	int fifo;
+	int size;
+	int cnt;
+	int grp;
+	int i;
+
+	memset(macs, 0, sizeof(macs));
+	memset(fifo_grp, 0, sizeof(fifo_grp));
+
+	/* Get the number of fifos required by each mac */
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX)) {
+		get_78xx_fifos_required(node, macs);
+	} else if (OCTEON_IS_MODEL(OCTEON_CNF75XX)) {
+		get_75xx_fifos_required(node, macs);
+	} else if (OCTEON_IS_MODEL(OCTEON_CN73XX)) {
+		get_73xx_fifos_required(node, macs);
+	} else {
+		pr_err("octeon3-pko: Unsupported board type\n");
+		return -1;
+	}
+
+	/* Assign fifos to each mac. Start with macs requiring 4 fifos */
+	fifo = 0;
+	for (cnt = 4; cnt > 0; cnt >>= 1) {
+		for (i = 0; i < get_num_output_macs(); i++) {
+			if (macs[i].fifo_cnt != cnt)
+				continue;
+
+			macs[i].fifo = fifo;
+			grp = fifo / 4;
+
+			fifo_grp[grp].speed += macs[i].speed;
+
+			if (cnt == 4) {
+				/* 10, 0, 0, 0 */
+				fifo_grp[grp].size = 4;
+			} else if (cnt == 2) {
+				/* 5, 0, 5, 0 */
+				fifo_grp[grp].size = 3;
+			} else if (cnt == 1) {
+				if ((fifo & 0x2) && fifo_grp[grp].size == 3) {
+					/* 5, 0, 2.5, 2.5 */
+					fifo_grp[grp].size = 1;
+				} else {
+					/* 2.5, 2.5, 2.5, 2.5 */
+					fifo_grp[grp].size = 0;
+				}
+			}
+
+			fifo += cnt;
+		}
+	}
+
+	/* Configure the fifo groups */
+	for (i = 0; i < get_num_fifo_groups(); i++) {
+		data = oct_csr_read(PKO_PTGF_CFG(node, i));
+		size = data & GENMASK_ULL(2, 0);
+		if (size != fifo_grp[i].size)
+			data |= BIT(6);
+		data &= ~GENMASK_ULL(2, 0);
+		data |= fifo_grp[i].size;
+
+		data &= ~GENMASK_ULL(5, 3);
+		if (fifo_grp[i].speed >= 40) {
+			if (fifo_grp[i].size >= 3) {
+				/* 50 Gbps */
+				data |= 0x3 << 3;
+			} else {
+				/* 25 Gbps */
+				data |= 0x2 << 3;
+			}
+		} else if (fifo_grp[i].speed >= 20) {
+			/* 25 Gbps */
+			data |= 0x2 << 3;
+		} else if (fifo_grp[i].speed >= 10) {
+			/* 12.5 Gbps */
+			data |= 0x1 << 3;
+		}
+		oct_csr_write(data, PKO_PTGF_CFG(node, i));
+		data &= ~BIT(6);
+		oct_csr_write(data, PKO_PTGF_CFG(node, i));
+	}
+
+	/* Configure the macs with their assigned fifo */
+	for (i = 0; i < get_num_output_macs(); i++) {
+		data = oct_csr_read(PKO_MAC_CFG(node, i));
+		data &= ~GENMASK_ULL(4, 0);
+		if (!macs[i].fifo_cnt)
+			data |= 0x1f;
+		else
+			data |= macs[i].fifo;
+		oct_csr_write(data, PKO_MAC_CFG(node, i));
+	}
+
+	/* Setup mci0/mci1/skid credits */
+	for (i = 0; i < get_num_output_macs(); i++) {
+		int fifo_credit;
+		int skid_credit;
+		int mac_credit;
+
+		if (!macs[i].fifo_cnt)
+			continue;
+
+		if (i == 0) {
+			/* Loopback */
+			mac_credit = 4 * 1024;
+			skid_credit = 0;
+		} else if (i == 1) {
+			/* Dpi */
+			mac_credit = 2 * 1024;
+			skid_credit = 0;
+		} else if (OCTEON_IS_MODEL(OCTEON_CN78XX) && ((i == 2 || i == 3))) {
+			/* ILK */
+			mac_credit = 4 * 1024;
+			skid_credit = 0;
+		} else if (OCTEON_IS_MODEL(OCTEON_CNF75XX) && ((i >= 6 && i <= 9))) {
+			/* Srio */
+			mac_credit = 1024 / 2;
+			skid_credit = 0;
+		} else {
+			/* Bgx */
+			mac_credit = macs[i].num_lmacs * 8 * 1024;
+			skid_credit = macs[i].num_lmacs * 256;
+		}
+
+		if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_X)) {
+			fifo_credit = macs[i].fifo_cnt * FIFO_SIZE;
+			data = (fifo_credit + mac_credit) / 16;
+			oct_csr_write(data, PKO_MCI0_MAX_CRED(node, i));
+		}
+
+		data = mac_credit / 16;
+		oct_csr_write(data, PKO_MCI1_MAX_CRED(node, i));
+
+		data = oct_csr_read(PKO_MAC_CFG(node, i));
+		data &= ~GENMASK_ULL(6, 5);
+		data |= ((skid_credit / 256) >> 1) << 5;
+		oct_csr_write(data, PKO_MAC_CFG(node, i));
+	}
+
+	return 0;
+}
+
+static int hw_init_global(int node, int aura)
+{
+	int timeout;
+	u64 data;
+
+	data = oct_csr_read(PKO_ENABLE(node));
+	if (data & BIT(0)) {
+		pr_info("octeon3-pko: Pko already enabled on node %d\n", node);
+		return 0;
+	}
+
+	/* Enable color awareness */
+	data = oct_csr_read(PKO_SHAPER_CFG(node));
+	data |= BIT(1);
+	oct_csr_write(data, PKO_SHAPER_CFG(node));
+
+	/* Clear flush command */
+	oct_csr_write(0, PKO_DPFI_FLUSH(node));
+
+	/* Set the aura number */
+	data = (node << 10) | aura;
+	oct_csr_write(data, PKO_DPFI_FPA_AURA(node));
+
+	data = BIT(0);
+	oct_csr_write(data, PKO_DPFI_ENA(node));
+
+	/* Wait until all pointers have been returned */
+	timeout = 100000;
+	do {
+		data = oct_csr_read(PKO_STATUS(node));
+		if (data & BIT(63))
+			break;
+		udelay(1);
+		timeout--;
+	} while (timeout);
+	if (!timeout) {
+		pr_err("octeon3-pko: Pko dfpi failed on node %d\n", node);
+		return -1;
+	}
+
+	/* Set max outstanding requests in IOBP for any FIFO.*/
+	data = oct_csr_read(PKO_PTF_IOBP_CFG(node));
+	data &= ~GENMASK_ULL(6, 0);
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX))
+		data |= 0x10;
+	else
+		data |= 3;
+	oct_csr_write(data, PKO_PTF_IOBP_CFG(node));
+
+	/* Set minimum packet size per Ethernet standard */
+	data = 0x3c << 3;
+	oct_csr_write(data, PKO_PDM_CFG(node));
+
+	/* Initialize macs and fifos */
+	setup_macs(node);
+
+	/* Enable pko */
+	data = BIT(0);
+	oct_csr_write(data, PKO_ENABLE(node));
+
+	/* Verify pko is ready */
+	data = oct_csr_read(PKO_STATUS(node));
+	if (!(data & BIT(63))) {
+		pr_err("octeon3_pko: pko is not ready\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int hw_exit_global(int node)
+{
+	int timeout;
+	u64 data;
+	int i;
+
+	/* Wait until there are no in-flight packets */
+	for (i = 0; i < get_num_fifos(); i++) {
+		data = oct_csr_read(PKO_PTF_STATUS(node, i));
+		if ((data & GENMASK_ULL(4, 0)) == 0x1f)
+			continue;
+
+		timeout = 10000;
+		do {
+			if (!(data & GENMASK_ULL(11, 5)))
+				break;
+			udelay(1);
+			timeout--;
+			data = oct_csr_read(PKO_PTF_STATUS(node, i));
+		} while (timeout);
+		if (!timeout) {
+			pr_err("octeon3-pko: Timeout in-flight fifo\n");
+			return -1;
+		}
+	}
+
+	/* Disable pko */
+	oct_csr_write(0, PKO_ENABLE(node));
+
+	/* Reset all port queues to the virtual mac */
+	for (i = 0; i < get_num_port_queues(); i++) {
+		data = get_num_output_macs() << 16;
+		oct_csr_write(data, PKO_L1_SQ_TOPOLOGY(node, i));
+
+		data = get_num_output_macs() << 13;
+		oct_csr_write(data, PKO_L1_SQ_SHAPE(node, i));
+
+		data = (u64)get_num_output_macs() << 48;
+		oct_csr_write(data, PKO_L1_SQ_LINK(node, i));
+	}
+
+	/* Reset all output macs */
+	for (i = 0; i < get_num_output_macs(); i++) {
+		data = 0x1f;
+		oct_csr_write(data, PKO_MAC_CFG(node, i));
+	}
+
+	/* Reset all fifo groups */
+	for (i = 0; i < get_num_fifo_groups(); i++) {
+		data = oct_csr_read(PKO_PTGF_CFG(node, i));
+		/* Simulator asserts if an unused group is reset */
+		if (data == 0)
+			continue;
+		data = BIT(6);
+		oct_csr_write(data, PKO_PTGF_CFG(node, i));
+	}
+
+	/* Return cache pointers to fpa */
+	data = BIT(0);
+	oct_csr_write(data, PKO_DPFI_FLUSH(node));
+	timeout = 10000;
+	do {
+		data = oct_csr_read(PKO_DPFI_STATUS(node));
+		if (data & BIT(0))
+			break;
+		udelay(1);
+		timeout--;
+	} while (timeout);
+	if (!timeout) {
+		pr_err("octeon3-pko: Timeout flushing cache\n");
+		return -1;
+	}
+	oct_csr_write(0, PKO_DPFI_ENA(node));
+	oct_csr_write(0, PKO_DPFI_FLUSH(node));
+
+	return 0;
+}
+
+static int virtual_mac_config(int node)
+{
+	enum queue_level level;
+	int parent_q;
+	int num_dq;
+	int dq[8];
+	int queue;
+	int vmac;
+	int pq;
+	int i;
+	int rc;
+
+	/* The virtual mac is after the last output mac. Note: for the 73xx it
+	 * might be 2 after the last output mac (15).
+	 */
+	vmac = get_num_output_macs();
+
+	/* Allocate a port queue */
+	rc = allocate_queues(node, PQ, 1, &pq);
+	if (rc < 0) {
+		pr_err("octeon3-pko: Failed to allocate port queue\n");
+		return rc;
+	}
+
+	/* Connect the port queue to the output mac */
+	port_queue_init(node, pq, vmac);
+
+	parent_q = pq;
+	for (level = L2_SQ; level <= max_sq_level(); level++) {
+		rc = allocate_queues(node, level, 1, &queue);
+		if (rc < 0) {
+			pr_err("octeon3-pko: Failed to allocate queue\n");
+			return rc;
+		}
+
+		switch (level) {
+		case L2_SQ:
+			scheduler_queue_l2_init(node, queue, parent_q);
+			break;
+		case L3_SQ:
+			scheduler_queue_l3_init(node, queue, parent_q);
+			break;
+		case L4_SQ:
+			scheduler_queue_l4_init(node, queue, parent_q);
+			break;
+		case L5_SQ:
+			scheduler_queue_l5_init(node, queue, parent_q);
+			break;
+		default:
+			break;
+		}
+
+		parent_q = queue;
+	}
+
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_0))
+		num_dq = 8;
+	else
+		num_dq = 1;
+
+	rc = allocate_queues(node, DQ, num_dq, dq);
+	if (rc < 0) {
+		pr_err("octeon3-pko: Failed to allocate description queues\n");
+		return rc;
+	}
+
+	/* By convention the dq must be zero */
+	if (dq[0] != 0) {
+		pr_err("octeon3-pko: Failed to reserve description queues\n");
+		return -1;
+	}
+	descriptor_queue_init(node, dq, parent_q, num_dq);
+
+	/* Open the dqs */
+	for (i = 0; i < num_dq; i++)
+		open_dq(node, dq[i]);
+
+	return 0;
+}
+
+static int drain_dq(int node, int dq)
+{
+	int timeout;
+	u64 data;
+	s64 rc;
+
+	data = BIT(2) | BIT(1);
+	oct_csr_write(data, PKO_DQ_SW_XOFF(node, dq));
+
+	usleep_range(1000, 2000);
+
+	data = 0;
+	oct_csr_write(data, PKO_DQ_SW_XOFF(node, dq));
+
+	/* Wait for the dq to drain */
+	timeout = 10000;
+	do {
+		rc = query_dq(node, dq);
+		if (!rc)
+			break;
+		else if (rc < 0)
+			return rc;
+		udelay(1);
+		timeout--;
+	} while (timeout);
+	if (!timeout) {
+		pr_err("octeon3-pko: Timeout waiting for dq to drain\n");
+		return -1;
+	}
+
+	/* Close the queue anf free internal buffers */
+	close_dq(node, dq);
+
+	return 0;
+}
+
+int octeon3_pko_exit_global(int node)
+{
+	int num_dq;
+	int dq[8];
+	int i;
+
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX_PASS1_0))
+		num_dq = 8;
+	else
+		num_dq = 1;
+
+	/* Shutdown the virtual/null interface */
+	for (i = 0; i < ARRAY_SIZE(dq); i++)
+		dq[i] = i;
+	octeon3_pko_interface_uninit(node, dq, num_dq);
+
+	/* Shutdown pko */
+	hw_exit_global(node);
+
+	return 0;
+}
+EXPORT_SYMBOL(octeon3_pko_exit_global);
+
+int octeon3_pko_init_global(int node, int aura)
+{
+	int rc;
+
+	rc = hw_init_global(node, aura);
+	if (rc)
+		return rc;
+
+	/* Channel credit level at level 2 */
+	oct_csr_write(0, PKO_CHANNEL_LEVEL(node));
+
+	/* Configure the null mac */
+	rc = virtual_mac_config(node);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+EXPORT_SYMBOL(octeon3_pko_init_global);
+
+int octeon3_pko_set_mac_options(int node, int interface, int index,
+				enum octeon3_mac_type mac_type,
+				bool fcs_en, bool pad_en, int fcs_sop_off)
+{
+	int fifo_num;
+	u64 data;
+	int mac;
+
+	mac = get_output_mac(interface, index, mac_type);
+
+	data = oct_csr_read(PKO_MAC_CFG(node, mac));
+	fifo_num = data & GENMASK_ULL(4, 0);
+	if (fifo_num == 0x1f) {
+		pr_err("octeon3_pko: mac not configured %d:%d:%d\n",
+		       node, interface, index);
+		return -ENODEV;
+	}
+
+	/* Some silicon requires fifo_num=0x1f to change padding, fcs */
+	data &= ~GENMASK_ULL(4, 0);
+	data |= 0x1f;
+
+	data &= ~(BIT(16) | BIT(15) | GENMASK_ULL(14, 7));
+	if (pad_en)
+		data |= BIT(16);
+	if (fcs_en)
+		data |= BIT(15);
+	if (fcs_sop_off)
+		data |= fcs_sop_off << 7;
+
+	oct_csr_write(data, PKO_MAC_CFG(node, mac));
+
+	data &= ~GENMASK_ULL(4, 0);
+	data |= fifo_num;
+	oct_csr_write(data, PKO_MAC_CFG(node, mac));
+
+	return 0;
+}
+EXPORT_SYMBOL(octeon3_pko_set_mac_options);
+
+int octeon3_pko_get_fifo_size(int node, int interface, int index,
+			      enum octeon3_mac_type mac_type)
+{
+	int fifo_grp;
+	int fifo_off;
+	u64 data;
+	int size;
+	int mac;
+
+	/* Set fifo size to 2.4 KB */
+	size = FIFO_SIZE;
+
+	mac = get_output_mac(interface, index, mac_type);
+
+	data = oct_csr_read(PKO_MAC_CFG(node, mac));
+	if ((data & GENMASK_ULL(4, 0)) == 0x1f) {
+		pr_err("octeon3_pko: mac not configured %d:%d:%d\n",
+		       node, interface, index);
+		return -ENODEV;
+	}
+	fifo_grp = (data & GENMASK_ULL(4, 0)) >> 2;
+	fifo_off = data & GENMASK_ULL(1, 0);
+
+	data = oct_csr_read(PKO_PTGF_CFG(node, fifo_grp));
+	data &= GENMASK_ULL(2, 0);
+	switch (data) {
+	case 0:
+		/* 2.5l, 2.5k, 2.5k, 2.5k */
+		break;
+	case 1:
+		/* 5.0k, 0.0k, 2.5k, 2.5k */
+		if (fifo_off == 0)
+			size *= 2;
+		if (fifo_off == 1)
+			size = 0;
+		break;
+	case 2:
+		/* 2.5k, 2.5k, 5.0k, 0.0k */
+		if (fifo_off == 2)
+			size *= 2;
+		if (fifo_off == 3)
+			size = 0;
+		break;
+	case 3:
+		/* 5k, 0, 5k, 0 */
+		if ((fifo_off & 1) != 0)
+			size = 0;
+		size *= 2;
+		break;
+	case 4:
+		/* 10k, 0, 0, 0 */
+		if (fifo_off != 0)
+			size = 0;
+		size *= 4;
+		break;
+	default:
+		size = -1;
+	}
+
+	return size;
+}
+EXPORT_SYMBOL(octeon3_pko_get_fifo_size);
+
+int octeon3_pko_activate_dq(int node, int dq, int cnt)
+{
+	int rc = 0;
+	u64 data;
+	int i;
+
+	for (i = 0; i < cnt; i++) {
+		rc = open_dq(node, dq + i);
+		if (rc)
+			break;
+
+		data = oct_csr_read(PKO_PDM_DQ_MINPAD(node, dq + i));
+		data &= ~BIT(0);
+		oct_csr_write(data, PKO_PDM_DQ_MINPAD(node, dq + i));
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(octeon3_pko_activate_dq);
+
+int octeon3_pko_interface_init(int node, int interface, int index,
+			       enum octeon3_mac_type mac_type, int ipd_port)
+{
+	enum queue_level level;
+	int parent_q;
+	int queue;
+	int mac;
+	int pq;
+	int rc;
+
+	mac = get_output_mac(interface, index, mac_type);
+
+	/* Allocate a port queue for this interface */
+	rc = allocate_queues(node, PQ, 1, &pq);
+	if (rc < 0) {
+		pr_err("octeon3-pko: Failed to allocate port queue\n");
+		return rc;
+	}
+
+	/* Connect the port queue to the output mac */
+	port_queue_init(node, pq, mac);
+
+	/* Link scheduler queues to the port queue */
+	parent_q = pq;
+	for (level = L2_SQ; level <= max_sq_level(); level++) {
+		rc = allocate_queues(node, level, 1, &queue);
+		if (rc < 0) {
+			pr_err("octeon3-pko: Failed to allocate queue\n");
+			return rc;
+		}
+
+		switch (level) {
+		case L2_SQ:
+			scheduler_queue_l2_init(node, queue, parent_q);
+			map_channel(node, pq, queue, ipd_port);
+			break;
+		case L3_SQ:
+			scheduler_queue_l3_init(node, queue, parent_q);
+			break;
+		case L4_SQ:
+			scheduler_queue_l4_init(node, queue, parent_q);
+			break;
+		case L5_SQ:
+			scheduler_queue_l5_init(node, queue, parent_q);
+			break;
+		default:
+			break;
+		}
+
+		parent_q = queue;
+	}
+
+	/* Link the descriptor queue */
+	rc = allocate_queues(node, DQ, 1, &queue);
+	if (rc < 0) {
+		pr_err("octeon3-pko: Failed to allocate descriptor queue\n");
+		return rc;
+	}
+	descriptor_queue_init(node, &queue, parent_q, 1);
+
+	return queue;
+}
+EXPORT_SYMBOL(octeon3_pko_interface_init);
+
+int octeon3_pko_interface_uninit(int node, const int *dq, int num_dq)
+{
+	enum queue_level level;
+	int parent_q;
+	int queue;
+	u64 addr;
+	u64 data;
+	int rc;
+	int i;
+
+	/* Drain all dqs */
+	for (i = 0; i < num_dq; i++) {
+		rc = drain_dq(node, dq[i]);
+		if (rc)
+			return rc;
+
+		/* Free the dq */
+		data = oct_csr_read(PKO_DQ_TOPOLOGY(node, dq[i]));
+		parent_q = (data & GENMASK_ULL(25, 16)) >> 16;
+		free_queues(node, DQ, 1, &dq[i]);
+
+		/* Free all the scheduler queues */
+		queue = parent_q;
+		for (level = max_sq_level(); (signed int)level >= PQ; level--) {
+			switch (level) {
+			case L5_SQ:
+				addr = PKO_L5_SQ_TOPOLOGY(node, queue);
+				data = oct_csr_read(addr);
+				parent_q = (data & GENMASK_ULL(25, 16)) >> 16;
+				break;
+
+			case L4_SQ:
+				addr = PKO_L4_SQ_TOPOLOGY(node, queue);
+				data = oct_csr_read(addr);
+				parent_q = (data & GENMASK_ULL(24, 16)) >> 16;
+				break;
+
+			case L3_SQ:
+				addr = PKO_L3_SQ_TOPOLOGY(node, queue);
+				data = oct_csr_read(addr);
+				parent_q = (data & GENMASK_ULL(24, 16)) >> 16;
+				break;
+
+			case L2_SQ:
+				addr = PKO_L2_SQ_TOPOLOGY(node, queue);
+				data = oct_csr_read(addr);
+				parent_q = (data & GENMASK_ULL(20, 16)) >> 16;
+				break;
+
+			case PQ:
+				break;
+
+			default:
+				pr_err("octeon3-pko: Invalid level=%d\n",
+				       level);
+				return -1;
+			}
+
+			free_queues(node, level, 1, &queue);
+			queue = parent_q;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(octeon3_pko_interface_uninit);
diff --git a/drivers/net/ethernet/cavium/octeon/octeon3-sso.c b/drivers/net/ethernet/cavium/octeon/octeon3-sso.c
new file mode 100644
index 0000000..9335093
--- /dev/null
+++ b/drivers/net/ethernet/cavium/octeon/octeon3-sso.c
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (c) 2017 Cavium, Inc.
+ *
+ * Configuration of Schedule/Synchronize/Order (SSO) block.
+ */
+#include <linux/module.h>
+
+#include <asm/octeon/octeon.h>
+
+#include "octeon3.h"
+
+/* Registers are accessed via xkphys */
+#define SSO_BASE			0x1670000000000ull
+#define SSO_ADDR(node)			(SET_XKPHYS + NODE_OFFSET(node) +      \
+					 SSO_BASE)
+
+#define SSO_AW_STATUS(n)		(SSO_ADDR(n)		   + 0x000010e0)
+#define SSO_AW_CFG(n)			(SSO_ADDR(n)		   + 0x000010f0)
+#define SSO_ERR0(n)			(SSO_ADDR(n)		   + 0x00001240)
+#define SSO_TAQ_ADD(n)			(SSO_ADDR(n)		   + 0x000020e0)
+#define SSO_XAQ_AURA(n)			(SSO_ADDR(n)		   + 0x00002100)
+
+#define AQ_OFFSET(g)			((g) << 3)
+#define AQ_ADDR(n, g)			(SSO_ADDR(n) + AQ_OFFSET(g))
+#define SSO_XAQ_HEAD_PTR(n, g)		(AQ_ADDR(n, g)		   + 0x00080000)
+#define SSO_XAQ_TAIL_PTR(n, g)		(AQ_ADDR(n, g)		   + 0x00090000)
+#define SSO_XAQ_HEAD_NEXT(n, g)		(AQ_ADDR(n, g)		   + 0x000a0000)
+#define SSO_XAQ_TAIL_NEXT(n, g)		(AQ_ADDR(n, g)		   + 0x000b0000)
+
+#define GRP_OFFSET(grp)			((grp) << 16)
+#define GRP_ADDR(n, g)			(SSO_ADDR(n) + GRP_OFFSET(g))
+#define SSO_GRP_TAQ_THR(n, g)		(GRP_ADDR(n, g)		   + 0x20000100)
+#define SSO_GRP_PRI(n, g)		(GRP_ADDR(n, g)		   + 0x20000200)
+#define SSO_GRP_INT(n, g)		(GRP_ADDR(n, g)		   + 0x20000400)
+#define SSO_GRP_INT_THR(n, g)		(GRP_ADDR(n, g)		   + 0x20000500)
+#define SSO_GRP_AQ_CNT(n, g)		(GRP_ADDR(n, g)		   + 0x20000700)
+
+static int get_num_sso_grps(void)
+{
+	if (OCTEON_IS_MODEL(OCTEON_CN78XX))
+		return 256;
+	if (OCTEON_IS_MODEL(OCTEON_CNF75XX) || OCTEON_IS_MODEL(OCTEON_CN73XX))
+		return 64;
+	return 0;
+}
+
+void octeon3_sso_irq_set(int node, int grp, bool en)
+{
+	if (en)
+		oct_csr_write(1, SSO_GRP_INT_THR(node, grp));
+	else
+		oct_csr_write(0, SSO_GRP_INT_THR(node, grp));
+
+	oct_csr_write(BIT(1), SSO_GRP_INT(node, grp));
+}
+EXPORT_SYMBOL(octeon3_sso_irq_set);
+
+/**
+ * octeon3_sso_alloc_grp_range() - Allocate a range of sso groups.
+ * @node: Node where sso resides.
+ * @req_grp: Group number to start allocating sequentially from. -1 for don't
+ *	     care.
+ * @req_cnt: Number of groups to allocate.
+ * @use_last_avail: Set to request the last available groups.
+ * @grp: Updated with allocated groups.
+ *
+ * Return: 0 if successful.
+ *	   < 0 for error codes.
+ */
+int octeon3_sso_alloc_grp_range(int node, int req_grp, int req_cnt,
+				bool use_last_avail, int *grp)
+{
+	struct global_resource_tag tag;
+	char buf[16];
+
+	/* Allocate the request group range */
+	strncpy((char *)&tag.lo, "cvm_sso_", 8);
+	snprintf(buf, 16, "0%d......", node);
+	memcpy(&tag.hi, buf, 8);
+
+	res_mgr_create_resource(tag, get_num_sso_grps());
+	return res_mgr_alloc_range(tag, req_grp, req_cnt, false, grp);
+}
+EXPORT_SYMBOL(octeon3_sso_alloc_grp_range);
+
+/**
+ * octeon3_sso_alloc_grp() - Allocate a sso group.
+ * @node: Node where sso resides.
+ * @req_grp: Group number to allocate, -1 for don't care.
+ *
+ * Return: allocated group.
+ *	   < 0 for error codes.
+ */
+int octeon3_sso_alloc_grp(int node, int req_grp)
+{
+	int grp;
+	int rc;
+
+	rc = octeon3_sso_alloc_grp_range(node, req_grp, 1, false, &grp);
+	if (!rc)
+		rc = grp;
+
+	return rc;
+}
+EXPORT_SYMBOL(octeon3_sso_alloc_grp);
+
+/**
+ * octeon3_sso_free_grp_range() - Free a range of sso groups.
+ * @node: Node where sso resides.
+ * @grp: Array of groups to free.
+ * @req_cnt: Number of groups to free.
+ */
+void octeon3_sso_free_grp_range(int node, int *grp, int req_cnt)
+{
+	struct global_resource_tag tag;
+	char buf[16];
+
+	/* Allocate the request group range */
+	strncpy((char *)&tag.lo, "cvm_sso_", 8);
+	snprintf(buf, 16, "0%d......", node);
+	memcpy(&tag.hi, buf, 8);
+
+	res_mgr_free_range(tag, grp, req_cnt);
+}
+EXPORT_SYMBOL(octeon3_sso_free_grp_range);
+
+/**
+ * octeon3_sso_free_grp() - Free a sso group.
+ * @node: Node where sso resides.
+ * @grp: Group to free.
+ */
+void octeon3_sso_free_grp(int node, int grp)
+{
+	octeon3_sso_free_grp_range(node, &grp, 1);
+}
+EXPORT_SYMBOL(octeon3_sso_free_grp);
+
+/**
+ * octeon3_sso_pass1_limit() - Near full TAQ can cause hang. When the TAQ
+ *			     (Transitory Admission Queue) is near-full, it is
+ *			     possible for SSO to hang.
+ *			     Workaround: Ensure that the sum of
+ *			     SSO_GRP(0..255)_TAQ_THR[MAX_THR] of all used
+ *			     groups is <= 1264. This may reduce single-group
+ *			     performance when many groups are used.
+ *
+ * @node: Node to update.
+ * @grp: SSO group to update.
+ */
+void octeon3_sso_pass1_limit(int node, int grp)
+{
+	u64 rsvd_thr;
+	u64 max_thr;
+	u64 taq_add;
+	u64 taq_thr;
+
+	/* Ideally, we would like to divide the maximum number of TAQ buffers
+	 * (1264) among the sso groups in use. However, since we don't know how
+	 * many sso groups are used by code outside this driver we take the
+	 * worst case approach and assume all 256 sso groups must be supported.
+	 */
+	max_thr = 1264 / get_num_sso_grps();
+	if (max_thr < 4)
+		max_thr = 4;
+	rsvd_thr = max_thr - 1;
+
+	/* Changes to SSO_GRP_TAQ_THR[rsvd_thr] must also update
+	 * SSO_TAQ_ADD[RSVD_FREE].
+	 */
+	taq_thr = oct_csr_read(SSO_GRP_TAQ_THR(node, grp));
+	taq_add = (rsvd_thr - (taq_thr & GENMASK_ULL(10, 0))) << 16;
+
+	taq_thr &= ~(GENMASK_ULL(42, 32) | GENMASK_ULL(10, 0));
+	taq_thr |= max_thr << 32;
+	taq_thr |= rsvd_thr;
+
+	oct_csr_write(taq_thr, SSO_GRP_TAQ_THR(node, grp));
+	oct_csr_write(taq_add, SSO_TAQ_ADD(node));
+}
+EXPORT_SYMBOL(octeon3_sso_pass1_limit);
+
+/**
+ * octeon3_sso_shutdown() - Shutdown the sso. It undoes what octeon3_sso_init()
+ *			    did.
+ * @node: Node where sso to disable is.
+ * @aura: Aura used for the sso buffers.
+ */
+void octeon3_sso_shutdown(int node, int aura)
+{
+	int max_grps;
+	int timeout;
+	u64 data;
+	int i;
+
+	/* Disable sso */
+	data = oct_csr_read(SSO_AW_CFG(node));
+	data |= BIT(6) | BIT(4);
+	data &= ~BIT(0);
+	oct_csr_write(data, SSO_AW_CFG(node));
+
+	/* Extract the fpa buffers */
+	max_grps = get_num_sso_grps();
+	for (i = 0; i < max_grps; i++) {
+		void *ptr;
+		u64 head;
+		u64 tail;
+
+		head = oct_csr_read(SSO_XAQ_HEAD_PTR(node, i));
+		tail = oct_csr_read(SSO_XAQ_TAIL_PTR(node, i));
+		data = oct_csr_read(SSO_GRP_AQ_CNT(node, i));
+
+		/* Verify pointers */
+		head &= GENMASK_ULL(41, 7);
+		tail &= GENMASK_ULL(41, 7);
+		if (head != tail) {
+			pr_err("octeon3_sso: bad ptr\n");
+			continue;
+		}
+
+		/* This sso group should have no pending entries */
+		if (data & GENMASK_ULL(32, 0))
+			pr_err("octeon3_sso: not empty\n");
+
+		ptr = phys_to_virt(head);
+		octeon_fpa3_free(node, aura, ptr);
+
+		/* Clear pointers */
+		oct_csr_write(0, SSO_XAQ_HEAD_PTR(node, i));
+		oct_csr_write(0, SSO_XAQ_HEAD_NEXT(node, i));
+		oct_csr_write(0, SSO_XAQ_TAIL_PTR(node, i));
+		oct_csr_write(0, SSO_XAQ_TAIL_NEXT(node, i));
+	}
+
+	/* Make sure all buffers drained */
+	timeout = 10000;
+	do {
+		data = oct_csr_read(SSO_AW_STATUS(node));
+		if ((data & GENMASK_ULL(5, 0)) == 0)
+			break;
+		timeout--;
+		udelay(1);
+	} while (timeout);
+	if (!timeout)
+		pr_err("octeon3_sso: timeout\n");
+}
+EXPORT_SYMBOL(octeon3_sso_shutdown);
+
+/**
+ * octeon3_sso_init() - Initialize the sso.
+ * @node: Node where sso resides.
+ * @aura: Aura used for the sso buffers.
+ */
+int octeon3_sso_init(int node, int aura)
+{
+	int max_grps;
+	int rc = 0;
+	u64 data;
+	int i;
+
+	data = BIT(3) | BIT(2) | BIT(1);
+	oct_csr_write(data, SSO_AW_CFG(node));
+
+	data = (node << 10) | aura;
+	oct_csr_write(data, SSO_XAQ_AURA(node));
+
+	max_grps = get_num_sso_grps();
+	for (i = 0; i < max_grps; i++) {
+		void *mem;
+		u64 phys;
+
+		mem = octeon_fpa3_alloc(node, aura);
+		if (!mem) {
+			rc = -ENOMEM;
+			goto err;
+		}
+
+		phys = virt_to_phys(mem);
+		oct_csr_write(phys, SSO_XAQ_HEAD_PTR(node, i));
+		oct_csr_write(phys, SSO_XAQ_HEAD_NEXT(node, i));
+		oct_csr_write(phys, SSO_XAQ_TAIL_PTR(node, i));
+		oct_csr_write(phys, SSO_XAQ_TAIL_NEXT(node, i));
+
+		/* SSO-18678 */
+		data = 0x3f << 16;
+		oct_csr_write(data, SSO_GRP_PRI(node, i));
+	}
+
+	data = BIT(0);
+	oct_csr_write(data, SSO_ERR0(node));
+
+	data = BIT(3) | BIT(2) | BIT(1) | BIT(0);
+	oct_csr_write(data, SSO_AW_CFG(node));
+
+ err:
+	return rc;
+}
+EXPORT_SYMBOL(octeon3_sso_init);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Cavium, Inc. <support@cavium.com>");
+MODULE_DESCRIPTION("Cavium, Inc. SSO management.");
diff --git a/drivers/net/ethernet/cavium/octeon/octeon3.h b/drivers/net/ethernet/cavium/octeon/octeon3.h
new file mode 100644
index 0000000..9823a97
--- /dev/null
+++ b/drivers/net/ethernet/cavium/octeon/octeon3.h
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (c) 2017 Cavium, Inc. */
+#ifndef _OCTEON3_H_
+#define _OCTEON3_H_
+
+#include <linux/platform_device.h>
+#include <linux/netdevice.h>
+
+#define MAX_NODES			2
+#define NODE_MASK			(MAX_NODES - 1)
+#define MAX_BGX_PER_NODE		6
+#define MAX_LMAC_PER_BGX		4
+
+#define IOBDMA_ORDERED_IO_ADDR		0xffffffffffffa200ull
+#define LMTDMA_ORDERED_IO_ADDR		0xffffffffffffa400ull
+
+#define SCRATCH_BASE			0xffffffffffff8000ull
+#define PKO_LMTLINE			2ull
+#define LMTDMA_SCR_OFFSET		(PKO_LMTLINE * CVMX_CACHE_LINE_SIZE)
+
+/* Pko sub-command three bit codes (SUBDC3) */
+#define PKO_SENDSUBDC_GATHER		0x1
+
+/* Pko sub-command four bit codes (SUBDC4) */
+#define PKO_SENDSUBDC_TSO		0x8
+#define PKO_SENDSUBDC_FREE		0x9
+#define PKO_SENDSUBDC_WORK		0xa
+#define PKO_SENDSUBDC_MEM		0xc
+#define PKO_SENDSUBDC_EXT		0xd
+
+#define BGX_RX_FIFO_SIZE		(64 * 1024)
+#define BGX_TX_FIFO_SIZE		(32 * 1024)
+
+/* Registers are accessed via xkphys */
+#define SET_XKPHYS			BIT_ULL(63)
+#define NODE_OFFSET(node)		((node) * 0x1000000000ull)
+
+/* Bgx register definitions */
+#define BGX_BASE			0x11800e0000000ull
+#define BGX_OFFSET(bgx)			(BGX_BASE + ((bgx) << 24))
+#define INDEX_OFFSET(index)		((index) << 20)
+#define INDEX_ADDR(n, b, i)		(SET_XKPHYS + NODE_OFFSET(n) +	       \
+					 BGX_OFFSET(b) + INDEX_OFFSET(i))
+#define CAM_OFFSET(mac)			((mac) << 3)
+#define CAM_ADDR(n, b, m)		(INDEX_ADDR(n, b, 0) + CAM_OFFSET(m))
+
+#define BGX_CMR_CONFIG(n, b, i)		(INDEX_ADDR(n, b, i)	      + 0x00000)
+#define BGX_CMR_GLOBAL_CONFIG(n, b)	(INDEX_ADDR(n, b, 0)	      + 0x00008)
+#define BGX_CMR_RX_ID_MAP(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x00028)
+#define BGX_CMR_RX_BP_ON(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x00088)
+#define BGX_CMR_RX_ADR_CTL(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x000a0)
+#define BGX_CMR_RX_FIFO_LEN(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x000c0)
+#define BGX_CMR_RX_ADRX_CAM(n, b, m)	(CAM_ADDR(n, b, m)	      + 0x00100)
+#define BGX_CMR_CHAN_MSK_AND(n, b)	(INDEX_ADDR(n, b, 0)	      + 0x00200)
+#define BGX_CMR_CHAN_MSK_OR(n, b)	(INDEX_ADDR(n, b, 0)	      + 0x00208)
+#define BGX_CMR_TX_FIFO_LEN(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x00418)
+#define BGX_CMR_TX_LMACS(n, b)		(INDEX_ADDR(n, b, 0)	      + 0x01000)
+
+#define BGX_SPU_CONTROL1(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x10000)
+#define BGX_SPU_STATUS1(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x10008)
+#define BGX_SPU_STATUS2(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x10020)
+#define BGX_SPU_BX_STATUS(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x10028)
+#define BGX_SPU_BR_STATUS1(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x10030)
+#define BGX_SPU_BR_STATUS2(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x10038)
+#define BGX_SPU_BR_BIP_ERR_CNT(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x10058)
+#define BGX_SPU_BR_PMD_CONTROL(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x10068)
+#define BGX_SPU_BR_PMD_LP_CUP(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x10078)
+#define BGX_SPU_BR_PMD_LD_CUP(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x10088)
+#define BGX_SPU_BR_PMD_LD_REP(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x10090)
+#define BGX_SPU_FEC_CONTROL(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x100a0)
+#define BGX_SPU_AN_CONTROL(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x100c8)
+#define BGX_SPU_AN_STATUS(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x100d0)
+#define BGX_SPU_AN_ADV(n, b, i)		(INDEX_ADDR(n, b, i)	      + 0x100d8)
+#define BGX_SPU_MISC_CONTROL(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x10218)
+#define BGX_SPU_INT(n, b, i)		(INDEX_ADDR(n, b, i)	      + 0x10220)
+#define BGX_SPU_DBG_CONTROL(n, b)	(INDEX_ADDR(n, b, 0)	      + 0x10300)
+
+#define BGX_SMU_RX_INT(n, b, i)		(INDEX_ADDR(n, b, i)	      + 0x20000)
+#define BGX_SMU_RX_FRM_CTL(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x20008)
+#define BGX_SMU_RX_JABBER(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x20018)
+#define BGX_SMU_RX_CTL(n, b, i)		(INDEX_ADDR(n, b, i)	      + 0x20030)
+#define BGX_SMU_TX_APPEND(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x20100)
+#define BGX_SMU_TX_MIN_PKT(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x20118)
+#define BGX_SMU_TX_INT(n, b, i)		(INDEX_ADDR(n, b, i)	      + 0x20140)
+#define BGX_SMU_TX_CTL(n, b, i)		(INDEX_ADDR(n, b, i)	      + 0x20160)
+#define BGX_SMU_TX_THRESH(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x20168)
+#define BGX_SMU_CTRL(n, b, i)		(INDEX_ADDR(n, b, i)	      + 0x20200)
+
+#define BGX_GMP_PCS_MR_CONTROL(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x30000)
+#define BGX_GMP_PCS_MR_STATUS(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x30008)
+#define BGX_GMP_PCS_AN_ADV(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x30010)
+#define BGX_GMP_PCS_LINK_TIMER(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x30040)
+#define BGX_GMP_PCS_SGM_AN_ADV(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x30068)
+#define BGX_GMP_PCS_MISC_CTL(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x30078)
+#define BGX_GMP_GMI_PRT_CFG(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x38010)
+#define BGX_GMP_GMI_RX_FRM_CTL(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x38018)
+#define BGX_GMP_GMI_RX_JABBER(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x38038)
+#define BGX_GMP_GMI_TX_THRESH(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x38210)
+#define BGX_GMP_GMI_TX_APPEND(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x38218)
+#define BGX_GMP_GMI_TX_SLOT(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x38220)
+#define BGX_GMP_GMI_TX_BURST(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x38228)
+#define BGX_GMP_GMI_TX_MIN_PKT(n, b, i)	(INDEX_ADDR(n, b, i)	      + 0x38240)
+#define BGX_GMP_GMI_TX_SGMII_CTL(n, b, i) (INDEX_ADDR(n, b, i)	      + 0x38300)
+
+/* XCV register definitions */
+#define XCV_BASE			0x11800db000000ull
+#define SET_XCV_BASE(node)		(SET_XKPHYS + NODE_OFFSET(node) +      \
+					 XCV_BASE)
+#define XCV_RESET(node)			(SET_XCV_BASE(node)	       + 0x0000)
+#define XCV_DLL_CTL(node)		(SET_XCV_BASE(node)	       + 0x0010)
+#define XCV_COMP_CTL(node)		(SET_XCV_BASE(node)	       + 0x0020)
+#define XCV_CTL(node)			(SET_XCV_BASE(node)	       + 0x0030)
+#define XCV_INT(node)			(SET_XCV_BASE(node)	       + 0x0040)
+#define XCV_INBND_STATUS(node)		(SET_XCV_BASE(node)	       + 0x0080)
+#define XCV_BATCH_CRD_RET(node)		(SET_XCV_BASE(node)	       + 0x0100)
+
+/* Gser register definitions */
+#define GSER_BASE			0x1180090000000ull
+#define GSER_OFFSET(gser)		(GSER_BASE + ((gser) << 24))
+#define GSER_LANE_OFFSET(lane)		((lane) << 20)
+#define GSER_LANE_ADDR(n, g, l)		(SET_XKPHYS + NODE_OFFSET(n) +	       \
+					 GSER_OFFSET(g) + GSER_LANE_OFFSET(l))
+#define GSER_PHY_CTL(n, g)		(GSER_LANE_ADDR(n, g, 0)     + 0x000000)
+#define GSER_CFG(n, g)			(GSER_LANE_ADDR(n, g, 0)     + 0x000080)
+#define GSER_LANE_MODE(n, g)		(GSER_LANE_ADDR(n, g, 0)     + 0x000118)
+#define GSER_RX_EIE_DETSTS(n, g)	(GSER_LANE_ADDR(n, g, 0)     + 0x000150)
+#define GSER_LANE_LBERT_CFG(n, g, l)	(GSER_LANE_ADDR(n, g, l)     + 0x4c0020)
+#define GSER_LANE_PCS_CTLIFC_0(n, g, l)	(GSER_LANE_ADDR(n, g, l)     + 0x4c0060)
+#define GSER_LANE_PCS_CTLIFC_2(n, g, l)	(GSER_LANE_ADDR(n, g, l)     + 0x4c0070)
+
+/* Odd gser registers */
+#define GSER_LANE_OFFSET_1(lane)	((lane) << 7)
+#define GSER_LANE_ADDR_1(n, g, l)	(SET_XKPHYS + NODE_OFFSET(n) +	       \
+					 GSER_OFFSET(g) + GSER_LANE_OFFSET_1(l))
+
+#define GSER_BR_RX_CTL(n, g, l)		(GSER_LANE_ADDR_1(n, g, l)   + 0x000400)
+#define GSER_BR_RX_EER(n, g, l)		(GSER_LANE_ADDR_1(n, g, l)   + 0x000418)
+
+#define GSER_LANE_OFFSET_2(mode)	((mode) << 5)
+#define GSER_LANE_ADDR_2(n, g, m)	(SET_XKPHYS + NODE_OFFSET(n) +	       \
+					 GSER_OFFSET(g) + GSER_LANE_OFFSET_2(m))
+
+#define GSER_LANE_P_MODE_1(n, g, m)	(GSER_LANE_ADDR_2(n, g, m)   + 0x4e0048)
+
+#define DPI_BASE			0x1df0000000000ull
+#define DPI_ADDR(n)			(SET_XKPHYS + NODE_OFFSET(n) + DPI_BASE)
+#define DPI_CTL(n)			(DPI_ADDR(n)                  + 0x00040)
+
+enum octeon3_mac_type {
+	BGX_MAC,
+	SRIO_MAC
+};
+
+enum octeon3_src_type {
+	QLM,
+	XCV
+};
+
+struct mac_platform_data {
+	enum octeon3_mac_type	mac_type;
+	int			numa_node;
+	int			interface;
+	int			port;
+	enum octeon3_src_type	src_type;
+};
+
+struct bgx_port_netdev_priv {
+	struct bgx_port_priv *bgx_priv;
+};
+
+/* Remove this define to use these enums after the last cvmx code
+ * references are gone.
+ */
+/* PKO_MEMDSZ_E */
+enum pko_memdsz_e {
+	MEMDSZ_B64 = 0,
+	MEMDSZ_B32 = 1,
+	MEMDSZ_B16 = 2,
+	MEMDSZ_B8 = 3
+};
+
+/* PKO_MEMALG_E */
+enum pko_memalg_e {
+	MEMALG_SET = 0,
+	MEMALG_SETTSTMP = 1,
+	MEMALG_SETRSLT = 2,
+	MEMALG_ADD = 8,
+	MEMALG_SUB = 9,
+	MEMALG_ADDLEN = 0xA,
+	MEMALG_SUBLEN = 0xB,
+	MEMALG_ADDMBUF = 0xC,
+	MEMALG_SUBMBUF = 0xD
+};
+
+/* PKO_QUERY_RTN_S[DQSTATUS] */
+enum pko_query_dqstatus {
+	PKO_DQSTATUS_PASS = 0,
+	PKO_DQSTATUS_BADSTATE = 0x8,
+	PKO_DQSTATUS_NOFPABUF = 0x9,
+	PKO_DQSTATUS_NOPKOBUF = 0xA,
+	PKO_DQSTATUS_FAILRTNPTR = 0xB,
+	PKO_DQSTATUS_ALREADY = 0xC,
+	PKO_DQSTATUS_NOTCREATED = 0xD,
+	PKO_DQSTATUS_NOTEMPTY = 0xE,
+	PKO_DQSTATUS_SENDPKTDROP = 0xF
+};
+
+union wqe_word0 {
+	u64 u64;
+	struct {
+		__BITFIELD_FIELD(u64 rsvd_0:4,
+		__BITFIELD_FIELD(u64 aura:12,
+		__BITFIELD_FIELD(u64 rsvd_1:1,
+		__BITFIELD_FIELD(u64 apad:3,
+		__BITFIELD_FIELD(u64 channel:12,
+		__BITFIELD_FIELD(u64 bufs:8,
+		__BITFIELD_FIELD(u64 style:8,
+		__BITFIELD_FIELD(u64 rsvd_2:10,
+		__BITFIELD_FIELD(u64 pknd:6,
+		;)))))))))
+	};
+};
+
+union wqe_word1 {
+	u64 u64;
+	struct {
+		__BITFIELD_FIELD(u64 len:16,
+		__BITFIELD_FIELD(u64 rsvd_0:2,
+		__BITFIELD_FIELD(u64 rsvd_1:2,
+		__BITFIELD_FIELD(u64 grp:10,
+		__BITFIELD_FIELD(u64 tag_type:2,
+		__BITFIELD_FIELD(u64 tag:32,
+		;))))))
+	};
+};
+
+union wqe_word2 {
+	u64 u64;
+	struct {
+		__BITFIELD_FIELD(u64 software:1,
+		__BITFIELD_FIELD(u64 lg_hdr_type:5,
+		__BITFIELD_FIELD(u64 lf_hdr_type:5,
+		__BITFIELD_FIELD(u64 le_hdr_type:5,
+		__BITFIELD_FIELD(u64 ld_hdr_type:5,
+		__BITFIELD_FIELD(u64 lc_hdr_type:5,
+		__BITFIELD_FIELD(u64 lb_hdr_type:5,
+		__BITFIELD_FIELD(u64 is_la_ether:1,
+		__BITFIELD_FIELD(u64 rsvd_0:8,
+		__BITFIELD_FIELD(u64 vlan_valid:1,
+		__BITFIELD_FIELD(u64 vlan_stacked:1,
+		__BITFIELD_FIELD(u64 stat_inc:1,
+		__BITFIELD_FIELD(u64 pcam_flag4:1,
+		__BITFIELD_FIELD(u64 pcam_flag3:1,
+		__BITFIELD_FIELD(u64 pcam_flag2:1,
+		__BITFIELD_FIELD(u64 pcam_flag1:1,
+		__BITFIELD_FIELD(u64 is_frag:1,
+		__BITFIELD_FIELD(u64 is_l3_bcast:1,
+		__BITFIELD_FIELD(u64 is_l3_mcast:1,
+		__BITFIELD_FIELD(u64 is_l2_bcast:1,
+		__BITFIELD_FIELD(u64 is_l2_mcast:1,
+		__BITFIELD_FIELD(u64 is_raw:1,
+		__BITFIELD_FIELD(u64 err_level:3,
+		__BITFIELD_FIELD(u64 err_code:8,
+		;))))))))))))))))))))))))
+	};
+};
+
+union buf_ptr {
+	u64 u64;
+	struct {
+		__BITFIELD_FIELD(u64 size:16,
+		__BITFIELD_FIELD(u64 packet_outside_wqe:1,
+		__BITFIELD_FIELD(u64 rsvd0:5,
+		__BITFIELD_FIELD(u64 addr:42,
+		;))))
+	};
+};
+
+union wqe_word4 {
+	u64 u64;
+	struct {
+		__BITFIELD_FIELD(u64 ptr_vlan:8,
+		__BITFIELD_FIELD(u64 ptr_layer_g:8,
+		__BITFIELD_FIELD(u64 ptr_layer_f:8,
+		__BITFIELD_FIELD(u64 ptr_layer_e:8,
+		__BITFIELD_FIELD(u64 ptr_layer_d:8,
+		__BITFIELD_FIELD(u64 ptr_layer_c:8,
+		__BITFIELD_FIELD(u64 ptr_layer_b:8,
+		__BITFIELD_FIELD(u64 ptr_layer_a:8,
+		;))))))))
+	};
+};
+
+struct wqe {
+	union wqe_word0	word0;
+	union wqe_word1	word1;
+	union wqe_word2	word2;
+	union buf_ptr	packet_ptr;
+	union wqe_word4	word4;
+	u64		wqe_data[11];
+};
+
+enum port_mode {
+	PORT_MODE_DISABLED,
+	PORT_MODE_SGMII,
+	PORT_MODE_RGMII,
+	PORT_MODE_XAUI,
+	PORT_MODE_RXAUI,
+	PORT_MODE_XLAUI,
+	PORT_MODE_XFI,
+	PORT_MODE_10G_KR,
+	PORT_MODE_40G_KR4
+};
+
+enum lane_mode {
+	R_25G_REFCLK100,
+	R_5G_REFCLK100,
+	R_8G_REFCLK100,
+	R_125G_REFCLK15625_KX,
+	R_3125G_REFCLK15625_XAUI,
+	R_103125G_REFCLK15625_KR,
+	R_125G_REFCLK15625_SGMII,
+	R_5G_REFCLK15625_QSGMII,
+	R_625G_REFCLK15625_RXAUI,
+	R_25G_REFCLK125,
+	R_5G_REFCLK125,
+	R_8G_REFCLK125
+};
+
+struct port_status {
+	int	link;
+	int	duplex;
+	int	speed;
+};
+
+static inline u64 oct_csr_read(u64 addr)
+{
+	return __raw_readq((void __iomem *)addr);
+}
+
+static inline void oct_csr_write(u64 data, u64 addr)
+{
+	__raw_writeq(data, (void __iomem *)addr);
+}
+
+static inline int bgx_addr_to_interface(u64 addr)
+{
+	return (addr >> 24) & 0xf;
+}
+
+static inline int bgx_node_to_numa_node(struct device_node *np)
+{
+	int ret = of_node_to_nid(np);
+
+	return ret == NUMA_NO_NODE ? 0 : ret;
+}
+
+extern int ilk0_lanes;
+extern int ilk1_lanes;
+
+void bgx_nexus_load(void);
+
+int bgx_port_allocate_pknd(int node);
+int bgx_port_get_pknd(int node, int bgx, int index);
+enum port_mode bgx_port_get_mode(int node, int bgx, int index);
+int bgx_port_get_qlm(int node, int bgx, int index);
+void bgx_port_set_netdev(struct device *dev, struct net_device *netdev);
+int bgx_port_enable(struct net_device *netdev);
+int bgx_port_disable(struct net_device *netdev);
+const u8 *bgx_port_get_mac(struct net_device *netdev);
+void bgx_port_set_rx_filtering(struct net_device *netdev);
+int bgx_port_change_mtu(struct net_device *netdev, int new_mtu);
+int bgx_port_ethtool_get_link_ksettings(struct net_device *netdev,
+					struct ethtool_link_ksettings *cmd);
+int bgx_port_ethtool_get_settings(struct net_device *netdev,
+				  struct ethtool_cmd *cmd);
+int bgx_port_ethtool_set_settings(struct net_device *netdev,
+				  struct ethtool_cmd *cmd);
+int bgx_port_ethtool_nway_reset(struct net_device *netdev);
+int bgx_port_do_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd);
+
+void bgx_port_mix_assert_reset(struct net_device *netdev, int mix, bool v);
+
+int octeon3_pki_vlan_init(int node);
+int octeon3_pki_cluster_init(int node, struct platform_device *pdev);
+int octeon3_pki_ltype_init(int node);
+int octeon3_pki_enable(int node);
+int octeon3_pki_port_init(int node, int aura, int grp, int skip, int mb_size,
+			  int pknd, int num_rx_cxt);
+int octeon3_pki_get_stats(int node, int pknd, u64 *packets, u64 *octets,
+			  u64 *dropped);
+int octeon3_pki_set_ptp_skip(int node, int pknd, int skip);
+int octeon3_pki_port_shutdown(int node, int pknd);
+void octeon3_pki_shutdown(int node);
+
+void octeon3_sso_pass1_limit(int node, int grp);
+int octeon3_sso_init(int node, int aura);
+void octeon3_sso_shutdown(int node, int aura);
+int octeon3_sso_alloc_grp(int node, int grp);
+int octeon3_sso_alloc_grp_range(int node, int req_grp, int req_cnt,
+				bool use_last_avail, int *grp);
+void octeon3_sso_free_grp(int node, int grp);
+void octeon3_sso_free_grp_range(int node, int *grp, int req_cnt);
+void octeon3_sso_irq_set(int node, int grp, bool en);
+
+int octeon3_pko_interface_init(int node, int interface, int index,
+			       enum octeon3_mac_type mac_type, int ipd_port);
+int octeon3_pko_activate_dq(int node, int dq, int cnt);
+int octeon3_pko_get_fifo_size(int node, int interface, int index,
+			      enum octeon3_mac_type mac_type);
+int octeon3_pko_set_mac_options(int node, int interface, int index,
+				enum octeon3_mac_type mac_type, bool fcs_en,
+				bool pad_en, int fcs_sop_off);
+int octeon3_pko_init_global(int node, int aura);
+int octeon3_pko_interface_uninit(int node, const int *dq, int num_dq);
+int octeon3_pko_exit_global(int node);
+
+int octeon_fpa3_init(int node);
+int octeon_fpa3_pool_init(int node, int pool_num, int *pool, void **pool_stack,
+			  int num_ptrs);
+int octeon_fpa3_aura_init(int node, int pool, int aura_num, int *aura,
+			  int num_bufs, unsigned int limit);
+int octeon_fpa3_mem_fill(int node, struct kmem_cache *cache, int aura,
+			 int num_bufs);
+void octeon_fpa3_free(int node, int aura, const void *buf);
+void *octeon_fpa3_alloc(int node, int aura);
+void octeon_fpa3_release_pool(int node, int pool);
+void octeon_fpa3_release_aura(int node, int aura);
+
+#endif /* _OCTEON3_H_ */
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v10 4/4] MAINTAINERS: Add entry for drivers/net/ethernet/cavium/octeon/octeon3-*
From: Steven J. Hill @ 2018-05-07 17:22 UTC (permalink / raw)
  To: netdev; +Cc: David Daney
In-Reply-To: <1525713731-27092-1-git-send-email-steven.hill@cavium.com>

From: David Daney <david.daney@cavium.com>

Signed-off-by: David Daney <david.daney@cavium.com>
---
 MAINTAINERS | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index ebe0b9e..f066bda 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3252,6 +3252,12 @@ W:	http://www.cavium.com
 S:	Supported
 F:	drivers/mmc/host/cavium*
 
+CAVIUM OCTEON-III NETWORK DRIVER
+M:	Steven J. Hill <Steven.Hill@cavium.com>
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	drivers/net/ethernet/cavium/octeon/octeon3-*
+
 CAVIUM OCTEON-TX CRYPTO DRIVER
 M:	George Cherian <george.cherian@cavium.com>
 L:	linux-crypto@vger.kernel.org
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v10 2/4] dt-bindings: Add Cavium Octeon Common Ethernet Interface.
From: Steven J. Hill @ 2018-05-07 17:22 UTC (permalink / raw)
  To: netdev; +Cc: Carlos Munoz, Steven J. Hill, David Daney
In-Reply-To: <1525713731-27092-1-git-send-email-steven.hill@cavium.com>

From: Carlos Munoz <cmunoz@cavium.com>

Add bindings for Common Ethernet Interface (BGX) block.

Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Carlos Munoz <cmunoz@cavium.com>
Signed-off-by: Steven J. Hill <Steven.Hill@cavium.com>
Signed-off-by: David Daney <david.daney@cavium.com>
---
 .../devicetree/bindings/net/cavium-bgx.txt         | 61 ++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/cavium-bgx.txt

diff --git a/Documentation/devicetree/bindings/net/cavium-bgx.txt b/Documentation/devicetree/bindings/net/cavium-bgx.txt
new file mode 100644
index 0000000..830c5f0
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/cavium-bgx.txt
@@ -0,0 +1,61 @@
+* Common Ethernet Interface (BGX) block
+
+Properties:
+
+- compatible: "cavium,octeon-7890-bgx": Compatibility with all cn7xxx SOCs.
+
+- reg: The base address of the BGX block.
+
+- #address-cells: Must be <1>.
+
+- #size-cells: Must be <0>.  BGX addresses have no size component.
+
+A BGX block has several children, each representing an Ethernet
+interface.
+
+
+* Ethernet Interface (BGX port) connects to PKI/PKO
+
+Properties:
+
+- compatible: "cavium,octeon-7890-bgx-port": Compatibility with all
+	      cn7xxx SOCs.
+
+	      "cavium,octeon-7360-xcv": Compatibility with cn73xx SOCs
+	      for RGMII.
+
+- reg: The index of the interface within the BGX block.
+
+Optional properties:
+
+- local-mac-address: Mac address for the interface.
+
+- phy-handle: phandle to the phy node connected to the interface.
+
+- phy-mode: described in ethernet.txt.
+
+- fixed-link: described in fixed-link.txt.
+
+Example:
+
+	ethernet-mac-nexus@11800e0000000 {
+		compatible = "cavium,octeon-7890-bgx";
+		reg = <0x00011800 0xe0000000 0x00000000 0x01000000>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		ethernet@0 {
+			compatible = "cavium,octeon-7360-xcv";
+			reg = <0>;
+			local-mac-address = [ 00 01 23 45 67 89 ];
+			phy-handle = <&phy3>;
+			phy-mode = "rgmii-rxid"
+		};
+		ethernet@1 {
+			compatible = "cavium,octeon-7890-bgx-port";
+			reg = <1>;
+			local-mac-address = [ 00 01 23 45 67 8a ];
+			phy-handle = <&phy4>;
+			phy-mode = "sgmii"
+		};
+	};
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v10 1/4] staging: octeon: Remove USE_ASYNC_IOBDMA macro.
From: Steven J. Hill @ 2018-05-07 17:22 UTC (permalink / raw)
  To: netdev; +Cc: David Daney
In-Reply-To: <1525713731-27092-1-git-send-email-steven.hill@cavium.com>

From: David Daney <david.daney@cavium.com>

Previous patch sets USE_ASYNC_IOBDMA to 1 unconditionally. Remove
USE_ASYNC_IOBDMA from all if statements.  Remove dead code caused
by the change.

Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: David Daney <david.daney@cavium.com>
---
 drivers/staging/octeon/ethernet-defines.h |  6 ---
 drivers/staging/octeon/ethernet-rx.c      | 25 ++++-----
 drivers/staging/octeon/ethernet-tx.c      | 85 ++++++++++---------------------
 3 files changed, 37 insertions(+), 79 deletions(-)

diff --git a/drivers/staging/octeon/ethernet-defines.h b/drivers/staging/octeon/ethernet-defines.h
index 1e11442..15db928 100644
--- a/drivers/staging/octeon/ethernet-defines.h
+++ b/drivers/staging/octeon/ethernet-defines.h
@@ -7,10 +7,6 @@
 
 /*
  * A few defines are used to control the operation of this driver:
- *  USE_ASYNC_IOBDMA
- *      Use asynchronous IO access to hardware. This uses Octeon's asynchronous
- *      IOBDMAs to issue IO accesses without stalling. Set this to zero
- *      to disable this. Note that IOBDMAs require CVMSEG.
  *  REUSE_SKBUFFS_WITHOUT_FREE
  *      Allows the TX path to free an skbuff into the FPA hardware pool. This
  *      can significantly improve performance for forwarding and bridging, but
@@ -29,8 +25,6 @@
 #define REUSE_SKBUFFS_WITHOUT_FREE  1
 #endif
 
-#define USE_ASYNC_IOBDMA            (CONFIG_CAVIUM_OCTEON_CVMSEG_SIZE > 0)
-
 /* Maximum number of SKBs to try to free per xmit packet. */
 #define MAX_OUT_QUEUE_DEPTH 1000
 
diff --git a/drivers/staging/octeon/ethernet-rx.c b/drivers/staging/octeon/ethernet-rx.c
index 5e27124..c1ae60c 100644
--- a/drivers/staging/octeon/ethernet-rx.c
+++ b/drivers/staging/octeon/ethernet-rx.c
@@ -198,11 +198,9 @@ static int cvm_oct_poll(struct oct_rx_group *rx_group, int budget)
 	/* Prefetch cvm_oct_device since we know we need it soon */
 	prefetch(cvm_oct_device);
 
-	if (USE_ASYNC_IOBDMA) {
-		/* Save scratch in case userspace is using it */
-		CVMX_SYNCIOBDMA;
-		old_scratch = cvmx_scratch_read64(CVMX_SCR_SCRATCH);
-	}
+	/* Save scratch in case userspace is using it */
+	CVMX_SYNCIOBDMA;
+	old_scratch = cvmx_scratch_read64(CVMX_SCR_SCRATCH);
 
 	/* Only allow work for our group (and preserve priorities) */
 	if (OCTEON_IS_MODEL(OCTEON_CN68XX)) {
@@ -217,10 +215,8 @@ static int cvm_oct_poll(struct oct_rx_group *rx_group, int budget)
 			       BIT(rx_group->group));
 	}
 
-	if (USE_ASYNC_IOBDMA) {
-		cvmx_pow_work_request_async(CVMX_SCR_SCRATCH, CVMX_POW_NO_WAIT);
-		did_work_request = 1;
-	}
+	cvmx_pow_work_request_async(CVMX_SCR_SCRATCH, CVMX_POW_NO_WAIT);
+	did_work_request = 1;
 
 	while (rx_count < budget) {
 		struct sk_buff *skb = NULL;
@@ -229,7 +225,7 @@ static int cvm_oct_poll(struct oct_rx_group *rx_group, int budget)
 		cvmx_wqe_t *work;
 		int port;
 
-		if (USE_ASYNC_IOBDMA && did_work_request)
+		if (did_work_request)
 			work = cvmx_pow_work_response_async(CVMX_SCR_SCRATCH);
 		else
 			work = cvmx_pow_work_request_sync(CVMX_POW_NO_WAIT);
@@ -257,7 +253,7 @@ static int cvm_oct_poll(struct oct_rx_group *rx_group, int budget)
 			sizeof(void *));
 		prefetch(pskb);
 
-		if (USE_ASYNC_IOBDMA && rx_count < (budget - 1)) {
+		if (rx_count < (budget - 1)) {
 			cvmx_pow_work_request_async_nocheck(CVMX_SCR_SCRATCH,
 							    CVMX_POW_NO_WAIT);
 			did_work_request = 1;
@@ -400,10 +396,9 @@ static int cvm_oct_poll(struct oct_rx_group *rx_group, int budget)
 		cvmx_write_csr(CVMX_POW_PP_GRP_MSKX(coreid), old_group_mask);
 	}
 
-	if (USE_ASYNC_IOBDMA) {
-		/* Restore the scratch area */
-		cvmx_scratch_write64(CVMX_SCR_SCRATCH, old_scratch);
-	}
+	/* Restore the scratch area */
+	cvmx_scratch_write64(CVMX_SCR_SCRATCH, old_scratch);
+
 	cvm_oct_rx_refill_pool(0);
 
 	return rx_count;
diff --git a/drivers/staging/octeon/ethernet-tx.c b/drivers/staging/octeon/ethernet-tx.c
index df3441b..2aa5fcb 100644
--- a/drivers/staging/octeon/ethernet-tx.c
+++ b/drivers/staging/octeon/ethernet-tx.c
@@ -176,23 +176,18 @@ int cvm_oct_xmit(struct sk_buff *skb, struct net_device *dev)
 		qos = 0;
 	}
 
-	if (USE_ASYNC_IOBDMA) {
-		/* Save scratch in case userspace is using it */
-		CVMX_SYNCIOBDMA;
-		old_scratch = cvmx_scratch_read64(CVMX_SCR_SCRATCH);
-		old_scratch2 = cvmx_scratch_read64(CVMX_SCR_SCRATCH + 8);
-
-		/*
-		 * Fetch and increment the number of packets to be
-		 * freed.
-		 */
-		cvmx_fau_async_fetch_and_add32(CVMX_SCR_SCRATCH + 8,
-					       FAU_NUM_PACKET_BUFFERS_TO_FREE,
-					       0);
-		cvmx_fau_async_fetch_and_add32(CVMX_SCR_SCRATCH,
-					       priv->fau + qos * 4,
-					       MAX_SKB_TO_FREE);
-	}
+	/* Save scratch in case userspace is using it */
+	CVMX_SYNCIOBDMA;
+	old_scratch = cvmx_scratch_read64(CVMX_SCR_SCRATCH);
+	old_scratch2 = cvmx_scratch_read64(CVMX_SCR_SCRATCH + 8);
+
+	/* Fetch and increment the number of packets to be freed. */
+	cvmx_fau_async_fetch_and_add32(CVMX_SCR_SCRATCH + 8,
+				       FAU_NUM_PACKET_BUFFERS_TO_FREE,
+				       0);
+	cvmx_fau_async_fetch_and_add32(CVMX_SCR_SCRATCH,
+				       priv->fau + qos * 4,
+				       MAX_SKB_TO_FREE);
 
 	/*
 	 * We have space for 6 segment pointers, If there will be more
@@ -201,22 +196,11 @@ int cvm_oct_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (unlikely(skb_shinfo(skb)->nr_frags > 5)) {
 		if (unlikely(__skb_linearize(skb))) {
 			queue_type = QUEUE_DROP;
-			if (USE_ASYNC_IOBDMA) {
-				/*
-				 * Get the number of skbuffs in use
-				 * by the hardware
-				 */
-				CVMX_SYNCIOBDMA;
-				skb_to_free =
-					cvmx_scratch_read64(CVMX_SCR_SCRATCH);
-			} else {
-				/*
-				 * Get the number of skbuffs in use
-				 * by the hardware
-				 */
-				skb_to_free = cvmx_fau_fetch_and_add32(
-					priv->fau + qos * 4, MAX_SKB_TO_FREE);
-			}
+			/* Get the number of skbuffs in use by the
+			 * hardware
+			 */
+			CVMX_SYNCIOBDMA;
+			skb_to_free = cvmx_scratch_read64(CVMX_SCR_SCRATCH);
 			skb_to_free = cvm_oct_adjust_skb_to_free(skb_to_free,
 								 priv->fau +
 								 qos * 4);
@@ -384,18 +368,10 @@ int cvm_oct_xmit(struct sk_buff *skb, struct net_device *dev)
 		pko_command.s.ipoffp1 = skb_network_offset(skb) + 1;
 	}
 
-	if (USE_ASYNC_IOBDMA) {
-		/* Get the number of skbuffs in use by the hardware */
-		CVMX_SYNCIOBDMA;
-		skb_to_free = cvmx_scratch_read64(CVMX_SCR_SCRATCH);
-		buffers_to_free = cvmx_scratch_read64(CVMX_SCR_SCRATCH + 8);
-	} else {
-		/* Get the number of skbuffs in use by the hardware */
-		skb_to_free = cvmx_fau_fetch_and_add32(priv->fau + qos * 4,
-						       MAX_SKB_TO_FREE);
-		buffers_to_free =
-		    cvmx_fau_fetch_and_add32(FAU_NUM_PACKET_BUFFERS_TO_FREE, 0);
-	}
+	/* Get the number of skbuffs in use by the hardware */
+	CVMX_SYNCIOBDMA;
+	skb_to_free = cvmx_scratch_read64(CVMX_SCR_SCRATCH);
+	buffers_to_free = cvmx_scratch_read64(CVMX_SCR_SCRATCH + 8);
 
 	skb_to_free = cvm_oct_adjust_skb_to_free(skb_to_free,
 						 priv->fau + qos * 4);
@@ -413,9 +389,7 @@ int cvm_oct_xmit(struct sk_buff *skb, struct net_device *dev)
 	} else {
 		queue_type = QUEUE_HW;
 	}
-	if (USE_ASYNC_IOBDMA)
-		cvmx_fau_async_fetch_and_add32(
-				CVMX_SCR_SCRATCH, FAU_TOTAL_TX_TO_CLEAN, 1);
+	cvmx_fau_async_fetch_and_add32(CVMX_SCR_SCRATCH, FAU_TOTAL_TX_TO_CLEAN, 1);
 
 	spin_lock_irqsave(&priv->tx_free_list[qos].lock, flags);
 
@@ -485,16 +459,11 @@ int cvm_oct_xmit(struct sk_buff *skb, struct net_device *dev)
 		dev_kfree_skb_any(t);
 	}
 
-	if (USE_ASYNC_IOBDMA) {
-		CVMX_SYNCIOBDMA;
-		total_to_clean = cvmx_scratch_read64(CVMX_SCR_SCRATCH);
-		/* Restore the scratch area */
-		cvmx_scratch_write64(CVMX_SCR_SCRATCH, old_scratch);
-		cvmx_scratch_write64(CVMX_SCR_SCRATCH + 8, old_scratch2);
-	} else {
-		total_to_clean = cvmx_fau_fetch_and_add32(
-						FAU_TOTAL_TX_TO_CLEAN, 1);
-	}
+	CVMX_SYNCIOBDMA;
+	total_to_clean = cvmx_scratch_read64(CVMX_SCR_SCRATCH);
+	/* Restore the scratch area */
+	cvmx_scratch_write64(CVMX_SCR_SCRATCH, old_scratch);
+	cvmx_scratch_write64(CVMX_SCR_SCRATCH + 8, old_scratch2);
 
 	if (total_to_clean & 0x3ff) {
 		/*
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v10 0/4] netdev: octeon-ethernet: Add Cavium Octeon III support.
From: Steven J. Hill @ 2018-05-07 17:22 UTC (permalink / raw)
  To: netdev

Add the Cavium OCTEON III network driver. There are some corresponding
MIPS architecture support changes which will be upstreamed separately.

Changes in v10:

o Reorganization of patches to cleanly separate network and
  MIPS platform code.


Carlos Munoz (2):
  dt-bindings: Add Cavium Octeon Common Ethernet Interface.
  netdev: octeon-ethernet: Add Cavium Octeon III support.

David Daney (2):
  staging: octeon: Remove USE_ASYNC_IOBDMA macro.
  MAINTAINERS: Add entry for
    drivers/net/ethernet/cavium/octeon/octeon3-*

 .../devicetree/bindings/net/cavium-bgx.txt         |   61 +
 MAINTAINERS                                        |    6 +
 drivers/net/ethernet/cavium/Kconfig                |   59 +-
 drivers/net/ethernet/cavium/octeon/Makefile        |    7 +
 .../net/ethernet/cavium/octeon/octeon3-bgx-nexus.c |  417 ++++
 .../net/ethernet/cavium/octeon/octeon3-bgx-port.c  | 2003 +++++++++++++++++++
 drivers/net/ethernet/cavium/octeon/octeon3-core.c  | 2079 ++++++++++++++++++++
 drivers/net/ethernet/cavium/octeon/octeon3-fpa.c   |  358 ++++
 drivers/net/ethernet/cavium/octeon/octeon3-pki.c   |  823 ++++++++
 drivers/net/ethernet/cavium/octeon/octeon3-pko.c   | 1688 ++++++++++++++++
 drivers/net/ethernet/cavium/octeon/octeon3-sso.c   |  301 +++
 drivers/net/ethernet/cavium/octeon/octeon3.h       |  430 ++++
 drivers/staging/octeon/ethernet-defines.h          |    6 -
 drivers/staging/octeon/ethernet-rx.c               |   25 +-
 drivers/staging/octeon/ethernet-tx.c               |   85 +-
 15 files changed, 8259 insertions(+), 89 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/net/cavium-bgx.txt
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-bgx-nexus.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-bgx-port.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-core.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-fpa.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-pki.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-pko.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3-sso.c
 create mode 100644 drivers/net/ethernet/cavium/octeon/octeon3.h

-- 
2.1.4

^ permalink raw reply

* [PATCH bpf-next] xsk: fix 64-bit division
From: Björn Töpel @ 2018-05-07 17:43 UTC (permalink / raw)
  To: ast, daniel, netdev
  Cc: Björn Töpel, Randy Dunlap, Stephen Rothwell,
	magnus.karlsson, linux-next, linux-kernel
In-Reply-To: <e9890e95-16b4-4540-301b-43f17a824808@infradead.org>

From: Björn Töpel <bjorn.topel@intel.com>

i386 builds report:
  net/xdp/xdp_umem.o: In function `xdp_umem_reg':
  xdp_umem.c:(.text+0x47e): undefined reference to `__udivdi3'

This fix uses div_u64 instead of the GCC built-in.

Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt")
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 net/xdp/xdp_umem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 881dfdefe235..2b47a1dd7c6c 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -209,7 +209,7 @@ int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	if ((addr + size) < addr)
 		return -EINVAL;
 
-	nframes = size / frame_size;
+	nframes = (unsigned int)div_u64(size, frame_size);
 	if (nframes == 0 || nframes > UINT_MAX)
 		return -EINVAL;
 
-- 
2.14.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox