* Re: [PATCH] net: sched: Fix memory exposure from short TCA_U32_SEL
From: Al Viro @ 2018-08-26 22:57 UTC (permalink / raw)
To: Kees Cook
Cc: LKML, Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller,
Network Development
In-Reply-To: <20180826173236.GU6515@ZenIV.linux.org.uk>
On Sun, Aug 26, 2018 at 06:32:37PM +0100, Al Viro wrote:
> As far as I can tell, the solution is
[snip long and painful reasoning]
> pointers, and not in provably opaque fashion. Theoretically, the three tcf_...
> inlines above need another look; fortunately, they don't use ->next at all, not to
> mention not being used anywhere outside of net/sched/*.c
>
> The 80 lines above prove that we only need to grep net/sched/*.c for
> tcf_proto_ops method calls. And only because we don't have (thank $DEITY)
> anything that could deconstruct types - as soon as some bastard grows means
> to say "type of the second argument of the function pointed to by p", this
> kind of analysis, painful as it is, goes out of window. Even as it is,
> do you really like the idea of newbies trying to get through the exercises
> like the one above?
BTW, would there be any problem if we took the definitions of tcf_proto and
tcf_proto_ops to e.g. net/sched/tcf_proto.h (along with the three inlines in
in pkt_cls.h), left forwards in sch_generic.h and added includes of "tcf_proto.h"
where needed in net/sched/*.c?
That would make tcf_proto/tcf_proto_ops opaque outside of net/sched, reducing
the exposure of internals. Something like a diff below (against net/master,
builds clean, ought to result in identical binary):
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index ef727f71336e..35f8eec3f7c0 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -217,35 +217,6 @@ cls_set_class(struct Qdisc *q, unsigned long *clp, unsigned long cl)
return old_cl;
}
-static inline void
-tcf_bind_filter(struct tcf_proto *tp, struct tcf_result *r, unsigned long base)
-{
- struct Qdisc *q = tp->chain->block->q;
- unsigned long cl;
-
- /* Check q as it is not set for shared blocks. In that case,
- * setting class is not supported.
- */
- if (!q)
- return;
- cl = q->ops->cl_ops->bind_tcf(q, base, r->classid);
- cl = cls_set_class(q, &r->class, cl);
- if (cl)
- q->ops->cl_ops->unbind_tcf(q, cl);
-}
-
-static inline void
-tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r)
-{
- struct Qdisc *q = tp->chain->block->q;
- unsigned long cl;
-
- if (!q)
- return;
- if ((cl = __cls_set_class(&r->class, 0)) != 0)
- q->ops->cl_ops->unbind_tcf(q, cl);
-}
-
struct tcf_exts {
#ifdef CONFIG_NET_CLS_ACT
__u32 type; /* for backward compat(TCA_OLD_COMPAT) */
@@ -708,18 +679,6 @@ static inline bool tc_in_hw(u32 flags)
return (flags & TCA_CLS_FLAGS_IN_HW) ? true : false;
}
-static inline void
-tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
- const struct tcf_proto *tp, u32 flags,
- struct netlink_ext_ack *extack)
-{
- cls_common->chain_index = tp->chain->index;
- cls_common->protocol = tp->protocol;
- cls_common->prio = tp->prio;
- if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE)
- cls_common->extack = extack;
-}
-
enum tc_fl_command {
TC_CLSFLOWER_REPLACE,
TC_CLSFLOWER_DESTROY,
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index a6d00093f35e..72dbb96fc549 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -246,65 +246,7 @@ struct tcf_result {
struct tcf_chain;
-struct tcf_proto_ops {
- struct list_head head;
- char kind[IFNAMSIZ];
-
- int (*classify)(struct sk_buff *,
- const struct tcf_proto *,
- struct tcf_result *);
- int (*init)(struct tcf_proto*);
- void (*destroy)(struct tcf_proto *tp,
- struct netlink_ext_ack *extack);
-
- void* (*get)(struct tcf_proto*, u32 handle);
- int (*change)(struct net *net, struct sk_buff *,
- struct tcf_proto*, unsigned long,
- u32 handle, struct nlattr **,
- void **, bool,
- struct netlink_ext_ack *);
- int (*delete)(struct tcf_proto *tp, void *arg,
- bool *last,
- struct netlink_ext_ack *);
- void (*walk)(struct tcf_proto*, struct tcf_walker *arg);
- int (*reoffload)(struct tcf_proto *tp, bool add,
- tc_setup_cb_t *cb, void *cb_priv,
- struct netlink_ext_ack *extack);
- void (*bind_class)(void *, u32, unsigned long);
- void * (*tmplt_create)(struct net *net,
- struct tcf_chain *chain,
- struct nlattr **tca,
- struct netlink_ext_ack *extack);
- void (*tmplt_destroy)(void *tmplt_priv);
-
- /* rtnetlink specific */
- int (*dump)(struct net*, struct tcf_proto*, void *,
- struct sk_buff *skb, struct tcmsg*);
- int (*tmplt_dump)(struct sk_buff *skb,
- struct net *net,
- void *tmplt_priv);
-
- struct module *owner;
-};
-
-struct tcf_proto {
- /* Fast access part */
- struct tcf_proto __rcu *next;
- void __rcu *root;
-
- /* called under RCU BH lock*/
- int (*classify)(struct sk_buff *,
- const struct tcf_proto *,
- struct tcf_result *);
- __be16 protocol;
-
- /* All the rest */
- u32 prio;
- void *data;
- const struct tcf_proto_ops *ops;
- struct tcf_chain *chain;
- struct rcu_head rcu;
-};
+struct tcf_proto_ops;
struct qdisc_skb_cb {
unsigned int pkt_len;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 229d63c99be2..e946ada18299 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -25,11 +25,12 @@
#include <linux/list.h>
#include <net/net_namespace.h>
#include <net/sock.h>
-#include <net/sch_generic.h>
#include <net/pkt_cls.h>
#include <net/act_api.h>
#include <net/netlink.h>
+#include "tcf_proto.h"
+
static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp)
{
u32 chain_index = a->tcfa_action & TC_ACT_EXT_VAL_MASK;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 31bd1439cf60..be5fba6355c5 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -31,6 +31,8 @@
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
+#include "tcf_proto.h"
+
/* The list of all installed classifier types */
static LIST_HEAD(tcf_proto_base);
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 6a5dce8baf19..3772432889f2 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -22,6 +22,8 @@
#include <net/act_api.h>
#include <net/pkt_cls.h>
+#include "tcf_proto.h"
+
struct basic_head {
struct list_head flist;
struct idr handle_idr;
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index fa6fe2fe0f32..fb2478e357cd 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -23,6 +23,8 @@
#include <net/pkt_cls.h>
#include <net/sock.h>
+#include "tcf_proto.h"
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
MODULE_DESCRIPTION("TC BPF based classifier");
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 3bc01bdde165..5638c711e53c 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -18,6 +18,8 @@
#include <net/sock.h>
#include <net/cls_cgroup.h>
+#include "tcf_proto.h"
+
struct cls_cgroup_head {
u32 handle;
struct tcf_exts exts;
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 2bb043cd436b..7e60e432e3a8 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -33,6 +33,8 @@
#include <net/netfilter/nf_conntrack.h>
#endif
+#include "tcf_proto.h"
+
struct flow_head {
struct list_head filters;
struct rcu_head rcu;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 6fd9bdd93796..b36c61f7ee44 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -20,7 +20,6 @@
#include <linux/ip.h>
#include <linux/mpls.h>
-#include <net/sch_generic.h>
#include <net/pkt_cls.h>
#include <net/ip.h>
#include <net/flow_dissector.h>
@@ -29,6 +28,8 @@
#include <net/dst.h>
#include <net/dst_metadata.h>
+#include "tcf_proto.h"
+
struct fl_flow_key {
int indev_ifindex;
struct flow_dissector_key_control control;
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 29eeeaf3ea44..be872b1653f5 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -28,7 +28,8 @@
#include <net/netlink.h>
#include <net/act_api.h>
#include <net/pkt_cls.h>
-#include <net/sch_generic.h>
+
+#include "tcf_proto.h"
#define HTSIZE 256
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 856fa79d4ffd..708faf62ecab 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -13,9 +13,10 @@
#include <linux/init.h>
#include <linux/module.h>
-#include <net/sch_generic.h>
#include <net/pkt_cls.h>
+#include "tcf_proto.h"
+
struct cls_mall_head {
struct tcf_exts exts;
struct tcf_result res;
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 0404aa5fa7cb..d40ae6d14b2d 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -22,6 +22,8 @@
#include <net/act_api.h>
#include <net/pkt_cls.h>
+#include "tcf_proto.h"
+
/*
* 1. For now we assume that route tags < 256.
* It allows to use direct table lookups, instead of hash tables.
diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c
index cbb5e0d600f3..131a81aeaa4e 100644
--- a/net/sched/cls_rsvp.c
+++ b/net/sched/cls_rsvp.c
@@ -20,6 +20,8 @@
#include <net/act_api.h>
#include <net/pkt_cls.h>
+#include "tcf_proto.h"
+
#define RSVP_DST_LEN 1
#define RSVP_ID "rsvp"
#define RSVP_OPS cls_rsvp_ops
diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c
index dd08aea2aee5..159dc01cf251 100644
--- a/net/sched/cls_rsvp6.c
+++ b/net/sched/cls_rsvp6.c
@@ -20,6 +20,8 @@
#include <net/pkt_cls.h>
#include <net/netlink.h>
+#include "tcf_proto.h"
+
#define RSVP_DST_LEN 4
#define RSVP_ID "rsvp6"
#define RSVP_OPS cls_rsvp6_ops
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 9ccc93f257db..e7d06c3d40a3 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -13,7 +13,8 @@
#include <net/act_api.h>
#include <net/netlink.h>
#include <net/pkt_cls.h>
-#include <net/sch_generic.h>
+
+#include "tcf_proto.h"
/*
* Passing parameters to the root seems to be done more awkwardly than really
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index d5d2a6dc3921..7b3bdfd80001 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -47,6 +47,8 @@
#include <net/pkt_cls.h>
#include <linux/idr.h>
+#include "tcf_proto.h"
+
struct tc_u_knode {
struct tc_u_knode __rcu *next;
u32 handle;
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
index 1331a4c2d8ff..b123880fbe07 100644
--- a/net/sched/ematch.c
+++ b/net/sched/ematch.c
@@ -90,6 +90,8 @@
#include <linux/skbuff.h>
#include <net/pkt_cls.h>
+#include "tcf_proto.h"
+
static LIST_HEAD(ematch_ops);
static DEFINE_RWLOCK(ematch_mod_lock);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 98541c6399db..d6ac218811d0 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -37,6 +37,8 @@
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
+#include "tcf_proto.h"
+
/*
Short review.
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index cd49afca9617..6bf259e55319 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -17,6 +17,8 @@
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
+#include "tcf_proto.h"
+
/*
* The ATM queuing discipline provides a framework for invoking classifiers
* (aka "filters"), which in turn select classes of this queuing discipline.
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 35fc7252187c..fcfd5f321447 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -75,6 +75,8 @@
#include <net/netfilter/nf_conntrack_core.h>
#endif
+#include "tcf_proto.h"
+
#define CAKE_SET_WAYS (8)
#define CAKE_MAX_TINS (8)
#define CAKE_QUEUES (1024)
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index f42025d53cfe..8021ba377dfd 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -21,6 +21,8 @@
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
+#include "tcf_proto.h"
+
/* Class-Based Queueing (CBQ) algorithm.
=======================================
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index e0b0cf8a9939..19a48fa95b9b 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -18,6 +18,8 @@
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
+#include "tcf_proto.h"
+
struct drr_class {
struct Qdisc_class_common common;
unsigned int filter_cnt;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 049714c57075..b3a4537afbcb 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -18,6 +18,8 @@
#include <net/inet_ecn.h>
#include <asm/byteorder.h>
+#include "tcf_proto.h"
+
/*
* classid class marking
* ------- ----- -------
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 6c0a9d5dbf94..8868a8e1a81f 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -28,6 +28,8 @@
#include <net/codel_impl.h>
#include <net/codel_qdisc.h>
+#include "tcf_proto.h"
+
/* Fair Queue CoDel.
*
* Principles :
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 3278a76f6861..9c75b77da56e 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -68,6 +68,8 @@
#include <net/pkt_cls.h>
#include <asm/div64.h>
+#include "tcf_proto.h"
+
/*
* kernel internal service curve representation:
* coordinates are given by 64 bit unsigned integers.
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 43c4bfe625a9..c206b3cfdfb2 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -38,10 +38,10 @@
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <net/netlink.h>
-#include <net/sch_generic.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
+#include "tcf_proto.h"
/* HTB algorithm.
Author: devik@cdi.cz
========================================================================
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 1da7ea8de0ad..107563c14e24 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -27,6 +27,8 @@
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
+#include "tcf_proto.h"
+
struct multiq_sched_data {
u16 bands;
u16 max_bands;
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 222e53d3d27a..4fed3fd38dd3 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -22,6 +22,8 @@
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
+#include "tcf_proto.h"
+
struct prio_sched_data {
int bands;
struct tcf_proto __rcu *filter_list;
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index bb1a9c11fc54..32f68e639037 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -19,6 +19,8 @@
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
+#include "tcf_proto.h"
+
/* Quick Fair Queueing Plus
========================
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 7cbdad8419b7..5465249c600f 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -28,6 +28,8 @@
#include <net/pkt_cls.h>
#include <net/inet_ecn.h>
+#include "tcf_proto.h"
+
/*
* SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level)
* This implementation uses L = 8 and N = 16
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 2f2678197760..abc1598e87e7 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -26,6 +26,8 @@
#include <net/pkt_cls.h>
#include <net/red.h>
+#include "tcf_proto.h"
+
/* Stochastic Fairness Queuing algorithm.
=======================================
diff --git a/net/sched/tcf_proto.h b/net/sched/tcf_proto.h
new file mode 100644
index 000000000000..b8d0e15e7f26
--- /dev/null
+++ b/net/sched/tcf_proto.h
@@ -0,0 +1,104 @@
+/* struct tcf_proto internal details - outside of net/sched it's opaque */
+
+#include <net/sch_generic.h>
+
+struct tcf_proto {
+ /* Fast access part */
+ struct tcf_proto __rcu *next;
+ void __rcu *root;
+
+ /* called under RCU BH lock*/
+ int (*classify)(struct sk_buff *,
+ const struct tcf_proto *,
+ struct tcf_result *);
+ __be16 protocol;
+
+ /* All the rest */
+ u32 prio;
+ void *data;
+ const struct tcf_proto_ops *ops;
+ struct tcf_chain *chain;
+ struct rcu_head rcu;
+};
+
+struct tcf_proto_ops {
+ struct list_head head;
+ char kind[IFNAMSIZ];
+
+ int (*classify)(struct sk_buff *,
+ const struct tcf_proto *,
+ struct tcf_result *);
+ int (*init)(struct tcf_proto*);
+ void (*destroy)(struct tcf_proto *tp,
+ struct netlink_ext_ack *extack);
+
+ void* (*get)(struct tcf_proto*, u32 handle);
+ int (*change)(struct net *net, struct sk_buff *,
+ struct tcf_proto*, unsigned long,
+ u32 handle, struct nlattr **,
+ void **, bool,
+ struct netlink_ext_ack *);
+ int (*delete)(struct tcf_proto *tp, void *arg,
+ bool *last,
+ struct netlink_ext_ack *);
+ void (*walk)(struct tcf_proto*, struct tcf_walker *arg);
+ int (*reoffload)(struct tcf_proto *tp, bool add,
+ tc_setup_cb_t *cb, void *cb_priv,
+ struct netlink_ext_ack *extack);
+ void (*bind_class)(void *, u32, unsigned long);
+ void * (*tmplt_create)(struct net *net,
+ struct tcf_chain *chain,
+ struct nlattr **tca,
+ struct netlink_ext_ack *extack);
+ void (*tmplt_destroy)(void *tmplt_priv);
+
+ /* rtnetlink specific */
+ int (*dump)(struct net*, struct tcf_proto*, void *,
+ struct sk_buff *skb, struct tcmsg*);
+ int (*tmplt_dump)(struct sk_buff *skb,
+ struct net *net,
+ void *tmplt_priv);
+
+ struct module *owner;
+};
+
+static inline void
+tcf_bind_filter(struct tcf_proto *tp, struct tcf_result *r, unsigned long base)
+{
+ struct Qdisc *q = tp->chain->block->q;
+ unsigned long cl;
+
+ /* Check q as it is not set for shared blocks. In that case,
+ * setting class is not supported.
+ */
+ if (!q)
+ return;
+ cl = q->ops->cl_ops->bind_tcf(q, base, r->classid);
+ cl = cls_set_class(q, &r->class, cl);
+ if (cl)
+ q->ops->cl_ops->unbind_tcf(q, cl);
+}
+
+static inline void
+tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r)
+{
+ struct Qdisc *q = tp->chain->block->q;
+ unsigned long cl;
+
+ if (!q)
+ return;
+ if ((cl = __cls_set_class(&r->class, 0)) != 0)
+ q->ops->cl_ops->unbind_tcf(q, cl);
+}
+
+static inline void
+tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
+ const struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
+{
+ cls_common->chain_index = tp->chain->index;
+ cls_common->protocol = tp->protocol;
+ cls_common->prio = tp->prio;
+ if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE)
+ cls_common->extack = extack;
+}
^ permalink raw reply related
* Re: [PATCH] net: sched: Fix memory exposure from short TCA_U32_SEL
From: Al Viro @ 2018-08-26 22:43 UTC (permalink / raw)
To: Joe Perches
Cc: Julia Lawall, Kees Cook, LKML, Jamal Hadi Salim, Cong Wang,
Jiri Pirko, David S. Miller, Network Development
In-Reply-To: <eca48539a3dede3bfaed9ab9a6c06794cf8160e0.camel@perches.com>
On Sun, Aug 26, 2018 at 03:26:54PM -0700, Joe Perches wrote:
> On Sun, 2018-08-26 at 22:24 +0100, Al Viro wrote:
> > On Sun, Aug 26, 2018 at 11:57:57AM -0700, Joe Perches wrote:
> >
> > > > That, BTW, is why I hate the use of sizeof(*p) in kmalloc, etc.
> > > > arguments. typeof is even worse in that respect.
> > >
> > > True. Semantic searches via tools like coccinelle could help here
> > > but those searches are quite a bit slower than straightforward greps.
> >
> > Those searches are .config-sensitive as well, which can be much more
> > unpleasant than being slow...
>
> Are they? Julia?
They work pretty much on preprocessor output level; if something it ifdef'ed
out on given config, it won't be seen...
^ permalink raw reply
* [PATCH] iwlwifi: mvm: fix spelling mistake "Recieved" -> "Received"
From: Colin King @ 2018-08-26 22:31 UTC (permalink / raw)
To: Johannes Berg, Emmanuel Grumbach, Luca Coelho,
Intel Linux Wireless, Kalle Valo, David S . Miller,
linux-wireless, netdev
Cc: kernel-janitors, linux-kernel
From: Colin Ian King <colin.king@canonical.com>
Trivial fix to spelling mistake in debug message.
Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index b15b0d84bb7e..5f32d3131d62 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -3320,7 +3320,7 @@ static bool iwl_mvm_rx_aux_roc(struct iwl_notif_wait_data *notif_wait,
resp = (void *)pkt->data;
IWL_DEBUG_TE(mvm,
- "Aux ROC: Recieved response from ucode: status=%d uid=%d\n",
+ "Aux ROC: Received response from ucode: status=%d uid=%d\n",
resp->status, resp->event_unique_id);
te_data->uid = le32_to_cpu(resp->event_unique_id);
--
2.17.1
^ permalink raw reply related
* Re: [PATCH] net: sched: Fix memory exposure from short TCA_U32_SEL
From: Joe Perches @ 2018-08-26 22:26 UTC (permalink / raw)
To: Al Viro, Julia Lawall
Cc: Kees Cook, LKML, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
David S. Miller, Network Development
In-Reply-To: <20180826212421.GW6515@ZenIV.linux.org.uk>
On Sun, 2018-08-26 at 22:24 +0100, Al Viro wrote:
> On Sun, Aug 26, 2018 at 11:57:57AM -0700, Joe Perches wrote:
>
> > > That, BTW, is why I hate the use of sizeof(*p) in kmalloc, etc.
> > > arguments. typeof is even worse in that respect.
> >
> > True. Semantic searches via tools like coccinelle could help here
> > but those searches are quite a bit slower than straightforward greps.
>
> Those searches are .config-sensitive as well, which can be much more
> unpleasant than being slow...
Are they? Julia?
^ permalink raw reply
* Re: broken behaviour of TC filter delete
From: Jamal Hadi Salim @ 2018-08-26 17:48 UTC (permalink / raw)
To: Jiri Pirko, Cong Wang
Cc: Roman Mashak, Linux Kernel Network Developers, Jiri Pirko
In-Reply-To: <20180825130243.GE2931@nanopsycho>
On 2018-08-25 9:02 a.m., Jiri Pirko wrote:
> Fri, Aug 24, 2018 at 08:11:07PM CEST, xiyou.wangcong@gmail.com wrote:
>
>>> ENOENT seems to be more logical to return when there's no more filter to delete.
>>
>> Yeah, at least we should keep ENOENT for compatibility.
>>
>> The bug here is chain 0 is gone after the last filter is gone,
>> so when you delete the filter again, it treats it as you specify
>> chain 0 which does not exist, so it hits EINVAL before ENOENT.
>
> I understand. My concern is about consistency with other chains. Perhaps
> -ENOENT for all chains in this case would be doable. What do you think?
>
ENOENT with extack describing whether chain or filter not found.
cheers,
jamal
^ permalink raw reply
* Re: [PATCH] net: sched: Fix memory exposure from short TCA_U32_SEL
From: Al Viro @ 2018-08-26 21:24 UTC (permalink / raw)
To: Joe Perches
Cc: Kees Cook, LKML, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
David S. Miller, Network Development
In-Reply-To: <d31fe59160e0b7d40e09536a3c74619ebb1f3b13.camel@perches.com>
On Sun, Aug 26, 2018 at 11:57:57AM -0700, Joe Perches wrote:
> > That, BTW, is why I hate the use of sizeof(*p) in kmalloc, etc.
> > arguments. typeof is even worse in that respect.
>
> True. Semantic searches via tools like coccinelle could help here
> but those searches are quite a bit slower than straightforward greps.
Those searches are .config-sensitive as well, which can be much more
unpleasant than being slow...
^ permalink raw reply
* Re: [PATCH] net: sched: Fix memory exposure from short TCA_U32_SEL
From: David Miller @ 2018-08-26 21:22 UTC (permalink / raw)
To: keescook; +Cc: linux-kernel, viro, jhs, xiyou.wangcong, jiri, netdev
In-Reply-To: <20180826055801.GA42063@beast>
From: Kees Cook <keescook@chromium.org>
Date: Sat, 25 Aug 2018 22:58:01 -0700
> Via u32_change(), TCA_U32_SEL has an unspecified type in the netlink
> policy, so max length isn't enforced, only minimum. This means nkeys
> (from userspace) was being trusted without checking the actual size of
> nla_len(), which could lead to a memory over-read, and ultimately an
> exposure via a call to u32_dump(). Reachability is CAP_NET_ADMIN within
> a namespace.
>
> Reported-by: Al Viro <viro@zeniv.linux.org.uk>
> Signed-off-by: Kees Cook <keescook@chromium.org>
I'll apply this as-is and queued it up for -stable.
If we want to avoid sizeof(*p) type stuff, it can be done as a follow-up.
Thanks.
^ permalink raw reply
* Re: [PATCH] net: sched: Fix memory exposure from short TCA_U32_SEL
From: Jamal Hadi Salim @ 2018-08-26 17:30 UTC (permalink / raw)
To: Kees Cook, Al Viro
Cc: LKML, Cong Wang, Jiri Pirko, David S. Miller, Network Development
In-Reply-To: <CAGXu5jK7VzayzZTcxgZBf-+YHWO+Hv7s8utj2rzTc3gFtA8pFQ@mail.gmail.com>
On 2018-08-26 2:19 a.m., Kees Cook wrote:
> On Sat, Aug 25, 2018 at 11:15 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>> On Sat, Aug 25, 2018 at 10:58:01PM -0700, Kees Cook wrote:
>> Saner approach would be sel_size = offsetof(struct tc_u32_sel, keys[s->nkeys])...
>
> Either is fine by me.
>
>>> + sel_size = struct_size(s, keys, s->nkeys);
>>> + if (nla_len(tb[TCA_U32_SEL]) < sel_size) {
>>> + err = -EINVAL;
>>> + goto erridr;
>>> + }
>>>
>>> - n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL);
>>> + n = kzalloc(offsetof(typeof(*n), sel) + sel_size, GFP_KERNEL);
>>
>> ITYM
>> n = kzalloc(offsetof(struct tc_u_common, sel.keys[s->nkeys]), GFP_KERNEL);
>
> I prefer to reuse sel_size and keep typeof() to keep things tied to
> "n" more directly. *shrug*
Looks good to me.
We should add an nla_policy later.
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
cheers,
jamal
^ permalink raw reply
* bpfilter causes a leftover kernel process
From: Olivier Brunel @ 2018-08-26 16:08 UTC (permalink / raw)
To: netdev
Hi,
(Please cc me as I'm not subscribed to the list, thanks.)
I'm running an Arch Linux x86_64 system, and recently updated to a 3.18
kernel, which led me to encounter what I believe to be a kernel bug
related to the bpfilter framework.
What happens is that upon boot, there's a "leftover kernel process"
running (shown as "[none]" in ps), which doesn't seem to do anything
(anymore) but does have references/fds open to the root fs, and so when
trying to shutdown the system umounting the root fs fails (EBUSY)
because of it, leading to expected issues.
Simply killing that process allows umounting the root fs fine and
"resolves" all issues. This process/behavior wasn't there with any
previous kernel, and is there with all tried kernels from 4.18.0 to
4.18.4, without any other change to the system -- although this is due
to CONFIG_BPFILTER=y in the kernel config.
Indeed I managed to compile a kernel 4.18.4 myself using the Arch Linux
config[1] with a single change of unsetting CONFIG_BPFILTER, and with
the resulting kernel I don't have this "leftover kernel process"
anymore, everything is back to normal.
Now, about this process, here's a few outputs to try and describe what
it is:
rafus# pgrep none
920
rafus# cd /proc/920
rafus# readlink exe
/ (deleted)
rafus# ls -l fd
total 0
lr-x------ 1 root root 64 Aug 26 17:17 0 -> 'pipe:[13366]'
l-wx------ 1 root root 64 Aug 26 17:17 1 -> 'pipe:[13367]'
lrwx------ 1 root root 64 Aug 26 17:17 2 -> /dev/console
rafus# cat status
Name: none
Umask: 0022
State: S (sleeping)
Tgid: 920
Ngid: 0
Pid: 920
PPid: 2
TracerPid: 0
Uid: 0 0 0 0
Gid: 0 0 0 0
FDSize: 64
Groups:
NStgid: 920
NSpid: 920
NSpgid: 0
NSsid: 0
VmPeak: 2296 kB
VmSize: 2296 kB
VmLck: 0 kB
VmPin: 0 kB
VmHWM: 748 kB
VmRSS: 748 kB
RssAnon: 60 kB
RssFile: 684 kB
RssShmem: 4 kB
VmData: 176 kB
VmStk: 132 kB
VmExe: 8 kB
VmLib: 1452 kB
VmPTE: 44 kB
VmSwap: 0 kB
HugetlbPages: 0 kB
CoreDumping: 0
Threads: 1
SigQ: 0/7861
SigPnd: 0000000000000000
ShdPnd: 0000000000000000
SigBlk: 0000000000000000
SigIgn: 0000000000000000
SigCgt: 0000000000000000
CapInh: 0000000000000000
CapPrm: 0000003fffffffff
CapEff: 0000003fffffffff
CapBnd: 0000003fffffffff
CapAmb: 0000000000000000
NoNewPrivs: 0
Seccomp: 0
Speculation_Store_Bypass: vulnerable
Cpus_allowed: 1
Cpus_allowed_list: 0
Mems_allowed: 00000001
Mems_allowed_list: 0
voluntary_ctxt_switches: 65
nonvoluntary_ctxt_switches: 1
rafus# cat stack
[<0>] pipe_wait+0x6c/0xb0
[<0>] pipe_read+0x20a/0x2d0
[<0>] __vfs_read+0x13a/0x180
[<0>] vfs_read+0x8a/0x130
[<0>] ksys_read+0x4f/0xb0
[<0>] do_syscall_64+0x5b/0x170
[<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[<0>] 0xffffffffffffffff
rafus# file -L exe
exe: ELF 64-bit LSB pie executable x86-64, version 1 (SYSV),
dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for
GNU/Linux 3.2.0,
BuildID[sha1]=b247cedd3f8daaea3eee38477aa641d84b77f0ba, not stripped
rafus# stat -L exe
File: exe
Size: 16832 Blocks: 40 IO Block: 4096 regular
file Device: 1h/1d Inode: 13361 Links: 0
Access: (0777/-rwxrwxrwx) Uid: ( 0/ root) Gid: ( 0/ root)
Access: 2018-08-26 17:17:37.334261924 +0200
Modify: 2018-08-26 17:14:27.787595262 +0200
Change: 2018-08-26 17:14:27.787595262 +0200
Birth: -
rafus# sha1sum exe
723d59584abe5e1e9917f0ec41d7e9120514afe7 exe
rafus# strings exe|grep bpf
Started bpfilter
I'm not actually sure what the process is, I'm guessing some kind of
helper is spawned at some point during boot, and for some reason it
never ends.
Although I can reproduce it (it happens on every boot with a kernel
4.18 and CONFIG_BPFILTER=y), I'm unfortunately not sure what is
actually needed to be done in order to trigger it.
I don't use bpfilter myself/directly, as said this happens with the
exact same system as with previous kernels, but I obviously have some
network configuration (done using iptables/iproute2) set up during boot.
Let me know if you need more information or need me to test things, and
I'll do my best.
Thank you.
[1]
https://git.archlinux.org/svntogit/packages.git/tree/trunk?h=packages/linux
^ permalink raw reply
* Re: [PATCH v2 02/17] zinc: introduce minimal cryptography library
From: Jason A. Donenfeld @ 2018-08-26 15:59 UTC (permalink / raw)
To: Eric Biggers
Cc: LKML, Netdev, David Miller, Andrew Lutomirski, Greg Kroah-Hartman,
Samuel Neves, Jean-Philippe Aumasson, Linux Crypto Mailing List
In-Reply-To: <20180825062951.GC726@sol.localdomain>
On Sat, Aug 25, 2018 at 12:29 AM Eric Biggers <ebiggers@kernel.org> wrote:
> I thought you were going to wrap lines at 80 characters? It's hard to read the
> extremely long lines, and they encourage deep nesting.
> There are still some alignment bugs where integers are loaded from byte arrays
> without using the unaligned access macros, e.g. in chacha20_init(),
> hchacha20_generic(), and fe_frombytes_impl().
These fixes are now completed in the development tree.
^ permalink raw reply
* followup: what's responsible for setting netdev->operstate to IF_OPER_DOWN?
From: Robert P. J. Day @ 2018-08-26 15:14 UTC (permalink / raw)
To: Linux kernel netdev mailing list
apologies for the constant pleas for assistance, but i think i'm
zeroing in on the problem that started all this. recap: custom
FPGA-based linux box with multiple ports, where the current symptom is
that there is no userspace notification when someone simply unplugs
one of the ports ("ifconfig" shows that interface still RUNNING).
as i read it, an active ethernet interface should be both UP (the
administrative state) and RUNNING (the RFC 2863-defined operational
state). if i unplug, i've verified on a standard net port on my laptop
that the interface is still UP, but no longer RUNNING, which makes
perfect sense. i plug back in, interface starts RUNNING again. so
where's the problem?
i can see that whether ifconfig shows an interface RUNNING is
defined in net/core/dev.c:
unsigned int dev_get_flags(const struct net_device *dev)
{
unsigned int flags;
flags = (dev->flags & ~(IFF_PROMISC |
IFF_ALLMULTI |
IFF_RUNNING |
IFF_LOWER_UP |
IFF_DORMANT)) |
(dev->gflags & (IFF_PROMISC |
IFF_ALLMULTI));
if (netif_running(dev)) {
if (netif_oper_up(dev))
flags |= IFF_RUNNING; <---- THERE
if (netif_carrier_ok(dev))
flags |= IFF_LOWER_UP;
if (netif_dormant(dev))
flags |= IFF_DORMANT;
}
return flags;
}
where netif_oper_up() is defined as:
static inline bool netif_oper_up(const struct net_device *dev)
{
return (dev->operstate == IF_OPER_UP ||
dev->operstate == IF_OPER_UNKNOWN /* backward compat */);
}
so i am simply assuming that the underlying problem is that,
somewhere down below, the unplugging of a port is somehow not setting
dev->operstate to its proper value of IF_OPER_DOWN.
that would clearly explain everything, and i'm about to dig even
further to see where the event of unplugging a port *should* be
recognized, but does this sound like a reasonable diagnosis? there
have been other problems with the programming of the FPGA, so it would
surprise absolutely no one to learn that this aspect was
misprogrammed.
rday
--
========================================================================
Robert P. J. Day Ottawa, Ontario, CANADA
http://crashcourse.ca/dokuwiki
Twitter: http://twitter.com/rpjday
LinkedIn: http://ca.linkedin.com/in/rpjday
========================================================================
^ permalink raw reply
* Re: [PATCH] net: sched: Fix memory exposure from short TCA_U32_SEL
From: Joe Perches @ 2018-08-26 18:57 UTC (permalink / raw)
To: Al Viro, Kees Cook
Cc: LKML, Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller,
Network Development
In-Reply-To: <20180826173236.GU6515@ZenIV.linux.org.uk>
On Sun, 2018-08-26 at 18:32 +0100, Al Viro wrote:
> On Sat, Aug 25, 2018 at 11:19:30PM -0700, Kees Cook wrote:
> > > > - n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL);
> > > > + n = kzalloc(offsetof(typeof(*n), sel) + sel_size, GFP_KERNEL);
> > >
> > > ITYM
> > > n = kzalloc(offsetof(struct tc_u_common, sel.keys[s->nkeys]), GFP_KERNEL);
> >
> > I prefer to reuse sel_size and keep typeof() to keep things tied to
> > "n" more directly. *shrug*
>
> This is rather search-hostile, though. Fresh example from the same
> area: where are struct tcf_proto instances created? Is it true that
> each is followed by ->ops->init()? Is it true that ->ops->init()
> is never called twice for the same instance? Is it true that
> ->ops->destroy() is called exactly once between successful ->ops->init()
> and freeing the object?
>
> That's precisely the kind of questions you end up asking when learning
> a new area. Your variant makes those harder to answer; it does make
> it easier to catch local problems on casual grep, but it's hell both
> on the newbies trying to make sense of an area and on the old hands
> from different areas.
>
> That, BTW, is why I hate the use of sizeof(*p) in kmalloc, etc.
> arguments. typeof is even worse in that respect.
True. Semantic searches via tools like coccinelle could help here
but those searches are quite a bit slower than straightforward greps.
^ permalink raw reply
* [PATCH] r8169: set RxConfig after tx/rx is enabled for RTL8169sb/8110sb devices
From: Azat Khuzhin @ 2018-08-26 14:03 UTC (permalink / raw)
To: netdev
Cc: Azat Khuzhin, Heiner Kallweit, David S . Miller,
Realtek linux nic maintainers
I have two Ethernet adapters:
r8169 0000:03:01.0 eth0: RTL8169sb/8110sb, 00:14:d1:14:2d:49, XID 10000000, IRQ 18
r8169 0000:01:00.0 eth0: RTL8168e/8111e, 64:66:b3:11:14:5d, XID 2c200000, IRQ 30
And after upgrading from linux 4.15 [1] to linux 4.18+ [2] RTL8169sb failed to
receive any packets. tcpdump shows a lot of checksum mismatch.
[1]: a0f79386a4968b4925da6db2d1daffd0605a4402
[2]: 0519359784328bfa92bf0931bf0cff3b58c16932 (4.19 merge window opened)
I started bisecting and the found that [3] breaks it. According to [4]:
"For 8110S, 8110SB, and 8110SC series, the initial value of RxConfig
needs to be set after the tx/rx is enabled."
So I moved rtl_init_rxcfg() after enabling tx/rs and now my adapter works
(RTL8168e works too).
[3]: 3559d81e76bfe3803e89f2e04cf6ef7ab4f3aace
[4]: e542a2269f232d61270ceddd42b73a4348dee2bb ("r8169: adjust the RxConfig
settings.")
Also drop "rx" from rtl_set_rx_tx_config_registers(), since it does nothing
with it already.
Fixes: 3559d81e76bfe3803e89f2e04cf6ef7ab4f3aace ("r8169: simplify
rtl_hw_start_8169")
Cc: Heiner Kallweit <hkallweit1@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: netdev@vger.kernel.org
Cc: Realtek linux nic maintainers <nic_swsd@realtek.com>
Signed-off-by: Azat Khuzhin <a3at.mail@gmail.com>
---
It looks like calling rtl_init_rxcfg() the second time is fine, but I
can move it into rtl_hw_start_8169())
drivers/net/ethernet/realtek/r8169.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 0efa977c422d..ac306797590e 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -4522,7 +4522,7 @@ static void rtl8169_hw_reset(struct rtl8169_private *tp)
rtl_hw_reset(tp);
}
-static void rtl_set_rx_tx_config_registers(struct rtl8169_private *tp)
+static void rtl_set_tx_config_registers(struct rtl8169_private *tp)
{
/* Set DMA burst size and Interframe Gap Time */
RTL_W32(tp, TxConfig, (TX_DMA_BURST << TxDMAShift) |
@@ -4633,12 +4633,14 @@ static void rtl_hw_start(struct rtl8169_private *tp)
rtl_set_rx_max_size(tp);
rtl_set_rx_tx_desc_registers(tp);
- rtl_set_rx_tx_config_registers(tp);
+ rtl_set_tx_config_registers(tp);
RTL_W8(tp, Cfg9346, Cfg9346_Lock);
/* Initially a 10 us delay. Turned it into a PCI commit. - FR */
RTL_R8(tp, IntrMask);
RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb);
+ rtl_init_rxcfg(tp);
+
rtl_set_rx_mode(tp->dev);
/* no early-rx interrupts */
RTL_W16(tp, MultiIntr, RTL_R16(tp, MultiIntr) & 0xf000);
--
2.18.0
^ permalink raw reply related
* Re: [PATCH] net: sched: Fix memory exposure from short TCA_U32_SEL
From: Al Viro @ 2018-08-26 17:32 UTC (permalink / raw)
To: Kees Cook
Cc: LKML, Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller,
Network Development
In-Reply-To: <CAGXu5jK7VzayzZTcxgZBf-+YHWO+Hv7s8utj2rzTc3gFtA8pFQ@mail.gmail.com>
On Sat, Aug 25, 2018 at 11:19:30PM -0700, Kees Cook wrote:
> >> - n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL);
> >> + n = kzalloc(offsetof(typeof(*n), sel) + sel_size, GFP_KERNEL);
> >
> > ITYM
> > n = kzalloc(offsetof(struct tc_u_common, sel.keys[s->nkeys]), GFP_KERNEL);
>
> I prefer to reuse sel_size and keep typeof() to keep things tied to
> "n" more directly. *shrug*
This is rather search-hostile, though. Fresh example from the same
area: where are struct tcf_proto instances created? Is it true that
each is followed by ->ops->init()? Is it true that ->ops->init()
is never called twice for the same instance? Is it true that
->ops->destroy() is called exactly once between successful ->ops->init()
and freeing the object?
That's precisely the kind of questions you end up asking when learning
a new area. Your variant makes those harder to answer; it does make
it easier to catch local problems on casual grep, but it's hell both
on the newbies trying to make sense of an area and on the old hands
from different areas.
That, BTW, is why I hate the use of sizeof(*p) in kmalloc, etc.
arguments. typeof is even worse in that respect.
As for the questions above... Do try to grep for ->init calls. Good
luck getting through the damn pile. And "it must see the definition
of tcf_proto_ops" doesn't narrow it - it's defined in net/sch_generic.h,
which gets pulled by linux/filter.h, which gets pulled by net/sock.h,
which gets pulled by arseloads of code.
As far as I can tell, the solution is
* outside of net/sched/*.c, tcf_proto_ops is mentioned only
in
include/net/pkt_cls.h:23:int register_tcf_proto_ops(struct tcf_proto_ops *ops);
include/net/pkt_cls.h:24:int unregister_tcf_proto_ops(struct tcf_proto_ops *ops);
and
include/net/sch_generic.h:304: const struct tcf_proto_ops *ops;
include/net/sch_generic.h:327: const struct tcf_proto_ops *tmplt_ops;
* the first two are irrelevant - externs don't get you any
access to the data structure
* tmplt_ops is only used in net/sched/*.c; for everything else
it could've been opaque pointer - it's not even looked at
* tcf_proto ->ops is, of course, ungreppable. However,
tcf_proto itself, outside of net/sched/*.c, is only mentioned in
include/net/act_api.h:175:int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
include/net/act_api.h:179:struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
include/net/pkt_cls.h:20: int (*fn)(struct tcf_proto *, void *node, struct tcf_walker *);
include/net/pkt_cls.h:48: struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
include/net/pkt_cls.h:90:int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
include/net/pkt_cls.h:96: struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
include/net/pkt_cls.h:196:static inline int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
include/net/pkt_cls.h:221:tcf_bind_filter(struct tcf_proto *tp, struct tcf_result *r, unsigned long base)
include/net/pkt_cls.h:238:tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r)
include/net/pkt_cls.h:385:int tcf_exts_validate(struct net *net, struct tcf_proto *tp,
include/net/pkt_cls.h:497:int tcf_em_tree_validate(struct tcf_proto *, struct nlattr *,
include/net/pkt_cls.h:713: const struct tcf_proto *tp, u32 flags,
include/net/sch_generic.h:237: const struct tcf_proto *goto_tp;
include/net/sch_generic.h:254: const struct tcf_proto *,
include/net/sch_generic.h:256: int (*init)(struct tcf_proto*);
include/net/sch_generic.h:257: void (*destroy)(struct tcf_proto *tp,
include/net/sch_generic.h:260: void* (*get)(struct tcf_proto*, u32 handle);
include/net/sch_generic.h:262: struct tcf_proto*, unsigned long,
include/net/sch_generic.h:266: int (*delete)(struct tcf_proto *tp, void *arg,
include/net/sch_generic.h:269: void (*walk)(struct tcf_proto*, struct tcf_walker *arg);
include/net/sch_generic.h:270: int (*reoffload)(struct tcf_proto *tp, bool add,
include/net/sch_generic.h:281: int (*dump)(struct net*, struct tcf_proto*, void *,
include/net/sch_generic.h:290:struct tcf_proto {
include/net/sch_generic.h:292: struct tcf_proto __rcu *next;
include/net/sch_generic.h:297: const struct tcf_proto *,
include/net/sch_generic.h:317:typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv);
include/net/sch_generic.h:320: struct tcf_proto __rcu *filter_chain;
include/net/sch_generic.h:1098: struct tcf_proto *filter_list;
include/net/sch_generic.h:1122: struct tcf_proto *tp_head);
* excluding externs, arguments in function pointers or a typedef for
such (neither would give an access to thus typed pointer), we are left with
include/net/pkt_cls.h:96: struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
include/net/pkt_cls.h:196:static inline int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
include/net/pkt_cls.h:221:tcf_bind_filter(struct tcf_proto *tp, struct tcf_result *r, unsigned long base)
include/net/pkt_cls.h:238:tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r)
include/net/pkt_cls.h:713: const struct tcf_proto *tp, u32 flags,
include/net/sch_generic.h:237: const struct tcf_proto *goto_tp;
include/net/sch_generic.h:290:struct tcf_proto {
include/net/sch_generic.h:292: struct tcf_proto __rcu *next;
include/net/sch_generic.h:320: struct tcf_proto __rcu *filter_chain;
include/net/sch_generic.h:1098: struct tcf_proto *filter_list;
* the first two are in arguments of static inlines which do not
use the arguments in question.
* the next three (tcf_bind_filter, tcf_unbind_filter and
tc_cls_common_offload_init) do not use ->ops or pass tcf_proto * to
anyone. Incidentally, they are only used in net/sched/*.c
* goto_tp is also used only in net/sched/*.c. Moreover,
all its uses anywhere could as well have been an opaque pointer.
* grepping for filter_chain catches an unrelated local field
of the same name in mellanox, a function with the same name in
uprobes.c and a bunch of uses in net/sched/*.c.
* search for filter_list gets false positives in trace_events_filter.c,
a bunch of uses in net/sched/*.c and
net/core/dev.c:3533: switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
net/core/dev.c:4593: switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
both of which are opaque.
* the last remaining source of such pointers is
include/net/sch_generic.h:292: struct tcf_proto __rcu *next;
Of course, that's ungreppable. However, it's within the struct tcf_proto itself, so
anyone accessing it as something non-opaque would already have to access tcf_proto
pointers, and not in provably opaque fashion. Theoretically, the three tcf_...
inlines above need another look; fortunately, they don't use ->next at all, not to
mention not being used anywhere outside of net/sched/*.c
The 80 lines above prove that we only need to grep net/sched/*.c for
tcf_proto_ops method calls. And only because we don't have (thank $DEITY)
anything that could deconstruct types - as soon as some bastard grows means
to say "type of the second argument of the function pointed to by p", this
kind of analysis, painful as it is, goes out of window. Even as it is,
do you really like the idea of newbies trying to get through the exercises
like the one above?
Incidentally, that's not the end -
git grep -n '[-]>[ ]*init\>' net/sched/
git grep -n '\.[ ]*init\>' net/sched/
does catch 93 hits. Excluding comparisons, assignments and initializers,
we are down to
net/sched/act_api.c:878: err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind,
net/sched/act_api.c:881: err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held,
net/sched/cls_api.c:173: err = tp->ops->init(tp);
net/sched/sch_api.c:1155: err = ops->init(sch, tca[TCA_OPTIONS], extack);
net/sched/sch_generic.c:901: if (!ops->init || ops->init(sch, NULL, extack) == 0)
Note that we have no less than 3 different methods of the same name
here, going just by the number of arguments. Fortunately, only one candidate
for tcf_proto one, that in tcf_proto_create(). And there it's obviously done
on new object, with nothing else seeing it until the call.
Only that's not quite all I wanted to know - are there any other places
where tcf_proto instances get created? They are not members of any structs,
unions or arrays, thankfully (that we can see from grep) and there's no
variables of that type, be it auto or static duration. So it should all be
dynamically allocated. Moreover, from the above it looks like no twit could've
done such allocation in his/her/its misbegotten driver (they would have to
find the size somehow, and the search above would've spotted that). It has to
be somewhere in net/sched, if anywhere at all. And no, you can't rely upon
k.*alloc being used to allocate them - not apriori in unfamiliar code, not when
looking for a bug somewhere, etc. net/sched/*.c is 49KLoC; "read through and
see if it's done anywhere" is neither feasible nor supportable (what, do it
again each cycle?)
*IF* nobody plays games with sizeof(expression) (or typeof()), one
could look for mentionings of struct tcf_proto in there and exclude the
ones that actually mention pointers. That would've shown both the allocations
and places where container_of() gets used, etc. No such luck, thanks to
misguided souls preaching the "robust" uses of sizeof...
Sure, I can find all places in net/sched/*.c where we are declaring
pointers to tcf_proto or get those out of mentionings of fields of that
type. 256 hits total, and of course a lot of those are declarations of
function arguments, which means that the function needs to be read through,
thanks to the possibility of wonders like
tp->next = kmalloc(sizeof(*tp), GFP_KERNEL);
AFAICS, nothing exaclty like that exists there, but...
Most of such arguments are thankfully called 'tp', so grepping for
that in there allows to drop such declarations from the list. The list,
of course, grows, but it no longer contains that number of "need to look
through the entire" function hits. That, and some judicious use of search
and replace reduces it to something I'd been able to get through in about
an hour. All instances *are* created by tcf_proto_create(). The same fun-filled
activity has proven (modulo misreadings in that fun, of course) that all
instances are either freed before getting returned by tcf_proto_create() (in
case of ->init() failure) or go out in tcf_proto_destroy() via kfree_rcu(),
after ->ops->destroy() call done to them. And apparently that's the only
caller of ->destroy(), so modulo locking questions (I hadn't even started
to look into that) the answers to all questions in the beginning are
"yes".
Now, I'm fairly used to that kind of digging (and have a bunch of
useful vi macros, search patterns, etc.), so I'd managed to get through all
that. Took me about an hour and a half total. Do you really expect the
newbies to get through that joy? Sure, they (and I) can ask the maintainers,
who would've answered those questions instantly (well, modulo the email
latency, etc.) And for newbies asking that kind of questions is certainly
the right thing to do (or noting the suspected answer down and moving along,
to verify it later). But then the same maintainers have to verify that
this answer doesn't rot - that changes there (and elsewhere - never underestimate
the amount of weirdness cropping up as one-off hack in the bowels of drivers/*)
do not invalidate it? Same search, more or less...
VFS-side I'm trying to enforce "no sizeof(expression), unless it's
sizeof(local_variable)". Not religiously so, but any new instances of
sizeof in there are checked for that (once I get around to that). typeof
is rare as hens teeth in there, and should bloody remain so, TYVM.
It belongs inside very low-level macros and (almost) nowhere else.
There is a conflict of interests between "I don't give a damn what's
being allocated here, it does get sufficient size for resulting pointer
type and that's all I'm interested in" and "I'm looking for the places where
>this< is allocated". Your variant is firmly on the former side...
^ permalink raw reply
* Re: [PATCH v2 01/17] asm: simd context helper API
From: Jason A. Donenfeld @ 2018-08-26 13:45 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Netdev, David Miller, Andrew Lutomirski, Greg Kroah-Hartman,
Samuel Neves, linux-arch, Rik van Riel
In-Reply-To: <alpine.DEB.2.21.1808261234240.1195@nanos.tec.linutronix.de>
Hey Thomas,
On Sun, Aug 26, 2018 at 6:10 AM Thomas Gleixner <tglx@linutronix.de> wrote:
> I'm not too fond of this simply because it requires that relax() step in
> all code pathes. I'd rather make that completely transparent by just
> marking the task as FPU using and let the context switch code deal with it
> in case that it gets preempted. I'll let one of my engineers look into
> that next week.
Do you mean to say you intend to make kernel_fpu_end() and
kernel_neon_end() only actually do something upon context switch, but
not when it's actually called? So that multiple calls to
kernel_fpu_begin() and kernel_neon_begin() can be made without
penalty? If so, that'd be great, and I'd certainly prefer this to the
simd_context_t passing. I consider the simd_get/put/relax API a
stopgap measure until something like that is implemented.
Jason
^ permalink raw reply
* Re: [PATCH net] net: sungem: fix rx checksum support
From: mroos @ 2018-08-26 13:14 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, edumazet, netdev, malat, schwab
In-Reply-To: <a043f344-77a7-75e6-ffbb-26f41ddc3108@gmail.com>
> BTW, removing the FCS also means GRO is going to work, finally on this NIC ;)
>
> GRO does not like packets with padding.
As a follow-up, I am seeing hw csum failures on Sun V440 that has
onboard Sun Cassini with sungem driver. First tested version was 4.18
(it happened there once) and now that I tried 4.18+git, it still
happens:
[ 21.563282] libphy: Fixed MDIO Bus: probed
[ 21.617116] cassini: cassini.c:v1.6 (21 May 2008)
[ 21.678962] cassini 0000:00:02.0: enabling device (0144 -> 0146)
[ 21.761931] cassini 0000:00:02.0 eth0: Sun Cassini+ (64bit/66MHz PCI/Cu) Ethernet[6] 00:03:ba:6f:14:39
[ 21.884952] cassini 0003:00:01.0: enabling device (0144 -> 0146)
[ 21.967868] cassini 0003:00:01.0 eth1: Sun Cassini+ (64bit/66MHz PCI/Cu) Ethernet[29] 00:03:ba:6f:14:3a
[...]
[ 54.341212] eth0: hw csum failure
[ 54.384725] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.18.0-12952-g2923b27 #1397
[ 54.483167] Call Trace:
[ 54.515209] [000000000077838c] __skb_checksum_complete+0xcc/0xe0
[ 54.595272] [000000000080fc84] igmp_rcv+0x224/0x920
[ 54.660475] [00000000007ca3d0] ip_local_deliver+0xb0/0x240
[ 54.733675] [00000000007ca5c0] ip_rcv+0x60/0xa0
[ 54.794304] [0000000000781a30] __netif_receive_skb_one_core+0x30/0x60
[ 54.880094] [0000000000782914] process_backlog+0x94/0x140
[ 54.952161] [0000000000788f6c] net_rx_action+0x1ec/0x320
[ 55.023083] [0000000000870de8] __do_softirq+0xc8/0x200
[ 55.091719] [000000000042c4cc] do_softirq_own_stack+0x2c/0x40
[ 55.168362] [00000000004662d8] irq_exit+0xb8/0xe0
[ 55.231266] [0000000000870ac0] handler_irq+0xc0/0x100
[ 55.298756] [00000000004208b4] tl0_irq5+0x14/0x20
[ 55.361670] [000000000042cafc] arch_cpu_idle+0x9c/0xa0
[ 55.447055] [000000000048a254] cpu_startup_entry+0x14/0x40
[ 55.536998] [000000000095f4b4] 0x95f4b4
[ 55.588471] [0000000040000000] 0x40000000
[ 179.780371] eth0: hw csum failure
[ 179.823878] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 4.18.0-12952-g2923b27 #1397
[ 179.922230] Call Trace:
[ 179.954267] [000000000077838c] __skb_checksum_complete+0xcc/0xe0
[ 180.034335] [000000000080fc84] igmp_rcv+0x224/0x920
[ 180.099536] [00000000007ca3d0] ip_local_deliver+0xb0/0x240
[ 180.172740] [00000000007ca5c0] ip_rcv+0x60/0xa0
[ 180.233368] [0000000000781a30] __netif_receive_skb_one_core+0x30/0x60
[ 180.319159] [0000000000782914] process_backlog+0x94/0x140
[ 180.391225] [0000000000788f6c] net_rx_action+0x1ec/0x320
[ 180.462148] [0000000000870de8] __do_softirq+0xc8/0x200
[ 180.530782] [000000000042c4cc] do_softirq_own_stack+0x2c/0x40
[ 180.607422] [00000000004662d8] irq_exit+0xb8/0xe0
[ 180.670331] [0000000000870ac0] handler_irq+0xc0/0x100
[ 180.737822] [00000000004208b4] tl0_irq5+0x14/0x20
[ 180.800735] [000000000042caf8] arch_cpu_idle+0x98/0xa0
[ 180.869373] [0000000000489f60] do_idle+0xe0/0x1c0
[ 180.932281] [000000000048a25c] cpu_startup_entry+0x1c/0x40
[ 181.005491] [000000000098e9b4] start_kernel+0x3b8/0x3c8
--
Meelis Roos (mroos@linux.ee)
^ permalink raw reply
* Re: [PATCH v2 01/17] asm: simd context helper API
From: Rik van Riel @ 2018-08-26 16:53 UTC (permalink / raw)
To: Andy Lutomirski, Thomas Gleixner
Cc: Jason A. Donenfeld, LKML, Netdev, David Miller, Andrew Lutomirski,
Greg Kroah-Hartman, Samuel Neves, linux-arch
In-Reply-To: <2532E417-DDD2-4E2C-9F21-3B8D9B96370D@amacapital.net>
[-- Attachment #1: Type: text/plain, Size: 2105 bytes --]
On Sun, 2018-08-26 at 07:18 -0700, Andy Lutomirski wrote:
> > On Aug 26, 2018, at 7:06 AM, Thomas Gleixner <tglx@linutronix.de>
> > wrote:
> >
> > Jason,
> >
> > > On Sun, 26 Aug 2018, Jason A. Donenfeld wrote:
> > > > On Sun, Aug 26, 2018 at 6:10 AM Thomas Gleixner <
> > > > tglx@linutronix.de> wrote:
> > > > I'm not too fond of this simply because it requires that
> > > > relax() step in
> > > > all code pathes. I'd rather make that completely transparent by
> > > > just
> > > > marking the task as FPU using and let the context switch code
> > > > deal with it
> > > > in case that it gets preempted. I'll let one of my engineers
> > > > look into
> > > > that next week.
> > >
> > > Do you mean to say you intend to make kernel_fpu_end() and
> > > kernel_neon_end() only actually do something upon context switch,
> > > but
> > > not when it's actually called? So that multiple calls to
> > > kernel_fpu_begin() and kernel_neon_begin() can be made without
> > > penalty?
> >
> > On context switch and exit to user. That allows to keep those code
> > pathes
> > fully preemptible. Still twisting my brain around the details.
>
> I think you’ll have to treat exit to user and context switch as
> different things. For exit to user, we want to restore the *user*
> state, but, for context switch, we’ll need to restore *kernel* state.
For non-preemptible kernel_fpu_begin/end (which seems
like a good starting point since since it gets the
code halfway to where Thomas would like it to go), the
rules would be a little simpler:
- For exit to userspace, restore the user FPU state.
- At kernel_fpu_begin(), save the user FPU state (if still loaded).
- At context switch time, save the user FPU state (if still loaded).
> Do user first as its own patch set. It’ll be less painful that way.
>
> And someone needs to rework PKRU for this to make sense. See previous
> threads.
I sent Thomas the patches I worked on in the past.
That series is likely incomplete, but should be a
reasonable starting point.
--
All Rights Reversed.
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply
* Re: [PATCH v2 01/17] asm: simd context helper API
From: Thomas Gleixner @ 2018-08-26 12:10 UTC (permalink / raw)
To: Jason A. Donenfeld
Cc: LKML, netdev, David Miller, Andy Lutomirski, Greg KH,
Samuel Neves, linux-arch, Rik van Riel
In-Reply-To: <20180824213849.23647-2-Jason@zx2c4.com>
On Fri, 24 Aug 2018, Jason A. Donenfeld wrote:
> Sometimes it's useful to amortize calls to XSAVE/XRSTOR and the related
> FPU/SIMD functions over a number of calls, because FPU restoration is
> quite expensive. This adds a simple header for carrying out this pattern:
>
> simd_context_t simd_context = simd_get();
> while ((item = get_item_from_queue()) != NULL) {
> encrypt_item(item, simd_context);
> simd_context = simd_relax(simd_context);
> }
> simd_put(simd_context);
I'm not too fond of this simply because it requires that relax() step in
all code pathes. I'd rather make that completely transparent by just
marking the task as FPU using and let the context switch code deal with it
in case that it gets preempted. I'll let one of my engineers look into
that next week.
Thanks,
tglx
^ permalink raw reply
* Re: [PATCH v2 01/17] asm: simd context helper API
From: Andy Lutomirski @ 2018-08-26 14:25 UTC (permalink / raw)
To: Jason A. Donenfeld
Cc: Thomas Gleixner, LKML, Netdev, David Miller, Andrew Lutomirski,
Greg Kroah-Hartman, Samuel Neves, linux-arch, Rik van Riel
In-Reply-To: <CAHmME9q+JcT9pnZ5jgowf15O4BkVF-2-QkHA2o1ZKbVe4nAg6g@mail.gmail.com>
> On Aug 26, 2018, at 7:18 AM, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
> On Sun, Aug 26, 2018 at 8:06 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>>> Do you mean to say you intend to make kernel_fpu_end() and
>>> kernel_neon_end() only actually do something upon context switch, but
>>> not when it's actually called? So that multiple calls to
>>> kernel_fpu_begin() and kernel_neon_begin() can be made without
>>> penalty?
>>
>> On context switch and exit to user. That allows to keep those code pathes
>> fully preemptible. Still twisting my brain around the details.
>
> Just to make sure we're on the same page, the goal is so that this code:
>
> kernel_fpu_begin();
> kernel_fpu_end();
> kernel_fpu_begin();
> kernel_fpu_end();
> kernel_fpu_begin();
> kernel_fpu_end();
> kernel_fpu_begin();
> kernel_fpu_end();
> kernel_fpu_begin();
> kernel_fpu_end();
> kernel_fpu_begin();
> kernel_fpu_end();
> ...
>
> has the same performance as this code:
>
> kernel_fpu_begin();
> kernel_fpu_end();
>
> (Unless of course the process is preempted or the like.)
>
> Currently the present situation makes the performance of the above
> wildly different, since kernel_fpu_end() does something immediately.
>
> What about something like this:
> - Add a tristate flag connected to task_struct (or in the global fpu
> struct in the case that this happens in irq and there isn't a valid
> current).
> - On kernel_fpu_begin(), if the flag is 0, do the usual expensive
> XSAVE stuff, and set the flag to 1.
> - On kernel_fpu_begin(), if the flag is non-0, just set the flag to 1
> and return.
> - On kernel_fpu_end(), if the flag is non-0, set the flag to 2.
> (Otherwise WARN() or BUG() or something.)
> - On context switch / preemption / etc away from the task, if the flag
> is non-0, XRSTOR and such.
It’s not that simple. First, these states need names, at least for thinking about. 0 is “user state in regs”. 1 is “kernel state active”. 2 is “nothing active”.
The actual encoding will be something like TIF_XSTATE_UNLOADED: user state is not in regs. TIF_KERNEL_XSTATE: kernel is using FPU. And this fundamentally doubles the size of struct fpu.
Tglx, that doubling-the-size-of-fpu makes me question the idea of letting the kernel use the fpu while preemptible.
> - On context switch / preemption / etc back to the task, if the flag
> is 1, XSAVE and such. If the flag is 2, set it to 0.
>
> Jason
^ permalink raw reply
* Re: [PATCH v2 01/17] asm: simd context helper API
From: Andy Lutomirski @ 2018-08-26 14:18 UTC (permalink / raw)
To: Thomas Gleixner
Cc: Jason A. Donenfeld, LKML, Netdev, David Miller, Andrew Lutomirski,
Greg Kroah-Hartman, Samuel Neves, linux-arch, Rik van Riel
In-Reply-To: <alpine.DEB.2.21.1808261602250.1195@nanos.tec.linutronix.de>
> On Aug 26, 2018, at 7:06 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
>
> Jason,
>
>> On Sun, 26 Aug 2018, Jason A. Donenfeld wrote:
>>> On Sun, Aug 26, 2018 at 6:10 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>>> I'm not too fond of this simply because it requires that relax() step in
>>> all code pathes. I'd rather make that completely transparent by just
>>> marking the task as FPU using and let the context switch code deal with it
>>> in case that it gets preempted. I'll let one of my engineers look into
>>> that next week.
>>
>> Do you mean to say you intend to make kernel_fpu_end() and
>> kernel_neon_end() only actually do something upon context switch, but
>> not when it's actually called? So that multiple calls to
>> kernel_fpu_begin() and kernel_neon_begin() can be made without
>> penalty?
>
> On context switch and exit to user. That allows to keep those code pathes
> fully preemptible. Still twisting my brain around the details.
I think you’ll have to treat exit to user and context switch as different things. For exit to user, we want to restore the *user* state, but, for context switch, we’ll need to restore *kernel* state.
Do user first as its own patch set. It’ll be less painful that way.
And someone needs to rework PKRU for this to make sense. See previous threads.
^ permalink raw reply
* Re: [PATCH v2 01/17] asm: simd context helper API
From: Jason A. Donenfeld @ 2018-08-26 14:18 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Netdev, David Miller, Andrew Lutomirski, Greg Kroah-Hartman,
Samuel Neves, linux-arch, Rik van Riel
In-Reply-To: <alpine.DEB.2.21.1808261602250.1195@nanos.tec.linutronix.de>
On Sun, Aug 26, 2018 at 8:06 AM Thomas Gleixner <tglx@linutronix.de> wrote:
> > Do you mean to say you intend to make kernel_fpu_end() and
> > kernel_neon_end() only actually do something upon context switch, but
> > not when it's actually called? So that multiple calls to
> > kernel_fpu_begin() and kernel_neon_begin() can be made without
> > penalty?
>
> On context switch and exit to user. That allows to keep those code pathes
> fully preemptible. Still twisting my brain around the details.
Just to make sure we're on the same page, the goal is so that this code:
kernel_fpu_begin();
kernel_fpu_end();
kernel_fpu_begin();
kernel_fpu_end();
kernel_fpu_begin();
kernel_fpu_end();
kernel_fpu_begin();
kernel_fpu_end();
kernel_fpu_begin();
kernel_fpu_end();
kernel_fpu_begin();
kernel_fpu_end();
...
has the same performance as this code:
kernel_fpu_begin();
kernel_fpu_end();
(Unless of course the process is preempted or the like.)
Currently the present situation makes the performance of the above
wildly different, since kernel_fpu_end() does something immediately.
What about something like this:
- Add a tristate flag connected to task_struct (or in the global fpu
struct in the case that this happens in irq and there isn't a valid
current).
- On kernel_fpu_begin(), if the flag is 0, do the usual expensive
XSAVE stuff, and set the flag to 1.
- On kernel_fpu_begin(), if the flag is non-0, just set the flag to 1
and return.
- On kernel_fpu_end(), if the flag is non-0, set the flag to 2.
(Otherwise WARN() or BUG() or something.)
- On context switch / preemption / etc away from the task, if the flag
is non-0, XRSTOR and such.
- On context switch / preemption / etc back to the task, if the flag
is 1, XSAVE and such. If the flag is 2, set it to 0.
Jason
^ permalink raw reply
* Re: [PATCH v2 01/17] asm: simd context helper API
From: Thomas Gleixner @ 2018-08-26 14:06 UTC (permalink / raw)
To: Jason A. Donenfeld
Cc: LKML, Netdev, David Miller, Andrew Lutomirski, Greg Kroah-Hartman,
Samuel Neves, linux-arch, Rik van Riel
In-Reply-To: <CAHmME9qrc_f6uqGOrRVpnO_sTXHg2cg-WwWg4ik3yqu=HCOZSg@mail.gmail.com>
Jason,
On Sun, 26 Aug 2018, Jason A. Donenfeld wrote:
> On Sun, Aug 26, 2018 at 6:10 AM Thomas Gleixner <tglx@linutronix.de> wrote:
> > I'm not too fond of this simply because it requires that relax() step in
> > all code pathes. I'd rather make that completely transparent by just
> > marking the task as FPU using and let the context switch code deal with it
> > in case that it gets preempted. I'll let one of my engineers look into
> > that next week.
>
> Do you mean to say you intend to make kernel_fpu_end() and
> kernel_neon_end() only actually do something upon context switch, but
> not when it's actually called? So that multiple calls to
> kernel_fpu_begin() and kernel_neon_begin() can be made without
> penalty?
On context switch and exit to user. That allows to keep those code pathes
fully preemptible. Still twisting my brain around the details.
> If so, that'd be great, and I'd certainly prefer this to the
> simd_context_t passing. I consider the simd_get/put/relax API a
> stopgap measure until something like that is implemented.
I really want to avoid this stopgap^Wducttape thing.
Thanks,
tglx
^ permalink raw reply
* KASAN: invalid-free in p9stat_free
From: syzbot @ 2018-08-26 13:50 UTC (permalink / raw)
To: asmadeus, davem, ericvh, linux-kernel, lucho, netdev,
syzkaller-bugs, v9fs-developer
Hello,
syzbot found the following crash on:
HEAD commit: e27bc174c9c6 Add linux-next specific files for 20180824
git tree: linux-next
console output: https://syzkaller.appspot.com/x/log.txt?x=15dc19a6400000
kernel config: https://syzkaller.appspot.com/x/.config?x=28446088176757ea
dashboard link: https://syzkaller.appspot.com/bug?extid=d4252148d198410b864f
compiler: gcc (GCC) 8.0.1 20180413 (experimental)
syz repro: https://syzkaller.appspot.com/x/repro.syz?x=15f8efba400000
C reproducer: https://syzkaller.appspot.com/x/repro.c?x=1178256a400000
IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+d4252148d198410b864f@syzkaller.appspotmail.com
random: sshd: uninitialized urandom read (32 bytes read)
random: sshd: uninitialized urandom read (32 bytes read)
random: sshd: uninitialized urandom read (32 bytes read)
random: sshd: uninitialized urandom read (32 bytes read)
==================================================================
BUG: KASAN: double-free or invalid-free in p9stat_free+0x35/0x100
net/9p/protocol.c:48
CPU: 0 PID: 4499 Comm: syz-executor922 Not tainted 4.18.0-next-20180824+ #47
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
print_address_description+0x6c/0x20b mm/kasan/report.c:256
kasan_report_invalid_free+0x64/0xa0 mm/kasan/report.c:336
__kasan_slab_free+0x150/0x170 mm/kasan/kasan.c:501
kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
__cache_free mm/slab.c:3498 [inline]
kfree+0xd9/0x210 mm/slab.c:3813
p9stat_free+0x35/0x100 net/9p/protocol.c:48
v9fs_dir_readdir+0x68e/0xbc0 fs/9p/vfs_dir.c:153
iterate_dir+0x48b/0x5d0 fs/readdir.c:51
__do_sys_getdents fs/readdir.c:231 [inline]
__se_sys_getdents fs/readdir.c:212 [inline]
__x64_sys_getdents+0x29f/0x510 fs/readdir.c:212
do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4406a9
Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff
ff 0f 83 fb 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007fffc1b13808 EFLAGS: 00000217 ORIG_RAX: 000000000000004e
RAX: ffffffffffffffda RBX: 0030656c69662f2e RCX: 00000000004406a9
RDX: 0000000000000008 RSI: 0000000020000180 RDI: 0000000000000005
RBP: 64663d736e617274 R08: 0000000000401f30 R09: 0000000000401f30
R10: 0000000000401f30 R11: 0000000000000217 R12: 0000000000401f30
R13: 0000000000401fc0 R14: 0000000000000000 R15: 0000000000000000
Allocated by task 4499:
save_stack+0x43/0xd0 mm/kasan/kasan.c:448
set_track mm/kasan/kasan.c:460 [inline]
kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
__do_kmalloc mm/slab.c:3718 [inline]
__kmalloc+0x14e/0x720 mm/slab.c:3727
kmalloc include/linux/slab.h:518 [inline]
p9pdu_vreadf net/9p/protocol.c:157 [inline]
p9pdu_readf+0x526/0x2170 net/9p/protocol.c:536
p9pdu_vreadf net/9p/protocol.c:208 [inline]
p9pdu_readf+0xd5c/0x2170 net/9p/protocol.c:536
p9stat_read+0x194/0x5d0 net/9p/protocol.c:565
v9fs_dir_readdir+0x63d/0xbc0 fs/9p/vfs_dir.c:149
iterate_dir+0x48b/0x5d0 fs/readdir.c:51
__do_sys_getdents fs/readdir.c:231 [inline]
__se_sys_getdents fs/readdir.c:212 [inline]
__x64_sys_getdents+0x29f/0x510 fs/readdir.c:212
do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
entry_SYSCALL_64_after_hwframe+0x49/0xbe
Freed by task 4499:
save_stack+0x43/0xd0 mm/kasan/kasan.c:448
set_track mm/kasan/kasan.c:460 [inline]
__kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
__cache_free mm/slab.c:3498 [inline]
kfree+0xd9/0x210 mm/slab.c:3813
p9stat_free+0x35/0x100 net/9p/protocol.c:48
p9pdu_vreadf net/9p/protocol.c:220 [inline]
p9pdu_readf+0xd90/0x2170 net/9p/protocol.c:536
p9stat_read+0x194/0x5d0 net/9p/protocol.c:565
v9fs_dir_readdir+0x63d/0xbc0 fs/9p/vfs_dir.c:149
iterate_dir+0x48b/0x5d0 fs/readdir.c:51
__do_sys_getdents fs/readdir.c:231 [inline]
__se_sys_getdents fs/readdir.c:212 [inline]
__x64_sys_getdents+0x29f/0x510 fs/readdir.c:212
do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
entry_SYSCALL_64_after_hwframe+0x49/0xbe
The buggy address belongs to the object at ffff8801b3006700
which belongs to the cache kmalloc-32 of size 32
The buggy address is located 0 bytes inside of
32-byte region [ffff8801b3006700, ffff8801b3006720)
The buggy address belongs to the page:
page:ffffea0006cc0180 count:1 mapcount:0 mapping:ffff8801dac001c0
index:0xffff8801b3006fc1
flags: 0x2fffc0000000100(slab)
raw: 02fffc0000000100 ffff8801dac01238 ffffea0006cc6548 ffff8801dac001c0
raw: ffff8801b3006fc1 ffff8801b3006000 0000000100000037 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
ffff8801b3006600: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
ffff8801b3006680: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
> ffff8801b3006700: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
^
ffff8801b3006780: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
ffff8801b3006800: fb fb fb fb fc fc fc fc 05 fc fc fc fc fc fc fc
==================================================================
---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.
syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with
syzbot.
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches
^ permalink raw reply
* Re: [PATCH v2 15/17] zinc: Curve25519 ARM implementation
From: Ard Biesheuvel @ 2018-08-26 13:18 UTC (permalink / raw)
To: Jason A. Donenfeld
Cc: Jean-Philippe Aumasson, <netdev@vger.kernel.org>,
Linux Kernel Mailing List, D . J . Bernstein, Samuel Neves,
open list:HARDWARE RANDOM NUMBER GENERATOR CORE, Andy Lutomirski,
Greg KH, Russell King, David S. Miller, linux-arm-kernel
In-Reply-To: <20180824213849.23647-16-Jason@zx2c4.com>
On 24 August 2018 at 22:38, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> This comes from Dan Bernstein and Peter Schwabe's public domain NEON
> code, and has been modified to be friendly for kernel space.
>
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> Cc: Andy Lutomirski <luto@kernel.org>
> Cc: Greg KH <gregkh@linuxfoundation.org>
> Cc: Samuel Neves <sneves@dei.uc.pt>
> Cc: D. J. Bernstein <djb@cr.yp.to>
> Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
> Cc: Russell King <linux@armlinux.org.uk>
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-crypto@vger.kernel.org
> ---
> lib/zinc/Makefile | 4 +
> lib/zinc/curve25519/curve25519-arm-glue.h | 33 +
> lib/zinc/curve25519/curve25519-arm.S | 2110 +++++++++++++++++++++
As discussed in the context of patch #2, I have my doubts about the
general approach you are taking. I think this code belongs in arch/arm
somewhere, exposed in a way so it naturally supersedes a dummy
implementation by library search order.
In the mean time, let me comment on the code in this patch.
> 3 files changed, 2147 insertions(+)
> create mode 100644 lib/zinc/curve25519/curve25519-arm-glue.h
> create mode 100644 lib/zinc/curve25519/curve25519-arm.S
>
> diff --git a/lib/zinc/Makefile b/lib/zinc/Makefile
> index b6b0614f8743..13e403780ce0 100644
> --- a/lib/zinc/Makefile
> +++ b/lib/zinc/Makefile
> @@ -52,6 +52,10 @@ endif
>
> ifeq ($(CONFIG_ZINC_CURVE25519),y)
> zinc-y += curve25519/curve25519.o
> +ifeq ($(CONFIG_ARM)$(CONFIG_KERNEL_MODE_NEON),yy)
> +zinc-y += curve25519/curve25519-arm.o
> +CFLAGS_curve25519.o += -include $(srctree)/$(src)/curve25519/curve25519-arm-glue.h
If you put the code below in arch/arm/lib/curve25519.c, you don't need
this clunkiness.
> +endif
> endif
>
> ifeq ($(CONFIG_ZINC_BLAKE2S),y)
> diff --git a/lib/zinc/curve25519/curve25519-arm-glue.h b/lib/zinc/curve25519/curve25519-arm-glue.h
> new file mode 100644
> index 000000000000..1d5c029e9195
> --- /dev/null
> +++ b/lib/zinc/curve25519/curve25519-arm-glue.h
> @@ -0,0 +1,33 @@
> +/* SPDX-License-Identifier: GPL-2.0
> + *
> + * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
> + */
> +
> +#include <zinc/curve25519.h>
> +#include <asm/hwcap.h>
> +#include <asm/neon.h>
> +#include <asm/simd.h>
> +
> +asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]);
> +
Please follow the coding style of the kernel.
> +static bool curve25519_use_neon __ro_after_init;
> +
> +void __init curve25519_fpu_init(void)
> +{
> + curve25519_use_neon = elf_hwcap & HWCAP_NEON;
> +}
> +
> +static inline bool curve25519_arch(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE])
> +{
> + if (curve25519_use_neon && may_use_simd()) {
> + kernel_neon_begin();
> + curve25519_neon(mypublic, secret, basepoint);
> + kernel_neon_end();
> + return true;
> + }
> + return false;
> +}
> +
> +static inline bool curve25519_base_arch(u8 pub[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE]) { return false; }
> +
> +#define HAVE_CURVE25519_ARCH_IMPLEMENTATION
> diff --git a/lib/zinc/curve25519/curve25519-arm.S b/lib/zinc/curve25519/curve25519-arm.S
> new file mode 100644
> index 000000000000..2c02e66cc87c
> --- /dev/null
> +++ b/lib/zinc/curve25519/curve25519-arm.S
> @@ -0,0 +1,2110 @@
> +/* SPDX-License-Identifier: GPL-2.0
> + *
> + * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
> + *
> + * Based on public domain code from Daniel J. Bernstein and Peter Schwabe.
> + */
> +
> +#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON)
> +
This #if is redundant - you only build the file if it is true.
> +#include <linux/linkage.h>
> +
> + .text
> + .fpu neon
> + .align 4
> +
> +ENTRY(curve25519_neon)
Could you please indent the asm with two tabs for legibility?
> + vpush {q4,q5,q6,q7}
You can drop this
> + mov r12,sp
> + sub r3,sp,#736
> + and r3,r3,#0xffffffe0
I don't think you need 32 byte alignment, 16 bytes should be sufficient
> + mov sp,r3
> + strd r4,[sp,#0]
> + strd r6,[sp,#8]
> + strd r8,[sp,#16]
> + strd r10,[sp,#24]
> + str r12,[sp,#480]
> + str r14,[sp,#484]
This is a very odd way to create a stack frame.
> + mov r0,r0
> + mov r1,r1
> + mov r2,r2
Drop these
> + add r3,sp,#32
> + ldr r4,=0
> + ldr r5,=254
NEON implies v7, so you can use movw for any 16-bit immediate. No need for ldr.
> + vmov.i32 q0,#1
> + vshr.u64 q1,q0,#7
> + vshr.u64 q0,q0,#8
> + vmov.i32 d4,#19
> + vmov.i32 d5,#38
> + add r6,sp,#512
> + vst1.8 {d2-d3},[r6,: 128]
> + add r6,sp,#528
> + vst1.8 {d0-d1},[r6,: 128]
> + add r6,sp,#544
> + vst1.8 {d4-d5},[r6,: 128]
> + add r6,r3,#0
> + vmov.i32 q2,#0
> + vst1.8 {d4-d5},[r6,: 128]!
> + vst1.8 {d4-d5},[r6,: 128]!
> + vst1.8 d4,[r6,: 64]
> + add r6,r3,#0
> + ldr r7,=960
> + sub r7,r7,#2
> + neg r7,r7
> + sub r7,r7,r7,LSL #7
What is this little dance for?
> + str r7,[r6]
> + add r6,sp,#704
> + vld1.8 {d4-d5},[r1]!
> + vld1.8 {d6-d7},[r1]
> + vst1.8 {d4-d5},[r6,: 128]!
> + vst1.8 {d6-d7},[r6,: 128]
> + sub r1,r6,#16
> + ldrb r6,[r1]
> + and r6,r6,#248
> + strb r6,[r1]
> + ldrb r6,[r1,#31]
> + and r6,r6,#127
> + orr r6,r6,#64
> + strb r6,[r1,#31]
> + vmov.i64 q2,#0xffffffff
> + vshr.u64 q3,q2,#7
> + vshr.u64 q2,q2,#6
> + vld1.8 {d8},[r2]
> + vld1.8 {d10},[r2]
> + add r2,r2,#6
> + vld1.8 {d12},[r2]
> + vld1.8 {d14},[r2]
> + add r2,r2,#6
> + vld1.8 {d16},[r2]
> + add r2,r2,#4
> + vld1.8 {d18},[r2]
> + vld1.8 {d20},[r2]
> + add r2,r2,#6
> + vld1.8 {d22},[r2]
> + add r2,r2,#2
> + vld1.8 {d24},[r2]
> + vld1.8 {d26},[r2]
> + vshr.u64 q5,q5,#26
> + vshr.u64 q6,q6,#3
> + vshr.u64 q7,q7,#29
> + vshr.u64 q8,q8,#6
> + vshr.u64 q10,q10,#25
> + vshr.u64 q11,q11,#3
> + vshr.u64 q12,q12,#12
> + vshr.u64 q13,q13,#38
> + vand q4,q4,q2
> + vand q6,q6,q2
> + vand q8,q8,q2
> + vand q10,q10,q2
> + vand q2,q12,q2
> + vand q5,q5,q3
> + vand q7,q7,q3
> + vand q9,q9,q3
> + vand q11,q11,q3
> + vand q3,q13,q3
> + add r2,r3,#48
> + vadd.i64 q12,q4,q1
> + vadd.i64 q13,q10,q1
> + vshr.s64 q12,q12,#26
> + vshr.s64 q13,q13,#26
> + vadd.i64 q5,q5,q12
> + vshl.i64 q12,q12,#26
> + vadd.i64 q14,q5,q0
> + vadd.i64 q11,q11,q13
> + vshl.i64 q13,q13,#26
> + vadd.i64 q15,q11,q0
> + vsub.i64 q4,q4,q12
> + vshr.s64 q12,q14,#25
> + vsub.i64 q10,q10,q13
> + vshr.s64 q13,q15,#25
> + vadd.i64 q6,q6,q12
> + vshl.i64 q12,q12,#25
> + vadd.i64 q14,q6,q1
> + vadd.i64 q2,q2,q13
> + vsub.i64 q5,q5,q12
> + vshr.s64 q12,q14,#26
> + vshl.i64 q13,q13,#25
> + vadd.i64 q14,q2,q1
> + vadd.i64 q7,q7,q12
> + vshl.i64 q12,q12,#26
> + vadd.i64 q15,q7,q0
> + vsub.i64 q11,q11,q13
> + vshr.s64 q13,q14,#26
> + vsub.i64 q6,q6,q12
> + vshr.s64 q12,q15,#25
> + vadd.i64 q3,q3,q13
> + vshl.i64 q13,q13,#26
> + vadd.i64 q14,q3,q0
> + vadd.i64 q8,q8,q12
> + vshl.i64 q12,q12,#25
> + vadd.i64 q15,q8,q1
> + add r2,r2,#8
> + vsub.i64 q2,q2,q13
> + vshr.s64 q13,q14,#25
> + vsub.i64 q7,q7,q12
> + vshr.s64 q12,q15,#26
> + vadd.i64 q14,q13,q13
> + vadd.i64 q9,q9,q12
> + vtrn.32 d12,d14
> + vshl.i64 q12,q12,#26
> + vtrn.32 d13,d15
> + vadd.i64 q0,q9,q0
> + vadd.i64 q4,q4,q14
> + vst1.8 d12,[r2,: 64]!
> + vshl.i64 q6,q13,#4
> + vsub.i64 q7,q8,q12
> + vshr.s64 q0,q0,#25
> + vadd.i64 q4,q4,q6
> + vadd.i64 q6,q10,q0
> + vshl.i64 q0,q0,#25
> + vadd.i64 q8,q6,q1
> + vadd.i64 q4,q4,q13
> + vshl.i64 q10,q13,#25
> + vadd.i64 q1,q4,q1
> + vsub.i64 q0,q9,q0
> + vshr.s64 q8,q8,#26
> + vsub.i64 q3,q3,q10
> + vtrn.32 d14,d0
> + vshr.s64 q1,q1,#26
> + vtrn.32 d15,d1
> + vadd.i64 q0,q11,q8
> + vst1.8 d14,[r2,: 64]
> + vshl.i64 q7,q8,#26
> + vadd.i64 q5,q5,q1
> + vtrn.32 d4,d6
> + vshl.i64 q1,q1,#26
> + vtrn.32 d5,d7
> + vsub.i64 q3,q6,q7
> + add r2,r2,#16
> + vsub.i64 q1,q4,q1
> + vst1.8 d4,[r2,: 64]
> + vtrn.32 d6,d0
> + vtrn.32 d7,d1
> + sub r2,r2,#8
> + vtrn.32 d2,d10
> + vtrn.32 d3,d11
> + vst1.8 d6,[r2,: 64]
> + sub r2,r2,#24
> + vst1.8 d2,[r2,: 64]
> + add r2,r3,#96
> + vmov.i32 q0,#0
> + vmov.i64 d2,#0xff
> + vmov.i64 d3,#0
> + vshr.u32 q1,q1,#7
> + vst1.8 {d2-d3},[r2,: 128]!
> + vst1.8 {d0-d1},[r2,: 128]!
> + vst1.8 d0,[r2,: 64]
> + add r2,r3,#144
> + vmov.i32 q0,#0
> + vst1.8 {d0-d1},[r2,: 128]!
> + vst1.8 {d0-d1},[r2,: 128]!
> + vst1.8 d0,[r2,: 64]
> + add r2,r3,#240
> + vmov.i32 q0,#0
> + vmov.i64 d2,#0xff
> + vmov.i64 d3,#0
> + vshr.u32 q1,q1,#7
> + vst1.8 {d2-d3},[r2,: 128]!
> + vst1.8 {d0-d1},[r2,: 128]!
> + vst1.8 d0,[r2,: 64]
> + add r2,r3,#48
> + add r6,r3,#192
> + vld1.8 {d0-d1},[r2,: 128]!
> + vld1.8 {d2-d3},[r2,: 128]!
> + vld1.8 {d4},[r2,: 64]
> + vst1.8 {d0-d1},[r6,: 128]!
> + vst1.8 {d2-d3},[r6,: 128]!
> + vst1.8 d4,[r6,: 64]
> + .Lmainloop:
Please put labels in the first column.
> + mov r2,r5,LSR #3
> + and r6,r5,#7
> + ldrb r2,[r1,r2]
> + mov r2,r2,LSR r6
> + and r2,r2,#1
> + str r5,[sp,#488]
> + eor r4,r4,r2
> + str r2,[sp,#492]
> + neg r2,r4
> + add r4,r3,#96
> + add r5,r3,#192
> + add r6,r3,#144
> + vld1.8 {d8-d9},[r4,: 128]!
> + add r7,r3,#240
> + vld1.8 {d10-d11},[r5,: 128]!
> + veor q6,q4,q5
> + vld1.8 {d14-d15},[r6,: 128]!
> + vdup.i32 q8,r2
> + vld1.8 {d18-d19},[r7,: 128]!
> + veor q10,q7,q9
> + vld1.8 {d22-d23},[r4,: 128]!
> + vand q6,q6,q8
> + vld1.8 {d24-d25},[r5,: 128]!
> + vand q10,q10,q8
> + vld1.8 {d26-d27},[r6,: 128]!
> + veor q4,q4,q6
> + vld1.8 {d28-d29},[r7,: 128]!
> + veor q5,q5,q6
> + vld1.8 {d0},[r4,: 64]
> + veor q6,q7,q10
> + vld1.8 {d2},[r5,: 64]
> + veor q7,q9,q10
> + vld1.8 {d4},[r6,: 64]
> + veor q9,q11,q12
> + vld1.8 {d6},[r7,: 64]
> + veor q10,q0,q1
> + sub r2,r4,#32
> + vand q9,q9,q8
> + sub r4,r5,#32
> + vand q10,q10,q8
> + sub r5,r6,#32
> + veor q11,q11,q9
> + sub r6,r7,#32
> + veor q0,q0,q10
> + veor q9,q12,q9
> + veor q1,q1,q10
> + veor q10,q13,q14
> + veor q12,q2,q3
> + vand q10,q10,q8
> + vand q8,q12,q8
> + veor q12,q13,q10
> + veor q2,q2,q8
> + veor q10,q14,q10
> + veor q3,q3,q8
> + vadd.i32 q8,q4,q6
> + vsub.i32 q4,q4,q6
> + vst1.8 {d16-d17},[r2,: 128]!
> + vadd.i32 q6,q11,q12
> + vst1.8 {d8-d9},[r5,: 128]!
> + vsub.i32 q4,q11,q12
> + vst1.8 {d12-d13},[r2,: 128]!
> + vadd.i32 q6,q0,q2
> + vst1.8 {d8-d9},[r5,: 128]!
> + vsub.i32 q0,q0,q2
> + vst1.8 d12,[r2,: 64]
> + vadd.i32 q2,q5,q7
> + vst1.8 d0,[r5,: 64]
> + vsub.i32 q0,q5,q7
> + vst1.8 {d4-d5},[r4,: 128]!
> + vadd.i32 q2,q9,q10
> + vst1.8 {d0-d1},[r6,: 128]!
> + vsub.i32 q0,q9,q10
> + vst1.8 {d4-d5},[r4,: 128]!
> + vadd.i32 q2,q1,q3
> + vst1.8 {d0-d1},[r6,: 128]!
> + vsub.i32 q0,q1,q3
> + vst1.8 d4,[r4,: 64]
> + vst1.8 d0,[r6,: 64]
> + add r2,sp,#544
> + add r4,r3,#96
> + add r5,r3,#144
> + vld1.8 {d0-d1},[r2,: 128]
> + vld1.8 {d2-d3},[r4,: 128]!
> + vld1.8 {d4-d5},[r5,: 128]!
> + vzip.i32 q1,q2
> + vld1.8 {d6-d7},[r4,: 128]!
> + vld1.8 {d8-d9},[r5,: 128]!
> + vshl.i32 q5,q1,#1
> + vzip.i32 q3,q4
> + vshl.i32 q6,q2,#1
> + vld1.8 {d14},[r4,: 64]
> + vshl.i32 q8,q3,#1
> + vld1.8 {d15},[r5,: 64]
> + vshl.i32 q9,q4,#1
> + vmul.i32 d21,d7,d1
> + vtrn.32 d14,d15
> + vmul.i32 q11,q4,q0
> + vmul.i32 q0,q7,q0
> + vmull.s32 q12,d2,d2
> + vmlal.s32 q12,d11,d1
> + vmlal.s32 q12,d12,d0
> + vmlal.s32 q12,d13,d23
> + vmlal.s32 q12,d16,d22
> + vmlal.s32 q12,d7,d21
> + vmull.s32 q10,d2,d11
> + vmlal.s32 q10,d4,d1
> + vmlal.s32 q10,d13,d0
> + vmlal.s32 q10,d6,d23
> + vmlal.s32 q10,d17,d22
> + vmull.s32 q13,d10,d4
> + vmlal.s32 q13,d11,d3
> + vmlal.s32 q13,d13,d1
> + vmlal.s32 q13,d16,d0
> + vmlal.s32 q13,d17,d23
> + vmlal.s32 q13,d8,d22
> + vmull.s32 q1,d10,d5
> + vmlal.s32 q1,d11,d4
> + vmlal.s32 q1,d6,d1
> + vmlal.s32 q1,d17,d0
> + vmlal.s32 q1,d8,d23
> + vmull.s32 q14,d10,d6
> + vmlal.s32 q14,d11,d13
> + vmlal.s32 q14,d4,d4
> + vmlal.s32 q14,d17,d1
> + vmlal.s32 q14,d18,d0
> + vmlal.s32 q14,d9,d23
> + vmull.s32 q11,d10,d7
> + vmlal.s32 q11,d11,d6
> + vmlal.s32 q11,d12,d5
> + vmlal.s32 q11,d8,d1
> + vmlal.s32 q11,d19,d0
> + vmull.s32 q15,d10,d8
> + vmlal.s32 q15,d11,d17
> + vmlal.s32 q15,d12,d6
> + vmlal.s32 q15,d13,d5
> + vmlal.s32 q15,d19,d1
> + vmlal.s32 q15,d14,d0
> + vmull.s32 q2,d10,d9
> + vmlal.s32 q2,d11,d8
> + vmlal.s32 q2,d12,d7
> + vmlal.s32 q2,d13,d6
> + vmlal.s32 q2,d14,d1
> + vmull.s32 q0,d15,d1
> + vmlal.s32 q0,d10,d14
> + vmlal.s32 q0,d11,d19
> + vmlal.s32 q0,d12,d8
> + vmlal.s32 q0,d13,d17
> + vmlal.s32 q0,d6,d6
> + add r2,sp,#512
> + vld1.8 {d18-d19},[r2,: 128]
> + vmull.s32 q3,d16,d7
> + vmlal.s32 q3,d10,d15
> + vmlal.s32 q3,d11,d14
> + vmlal.s32 q3,d12,d9
> + vmlal.s32 q3,d13,d8
> + add r2,sp,#528
> + vld1.8 {d8-d9},[r2,: 128]
> + vadd.i64 q5,q12,q9
> + vadd.i64 q6,q15,q9
> + vshr.s64 q5,q5,#26
> + vshr.s64 q6,q6,#26
> + vadd.i64 q7,q10,q5
> + vshl.i64 q5,q5,#26
> + vadd.i64 q8,q7,q4
> + vadd.i64 q2,q2,q6
> + vshl.i64 q6,q6,#26
> + vadd.i64 q10,q2,q4
> + vsub.i64 q5,q12,q5
> + vshr.s64 q8,q8,#25
> + vsub.i64 q6,q15,q6
> + vshr.s64 q10,q10,#25
> + vadd.i64 q12,q13,q8
> + vshl.i64 q8,q8,#25
> + vadd.i64 q13,q12,q9
> + vadd.i64 q0,q0,q10
> + vsub.i64 q7,q7,q8
> + vshr.s64 q8,q13,#26
> + vshl.i64 q10,q10,#25
> + vadd.i64 q13,q0,q9
> + vadd.i64 q1,q1,q8
> + vshl.i64 q8,q8,#26
> + vadd.i64 q15,q1,q4
> + vsub.i64 q2,q2,q10
> + vshr.s64 q10,q13,#26
> + vsub.i64 q8,q12,q8
> + vshr.s64 q12,q15,#25
> + vadd.i64 q3,q3,q10
> + vshl.i64 q10,q10,#26
> + vadd.i64 q13,q3,q4
> + vadd.i64 q14,q14,q12
> + add r2,r3,#288
> + vshl.i64 q12,q12,#25
> + add r4,r3,#336
> + vadd.i64 q15,q14,q9
> + add r2,r2,#8
> + vsub.i64 q0,q0,q10
> + add r4,r4,#8
> + vshr.s64 q10,q13,#25
> + vsub.i64 q1,q1,q12
> + vshr.s64 q12,q15,#26
> + vadd.i64 q13,q10,q10
> + vadd.i64 q11,q11,q12
> + vtrn.32 d16,d2
> + vshl.i64 q12,q12,#26
> + vtrn.32 d17,d3
> + vadd.i64 q1,q11,q4
> + vadd.i64 q4,q5,q13
> + vst1.8 d16,[r2,: 64]!
> + vshl.i64 q5,q10,#4
> + vst1.8 d17,[r4,: 64]!
> + vsub.i64 q8,q14,q12
> + vshr.s64 q1,q1,#25
> + vadd.i64 q4,q4,q5
> + vadd.i64 q5,q6,q1
> + vshl.i64 q1,q1,#25
> + vadd.i64 q6,q5,q9
> + vadd.i64 q4,q4,q10
> + vshl.i64 q10,q10,#25
> + vadd.i64 q9,q4,q9
> + vsub.i64 q1,q11,q1
> + vshr.s64 q6,q6,#26
> + vsub.i64 q3,q3,q10
> + vtrn.32 d16,d2
> + vshr.s64 q9,q9,#26
> + vtrn.32 d17,d3
> + vadd.i64 q1,q2,q6
> + vst1.8 d16,[r2,: 64]
> + vshl.i64 q2,q6,#26
> + vst1.8 d17,[r4,: 64]
> + vadd.i64 q6,q7,q9
> + vtrn.32 d0,d6
> + vshl.i64 q7,q9,#26
> + vtrn.32 d1,d7
> + vsub.i64 q2,q5,q2
> + add r2,r2,#16
> + vsub.i64 q3,q4,q7
> + vst1.8 d0,[r2,: 64]
> + add r4,r4,#16
> + vst1.8 d1,[r4,: 64]
> + vtrn.32 d4,d2
> + vtrn.32 d5,d3
> + sub r2,r2,#8
> + sub r4,r4,#8
> + vtrn.32 d6,d12
> + vtrn.32 d7,d13
> + vst1.8 d4,[r2,: 64]
> + vst1.8 d5,[r4,: 64]
> + sub r2,r2,#24
> + sub r4,r4,#24
> + vst1.8 d6,[r2,: 64]
> + vst1.8 d7,[r4,: 64]
> + add r2,r3,#240
> + add r4,r3,#96
> + vld1.8 {d0-d1},[r4,: 128]!
> + vld1.8 {d2-d3},[r4,: 128]!
> + vld1.8 {d4},[r4,: 64]
> + add r4,r3,#144
> + vld1.8 {d6-d7},[r4,: 128]!
> + vtrn.32 q0,q3
> + vld1.8 {d8-d9},[r4,: 128]!
> + vshl.i32 q5,q0,#4
> + vtrn.32 q1,q4
> + vshl.i32 q6,q3,#4
> + vadd.i32 q5,q5,q0
> + vadd.i32 q6,q6,q3
> + vshl.i32 q7,q1,#4
> + vld1.8 {d5},[r4,: 64]
> + vshl.i32 q8,q4,#4
> + vtrn.32 d4,d5
> + vadd.i32 q7,q7,q1
> + vadd.i32 q8,q8,q4
> + vld1.8 {d18-d19},[r2,: 128]!
> + vshl.i32 q10,q2,#4
> + vld1.8 {d22-d23},[r2,: 128]!
> + vadd.i32 q10,q10,q2
> + vld1.8 {d24},[r2,: 64]
> + vadd.i32 q5,q5,q0
> + add r2,r3,#192
> + vld1.8 {d26-d27},[r2,: 128]!
> + vadd.i32 q6,q6,q3
> + vld1.8 {d28-d29},[r2,: 128]!
> + vadd.i32 q8,q8,q4
> + vld1.8 {d25},[r2,: 64]
> + vadd.i32 q10,q10,q2
> + vtrn.32 q9,q13
> + vadd.i32 q7,q7,q1
> + vadd.i32 q5,q5,q0
> + vtrn.32 q11,q14
> + vadd.i32 q6,q6,q3
> + add r2,sp,#560
> + vadd.i32 q10,q10,q2
> + vtrn.32 d24,d25
> + vst1.8 {d12-d13},[r2,: 128]
> + vshl.i32 q6,q13,#1
> + add r2,sp,#576
> + vst1.8 {d20-d21},[r2,: 128]
> + vshl.i32 q10,q14,#1
> + add r2,sp,#592
> + vst1.8 {d12-d13},[r2,: 128]
> + vshl.i32 q15,q12,#1
> + vadd.i32 q8,q8,q4
> + vext.32 d10,d31,d30,#0
> + vadd.i32 q7,q7,q1
> + add r2,sp,#608
> + vst1.8 {d16-d17},[r2,: 128]
> + vmull.s32 q8,d18,d5
> + vmlal.s32 q8,d26,d4
> + vmlal.s32 q8,d19,d9
> + vmlal.s32 q8,d27,d3
> + vmlal.s32 q8,d22,d8
> + vmlal.s32 q8,d28,d2
> + vmlal.s32 q8,d23,d7
> + vmlal.s32 q8,d29,d1
> + vmlal.s32 q8,d24,d6
> + vmlal.s32 q8,d25,d0
> + add r2,sp,#624
> + vst1.8 {d14-d15},[r2,: 128]
> + vmull.s32 q2,d18,d4
> + vmlal.s32 q2,d12,d9
> + vmlal.s32 q2,d13,d8
> + vmlal.s32 q2,d19,d3
> + vmlal.s32 q2,d22,d2
> + vmlal.s32 q2,d23,d1
> + vmlal.s32 q2,d24,d0
> + add r2,sp,#640
> + vst1.8 {d20-d21},[r2,: 128]
> + vmull.s32 q7,d18,d9
> + vmlal.s32 q7,d26,d3
> + vmlal.s32 q7,d19,d8
> + vmlal.s32 q7,d27,d2
> + vmlal.s32 q7,d22,d7
> + vmlal.s32 q7,d28,d1
> + vmlal.s32 q7,d23,d6
> + vmlal.s32 q7,d29,d0
> + add r2,sp,#656
> + vst1.8 {d10-d11},[r2,: 128]
> + vmull.s32 q5,d18,d3
> + vmlal.s32 q5,d19,d2
> + vmlal.s32 q5,d22,d1
> + vmlal.s32 q5,d23,d0
> + vmlal.s32 q5,d12,d8
> + add r2,sp,#672
> + vst1.8 {d16-d17},[r2,: 128]
> + vmull.s32 q4,d18,d8
> + vmlal.s32 q4,d26,d2
> + vmlal.s32 q4,d19,d7
> + vmlal.s32 q4,d27,d1
> + vmlal.s32 q4,d22,d6
> + vmlal.s32 q4,d28,d0
> + vmull.s32 q8,d18,d7
> + vmlal.s32 q8,d26,d1
> + vmlal.s32 q8,d19,d6
> + vmlal.s32 q8,d27,d0
> + add r2,sp,#576
> + vld1.8 {d20-d21},[r2,: 128]
> + vmlal.s32 q7,d24,d21
> + vmlal.s32 q7,d25,d20
> + vmlal.s32 q4,d23,d21
> + vmlal.s32 q4,d29,d20
> + vmlal.s32 q8,d22,d21
> + vmlal.s32 q8,d28,d20
> + vmlal.s32 q5,d24,d20
> + add r2,sp,#576
> + vst1.8 {d14-d15},[r2,: 128]
> + vmull.s32 q7,d18,d6
> + vmlal.s32 q7,d26,d0
> + add r2,sp,#656
> + vld1.8 {d30-d31},[r2,: 128]
> + vmlal.s32 q2,d30,d21
> + vmlal.s32 q7,d19,d21
> + vmlal.s32 q7,d27,d20
> + add r2,sp,#624
> + vld1.8 {d26-d27},[r2,: 128]
> + vmlal.s32 q4,d25,d27
> + vmlal.s32 q8,d29,d27
> + vmlal.s32 q8,d25,d26
> + vmlal.s32 q7,d28,d27
> + vmlal.s32 q7,d29,d26
> + add r2,sp,#608
> + vld1.8 {d28-d29},[r2,: 128]
> + vmlal.s32 q4,d24,d29
> + vmlal.s32 q8,d23,d29
> + vmlal.s32 q8,d24,d28
> + vmlal.s32 q7,d22,d29
> + vmlal.s32 q7,d23,d28
> + add r2,sp,#608
> + vst1.8 {d8-d9},[r2,: 128]
> + add r2,sp,#560
> + vld1.8 {d8-d9},[r2,: 128]
> + vmlal.s32 q7,d24,d9
> + vmlal.s32 q7,d25,d31
> + vmull.s32 q1,d18,d2
> + vmlal.s32 q1,d19,d1
> + vmlal.s32 q1,d22,d0
> + vmlal.s32 q1,d24,d27
> + vmlal.s32 q1,d23,d20
> + vmlal.s32 q1,d12,d7
> + vmlal.s32 q1,d13,d6
> + vmull.s32 q6,d18,d1
> + vmlal.s32 q6,d19,d0
> + vmlal.s32 q6,d23,d27
> + vmlal.s32 q6,d22,d20
> + vmlal.s32 q6,d24,d26
> + vmull.s32 q0,d18,d0
> + vmlal.s32 q0,d22,d27
> + vmlal.s32 q0,d23,d26
> + vmlal.s32 q0,d24,d31
> + vmlal.s32 q0,d19,d20
> + add r2,sp,#640
> + vld1.8 {d18-d19},[r2,: 128]
> + vmlal.s32 q2,d18,d7
> + vmlal.s32 q2,d19,d6
> + vmlal.s32 q5,d18,d6
> + vmlal.s32 q5,d19,d21
> + vmlal.s32 q1,d18,d21
> + vmlal.s32 q1,d19,d29
> + vmlal.s32 q0,d18,d28
> + vmlal.s32 q0,d19,d9
> + vmlal.s32 q6,d18,d29
> + vmlal.s32 q6,d19,d28
> + add r2,sp,#592
> + vld1.8 {d18-d19},[r2,: 128]
> + add r2,sp,#512
> + vld1.8 {d22-d23},[r2,: 128]
> + vmlal.s32 q5,d19,d7
> + vmlal.s32 q0,d18,d21
> + vmlal.s32 q0,d19,d29
> + vmlal.s32 q6,d18,d6
> + add r2,sp,#528
> + vld1.8 {d6-d7},[r2,: 128]
> + vmlal.s32 q6,d19,d21
> + add r2,sp,#576
> + vld1.8 {d18-d19},[r2,: 128]
> + vmlal.s32 q0,d30,d8
> + add r2,sp,#672
> + vld1.8 {d20-d21},[r2,: 128]
> + vmlal.s32 q5,d30,d29
> + add r2,sp,#608
> + vld1.8 {d24-d25},[r2,: 128]
> + vmlal.s32 q1,d30,d28
> + vadd.i64 q13,q0,q11
> + vadd.i64 q14,q5,q11
> + vmlal.s32 q6,d30,d9
> + vshr.s64 q4,q13,#26
> + vshr.s64 q13,q14,#26
> + vadd.i64 q7,q7,q4
> + vshl.i64 q4,q4,#26
> + vadd.i64 q14,q7,q3
> + vadd.i64 q9,q9,q13
> + vshl.i64 q13,q13,#26
> + vadd.i64 q15,q9,q3
> + vsub.i64 q0,q0,q4
> + vshr.s64 q4,q14,#25
> + vsub.i64 q5,q5,q13
> + vshr.s64 q13,q15,#25
> + vadd.i64 q6,q6,q4
> + vshl.i64 q4,q4,#25
> + vadd.i64 q14,q6,q11
> + vadd.i64 q2,q2,q13
> + vsub.i64 q4,q7,q4
> + vshr.s64 q7,q14,#26
> + vshl.i64 q13,q13,#25
> + vadd.i64 q14,q2,q11
> + vadd.i64 q8,q8,q7
> + vshl.i64 q7,q7,#26
> + vadd.i64 q15,q8,q3
> + vsub.i64 q9,q9,q13
> + vshr.s64 q13,q14,#26
> + vsub.i64 q6,q6,q7
> + vshr.s64 q7,q15,#25
> + vadd.i64 q10,q10,q13
> + vshl.i64 q13,q13,#26
> + vadd.i64 q14,q10,q3
> + vadd.i64 q1,q1,q7
> + add r2,r3,#144
> + vshl.i64 q7,q7,#25
> + add r4,r3,#96
> + vadd.i64 q15,q1,q11
> + add r2,r2,#8
> + vsub.i64 q2,q2,q13
> + add r4,r4,#8
> + vshr.s64 q13,q14,#25
> + vsub.i64 q7,q8,q7
> + vshr.s64 q8,q15,#26
> + vadd.i64 q14,q13,q13
> + vadd.i64 q12,q12,q8
> + vtrn.32 d12,d14
> + vshl.i64 q8,q8,#26
> + vtrn.32 d13,d15
> + vadd.i64 q3,q12,q3
> + vadd.i64 q0,q0,q14
> + vst1.8 d12,[r2,: 64]!
> + vshl.i64 q7,q13,#4
> + vst1.8 d13,[r4,: 64]!
> + vsub.i64 q1,q1,q8
> + vshr.s64 q3,q3,#25
> + vadd.i64 q0,q0,q7
> + vadd.i64 q5,q5,q3
> + vshl.i64 q3,q3,#25
> + vadd.i64 q6,q5,q11
> + vadd.i64 q0,q0,q13
> + vshl.i64 q7,q13,#25
> + vadd.i64 q8,q0,q11
> + vsub.i64 q3,q12,q3
> + vshr.s64 q6,q6,#26
> + vsub.i64 q7,q10,q7
> + vtrn.32 d2,d6
> + vshr.s64 q8,q8,#26
> + vtrn.32 d3,d7
> + vadd.i64 q3,q9,q6
> + vst1.8 d2,[r2,: 64]
> + vshl.i64 q6,q6,#26
> + vst1.8 d3,[r4,: 64]
> + vadd.i64 q1,q4,q8
> + vtrn.32 d4,d14
> + vshl.i64 q4,q8,#26
> + vtrn.32 d5,d15
> + vsub.i64 q5,q5,q6
> + add r2,r2,#16
> + vsub.i64 q0,q0,q4
> + vst1.8 d4,[r2,: 64]
> + add r4,r4,#16
> + vst1.8 d5,[r4,: 64]
> + vtrn.32 d10,d6
> + vtrn.32 d11,d7
> + sub r2,r2,#8
> + sub r4,r4,#8
> + vtrn.32 d0,d2
> + vtrn.32 d1,d3
> + vst1.8 d10,[r2,: 64]
> + vst1.8 d11,[r4,: 64]
> + sub r2,r2,#24
> + sub r4,r4,#24
> + vst1.8 d0,[r2,: 64]
> + vst1.8 d1,[r4,: 64]
> + add r2,r3,#288
> + add r4,r3,#336
> + vld1.8 {d0-d1},[r2,: 128]!
> + vld1.8 {d2-d3},[r4,: 128]!
> + vsub.i32 q0,q0,q1
> + vld1.8 {d2-d3},[r2,: 128]!
> + vld1.8 {d4-d5},[r4,: 128]!
> + vsub.i32 q1,q1,q2
> + add r5,r3,#240
> + vld1.8 {d4},[r2,: 64]
> + vld1.8 {d6},[r4,: 64]
> + vsub.i32 q2,q2,q3
> + vst1.8 {d0-d1},[r5,: 128]!
> + vst1.8 {d2-d3},[r5,: 128]!
> + vst1.8 d4,[r5,: 64]
> + add r2,r3,#144
> + add r4,r3,#96
> + add r5,r3,#144
> + add r6,r3,#192
> + vld1.8 {d0-d1},[r2,: 128]!
> + vld1.8 {d2-d3},[r4,: 128]!
> + vsub.i32 q2,q0,q1
> + vadd.i32 q0,q0,q1
> + vld1.8 {d2-d3},[r2,: 128]!
> + vld1.8 {d6-d7},[r4,: 128]!
> + vsub.i32 q4,q1,q3
> + vadd.i32 q1,q1,q3
> + vld1.8 {d6},[r2,: 64]
> + vld1.8 {d10},[r4,: 64]
> + vsub.i32 q6,q3,q5
> + vadd.i32 q3,q3,q5
> + vst1.8 {d4-d5},[r5,: 128]!
> + vst1.8 {d0-d1},[r6,: 128]!
> + vst1.8 {d8-d9},[r5,: 128]!
> + vst1.8 {d2-d3},[r6,: 128]!
> + vst1.8 d12,[r5,: 64]
> + vst1.8 d6,[r6,: 64]
> + add r2,r3,#0
> + add r4,r3,#240
> + vld1.8 {d0-d1},[r4,: 128]!
> + vld1.8 {d2-d3},[r4,: 128]!
> + vld1.8 {d4},[r4,: 64]
> + add r4,r3,#336
> + vld1.8 {d6-d7},[r4,: 128]!
> + vtrn.32 q0,q3
> + vld1.8 {d8-d9},[r4,: 128]!
> + vshl.i32 q5,q0,#4
> + vtrn.32 q1,q4
> + vshl.i32 q6,q3,#4
> + vadd.i32 q5,q5,q0
> + vadd.i32 q6,q6,q3
> + vshl.i32 q7,q1,#4
> + vld1.8 {d5},[r4,: 64]
> + vshl.i32 q8,q4,#4
> + vtrn.32 d4,d5
> + vadd.i32 q7,q7,q1
> + vadd.i32 q8,q8,q4
> + vld1.8 {d18-d19},[r2,: 128]!
> + vshl.i32 q10,q2,#4
> + vld1.8 {d22-d23},[r2,: 128]!
> + vadd.i32 q10,q10,q2
> + vld1.8 {d24},[r2,: 64]
> + vadd.i32 q5,q5,q0
> + add r2,r3,#288
> + vld1.8 {d26-d27},[r2,: 128]!
> + vadd.i32 q6,q6,q3
> + vld1.8 {d28-d29},[r2,: 128]!
> + vadd.i32 q8,q8,q4
> + vld1.8 {d25},[r2,: 64]
> + vadd.i32 q10,q10,q2
> + vtrn.32 q9,q13
> + vadd.i32 q7,q7,q1
> + vadd.i32 q5,q5,q0
> + vtrn.32 q11,q14
> + vadd.i32 q6,q6,q3
> + add r2,sp,#560
> + vadd.i32 q10,q10,q2
> + vtrn.32 d24,d25
> + vst1.8 {d12-d13},[r2,: 128]
> + vshl.i32 q6,q13,#1
> + add r2,sp,#576
> + vst1.8 {d20-d21},[r2,: 128]
> + vshl.i32 q10,q14,#1
> + add r2,sp,#592
> + vst1.8 {d12-d13},[r2,: 128]
> + vshl.i32 q15,q12,#1
> + vadd.i32 q8,q8,q4
> + vext.32 d10,d31,d30,#0
> + vadd.i32 q7,q7,q1
> + add r2,sp,#608
> + vst1.8 {d16-d17},[r2,: 128]
> + vmull.s32 q8,d18,d5
> + vmlal.s32 q8,d26,d4
> + vmlal.s32 q8,d19,d9
> + vmlal.s32 q8,d27,d3
> + vmlal.s32 q8,d22,d8
> + vmlal.s32 q8,d28,d2
> + vmlal.s32 q8,d23,d7
> + vmlal.s32 q8,d29,d1
> + vmlal.s32 q8,d24,d6
> + vmlal.s32 q8,d25,d0
> + add r2,sp,#624
> + vst1.8 {d14-d15},[r2,: 128]
> + vmull.s32 q2,d18,d4
> + vmlal.s32 q2,d12,d9
> + vmlal.s32 q2,d13,d8
> + vmlal.s32 q2,d19,d3
> + vmlal.s32 q2,d22,d2
> + vmlal.s32 q2,d23,d1
> + vmlal.s32 q2,d24,d0
> + add r2,sp,#640
> + vst1.8 {d20-d21},[r2,: 128]
> + vmull.s32 q7,d18,d9
> + vmlal.s32 q7,d26,d3
> + vmlal.s32 q7,d19,d8
> + vmlal.s32 q7,d27,d2
> + vmlal.s32 q7,d22,d7
> + vmlal.s32 q7,d28,d1
> + vmlal.s32 q7,d23,d6
> + vmlal.s32 q7,d29,d0
> + add r2,sp,#656
> + vst1.8 {d10-d11},[r2,: 128]
> + vmull.s32 q5,d18,d3
> + vmlal.s32 q5,d19,d2
> + vmlal.s32 q5,d22,d1
> + vmlal.s32 q5,d23,d0
> + vmlal.s32 q5,d12,d8
> + add r2,sp,#672
> + vst1.8 {d16-d17},[r2,: 128]
> + vmull.s32 q4,d18,d8
> + vmlal.s32 q4,d26,d2
> + vmlal.s32 q4,d19,d7
> + vmlal.s32 q4,d27,d1
> + vmlal.s32 q4,d22,d6
> + vmlal.s32 q4,d28,d0
> + vmull.s32 q8,d18,d7
> + vmlal.s32 q8,d26,d1
> + vmlal.s32 q8,d19,d6
> + vmlal.s32 q8,d27,d0
> + add r2,sp,#576
> + vld1.8 {d20-d21},[r2,: 128]
> + vmlal.s32 q7,d24,d21
> + vmlal.s32 q7,d25,d20
> + vmlal.s32 q4,d23,d21
> + vmlal.s32 q4,d29,d20
> + vmlal.s32 q8,d22,d21
> + vmlal.s32 q8,d28,d20
> + vmlal.s32 q5,d24,d20
> + add r2,sp,#576
> + vst1.8 {d14-d15},[r2,: 128]
> + vmull.s32 q7,d18,d6
> + vmlal.s32 q7,d26,d0
> + add r2,sp,#656
> + vld1.8 {d30-d31},[r2,: 128]
> + vmlal.s32 q2,d30,d21
> + vmlal.s32 q7,d19,d21
> + vmlal.s32 q7,d27,d20
> + add r2,sp,#624
> + vld1.8 {d26-d27},[r2,: 128]
> + vmlal.s32 q4,d25,d27
> + vmlal.s32 q8,d29,d27
> + vmlal.s32 q8,d25,d26
> + vmlal.s32 q7,d28,d27
> + vmlal.s32 q7,d29,d26
> + add r2,sp,#608
> + vld1.8 {d28-d29},[r2,: 128]
> + vmlal.s32 q4,d24,d29
> + vmlal.s32 q8,d23,d29
> + vmlal.s32 q8,d24,d28
> + vmlal.s32 q7,d22,d29
> + vmlal.s32 q7,d23,d28
> + add r2,sp,#608
> + vst1.8 {d8-d9},[r2,: 128]
> + add r2,sp,#560
> + vld1.8 {d8-d9},[r2,: 128]
> + vmlal.s32 q7,d24,d9
> + vmlal.s32 q7,d25,d31
> + vmull.s32 q1,d18,d2
> + vmlal.s32 q1,d19,d1
> + vmlal.s32 q1,d22,d0
> + vmlal.s32 q1,d24,d27
> + vmlal.s32 q1,d23,d20
> + vmlal.s32 q1,d12,d7
> + vmlal.s32 q1,d13,d6
> + vmull.s32 q6,d18,d1
> + vmlal.s32 q6,d19,d0
> + vmlal.s32 q6,d23,d27
> + vmlal.s32 q6,d22,d20
> + vmlal.s32 q6,d24,d26
> + vmull.s32 q0,d18,d0
> + vmlal.s32 q0,d22,d27
> + vmlal.s32 q0,d23,d26
> + vmlal.s32 q0,d24,d31
> + vmlal.s32 q0,d19,d20
> + add r2,sp,#640
> + vld1.8 {d18-d19},[r2,: 128]
> + vmlal.s32 q2,d18,d7
> + vmlal.s32 q2,d19,d6
> + vmlal.s32 q5,d18,d6
> + vmlal.s32 q5,d19,d21
> + vmlal.s32 q1,d18,d21
> + vmlal.s32 q1,d19,d29
> + vmlal.s32 q0,d18,d28
> + vmlal.s32 q0,d19,d9
> + vmlal.s32 q6,d18,d29
> + vmlal.s32 q6,d19,d28
> + add r2,sp,#592
> + vld1.8 {d18-d19},[r2,: 128]
> + add r2,sp,#512
> + vld1.8 {d22-d23},[r2,: 128]
> + vmlal.s32 q5,d19,d7
> + vmlal.s32 q0,d18,d21
> + vmlal.s32 q0,d19,d29
> + vmlal.s32 q6,d18,d6
> + add r2,sp,#528
> + vld1.8 {d6-d7},[r2,: 128]
> + vmlal.s32 q6,d19,d21
> + add r2,sp,#576
> + vld1.8 {d18-d19},[r2,: 128]
> + vmlal.s32 q0,d30,d8
> + add r2,sp,#672
> + vld1.8 {d20-d21},[r2,: 128]
> + vmlal.s32 q5,d30,d29
> + add r2,sp,#608
> + vld1.8 {d24-d25},[r2,: 128]
> + vmlal.s32 q1,d30,d28
> + vadd.i64 q13,q0,q11
> + vadd.i64 q14,q5,q11
> + vmlal.s32 q6,d30,d9
> + vshr.s64 q4,q13,#26
> + vshr.s64 q13,q14,#26
> + vadd.i64 q7,q7,q4
> + vshl.i64 q4,q4,#26
> + vadd.i64 q14,q7,q3
> + vadd.i64 q9,q9,q13
> + vshl.i64 q13,q13,#26
> + vadd.i64 q15,q9,q3
> + vsub.i64 q0,q0,q4
> + vshr.s64 q4,q14,#25
> + vsub.i64 q5,q5,q13
> + vshr.s64 q13,q15,#25
> + vadd.i64 q6,q6,q4
> + vshl.i64 q4,q4,#25
> + vadd.i64 q14,q6,q11
> + vadd.i64 q2,q2,q13
> + vsub.i64 q4,q7,q4
> + vshr.s64 q7,q14,#26
> + vshl.i64 q13,q13,#25
> + vadd.i64 q14,q2,q11
> + vadd.i64 q8,q8,q7
> + vshl.i64 q7,q7,#26
> + vadd.i64 q15,q8,q3
> + vsub.i64 q9,q9,q13
> + vshr.s64 q13,q14,#26
> + vsub.i64 q6,q6,q7
> + vshr.s64 q7,q15,#25
> + vadd.i64 q10,q10,q13
> + vshl.i64 q13,q13,#26
> + vadd.i64 q14,q10,q3
> + vadd.i64 q1,q1,q7
> + add r2,r3,#288
> + vshl.i64 q7,q7,#25
> + add r4,r3,#96
> + vadd.i64 q15,q1,q11
> + add r2,r2,#8
> + vsub.i64 q2,q2,q13
> + add r4,r4,#8
> + vshr.s64 q13,q14,#25
> + vsub.i64 q7,q8,q7
> + vshr.s64 q8,q15,#26
> + vadd.i64 q14,q13,q13
> + vadd.i64 q12,q12,q8
> + vtrn.32 d12,d14
> + vshl.i64 q8,q8,#26
> + vtrn.32 d13,d15
> + vadd.i64 q3,q12,q3
> + vadd.i64 q0,q0,q14
> + vst1.8 d12,[r2,: 64]!
> + vshl.i64 q7,q13,#4
> + vst1.8 d13,[r4,: 64]!
> + vsub.i64 q1,q1,q8
> + vshr.s64 q3,q3,#25
> + vadd.i64 q0,q0,q7
> + vadd.i64 q5,q5,q3
> + vshl.i64 q3,q3,#25
> + vadd.i64 q6,q5,q11
> + vadd.i64 q0,q0,q13
> + vshl.i64 q7,q13,#25
> + vadd.i64 q8,q0,q11
> + vsub.i64 q3,q12,q3
> + vshr.s64 q6,q6,#26
> + vsub.i64 q7,q10,q7
> + vtrn.32 d2,d6
> + vshr.s64 q8,q8,#26
> + vtrn.32 d3,d7
> + vadd.i64 q3,q9,q6
> + vst1.8 d2,[r2,: 64]
> + vshl.i64 q6,q6,#26
> + vst1.8 d3,[r4,: 64]
> + vadd.i64 q1,q4,q8
> + vtrn.32 d4,d14
> + vshl.i64 q4,q8,#26
> + vtrn.32 d5,d15
> + vsub.i64 q5,q5,q6
> + add r2,r2,#16
> + vsub.i64 q0,q0,q4
> + vst1.8 d4,[r2,: 64]
> + add r4,r4,#16
> + vst1.8 d5,[r4,: 64]
> + vtrn.32 d10,d6
> + vtrn.32 d11,d7
> + sub r2,r2,#8
> + sub r4,r4,#8
> + vtrn.32 d0,d2
> + vtrn.32 d1,d3
> + vst1.8 d10,[r2,: 64]
> + vst1.8 d11,[r4,: 64]
> + sub r2,r2,#24
> + sub r4,r4,#24
> + vst1.8 d0,[r2,: 64]
> + vst1.8 d1,[r4,: 64]
> + add r2,sp,#544
> + add r4,r3,#144
> + add r5,r3,#192
> + vld1.8 {d0-d1},[r2,: 128]
> + vld1.8 {d2-d3},[r4,: 128]!
> + vld1.8 {d4-d5},[r5,: 128]!
> + vzip.i32 q1,q2
> + vld1.8 {d6-d7},[r4,: 128]!
> + vld1.8 {d8-d9},[r5,: 128]!
> + vshl.i32 q5,q1,#1
> + vzip.i32 q3,q4
> + vshl.i32 q6,q2,#1
> + vld1.8 {d14},[r4,: 64]
> + vshl.i32 q8,q3,#1
> + vld1.8 {d15},[r5,: 64]
> + vshl.i32 q9,q4,#1
> + vmul.i32 d21,d7,d1
> + vtrn.32 d14,d15
> + vmul.i32 q11,q4,q0
> + vmul.i32 q0,q7,q0
> + vmull.s32 q12,d2,d2
> + vmlal.s32 q12,d11,d1
> + vmlal.s32 q12,d12,d0
> + vmlal.s32 q12,d13,d23
> + vmlal.s32 q12,d16,d22
> + vmlal.s32 q12,d7,d21
> + vmull.s32 q10,d2,d11
> + vmlal.s32 q10,d4,d1
> + vmlal.s32 q10,d13,d0
> + vmlal.s32 q10,d6,d23
> + vmlal.s32 q10,d17,d22
> + vmull.s32 q13,d10,d4
> + vmlal.s32 q13,d11,d3
> + vmlal.s32 q13,d13,d1
> + vmlal.s32 q13,d16,d0
> + vmlal.s32 q13,d17,d23
> + vmlal.s32 q13,d8,d22
> + vmull.s32 q1,d10,d5
> + vmlal.s32 q1,d11,d4
> + vmlal.s32 q1,d6,d1
> + vmlal.s32 q1,d17,d0
> + vmlal.s32 q1,d8,d23
> + vmull.s32 q14,d10,d6
> + vmlal.s32 q14,d11,d13
> + vmlal.s32 q14,d4,d4
> + vmlal.s32 q14,d17,d1
> + vmlal.s32 q14,d18,d0
> + vmlal.s32 q14,d9,d23
> + vmull.s32 q11,d10,d7
> + vmlal.s32 q11,d11,d6
> + vmlal.s32 q11,d12,d5
> + vmlal.s32 q11,d8,d1
> + vmlal.s32 q11,d19,d0
> + vmull.s32 q15,d10,d8
> + vmlal.s32 q15,d11,d17
> + vmlal.s32 q15,d12,d6
> + vmlal.s32 q15,d13,d5
> + vmlal.s32 q15,d19,d1
> + vmlal.s32 q15,d14,d0
> + vmull.s32 q2,d10,d9
> + vmlal.s32 q2,d11,d8
> + vmlal.s32 q2,d12,d7
> + vmlal.s32 q2,d13,d6
> + vmlal.s32 q2,d14,d1
> + vmull.s32 q0,d15,d1
> + vmlal.s32 q0,d10,d14
> + vmlal.s32 q0,d11,d19
> + vmlal.s32 q0,d12,d8
> + vmlal.s32 q0,d13,d17
> + vmlal.s32 q0,d6,d6
> + add r2,sp,#512
> + vld1.8 {d18-d19},[r2,: 128]
> + vmull.s32 q3,d16,d7
> + vmlal.s32 q3,d10,d15
> + vmlal.s32 q3,d11,d14
> + vmlal.s32 q3,d12,d9
> + vmlal.s32 q3,d13,d8
> + add r2,sp,#528
> + vld1.8 {d8-d9},[r2,: 128]
> + vadd.i64 q5,q12,q9
> + vadd.i64 q6,q15,q9
> + vshr.s64 q5,q5,#26
> + vshr.s64 q6,q6,#26
> + vadd.i64 q7,q10,q5
> + vshl.i64 q5,q5,#26
> + vadd.i64 q8,q7,q4
> + vadd.i64 q2,q2,q6
> + vshl.i64 q6,q6,#26
> + vadd.i64 q10,q2,q4
> + vsub.i64 q5,q12,q5
> + vshr.s64 q8,q8,#25
> + vsub.i64 q6,q15,q6
> + vshr.s64 q10,q10,#25
> + vadd.i64 q12,q13,q8
> + vshl.i64 q8,q8,#25
> + vadd.i64 q13,q12,q9
> + vadd.i64 q0,q0,q10
> + vsub.i64 q7,q7,q8
> + vshr.s64 q8,q13,#26
> + vshl.i64 q10,q10,#25
> + vadd.i64 q13,q0,q9
> + vadd.i64 q1,q1,q8
> + vshl.i64 q8,q8,#26
> + vadd.i64 q15,q1,q4
> + vsub.i64 q2,q2,q10
> + vshr.s64 q10,q13,#26
> + vsub.i64 q8,q12,q8
> + vshr.s64 q12,q15,#25
> + vadd.i64 q3,q3,q10
> + vshl.i64 q10,q10,#26
> + vadd.i64 q13,q3,q4
> + vadd.i64 q14,q14,q12
> + add r2,r3,#144
> + vshl.i64 q12,q12,#25
> + add r4,r3,#192
> + vadd.i64 q15,q14,q9
> + add r2,r2,#8
> + vsub.i64 q0,q0,q10
> + add r4,r4,#8
> + vshr.s64 q10,q13,#25
> + vsub.i64 q1,q1,q12
> + vshr.s64 q12,q15,#26
> + vadd.i64 q13,q10,q10
> + vadd.i64 q11,q11,q12
> + vtrn.32 d16,d2
> + vshl.i64 q12,q12,#26
> + vtrn.32 d17,d3
> + vadd.i64 q1,q11,q4
> + vadd.i64 q4,q5,q13
> + vst1.8 d16,[r2,: 64]!
> + vshl.i64 q5,q10,#4
> + vst1.8 d17,[r4,: 64]!
> + vsub.i64 q8,q14,q12
> + vshr.s64 q1,q1,#25
> + vadd.i64 q4,q4,q5
> + vadd.i64 q5,q6,q1
> + vshl.i64 q1,q1,#25
> + vadd.i64 q6,q5,q9
> + vadd.i64 q4,q4,q10
> + vshl.i64 q10,q10,#25
> + vadd.i64 q9,q4,q9
> + vsub.i64 q1,q11,q1
> + vshr.s64 q6,q6,#26
> + vsub.i64 q3,q3,q10
> + vtrn.32 d16,d2
> + vshr.s64 q9,q9,#26
> + vtrn.32 d17,d3
> + vadd.i64 q1,q2,q6
> + vst1.8 d16,[r2,: 64]
> + vshl.i64 q2,q6,#26
> + vst1.8 d17,[r4,: 64]
> + vadd.i64 q6,q7,q9
> + vtrn.32 d0,d6
> + vshl.i64 q7,q9,#26
> + vtrn.32 d1,d7
> + vsub.i64 q2,q5,q2
> + add r2,r2,#16
> + vsub.i64 q3,q4,q7
> + vst1.8 d0,[r2,: 64]
> + add r4,r4,#16
> + vst1.8 d1,[r4,: 64]
> + vtrn.32 d4,d2
> + vtrn.32 d5,d3
> + sub r2,r2,#8
> + sub r4,r4,#8
> + vtrn.32 d6,d12
> + vtrn.32 d7,d13
> + vst1.8 d4,[r2,: 64]
> + vst1.8 d5,[r4,: 64]
> + sub r2,r2,#24
> + sub r4,r4,#24
> + vst1.8 d6,[r2,: 64]
> + vst1.8 d7,[r4,: 64]
> + add r2,r3,#336
> + add r4,r3,#288
> + vld1.8 {d0-d1},[r2,: 128]!
> + vld1.8 {d2-d3},[r4,: 128]!
> + vadd.i32 q0,q0,q1
> + vld1.8 {d2-d3},[r2,: 128]!
> + vld1.8 {d4-d5},[r4,: 128]!
> + vadd.i32 q1,q1,q2
> + add r5,r3,#288
> + vld1.8 {d4},[r2,: 64]
> + vld1.8 {d6},[r4,: 64]
> + vadd.i32 q2,q2,q3
> + vst1.8 {d0-d1},[r5,: 128]!
> + vst1.8 {d2-d3},[r5,: 128]!
> + vst1.8 d4,[r5,: 64]
> + add r2,r3,#48
> + add r4,r3,#144
> + vld1.8 {d0-d1},[r4,: 128]!
> + vld1.8 {d2-d3},[r4,: 128]!
> + vld1.8 {d4},[r4,: 64]
> + add r4,r3,#288
> + vld1.8 {d6-d7},[r4,: 128]!
> + vtrn.32 q0,q3
> + vld1.8 {d8-d9},[r4,: 128]!
> + vshl.i32 q5,q0,#4
> + vtrn.32 q1,q4
> + vshl.i32 q6,q3,#4
> + vadd.i32 q5,q5,q0
> + vadd.i32 q6,q6,q3
> + vshl.i32 q7,q1,#4
> + vld1.8 {d5},[r4,: 64]
> + vshl.i32 q8,q4,#4
> + vtrn.32 d4,d5
> + vadd.i32 q7,q7,q1
> + vadd.i32 q8,q8,q4
> + vld1.8 {d18-d19},[r2,: 128]!
> + vshl.i32 q10,q2,#4
> + vld1.8 {d22-d23},[r2,: 128]!
> + vadd.i32 q10,q10,q2
> + vld1.8 {d24},[r2,: 64]
> + vadd.i32 q5,q5,q0
> + add r2,r3,#240
> + vld1.8 {d26-d27},[r2,: 128]!
> + vadd.i32 q6,q6,q3
> + vld1.8 {d28-d29},[r2,: 128]!
> + vadd.i32 q8,q8,q4
> + vld1.8 {d25},[r2,: 64]
> + vadd.i32 q10,q10,q2
> + vtrn.32 q9,q13
> + vadd.i32 q7,q7,q1
> + vadd.i32 q5,q5,q0
> + vtrn.32 q11,q14
> + vadd.i32 q6,q6,q3
> + add r2,sp,#560
> + vadd.i32 q10,q10,q2
> + vtrn.32 d24,d25
> + vst1.8 {d12-d13},[r2,: 128]
> + vshl.i32 q6,q13,#1
> + add r2,sp,#576
> + vst1.8 {d20-d21},[r2,: 128]
> + vshl.i32 q10,q14,#1
> + add r2,sp,#592
> + vst1.8 {d12-d13},[r2,: 128]
> + vshl.i32 q15,q12,#1
> + vadd.i32 q8,q8,q4
> + vext.32 d10,d31,d30,#0
> + vadd.i32 q7,q7,q1
> + add r2,sp,#608
> + vst1.8 {d16-d17},[r2,: 128]
> + vmull.s32 q8,d18,d5
> + vmlal.s32 q8,d26,d4
> + vmlal.s32 q8,d19,d9
> + vmlal.s32 q8,d27,d3
> + vmlal.s32 q8,d22,d8
> + vmlal.s32 q8,d28,d2
> + vmlal.s32 q8,d23,d7
> + vmlal.s32 q8,d29,d1
> + vmlal.s32 q8,d24,d6
> + vmlal.s32 q8,d25,d0
> + add r2,sp,#624
> + vst1.8 {d14-d15},[r2,: 128]
> + vmull.s32 q2,d18,d4
> + vmlal.s32 q2,d12,d9
> + vmlal.s32 q2,d13,d8
> + vmlal.s32 q2,d19,d3
> + vmlal.s32 q2,d22,d2
> + vmlal.s32 q2,d23,d1
> + vmlal.s32 q2,d24,d0
> + add r2,sp,#640
> + vst1.8 {d20-d21},[r2,: 128]
> + vmull.s32 q7,d18,d9
> + vmlal.s32 q7,d26,d3
> + vmlal.s32 q7,d19,d8
> + vmlal.s32 q7,d27,d2
> + vmlal.s32 q7,d22,d7
> + vmlal.s32 q7,d28,d1
> + vmlal.s32 q7,d23,d6
> + vmlal.s32 q7,d29,d0
> + add r2,sp,#656
> + vst1.8 {d10-d11},[r2,: 128]
> + vmull.s32 q5,d18,d3
> + vmlal.s32 q5,d19,d2
> + vmlal.s32 q5,d22,d1
> + vmlal.s32 q5,d23,d0
> + vmlal.s32 q5,d12,d8
> + add r2,sp,#672
> + vst1.8 {d16-d17},[r2,: 128]
> + vmull.s32 q4,d18,d8
> + vmlal.s32 q4,d26,d2
> + vmlal.s32 q4,d19,d7
> + vmlal.s32 q4,d27,d1
> + vmlal.s32 q4,d22,d6
> + vmlal.s32 q4,d28,d0
> + vmull.s32 q8,d18,d7
> + vmlal.s32 q8,d26,d1
> + vmlal.s32 q8,d19,d6
> + vmlal.s32 q8,d27,d0
> + add r2,sp,#576
> + vld1.8 {d20-d21},[r2,: 128]
> + vmlal.s32 q7,d24,d21
> + vmlal.s32 q7,d25,d20
> + vmlal.s32 q4,d23,d21
> + vmlal.s32 q4,d29,d20
> + vmlal.s32 q8,d22,d21
> + vmlal.s32 q8,d28,d20
> + vmlal.s32 q5,d24,d20
> + add r2,sp,#576
> + vst1.8 {d14-d15},[r2,: 128]
> + vmull.s32 q7,d18,d6
> + vmlal.s32 q7,d26,d0
> + add r2,sp,#656
> + vld1.8 {d30-d31},[r2,: 128]
> + vmlal.s32 q2,d30,d21
> + vmlal.s32 q7,d19,d21
> + vmlal.s32 q7,d27,d20
> + add r2,sp,#624
> + vld1.8 {d26-d27},[r2,: 128]
> + vmlal.s32 q4,d25,d27
> + vmlal.s32 q8,d29,d27
> + vmlal.s32 q8,d25,d26
> + vmlal.s32 q7,d28,d27
> + vmlal.s32 q7,d29,d26
> + add r2,sp,#608
> + vld1.8 {d28-d29},[r2,: 128]
> + vmlal.s32 q4,d24,d29
> + vmlal.s32 q8,d23,d29
> + vmlal.s32 q8,d24,d28
> + vmlal.s32 q7,d22,d29
> + vmlal.s32 q7,d23,d28
> + add r2,sp,#608
> + vst1.8 {d8-d9},[r2,: 128]
> + add r2,sp,#560
> + vld1.8 {d8-d9},[r2,: 128]
> + vmlal.s32 q7,d24,d9
> + vmlal.s32 q7,d25,d31
> + vmull.s32 q1,d18,d2
> + vmlal.s32 q1,d19,d1
> + vmlal.s32 q1,d22,d0
> + vmlal.s32 q1,d24,d27
> + vmlal.s32 q1,d23,d20
> + vmlal.s32 q1,d12,d7
> + vmlal.s32 q1,d13,d6
> + vmull.s32 q6,d18,d1
> + vmlal.s32 q6,d19,d0
> + vmlal.s32 q6,d23,d27
> + vmlal.s32 q6,d22,d20
> + vmlal.s32 q6,d24,d26
> + vmull.s32 q0,d18,d0
> + vmlal.s32 q0,d22,d27
> + vmlal.s32 q0,d23,d26
> + vmlal.s32 q0,d24,d31
> + vmlal.s32 q0,d19,d20
> + add r2,sp,#640
> + vld1.8 {d18-d19},[r2,: 128]
> + vmlal.s32 q2,d18,d7
> + vmlal.s32 q2,d19,d6
> + vmlal.s32 q5,d18,d6
> + vmlal.s32 q5,d19,d21
> + vmlal.s32 q1,d18,d21
> + vmlal.s32 q1,d19,d29
> + vmlal.s32 q0,d18,d28
> + vmlal.s32 q0,d19,d9
> + vmlal.s32 q6,d18,d29
> + vmlal.s32 q6,d19,d28
> + add r2,sp,#592
> + vld1.8 {d18-d19},[r2,: 128]
> + add r2,sp,#512
> + vld1.8 {d22-d23},[r2,: 128]
> + vmlal.s32 q5,d19,d7
> + vmlal.s32 q0,d18,d21
> + vmlal.s32 q0,d19,d29
> + vmlal.s32 q6,d18,d6
> + add r2,sp,#528
> + vld1.8 {d6-d7},[r2,: 128]
> + vmlal.s32 q6,d19,d21
> + add r2,sp,#576
> + vld1.8 {d18-d19},[r2,: 128]
> + vmlal.s32 q0,d30,d8
> + add r2,sp,#672
> + vld1.8 {d20-d21},[r2,: 128]
> + vmlal.s32 q5,d30,d29
> + add r2,sp,#608
> + vld1.8 {d24-d25},[r2,: 128]
> + vmlal.s32 q1,d30,d28
> + vadd.i64 q13,q0,q11
> + vadd.i64 q14,q5,q11
> + vmlal.s32 q6,d30,d9
> + vshr.s64 q4,q13,#26
> + vshr.s64 q13,q14,#26
> + vadd.i64 q7,q7,q4
> + vshl.i64 q4,q4,#26
> + vadd.i64 q14,q7,q3
> + vadd.i64 q9,q9,q13
> + vshl.i64 q13,q13,#26
> + vadd.i64 q15,q9,q3
> + vsub.i64 q0,q0,q4
> + vshr.s64 q4,q14,#25
> + vsub.i64 q5,q5,q13
> + vshr.s64 q13,q15,#25
> + vadd.i64 q6,q6,q4
> + vshl.i64 q4,q4,#25
> + vadd.i64 q14,q6,q11
> + vadd.i64 q2,q2,q13
> + vsub.i64 q4,q7,q4
> + vshr.s64 q7,q14,#26
> + vshl.i64 q13,q13,#25
> + vadd.i64 q14,q2,q11
> + vadd.i64 q8,q8,q7
> + vshl.i64 q7,q7,#26
> + vadd.i64 q15,q8,q3
> + vsub.i64 q9,q9,q13
> + vshr.s64 q13,q14,#26
> + vsub.i64 q6,q6,q7
> + vshr.s64 q7,q15,#25
> + vadd.i64 q10,q10,q13
> + vshl.i64 q13,q13,#26
> + vadd.i64 q14,q10,q3
> + vadd.i64 q1,q1,q7
> + add r2,r3,#240
> + vshl.i64 q7,q7,#25
> + add r4,r3,#144
> + vadd.i64 q15,q1,q11
> + add r2,r2,#8
> + vsub.i64 q2,q2,q13
> + add r4,r4,#8
> + vshr.s64 q13,q14,#25
> + vsub.i64 q7,q8,q7
> + vshr.s64 q8,q15,#26
> + vadd.i64 q14,q13,q13
> + vadd.i64 q12,q12,q8
> + vtrn.32 d12,d14
> + vshl.i64 q8,q8,#26
> + vtrn.32 d13,d15
> + vadd.i64 q3,q12,q3
> + vadd.i64 q0,q0,q14
> + vst1.8 d12,[r2,: 64]!
> + vshl.i64 q7,q13,#4
> + vst1.8 d13,[r4,: 64]!
> + vsub.i64 q1,q1,q8
> + vshr.s64 q3,q3,#25
> + vadd.i64 q0,q0,q7
> + vadd.i64 q5,q5,q3
> + vshl.i64 q3,q3,#25
> + vadd.i64 q6,q5,q11
> + vadd.i64 q0,q0,q13
> + vshl.i64 q7,q13,#25
> + vadd.i64 q8,q0,q11
> + vsub.i64 q3,q12,q3
> + vshr.s64 q6,q6,#26
> + vsub.i64 q7,q10,q7
> + vtrn.32 d2,d6
> + vshr.s64 q8,q8,#26
> + vtrn.32 d3,d7
> + vadd.i64 q3,q9,q6
> + vst1.8 d2,[r2,: 64]
> + vshl.i64 q6,q6,#26
> + vst1.8 d3,[r4,: 64]
> + vadd.i64 q1,q4,q8
> + vtrn.32 d4,d14
> + vshl.i64 q4,q8,#26
> + vtrn.32 d5,d15
> + vsub.i64 q5,q5,q6
> + add r2,r2,#16
> + vsub.i64 q0,q0,q4
> + vst1.8 d4,[r2,: 64]
> + add r4,r4,#16
> + vst1.8 d5,[r4,: 64]
> + vtrn.32 d10,d6
> + vtrn.32 d11,d7
> + sub r2,r2,#8
> + sub r4,r4,#8
> + vtrn.32 d0,d2
> + vtrn.32 d1,d3
> + vst1.8 d10,[r2,: 64]
> + vst1.8 d11,[r4,: 64]
> + sub r2,r2,#24
> + sub r4,r4,#24
> + vst1.8 d0,[r2,: 64]
> + vst1.8 d1,[r4,: 64]
> + ldr r2,[sp,#488]
> + ldr r4,[sp,#492]
> + subs r5,r2,#1
> + bge .Lmainloop
> + add r1,r3,#144
> + add r2,r3,#336
> + vld1.8 {d0-d1},[r1,: 128]!
> + vld1.8 {d2-d3},[r1,: 128]!
> + vld1.8 {d4},[r1,: 64]
> + vst1.8 {d0-d1},[r2,: 128]!
> + vst1.8 {d2-d3},[r2,: 128]!
> + vst1.8 d4,[r2,: 64]
> + ldr r1,=0
> + .Linvertloop:
> + add r2,r3,#144
> + ldr r4,=0
> + ldr r5,=2
> + cmp r1,#1
> + ldreq r5,=1
> + addeq r2,r3,#336
> + addeq r4,r3,#48
> + cmp r1,#2
> + ldreq r5,=1
> + addeq r2,r3,#48
> + cmp r1,#3
> + ldreq r5,=5
> + addeq r4,r3,#336
> + cmp r1,#4
> + ldreq r5,=10
> + cmp r1,#5
> + ldreq r5,=20
> + cmp r1,#6
> + ldreq r5,=10
> + addeq r2,r3,#336
> + addeq r4,r3,#336
> + cmp r1,#7
> + ldreq r5,=50
> + cmp r1,#8
> + ldreq r5,=100
> + cmp r1,#9
> + ldreq r5,=50
> + addeq r2,r3,#336
> + cmp r1,#10
> + ldreq r5,=5
> + addeq r2,r3,#48
> + cmp r1,#11
> + ldreq r5,=0
> + addeq r2,r3,#96
> + add r6,r3,#144
> + add r7,r3,#288
> + vld1.8 {d0-d1},[r6,: 128]!
> + vld1.8 {d2-d3},[r6,: 128]!
> + vld1.8 {d4},[r6,: 64]
> + vst1.8 {d0-d1},[r7,: 128]!
> + vst1.8 {d2-d3},[r7,: 128]!
> + vst1.8 d4,[r7,: 64]
> + cmp r5,#0
> + beq .Lskipsquaringloop
> + .Lsquaringloop:
> + add r6,r3,#288
> + add r7,r3,#288
> + add r8,r3,#288
> + vmov.i32 q0,#19
> + vmov.i32 q1,#0
> + vmov.i32 q2,#1
> + vzip.i32 q1,q2
> + vld1.8 {d4-d5},[r7,: 128]!
> + vld1.8 {d6-d7},[r7,: 128]!
> + vld1.8 {d9},[r7,: 64]
> + vld1.8 {d10-d11},[r6,: 128]!
> + add r7,sp,#416
> + vld1.8 {d12-d13},[r6,: 128]!
> + vmul.i32 q7,q2,q0
> + vld1.8 {d8},[r6,: 64]
> + vext.32 d17,d11,d10,#1
> + vmul.i32 q9,q3,q0
> + vext.32 d16,d10,d8,#1
> + vshl.u32 q10,q5,q1
> + vext.32 d22,d14,d4,#1
> + vext.32 d24,d18,d6,#1
> + vshl.u32 q13,q6,q1
> + vshl.u32 d28,d8,d2
> + vrev64.i32 d22,d22
> + vmul.i32 d1,d9,d1
> + vrev64.i32 d24,d24
> + vext.32 d29,d8,d13,#1
> + vext.32 d0,d1,d9,#1
> + vrev64.i32 d0,d0
> + vext.32 d2,d9,d1,#1
> + vext.32 d23,d15,d5,#1
> + vmull.s32 q4,d20,d4
> + vrev64.i32 d23,d23
> + vmlal.s32 q4,d21,d1
> + vrev64.i32 d2,d2
> + vmlal.s32 q4,d26,d19
> + vext.32 d3,d5,d15,#1
> + vmlal.s32 q4,d27,d18
> + vrev64.i32 d3,d3
> + vmlal.s32 q4,d28,d15
> + vext.32 d14,d12,d11,#1
> + vmull.s32 q5,d16,d23
> + vext.32 d15,d13,d12,#1
> + vmlal.s32 q5,d17,d4
> + vst1.8 d8,[r7,: 64]!
> + vmlal.s32 q5,d14,d1
> + vext.32 d12,d9,d8,#0
> + vmlal.s32 q5,d15,d19
> + vmov.i64 d13,#0
> + vmlal.s32 q5,d29,d18
> + vext.32 d25,d19,d7,#1
> + vmlal.s32 q6,d20,d5
> + vrev64.i32 d25,d25
> + vmlal.s32 q6,d21,d4
> + vst1.8 d11,[r7,: 64]!
> + vmlal.s32 q6,d26,d1
> + vext.32 d9,d10,d10,#0
> + vmlal.s32 q6,d27,d19
> + vmov.i64 d8,#0
> + vmlal.s32 q6,d28,d18
> + vmlal.s32 q4,d16,d24
> + vmlal.s32 q4,d17,d5
> + vmlal.s32 q4,d14,d4
> + vst1.8 d12,[r7,: 64]!
> + vmlal.s32 q4,d15,d1
> + vext.32 d10,d13,d12,#0
> + vmlal.s32 q4,d29,d19
> + vmov.i64 d11,#0
> + vmlal.s32 q5,d20,d6
> + vmlal.s32 q5,d21,d5
> + vmlal.s32 q5,d26,d4
> + vext.32 d13,d8,d8,#0
> + vmlal.s32 q5,d27,d1
> + vmov.i64 d12,#0
> + vmlal.s32 q5,d28,d19
> + vst1.8 d9,[r7,: 64]!
> + vmlal.s32 q6,d16,d25
> + vmlal.s32 q6,d17,d6
> + vst1.8 d10,[r7,: 64]
> + vmlal.s32 q6,d14,d5
> + vext.32 d8,d11,d10,#0
> + vmlal.s32 q6,d15,d4
> + vmov.i64 d9,#0
> + vmlal.s32 q6,d29,d1
> + vmlal.s32 q4,d20,d7
> + vmlal.s32 q4,d21,d6
> + vmlal.s32 q4,d26,d5
> + vext.32 d11,d12,d12,#0
> + vmlal.s32 q4,d27,d4
> + vmov.i64 d10,#0
> + vmlal.s32 q4,d28,d1
> + vmlal.s32 q5,d16,d0
> + sub r6,r7,#32
> + vmlal.s32 q5,d17,d7
> + vmlal.s32 q5,d14,d6
> + vext.32 d30,d9,d8,#0
> + vmlal.s32 q5,d15,d5
> + vld1.8 {d31},[r6,: 64]!
> + vmlal.s32 q5,d29,d4
> + vmlal.s32 q15,d20,d0
> + vext.32 d0,d6,d18,#1
> + vmlal.s32 q15,d21,d25
> + vrev64.i32 d0,d0
> + vmlal.s32 q15,d26,d24
> + vext.32 d1,d7,d19,#1
> + vext.32 d7,d10,d10,#0
> + vmlal.s32 q15,d27,d23
> + vrev64.i32 d1,d1
> + vld1.8 {d6},[r6,: 64]
> + vmlal.s32 q15,d28,d22
> + vmlal.s32 q3,d16,d4
> + add r6,r6,#24
> + vmlal.s32 q3,d17,d2
> + vext.32 d4,d31,d30,#0
> + vmov d17,d11
> + vmlal.s32 q3,d14,d1
> + vext.32 d11,d13,d13,#0
> + vext.32 d13,d30,d30,#0
> + vmlal.s32 q3,d15,d0
> + vext.32 d1,d8,d8,#0
> + vmlal.s32 q3,d29,d3
> + vld1.8 {d5},[r6,: 64]
> + sub r6,r6,#16
> + vext.32 d10,d6,d6,#0
> + vmov.i32 q1,#0xffffffff
> + vshl.i64 q4,q1,#25
> + add r7,sp,#512
> + vld1.8 {d14-d15},[r7,: 128]
> + vadd.i64 q9,q2,q7
> + vshl.i64 q1,q1,#26
> + vshr.s64 q10,q9,#26
> + vld1.8 {d0},[r6,: 64]!
> + vadd.i64 q5,q5,q10
> + vand q9,q9,q1
> + vld1.8 {d16},[r6,: 64]!
> + add r6,sp,#528
> + vld1.8 {d20-d21},[r6,: 128]
> + vadd.i64 q11,q5,q10
> + vsub.i64 q2,q2,q9
> + vshr.s64 q9,q11,#25
> + vext.32 d12,d5,d4,#0
> + vand q11,q11,q4
> + vadd.i64 q0,q0,q9
> + vmov d19,d7
> + vadd.i64 q3,q0,q7
> + vsub.i64 q5,q5,q11
> + vshr.s64 q11,q3,#26
> + vext.32 d18,d11,d10,#0
> + vand q3,q3,q1
> + vadd.i64 q8,q8,q11
> + vadd.i64 q11,q8,q10
> + vsub.i64 q0,q0,q3
> + vshr.s64 q3,q11,#25
> + vand q11,q11,q4
> + vadd.i64 q3,q6,q3
> + vadd.i64 q6,q3,q7
> + vsub.i64 q8,q8,q11
> + vshr.s64 q11,q6,#26
> + vand q6,q6,q1
> + vadd.i64 q9,q9,q11
> + vadd.i64 d25,d19,d21
> + vsub.i64 q3,q3,q6
> + vshr.s64 d23,d25,#25
> + vand q4,q12,q4
> + vadd.i64 d21,d23,d23
> + vshl.i64 d25,d23,#4
> + vadd.i64 d21,d21,d23
> + vadd.i64 d25,d25,d21
> + vadd.i64 d4,d4,d25
> + vzip.i32 q0,q8
> + vadd.i64 d12,d4,d14
> + add r6,r8,#8
> + vst1.8 d0,[r6,: 64]
> + vsub.i64 d19,d19,d9
> + add r6,r6,#16
> + vst1.8 d16,[r6,: 64]
> + vshr.s64 d22,d12,#26
> + vand q0,q6,q1
> + vadd.i64 d10,d10,d22
> + vzip.i32 q3,q9
> + vsub.i64 d4,d4,d0
> + sub r6,r6,#8
> + vst1.8 d6,[r6,: 64]
> + add r6,r6,#16
> + vst1.8 d18,[r6,: 64]
> + vzip.i32 q2,q5
> + sub r6,r6,#32
> + vst1.8 d4,[r6,: 64]
> + subs r5,r5,#1
> + bhi .Lsquaringloop
> + .Lskipsquaringloop:
> + mov r2,r2
> + add r5,r3,#288
> + add r6,r3,#144
> + vmov.i32 q0,#19
> + vmov.i32 q1,#0
> + vmov.i32 q2,#1
> + vzip.i32 q1,q2
> + vld1.8 {d4-d5},[r5,: 128]!
> + vld1.8 {d6-d7},[r5,: 128]!
> + vld1.8 {d9},[r5,: 64]
> + vld1.8 {d10-d11},[r2,: 128]!
> + add r5,sp,#416
> + vld1.8 {d12-d13},[r2,: 128]!
> + vmul.i32 q7,q2,q0
> + vld1.8 {d8},[r2,: 64]
> + vext.32 d17,d11,d10,#1
> + vmul.i32 q9,q3,q0
> + vext.32 d16,d10,d8,#1
> + vshl.u32 q10,q5,q1
> + vext.32 d22,d14,d4,#1
> + vext.32 d24,d18,d6,#1
> + vshl.u32 q13,q6,q1
> + vshl.u32 d28,d8,d2
> + vrev64.i32 d22,d22
> + vmul.i32 d1,d9,d1
> + vrev64.i32 d24,d24
> + vext.32 d29,d8,d13,#1
> + vext.32 d0,d1,d9,#1
> + vrev64.i32 d0,d0
> + vext.32 d2,d9,d1,#1
> + vext.32 d23,d15,d5,#1
> + vmull.s32 q4,d20,d4
> + vrev64.i32 d23,d23
> + vmlal.s32 q4,d21,d1
> + vrev64.i32 d2,d2
> + vmlal.s32 q4,d26,d19
> + vext.32 d3,d5,d15,#1
> + vmlal.s32 q4,d27,d18
> + vrev64.i32 d3,d3
> + vmlal.s32 q4,d28,d15
> + vext.32 d14,d12,d11,#1
> + vmull.s32 q5,d16,d23
> + vext.32 d15,d13,d12,#1
> + vmlal.s32 q5,d17,d4
> + vst1.8 d8,[r5,: 64]!
> + vmlal.s32 q5,d14,d1
> + vext.32 d12,d9,d8,#0
> + vmlal.s32 q5,d15,d19
> + vmov.i64 d13,#0
> + vmlal.s32 q5,d29,d18
> + vext.32 d25,d19,d7,#1
> + vmlal.s32 q6,d20,d5
> + vrev64.i32 d25,d25
> + vmlal.s32 q6,d21,d4
> + vst1.8 d11,[r5,: 64]!
> + vmlal.s32 q6,d26,d1
> + vext.32 d9,d10,d10,#0
> + vmlal.s32 q6,d27,d19
> + vmov.i64 d8,#0
> + vmlal.s32 q6,d28,d18
> + vmlal.s32 q4,d16,d24
> + vmlal.s32 q4,d17,d5
> + vmlal.s32 q4,d14,d4
> + vst1.8 d12,[r5,: 64]!
> + vmlal.s32 q4,d15,d1
> + vext.32 d10,d13,d12,#0
> + vmlal.s32 q4,d29,d19
> + vmov.i64 d11,#0
> + vmlal.s32 q5,d20,d6
> + vmlal.s32 q5,d21,d5
> + vmlal.s32 q5,d26,d4
> + vext.32 d13,d8,d8,#0
> + vmlal.s32 q5,d27,d1
> + vmov.i64 d12,#0
> + vmlal.s32 q5,d28,d19
> + vst1.8 d9,[r5,: 64]!
> + vmlal.s32 q6,d16,d25
> + vmlal.s32 q6,d17,d6
> + vst1.8 d10,[r5,: 64]
> + vmlal.s32 q6,d14,d5
> + vext.32 d8,d11,d10,#0
> + vmlal.s32 q6,d15,d4
> + vmov.i64 d9,#0
> + vmlal.s32 q6,d29,d1
> + vmlal.s32 q4,d20,d7
> + vmlal.s32 q4,d21,d6
> + vmlal.s32 q4,d26,d5
> + vext.32 d11,d12,d12,#0
> + vmlal.s32 q4,d27,d4
> + vmov.i64 d10,#0
> + vmlal.s32 q4,d28,d1
> + vmlal.s32 q5,d16,d0
> + sub r2,r5,#32
> + vmlal.s32 q5,d17,d7
> + vmlal.s32 q5,d14,d6
> + vext.32 d30,d9,d8,#0
> + vmlal.s32 q5,d15,d5
> + vld1.8 {d31},[r2,: 64]!
> + vmlal.s32 q5,d29,d4
> + vmlal.s32 q15,d20,d0
> + vext.32 d0,d6,d18,#1
> + vmlal.s32 q15,d21,d25
> + vrev64.i32 d0,d0
> + vmlal.s32 q15,d26,d24
> + vext.32 d1,d7,d19,#1
> + vext.32 d7,d10,d10,#0
> + vmlal.s32 q15,d27,d23
> + vrev64.i32 d1,d1
> + vld1.8 {d6},[r2,: 64]
> + vmlal.s32 q15,d28,d22
> + vmlal.s32 q3,d16,d4
> + add r2,r2,#24
> + vmlal.s32 q3,d17,d2
> + vext.32 d4,d31,d30,#0
> + vmov d17,d11
> + vmlal.s32 q3,d14,d1
> + vext.32 d11,d13,d13,#0
> + vext.32 d13,d30,d30,#0
> + vmlal.s32 q3,d15,d0
> + vext.32 d1,d8,d8,#0
> + vmlal.s32 q3,d29,d3
> + vld1.8 {d5},[r2,: 64]
> + sub r2,r2,#16
> + vext.32 d10,d6,d6,#0
> + vmov.i32 q1,#0xffffffff
> + vshl.i64 q4,q1,#25
> + add r5,sp,#512
> + vld1.8 {d14-d15},[r5,: 128]
> + vadd.i64 q9,q2,q7
> + vshl.i64 q1,q1,#26
> + vshr.s64 q10,q9,#26
> + vld1.8 {d0},[r2,: 64]!
> + vadd.i64 q5,q5,q10
> + vand q9,q9,q1
> + vld1.8 {d16},[r2,: 64]!
> + add r2,sp,#528
> + vld1.8 {d20-d21},[r2,: 128]
> + vadd.i64 q11,q5,q10
> + vsub.i64 q2,q2,q9
> + vshr.s64 q9,q11,#25
> + vext.32 d12,d5,d4,#0
> + vand q11,q11,q4
> + vadd.i64 q0,q0,q9
> + vmov d19,d7
> + vadd.i64 q3,q0,q7
> + vsub.i64 q5,q5,q11
> + vshr.s64 q11,q3,#26
> + vext.32 d18,d11,d10,#0
> + vand q3,q3,q1
> + vadd.i64 q8,q8,q11
> + vadd.i64 q11,q8,q10
> + vsub.i64 q0,q0,q3
> + vshr.s64 q3,q11,#25
> + vand q11,q11,q4
> + vadd.i64 q3,q6,q3
> + vadd.i64 q6,q3,q7
> + vsub.i64 q8,q8,q11
> + vshr.s64 q11,q6,#26
> + vand q6,q6,q1
> + vadd.i64 q9,q9,q11
> + vadd.i64 d25,d19,d21
> + vsub.i64 q3,q3,q6
> + vshr.s64 d23,d25,#25
> + vand q4,q12,q4
> + vadd.i64 d21,d23,d23
> + vshl.i64 d25,d23,#4
> + vadd.i64 d21,d21,d23
> + vadd.i64 d25,d25,d21
> + vadd.i64 d4,d4,d25
> + vzip.i32 q0,q8
> + vadd.i64 d12,d4,d14
> + add r2,r6,#8
> + vst1.8 d0,[r2,: 64]
> + vsub.i64 d19,d19,d9
> + add r2,r2,#16
> + vst1.8 d16,[r2,: 64]
> + vshr.s64 d22,d12,#26
> + vand q0,q6,q1
> + vadd.i64 d10,d10,d22
> + vzip.i32 q3,q9
> + vsub.i64 d4,d4,d0
> + sub r2,r2,#8
> + vst1.8 d6,[r2,: 64]
> + add r2,r2,#16
> + vst1.8 d18,[r2,: 64]
> + vzip.i32 q2,q5
> + sub r2,r2,#32
> + vst1.8 d4,[r2,: 64]
> + cmp r4,#0
> + beq .Lskippostcopy
> + add r2,r3,#144
> + mov r4,r4
> + vld1.8 {d0-d1},[r2,: 128]!
> + vld1.8 {d2-d3},[r2,: 128]!
> + vld1.8 {d4},[r2,: 64]
> + vst1.8 {d0-d1},[r4,: 128]!
> + vst1.8 {d2-d3},[r4,: 128]!
> + vst1.8 d4,[r4,: 64]
> + .Lskippostcopy:
> + cmp r1,#1
> + bne .Lskipfinalcopy
> + add r2,r3,#288
> + add r4,r3,#144
> + vld1.8 {d0-d1},[r2,: 128]!
> + vld1.8 {d2-d3},[r2,: 128]!
> + vld1.8 {d4},[r2,: 64]
> + vst1.8 {d0-d1},[r4,: 128]!
> + vst1.8 {d2-d3},[r4,: 128]!
> + vst1.8 d4,[r4,: 64]
> + .Lskipfinalcopy:
> + add r1,r1,#1
> + cmp r1,#12
> + blo .Linvertloop
> + add r1,r3,#144
> + ldr r2,[r1],#4
> + ldr r3,[r1],#4
> + ldr r4,[r1],#4
> + ldr r5,[r1],#4
> + ldr r6,[r1],#4
> + ldr r7,[r1],#4
> + ldr r8,[r1],#4
> + ldr r9,[r1],#4
> + ldr r10,[r1],#4
> + ldr r1,[r1]
> + add r11,r1,r1,LSL #4
> + add r11,r11,r1,LSL #1
> + add r11,r11,#16777216
> + mov r11,r11,ASR #25
> + add r11,r11,r2
> + mov r11,r11,ASR #26
> + add r11,r11,r3
> + mov r11,r11,ASR #25
> + add r11,r11,r4
> + mov r11,r11,ASR #26
> + add r11,r11,r5
> + mov r11,r11,ASR #25
> + add r11,r11,r6
> + mov r11,r11,ASR #26
> + add r11,r11,r7
> + mov r11,r11,ASR #25
> + add r11,r11,r8
> + mov r11,r11,ASR #26
> + add r11,r11,r9
> + mov r11,r11,ASR #25
> + add r11,r11,r10
> + mov r11,r11,ASR #26
> + add r11,r11,r1
> + mov r11,r11,ASR #25
> + add r2,r2,r11
> + add r2,r2,r11,LSL #1
> + add r2,r2,r11,LSL #4
> + mov r11,r2,ASR #26
> + add r3,r3,r11
> + sub r2,r2,r11,LSL #26
> + mov r11,r3,ASR #25
> + add r4,r4,r11
> + sub r3,r3,r11,LSL #25
> + mov r11,r4,ASR #26
> + add r5,r5,r11
> + sub r4,r4,r11,LSL #26
> + mov r11,r5,ASR #25
> + add r6,r6,r11
> + sub r5,r5,r11,LSL #25
> + mov r11,r6,ASR #26
> + add r7,r7,r11
> + sub r6,r6,r11,LSL #26
> + mov r11,r7,ASR #25
> + add r8,r8,r11
> + sub r7,r7,r11,LSL #25
> + mov r11,r8,ASR #26
> + add r9,r9,r11
> + sub r8,r8,r11,LSL #26
> + mov r11,r9,ASR #25
> + add r10,r10,r11
> + sub r9,r9,r11,LSL #25
> + mov r11,r10,ASR #26
> + add r1,r1,r11
> + sub r10,r10,r11,LSL #26
> + mov r11,r1,ASR #25
> + sub r1,r1,r11,LSL #25
> + add r2,r2,r3,LSL #26
> + mov r3,r3,LSR #6
> + add r3,r3,r4,LSL #19
> + mov r4,r4,LSR #13
> + add r4,r4,r5,LSL #13
> + mov r5,r5,LSR #19
> + add r5,r5,r6,LSL #6
> + add r6,r7,r8,LSL #25
> + mov r7,r8,LSR #7
> + add r7,r7,r9,LSL #19
> + mov r8,r9,LSR #13
> + add r8,r8,r10,LSL #12
> + mov r9,r10,LSR #20
> + add r1,r9,r1,LSL #6
> + str r2,[r0],#4
> + str r3,[r0],#4
> + str r4,[r0],#4
> + str r5,[r0],#4
> + str r6,[r0],#4
> + str r7,[r0],#4
> + str r8,[r0],#4
> + str r1,[r0]
Why the post increment? Although it hardly matters here, this should
be more efficient
str r2, [r0]
str r3, [r0, #4]
str r4, [r0, #8]
str r5, [r0, #12]
str r6, [r0, #16]
str r7, [r0, #20]
str r8, [r0, #24]
str r1, [r0, #28]
> + ldrd r4,[sp,#0]
> + ldrd r6,[sp,#8]
> + ldrd r8,[sp,#16]
> + ldrd r10,[sp,#24]
> + ldr r12,[sp,#480]
> + ldr r14,[sp,#484]
> + ldr r0,=0
> + mov sp,r12
> + vpop {q4,q5,q6,q7}
Drop this
> + bx lr
> +ENDPROC(curve25519_neon)
> +#endif
> --
> 2.18.0
>
^ permalink raw reply
* confusing comment, explanation of @IFF_RUNNING in if.h
From: Robert P. J. Day @ 2018-08-26 8:13 UTC (permalink / raw)
To: Linux kernel netdev mailing list
more annoying pedantry ... from include/uapi/linux/if.h:
* @IFF_RUNNING: interface RFC2863 OPER_UP. Volatile.
however, both the code in net/core/dev.c:
/**
* netif_oper_up - test if device is operational
* @dev: network device
*
* Check if carrier is operational
*/
static inline bool netif_oper_up(const struct net_device *dev)
{
return (dev->operstate == IF_OPER_UP ||
dev->operstate == IF_OPER_UNKNOWN /* backward compat */);
}
and the explanation in operstates.txt:
ifinfomsg::if_flags & IFF_RUNNING:
Interface is in RFC2863 operational state UP or UNKNOWN.
suggests IFF_RUNNING represents *either* of the operational states UP
or UNKNOWN, not just UP as the comment in if.h claims. is this
misleading? or is this a deliberate explanation somehow taking into
account that the UNKNOWN state is for backward compatibility (whatever
that means)?
i ask since, in my testing, when the interface should have been up,
the attribute file "operstate" for that interface showed "unknown",
and i wondered how worried i should be about that.
rday
--
========================================================================
Robert P. J. Day Ottawa, Ontario, CANADA
http://crashcourse.ca/dokuwiki
Twitter: http://twitter.com/rpjday
LinkedIn: http://ca.linkedin.com/in/rpjday
========================================================================
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox